From e2d50d7922a1247ab8e679f150f6a33ca9caefb4 Mon Sep 17 00:00:00 2001
From: tqchen <tqchenml@gmail.com>
Date: Thu, 21 Nov 2024 10:29:47 -0500
Subject: [PATCH] [REFACTOR] Phase out VTA

This PR phases out VTA from the current development main branch.

The particular component will remain available in past releases
and is not actively maintained as of now.
---
 .gitmodules                                   |    3 -
 3rdparty/cma/cma.h                            |   43 -
 3rdparty/cma/cma_api_impl.h                   |  173 --
 3rdparty/cma/settings.mk                      |   19 -
 3rdparty/vta-hw                               |    1 -
 CMakeLists.txt                                |    1 -
 Makefile                                      |   15 +-
 apps/vta_rpc/start_rpc_server.sh              |   22 -
 apps/vta_rpc/start_rpc_server_to_tracker.sh   |   26 -
 ci/jenkins/generated/arm_jenkinsfile.groovy   |    7 -
 .../generated/cortexm_jenkinsfile.groovy      |    7 -
 ci/jenkins/generated/cpu_jenkinsfile.groovy   |   12 -
 .../generated/docker_jenkinsfile.groovy       |    7 -
 ci/jenkins/generated/gpu_jenkinsfile.groovy   |    7 -
 .../generated/hexagon_jenkinsfile.groovy      |    7 -
 ci/jenkins/generated/i386_jenkinsfile.groovy  |    8 -
 ci/jenkins/generated/lint_jenkinsfile.groovy  |    7 -
 .../minimal_cross_isa_jenkinsfile.groovy      |    7 -
 .../generated/minimal_jenkinsfile.groovy      |    7 -
 ci/jenkins/generated/riscv_jenkinsfile.groovy |    7 -
 ci/jenkins/generated/wasm_jenkinsfile.groovy  |    7 -
 .../templates/cpu_jenkinsfile.groovy.j2       |    5 -
 .../templates/i386_jenkinsfile.groovy.j2      |    3 -
 ci/jenkins/templates/utils/Build.groovy.j2    |    7 -
 ci/jenkins/unity_jenkinsfile.groovy           |    6 -
 cmake/modules/VTA.cmake                       |  152 --
 docs/conf.py                                  |   17 -
 docs/faq.rst                                  |   69 -
 docs/how_to/legacy_index.rst                  |    1 -
 docs/index.rst                                |    1 -
 docs/legacy_redirect.py                       |  272 ----
 docs/reference/langref/relay_pattern.rst      |    2 +-
 docs/topic/vta/.gitignore                     |    1 -
 docs/topic/vta/dev/config.rst                 |   73 -
 docs/topic/vta/dev/hardware.rst               |  298 ----
 docs/topic/vta/dev/index.rst                  |   31 -
 docs/topic/vta/index.rst                      |   55 -
 docs/topic/vta/install.rst                    |  488 ------
 .../how_to/work_with_schedules/tensorize.py   |    2 -
 python/tvm/autotvm/measure/measure_methods.py |   54 +-
 python/tvm/autotvm/task/relay_integration.py  |   11 -
 python/tvm/target/__init__.py                 |    1 -
 python/tvm/target/target.py                   |    6 -
 tests/azure-pipelines/main.yml                |   81 -
 tests/lint/blocklint.sh                       |    2 +-
 tests/lint/cpplint.sh                         |    3 +-
 tests/lint/pylint.sh                          |    1 -
 .../python/contrib/test_verilator/__init__.py |   18 -
 .../contrib/test_verilator/infrastructure.py  |  198 ---
 .../contrib/test_verilator/test_mobilenet.py  |  245 ---
 .../test_verilator/test_verilator_ops.py      |  199 ---
 tests/python/target/test_target_target.py     |    4 +-
 tests/scripts/ci.py                           |    2 -
 tests/scripts/release/make_notes.py           |    1 -
 tests/scripts/task_build.py                   |    3 +-
 tests/scripts/task_config_build_arm.sh        |    1 -
 tests/scripts/task_config_build_gpu.sh        |    1 -
 tests/scripts/task_config_build_wasm.sh       |    1 -
 tests/scripts/task_cpp_unittest.sh            |    5 -
 tests/scripts/task_microtvm_cpp_tests.sh      |    3 -
 tests/scripts/task_python_vta_fsim.sh         |   32 -
 tests/scripts/task_python_vta_tsim.sh         |   50 -
 vta/README.md                                 |   33 -
 vta/python/vta/__init__.py                    |   38 -
 vta/python/vta/autotvm.py                     |   52 -
 vta/python/vta/bitstream.py                   |   92 --
 vta/python/vta/build_module.py                |  199 ---
 vta/python/vta/environment.py                 |  266 ----
 vta/python/vta/exec/__init__.py               |   18 -
 vta/python/vta/exec/rpc_server.py             |  170 --
 vta/python/vta/intrin.py                      |  139 --
 vta/python/vta/libinfo.py                     |   80 -
 vta/python/vta/program_bitstream.py           |   85 -
 vta/python/vta/rpc_client.py                  |   66 -
 vta/python/vta/testing/__init__.py            |   20 -
 vta/python/vta/testing/simulator.py           |  113 --
 vta/python/vta/testing/utils.py               |   81 -
 vta/python/vta/top/__init__.py                |   27 -
 vta/python/vta/top/bitpack.py                 |   91 --
 vta/python/vta/top/graphpack.py               |  628 --------
 vta/python/vta/top/op.py                      |  268 ----
 vta/python/vta/top/utils.py                   |   26 -
 vta/python/vta/top/vta_conv2d.py              |  196 ---
 vta/python/vta/top/vta_conv2d_transpose.py    |  205 ---
 vta/python/vta/top/vta_dense.py               |  171 --
 vta/python/vta/top/vta_group_conv2d.py        |  207 ---
 vta/python/vta/transform.py                   | 1123 -------------
 vta/runtime/device_api.cc                     |   95 --
 vta/runtime/runtime.cc                        | 1417 -----------------
 vta/runtime/runtime.h                         |  261 ---
 vta/scripts/tune_conv2d.py                    |  180 ---
 vta/scripts/tune_conv2d_transpose.py          |  171 --
 vta/scripts/tune_dense.py                     |  137 --
 vta/scripts/tune_group_conv2d.py              |  175 --
 vta/tests/python/de10nano/test_program_rpc.py |   47 -
 .../python/integration/test_benchmark_gemm.py |  287 ----
 .../integration/test_benchmark_topi_conv2d.py |  318 ----
 .../test_benchmark_topi_conv2d_transpose.py   |  305 ----
 .../integration/test_benchmark_topi_dense.py  |  215 ---
 .../test_benchmark_topi_group_conv2d.py       |  315 ----
 vta/tests/python/pynq/test_program_rpc.py     |   47 -
 vta/tests/python/unittest/test_environment.py |   37 -
 vta/tests/python/unittest/test_vta_insn.py    |  569 -------
 vta/tutorials/README.txt                      |    5 -
 vta/tutorials/frontend/README.txt             |    4 -
 vta/tutorials/frontend/deploy_detection.py    |  322 ----
 vta/tutorials/matrix_multiply.py              |  474 ------
 vta/tutorials/optimize/README.txt             |    2 -
 vta/tutorials/optimize/convolution_opt.py     |  458 ------
 vta/tutorials/optimize/matrix_multiply_opt.py |  374 -----
 vta/tutorials/vta_get_started.py              |  405 -----
 111 files changed, 30 insertions(+), 13782 deletions(-)
 delete mode 100644 3rdparty/cma/cma.h
 delete mode 100644 3rdparty/cma/cma_api_impl.h
 delete mode 100644 3rdparty/cma/settings.mk
 delete mode 160000 3rdparty/vta-hw
 delete mode 100755 apps/vta_rpc/start_rpc_server.sh
 delete mode 100755 apps/vta_rpc/start_rpc_server_to_tracker.sh
 delete mode 100644 cmake/modules/VTA.cmake
 delete mode 100644 docs/faq.rst
 delete mode 100644 docs/legacy_redirect.py
 delete mode 100644 docs/topic/vta/.gitignore
 delete mode 100644 docs/topic/vta/dev/config.rst
 delete mode 100644 docs/topic/vta/dev/hardware.rst
 delete mode 100644 docs/topic/vta/dev/index.rst
 delete mode 100644 docs/topic/vta/index.rst
 delete mode 100644 docs/topic/vta/install.rst
 delete mode 100644 tests/azure-pipelines/main.yml
 delete mode 100644 tests/python/contrib/test_verilator/__init__.py
 delete mode 100644 tests/python/contrib/test_verilator/infrastructure.py
 delete mode 100644 tests/python/contrib/test_verilator/test_mobilenet.py
 delete mode 100644 tests/python/contrib/test_verilator/test_verilator_ops.py
 delete mode 100644 vta/README.md
 delete mode 100644 vta/python/vta/__init__.py
 delete mode 100644 vta/python/vta/autotvm.py
 delete mode 100644 vta/python/vta/bitstream.py
 delete mode 100644 vta/python/vta/build_module.py
 delete mode 100644 vta/python/vta/environment.py
 delete mode 100644 vta/python/vta/exec/__init__.py
 delete mode 100644 vta/python/vta/exec/rpc_server.py
 delete mode 100644 vta/python/vta/intrin.py
 delete mode 100644 vta/python/vta/libinfo.py
 delete mode 100644 vta/python/vta/program_bitstream.py
 delete mode 100644 vta/python/vta/rpc_client.py
 delete mode 100644 vta/python/vta/testing/__init__.py
 delete mode 100644 vta/python/vta/testing/simulator.py
 delete mode 100644 vta/python/vta/testing/utils.py
 delete mode 100644 vta/python/vta/top/__init__.py
 delete mode 100644 vta/python/vta/top/bitpack.py
 delete mode 100644 vta/python/vta/top/graphpack.py
 delete mode 100644 vta/python/vta/top/op.py
 delete mode 100644 vta/python/vta/top/utils.py
 delete mode 100644 vta/python/vta/top/vta_conv2d.py
 delete mode 100644 vta/python/vta/top/vta_conv2d_transpose.py
 delete mode 100644 vta/python/vta/top/vta_dense.py
 delete mode 100644 vta/python/vta/top/vta_group_conv2d.py
 delete mode 100644 vta/python/vta/transform.py
 delete mode 100644 vta/runtime/device_api.cc
 delete mode 100644 vta/runtime/runtime.cc
 delete mode 100644 vta/runtime/runtime.h
 delete mode 100644 vta/scripts/tune_conv2d.py
 delete mode 100644 vta/scripts/tune_conv2d_transpose.py
 delete mode 100644 vta/scripts/tune_dense.py
 delete mode 100644 vta/scripts/tune_group_conv2d.py
 delete mode 100644 vta/tests/python/de10nano/test_program_rpc.py
 delete mode 100644 vta/tests/python/integration/test_benchmark_gemm.py
 delete mode 100644 vta/tests/python/integration/test_benchmark_topi_conv2d.py
 delete mode 100644 vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
 delete mode 100644 vta/tests/python/integration/test_benchmark_topi_dense.py
 delete mode 100644 vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
 delete mode 100644 vta/tests/python/pynq/test_program_rpc.py
 delete mode 100644 vta/tests/python/unittest/test_environment.py
 delete mode 100644 vta/tests/python/unittest/test_vta_insn.py
 delete mode 100644 vta/tutorials/README.txt
 delete mode 100644 vta/tutorials/frontend/README.txt
 delete mode 100644 vta/tutorials/frontend/deploy_detection.py
 delete mode 100644 vta/tutorials/matrix_multiply.py
 delete mode 100644 vta/tutorials/optimize/README.txt
 delete mode 100644 vta/tutorials/optimize/convolution_opt.py
 delete mode 100644 vta/tutorials/optimize/matrix_multiply_opt.py
 delete mode 100644 vta/tutorials/vta_get_started.py

diff --git a/.gitmodules b/.gitmodules
index b5102d9a9b0b..cb22b3d3d38e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,9 +7,6 @@
 [submodule "3rdparty/rang"]
 	path = 3rdparty/rang
 	url = https://github.com/agauniyal/rang.git
-[submodule "3rdparty/vta-hw"]
-	path = 3rdparty/vta-hw
-	url = https://github.com/apache/tvm-vta.git
 [submodule "3rdparty/libbacktrace"]
 	path = 3rdparty/libbacktrace
 	url = https://github.com/tlc-pack/libbacktrace.git
diff --git a/3rdparty/cma/cma.h b/3rdparty/cma/cma.h
deleted file mode 100644
index 2cd550122614..000000000000
--- a/3rdparty/cma/cma.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* cma.h
- *
- * The MIT License (MIT)
- *
- * COPYRIGHT (C) 2017 Institute of Electronics and Computer Science (EDI), Latvia.
- * AUTHOR: Rihards Novickis (rihards.novickis@edi.lv)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef VTA_DE10_NANO_KERNEL_MODULE_CMA_H_
-#define VTA_DE10_NANO_KERNEL_MODULE_CMA_H_
-
-/* Should be defined in settings.mk file */
-#ifndef CMA_IOCTL_MAGIC
-#define CMA_IOCTL_MAGIC 0xf2
-#endif
-
-#define CMA_ALLOC_CACHED _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 1, 4)
-#define CMA_ALLOC_NONCACHED _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 2, 4)
-#define CMA_FREE _IOC(_IOC_WRITE, CMA_IOCTL_MAGIC, 3, 4)
-#define CMA_GET_PHY_ADDR _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 4, 4)
-#define CMA_GET_SIZE _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 5, 4)
-
-#define CMA_IOCTL_MAXNR 5
-
-#endif  // VTA_DE10_NANO_KERNEL_MODULE_CMA_H_
diff --git a/3rdparty/cma/cma_api_impl.h b/3rdparty/cma/cma_api_impl.h
deleted file mode 100644
index 317be5c9af1a..000000000000
--- a/3rdparty/cma/cma_api_impl.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * The MIT License (MIT)
- *
- * COPYRIGHT (C) 2017 Institute of Electronics and Computer Science (EDI), Latvia.
- * AUTHOR: Rihards Novickis (rihards.novickis@edi.lv)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- */
-
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file cma_api.cc
- * \brief Application layer implementation for contigous memory allocation.
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include "cma_api.h"
-
-#ifndef CMA_IOCTL_MAGIC
-#define CMA_IOCTL_MAGIC 0xf2
-#endif
-
-#define CMA_ALLOC_CACHED _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 1, 4)
-#define CMA_ALLOC_NONCACHED _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 2, 4)
-#define CMA_FREE _IOC(_IOC_WRITE, CMA_IOCTL_MAGIC, 3, 4)
-#define CMA_GET_PHY_ADDR _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 4, 4)
-#define CMA_GET_SIZE _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 5, 4)
-
-#define CMA_IOCTL_MAXNR 5
-
-#ifndef CMA_DEBUG
-#define CMA_DEBUG 0
-#endif
-#ifndef DRIVER_NODE_NAME
-#define DRIVER_NODE_NAME "cma"
-#endif
-
-#if CMA_DEBUG == 1
-#define __DEBUG(fmt, args...) printf("CMA_API_DEBUG: " fmt, ##args)
-#else
-#define __DEBUG(fmt, args...)
-#endif
-
-#define ROUND_UP(N, S) ((((N) + (S)-1) / (S)) * (S))
-
-/* Private functions */
-void* cma_alloc(size_t size, unsigned ioctl_cmd);
-
-/* Global file descriptor */
-int cma_fd = 0;
-
-int cma_init(void) {
-  __DEBUG("Opening \"/dev/" DRIVER_NODE_NAME "\" file\n");
-
-  cma_fd = open("/dev/" DRIVER_NODE_NAME, O_RDWR);
-  if (cma_fd == -1) {
-    __DEBUG("Failed to initialize api - \"%s\"\n", strerror(errno));
-    return -1;
-  }
-
-  return 0;
-}
-
-int cma_release(void) {
-  __DEBUG("Closing \"/dev/" DRIVER_NODE_NAME "\" file\n");
-
-  if (close(cma_fd) == -1) {
-    __DEBUG("Failed to finilize api - \"%s\"\n", strerror(errno));
-    return -1;
-  }
-
-  return 0;
-}
-
-void* cma_alloc_cached(size_t size) { return cma_alloc(size, CMA_ALLOC_CACHED); }
-
-void* cma_alloc_noncached(size_t size) { return cma_alloc(size, CMA_ALLOC_NONCACHED); }
-
-int cma_free(void* mem) {
-  __DEBUG("Releasing contigous memory from 0x%x\n", (unsigned)mem);
-  unsigned data, v_addr;
-
-  /* save user space pointer value */
-  data = (unsigned)mem;
-  v_addr = (unsigned)mem;
-
-  if (ioctl(cma_fd, CMA_GET_SIZE, &data) == -1) {
-    __DEBUG("cma_free - ioctl command unsuccsessful - 0\n");
-    return -1;
-  }
-  /* data now contains size */
-
-  /* unmap memory */
-  munmap(mem, data);
-
-  /* free cma entry */
-  if (ioctl(cma_fd, CMA_FREE, &v_addr) == -1) {
-    __DEBUG("cma_free - ioctl command unsuccsessful - 1\n");
-    return -1;
-  }
-
-  return 0;
-}
-
-unsigned cma_get_phy_addr(void* mem) {
-  unsigned data;
-  __DEBUG("Getting physical address from 0x%x\n", (unsigned)mem);
-
-  /* save user space pointer value */
-  data = (unsigned)mem;
-
-  /* get physical address */
-  if (ioctl(cma_fd, CMA_GET_PHY_ADDR, &data) == -1) {
-    __DEBUG("cma_free - ioctl command unsuccsessful\n");
-    return 0;
-  }
-  /* data now contains physical address */
-
-  return data;
-}
-
-void* cma_alloc(size_t size, unsigned ioctl_cmd) {
-  unsigned data;
-  void* mem;
-  __DEBUG("Allocating 0x%x bytes of contigous memory\n", size);
-
-  /* Page align size */
-  size = ROUND_UP(size, getpagesize());
-
-  /* ioctl cmd to allocate contigous memory */
-  data = (unsigned)size;
-  if (ioctl(cma_fd, ioctl_cmd, &data) == -1) {
-    __DEBUG("cma_alloc - ioctl command unsuccsessful\n");
-    return NULL;
-  }
-
-  /* at this point phy_addr is written to data */
-
-  /* mmap memory */
-  mem = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, cma_fd, data);
-  if (mem == MAP_FAILED) {
-    __DEBUG("cma_alloc - mmap unsuccsessful\n");
-    return NULL;
-  }
-
-  return mem;
-}
diff --git a/3rdparty/cma/settings.mk b/3rdparty/cma/settings.mk
deleted file mode 100644
index 7403845023d5..000000000000
--- a/3rdparty/cma/settings.mk
+++ /dev/null
@@ -1,19 +0,0 @@
-# ==================== COMPILATION RELATED SETTINGS ====================
-# Path to the kernel sources (from "./driver", if relative path is used)
-KSOURCE_DIR=/opt/intel/linux-socfpga-rel_socfpga-4.9.78-ltsi_18.08.02_pr
-
-# Cross compiler "prepend" string
-CROSS_COMPILE=arm-linux-gnueabihf-
-
-# Architecture
-ARCH=arm
-
-# Compile with debug information
-CMA_DEBUG?=0
-
-# ==================== DRIVER RELATED SETTINGS ====================
-# Node name used in "/dev" folder
-DRIVER_NODE_NAME="cma"
-
-# Unique (across system) ioctl magic number. Every ioctl interface should have one.
-CMA_IOC_MAGIC=0xf2
diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
deleted file mode 160000
index 36a91576edf6..000000000000
--- a/3rdparty/vta-hw
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 36a91576edf633479c78649e050f18dd2ddc8103
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1fb28c869474..cac2e726fbda 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -569,7 +569,6 @@ else()
 endif()
 
 # Module rules
-include(cmake/modules/VTA.cmake)
 include(cmake/modules/StandaloneCrt.cmake)
 include(cmake/modules/CUDA.cmake)
 include(cmake/modules/Hexagon.cmake) # This must come before logging.cmake
diff --git a/Makefile b/Makefile
index d5dd45161cf1..3a880b94a5af 100644
--- a/Makefile
+++ b/Makefile
@@ -17,8 +17,8 @@
 
 
 .PHONY: all \
-        runtime vta cpptest crttest \
-        lint pylint cpplint scalalint \
+        runtime cpptest crttest \
+        lint pylint cpplint \
 	cppdoc docs \
 	web webclean \
 	cython cython3 cyclean \
@@ -39,15 +39,10 @@ TVM_BUILD_PATH := $(abspath $(TVM_BUILD_PATH))
 # packaged version.
 DMLC_CORE_PATH ?= $(ROOTDIR)/3rdparty/dmlc-core
 DLPACK_PATH ?= $(ROOTDIR)/3rdparty/dlpack
-VTA_HW_PATH ?= $(ROOTDIR)/3rdparty/vta-hw
-
-
-
 
 all: $(addsuffix /all,$(TVM_BUILD_PATH))
 
 runtime: $(addsuffix /runtime,$(TVM_BUILD_PATH))
-vta: $(addsuffix /vta,$(TVM_BUILD_PATH))
 cpptest: $(addsuffix /cpptest,$(TVM_BUILD_PATH))
 crttest: $(addsuffix /crttest,$(TVM_BUILD_PATH))
 
@@ -78,7 +73,7 @@ FORCE:
 # Since the pattern stem is already being used for the directory name,
 # cannot also have it refer to the command passed to cmake.
 # Therefore, explicitly listing out the delegated.
-CMAKE_TARGETS = all runtime vta cpptest crttest clean
+CMAKE_TARGETS = all runtime cpptest crttest clean
 
 define GEN_CMAKE_RULE
 %/$(CMAKE_TARGET): %/CMakeCache.txt FORCE
@@ -107,10 +102,6 @@ pylint:
 jnilint:
 	python3 3rdparty/dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
 
-scalalint:
-	make -C $(VTA_HW_PATH)/hardware/chisel lint
-
-
 mypy:
 	tests/scripts/task_mypy.sh
 
diff --git a/apps/vta_rpc/start_rpc_server.sh b/apps/vta_rpc/start_rpc_server.sh
deleted file mode 100755
index 46258f9d7962..000000000000
--- a/apps/vta_rpc/start_rpc_server.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
-
-export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
-export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
-python3 -m vta.exec.rpc_server
diff --git a/apps/vta_rpc/start_rpc_server_to_tracker.sh b/apps/vta_rpc/start_rpc_server_to_tracker.sh
deleted file mode 100755
index 40d01557fe23..000000000000
--- a/apps/vta_rpc/start_rpc_server_to_tracker.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-PROJROOT="$( cd "$( dirname '${BASH_SOURCE[0]}' )/../../" && pwd )"
-
-# Derive target specified by vta_config.json
-VTA_CONFIG=${VTA_HW_PATH}/config/vta_config.py
-TARGET=$(python ${VTA_CONFIG} --target)
-
-export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
-export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
-python3 -m vta.exec.rpc_server --tracker fleet:9190 --key $TARGET
diff --git a/ci/jenkins/generated/arm_jenkinsfile.groovy b/ci/jenkins/generated/arm_jenkinsfile.groovy
index 199619bed0d8..d4447d2ca81d 100644
--- a/ci/jenkins/generated/arm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/arm_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/generated/cortexm_jenkinsfile.groovy b/ci/jenkins/generated/cortexm_jenkinsfile.groovy
index 8f2c3ace05ea..8efdf23f9f88 100644
--- a/ci/jenkins/generated/cortexm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cortexm_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/generated/cpu_jenkinsfile.groovy b/ci/jenkins/generated/cpu_jenkinsfile.groovy
index 8e90daa6d2b7..a97ae1484f0c 100644
--- a/ci/jenkins/generated/cpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/cpu_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
@@ -812,11 +805,6 @@ def shard_run_unittest_CPU_1_of_1(node_type='CPU-SMALL-SPOT', on_demand=false) {
               cpp_unittest(ci_cpu)
               micro_cpp_unittest(ci_cpu)
               python_unittest(ci_cpu)
-              fsim_test(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
-                label: 'Run VTA tests in TSIM',
-              )
             })
           }
         } finally {
diff --git a/ci/jenkins/generated/docker_jenkinsfile.groovy b/ci/jenkins/generated/docker_jenkinsfile.groovy
index dbef6d2cae54..0451f698488d 100644
--- a/ci/jenkins/generated/docker_jenkinsfile.groovy
+++ b/ci/jenkins/generated/docker_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/generated/gpu_jenkinsfile.groovy b/ci/jenkins/generated/gpu_jenkinsfile.groovy
index 95448b779815..0c9e48fc9d0b 100644
--- a/ci/jenkins/generated/gpu_jenkinsfile.groovy
+++ b/ci/jenkins/generated/gpu_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/generated/hexagon_jenkinsfile.groovy b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
index 9f1e13fb8a56..0aaaec858a9b 100644
--- a/ci/jenkins/generated/hexagon_jenkinsfile.groovy
+++ b/ci/jenkins/generated/hexagon_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/generated/i386_jenkinsfile.groovy b/ci/jenkins/generated/i386_jenkinsfile.groovy
index cd0e43ee520b..840a0a5d9d8b 100644
--- a/ci/jenkins/generated/i386_jenkinsfile.groovy
+++ b/ci/jenkins/generated/i386_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
@@ -667,7 +660,6 @@ def shard_run_python_i386_2_of_3(node_type='CPU-SMALL-SPOT', on_demand=false) {
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
                 label: 'Run i386 integration tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
diff --git a/ci/jenkins/generated/lint_jenkinsfile.groovy b/ci/jenkins/generated/lint_jenkinsfile.groovy
index 80dc952187d7..ac7796b329fd 100644
--- a/ci/jenkins/generated/lint_jenkinsfile.groovy
+++ b/ci/jenkins/generated/lint_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/generated/minimal_cross_isa_jenkinsfile.groovy b/ci/jenkins/generated/minimal_cross_isa_jenkinsfile.groovy
index 5a7b85c762dc..76c0bd5830f5 100644
--- a/ci/jenkins/generated/minimal_cross_isa_jenkinsfile.groovy
+++ b/ci/jenkins/generated/minimal_cross_isa_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/generated/minimal_jenkinsfile.groovy b/ci/jenkins/generated/minimal_jenkinsfile.groovy
index fb649ea1b6d7..7f57cef32e64 100644
--- a/ci/jenkins/generated/minimal_jenkinsfile.groovy
+++ b/ci/jenkins/generated/minimal_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/generated/riscv_jenkinsfile.groovy b/ci/jenkins/generated/riscv_jenkinsfile.groovy
index 26f6aad9466c..1667cd02c994 100644
--- a/ci/jenkins/generated/riscv_jenkinsfile.groovy
+++ b/ci/jenkins/generated/riscv_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/generated/wasm_jenkinsfile.groovy b/ci/jenkins/generated/wasm_jenkinsfile.groovy
index 7deb5898efcb..84f511de9558 100644
--- a/ci/jenkins/generated/wasm_jenkinsfile.groovy
+++ b/ci/jenkins/generated/wasm_jenkinsfile.groovy
@@ -480,13 +480,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
index a7142ef4d09b..3fb21863ccfc 100644
--- a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2
@@ -72,11 +72,6 @@
   cpp_unittest(ci_cpu)
   micro_cpp_unittest(ci_cpu)
   python_unittest(ci_cpu)
-  fsim_test(ci_cpu)
-  sh (
-    script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
-    label: 'Run VTA tests in TSIM',
-  )
 {% endcall %}
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="frontend: CPU",
diff --git a/ci/jenkins/templates/i386_jenkinsfile.groovy.j2 b/ci/jenkins/templates/i386_jenkinsfile.groovy.j2
index 56a2d7d9f18e..78cf8cb2c04c 100644
--- a/ci/jenkins/templates/i386_jenkinsfile.groovy.j2
+++ b/ci/jenkins/templates/i386_jenkinsfile.groovy.j2
@@ -58,9 +58,6 @@
     script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
     label: 'Run i386 integration tests',
   )
-  {% if shard_index == 2 or num_shards < 2 %}
-  fsim_test(ci_i386)
-  {% endif %}
 {% endcall %}
 
 {{ m.invoke_tests(test_method_names) -}}
diff --git a/ci/jenkins/templates/utils/Build.groovy.j2 b/ci/jenkins/templates/utils/Build.groovy.j2
index 311222747410..ce05f1c62b14 100644
--- a/ci/jenkins/templates/utils/Build.groovy.j2
+++ b/ci/jenkins/templates/utils/Build.groovy.j2
@@ -12,13 +12,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
 def make_standalone_crt(image, build_dir) {
   sh (
     script: """
diff --git a/ci/jenkins/unity_jenkinsfile.groovy b/ci/jenkins/unity_jenkinsfile.groovy
index 2a7a4fee3797..3e6213ff265e 100755
--- a/ci/jenkins/unity_jenkinsfile.groovy
+++ b/ci/jenkins/unity_jenkinsfile.groovy
@@ -309,12 +309,6 @@ def python_unittest(image) {
   )
 }
 
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
 
 def cmake_build(image, path, make_flag) {
   sh (
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
deleted file mode 100644
index f3fd4732519c..000000000000
--- a/cmake/modules/VTA.cmake
+++ /dev/null
@@ -1,152 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# CMake Build rules for VTA
-find_program(PYTHON NAMES python python3 python3.6)
-
-# Throw error if VTA_HW_PATH is not set
-if(NOT DEFINED ENV{VTA_HW_PATH})
-  set(VTA_HW_PATH ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw)
-else()
-  set(VTA_HW_PATH $ENV{VTA_HW_PATH})
-endif()
-
-if(MSVC)
-  message(STATUS "VTA build is skipped in Windows..")
-elseif(NOT EXISTS ${VTA_HW_PATH})
-  if (USE_VTA_TSIM OR USE_VTA_FSIM OR USE_UFPGA)
-    message(FATAL_ERROR "VTA path " ${VTA_HW_PATH} " does not exist")
-  endif()
-elseif(PYTHON)
-  message(STATUS "VTA build with VTA_HW_PATH=" ${VTA_HW_PATH})
-  set(VTA_CONFIG ${PYTHON} ${VTA_HW_PATH}/config/vta_config.py)
-
-  if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
-    message(STATUS "Use VTA config " ${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
-    set(VTA_CONFIG ${PYTHON} ${VTA_HW_PATH}/config/vta_config.py
-      --use-cfg=${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
-  endif()
-
-  execute_process(COMMAND ${VTA_CONFIG} --target OUTPUT_VARIABLE VTA_TARGET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-  message(STATUS "Build VTA runtime with target: " ${VTA_TARGET})
-
-  execute_process(COMMAND ${VTA_CONFIG} --defs OUTPUT_VARIABLE __vta_defs)
-
-  string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_=.]*" VTA_DEFINITIONS "${__vta_defs}")
-
-  # Fast simulator driver build
-  if(USE_VTA_FSIM)
-    # Add fsim driver sources
-    tvm_file_glob(GLOB FSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/*.cc)
-    tvm_file_glob(GLOB FSIM_RUNTIME_SRCS vta/runtime/*.cc)
-    list(APPEND FSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/sim/sim_driver.cc)
-    list(APPEND FSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/sim/sim_tlpp.cc)
-    list(APPEND FSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc)
-    # Target lib: vta_fsim
-    add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS})
-    target_include_directories(vta_fsim SYSTEM PUBLIC ${VTA_HW_PATH}/include)
-    target_compile_definitions(vta_fsim PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
-    foreach(__def ${VTA_DEFINITIONS})
-      string(SUBSTRING ${__def} 3 -1 __strip_def)
-      target_compile_definitions(vta_fsim PUBLIC ${__strip_def})
-    endforeach()
-    if(APPLE)
-      set_property(TARGET vta_fsim APPEND PROPERTY LINK_FLAGS "-undefined dynamic_lookup")
-    endif(APPLE)
-    target_compile_definitions(vta_fsim PUBLIC USE_FSIM_TLPP)
-  endif()
-
-  # Cycle accurate simulator driver build
-  if(USE_VTA_TSIM)
-    if(DEFINED ENV{VERILATOR_INC_DIR})
-      set(VERILATOR_INC_DIR $ENV{VERILATOR_INC_DIR})
-    elseif (EXISTS /usr/local/share/verilator/include)
-      set(VERILATOR_INC_DIR /usr/local/share/verilator/include)
-    elseif (EXISTS /usr/share/verilator/include)
-      set(VERILATOR_INC_DIR /usr/share/verilator/include)
-    else()
-      message(STATUS "Verilator not found in /usr/local/share/verilator/include")
-      message(STATUS "Verilator not found in /usr/share/verilator/include")
-      message(FATAL_ERROR "Cannot find Verilator, VERILATOR_INC_DIR is not defined")
-    endif()
-    # Add tsim driver sources
-    tvm_file_glob(GLOB TSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/*.cc)
-    tvm_file_glob(GLOB TSIM_RUNTIME_SRCS vta/runtime/*.cc)
-    list(APPEND TSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/tsim/tsim_driver.cc)
-    list(APPEND TSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/dpi/module.cc)
-    list(APPEND TSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc)
-    # Target lib: vta_tsim
-    add_library(vta_tsim SHARED ${TSIM_RUNTIME_SRCS})
-    target_include_directories(vta_tsim SYSTEM PUBLIC ${VTA_HW_PATH}/include ${VERILATOR_INC_DIR} ${VERILATOR_INC_DIR}/vltstd)
-    target_compile_definitions(vta_tsim PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
-    foreach(__def ${VTA_DEFINITIONS})
-      string(SUBSTRING ${__def} 3 -1 __strip_def)
-      target_compile_definitions(vta_tsim PUBLIC ${__strip_def})
-    endforeach()
-    if(APPLE)
-      set_property(TARGET vta_fsim APPEND PROPERTY LINK_FLAGS "-undefined dynamic_lookup")
-    endif(APPLE)
-  endif()
-
-  # VTA FPGA driver sources
-  if(USE_VTA_FPGA)
-    tvm_file_glob(GLOB FSIM_RUNTIME_SRCS ${VTA_HW_PATH}/src/*.cc)
-    tvm_file_glob(GLOB FPGA_RUNTIME_SRCS vta/runtime/*.cc)
-    # Rules for Zynq-class FPGAs with pynq OS support (see pynq.io)
-    if(${VTA_TARGET} STREQUAL "pynq" OR
-       ${VTA_TARGET} STREQUAL "ultra96")
-      list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/pynq/pynq_driver.cc)
-      # Rules for Pynq v2.4
-      find_library(__cma_lib NAMES cma PATH /usr/lib)
-    elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
-      tvm_file_glob(GLOB DE10_FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc)
-      list(APPEND FPGA_RUNTIME_SRCS ${DE10_FPGA_RUNTIME_SRCS})
-    elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
-      tvm_file_glob(GLOB FOCL_SRC ${VTA_HW_PATH}/src/oclfpga/*.cc)
-      list(APPEND FPGA_RUNTIME_SRCS ${FOCL_SRC})
-      list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h)
-    endif()
-    # Target lib: vta
-    add_library(vta SHARED ${FPGA_RUNTIME_SRCS})
-    target_include_directories(vta PUBLIC vta/runtime)
-    target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include)
-    target_compile_definitions(vta PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
-    foreach(__def ${VTA_DEFINITIONS})
-      string(SUBSTRING ${__def} 3 -1 __strip_def)
-      target_compile_definitions(vta PUBLIC ${__strip_def})
-    endforeach()
-    if(${VTA_TARGET} STREQUAL "pynq" OR
-       ${VTA_TARGET} STREQUAL "ultra96")
-      target_link_libraries(vta ${__cma_lib})
-    elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
-     #target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21)
-      target_include_directories(vta SYSTEM PUBLIC ${VTA_HW_PATH}/src/de10nano)
-      target_include_directories(vta SYSTEM PUBLIC 3rdparty)
-      target_include_directories(vta SYSTEM PUBLIC
-        "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
-    elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
-      target_include_directories(vta PUBLIC 3rdparty)
-      set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
-      target_link_libraries(vta -lOpenCL)
-    endif()
-  endif()
-
-
-else()
-  message(STATUS "Cannot found python in env, VTA build is skipped..")
-endif()
diff --git a/docs/conf.py b/docs/conf.py
index acc03161e559..c858e9c45045 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -53,7 +53,6 @@
     tvm_path = Path(os.pardir)
 
 sys.path.insert(0, str(tvm_path.resolve() / "python"))
-sys.path.insert(0, str(tvm_path.resolve() / "vta" / "python"))
 sys.path.insert(0, str(tvm_path.resolve() / "docs"))
 
 # -- General configuration ------------------------------------------------
@@ -420,7 +419,6 @@ def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func):
     tvm_path.joinpath("gallery", "how_to", "tune_with_autoscheduler"),
     tvm_path.joinpath("gallery", "how_to", "work_with_microtvm"),
     tvm_path.joinpath("gallery", "how_to", "extend_tvm"),
-    tvm_path.joinpath("vta", "tutorials"),
     # New tutorial structure under docs folder
     tvm_path.joinpath("docs", "get_started", "tutorials"),
     tvm_path.joinpath("docs", "how_to", "tutorials"),
@@ -440,7 +438,6 @@ def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func):
     "how_to/tune_with_autoscheduler",
     "how_to/work_with_microtvm",
     "how_to/extend_tvm",
-    "topic/vta/tutorials",
     # New tutorial structure under docs folder
     "get_started/tutorials/",
     "how_to/tutorials/",
@@ -448,15 +445,6 @@ def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func):
     "deep_dive/tensor_ir/tutorials/",
 ]
 
-
-subsection_order = ExplicitOrder(
-    str(p)
-    for p in [
-        tvm_path / "vta" / "tutorials" / "frontend",
-        tvm_path / "vta" / "tutorials" / "optimize",
-    ]
-)
-
 # Explicitly define the order within a subsection.
 # The listed files are sorted according to the list.
 # The unlisted files are sorted by filenames.
@@ -575,7 +563,6 @@ def force_gc(gallery_conf, fname):
     "examples_dirs": examples_dirs,
     "within_subsection_order": WithinSubsectionOrder,
     "gallery_dirs": gallery_dirs,
-    "subsection_order": subsection_order,
     "filename_pattern": os.environ.get("TVM_TUTORIAL_EXEC_PATTERN", filename_pattern_default),
     "download_all_examples": False,
     "min_reported_time": 60,
@@ -769,9 +756,6 @@ def process_docstring(app, what, name, obj, options, lines):
         distinguish_class_name(name, lines)
 
 
-from legacy_redirect import build_legacy_redirect
-
-
 def strip_ipython_magic(app, docname, source):
     """Prevents IPython magic commands from being rendered in HTML files.
 
@@ -784,4 +768,3 @@ def strip_ipython_magic(app, docname, source):
 def setup(app):
     app.connect("source-read", strip_ipython_magic)
     app.connect("autodoc-process-docstring", process_docstring)
-    app.connect("build-finished", build_legacy_redirect(tvm_path))
diff --git a/docs/faq.rst b/docs/faq.rst
deleted file mode 100644
index 4104c82a12a3..000000000000
--- a/docs/faq.rst
+++ /dev/null
@@ -1,69 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-
-Frequently Asked Questions
-==========================
-
-
-How to Install
---------------
-See :ref:`installation`.
-
-
-How to add a new Hardware Backend
----------------------------------
-
-- If the hardware backend has LLVM support,
-  then we can directly generate the code by setting the correct target triple as in :py:mod:`~tvm.target`.
-- If the target hardware is a GPU, try to use the cuda, opencl or vulkan backend.
-- If the target hardware is a special accelerator,
-  checkout :ref:`vta-index` and :ref:`relay-bring-your-own-codegen`.
-- For all of the above cases, You may want to add target specific
-  optimization templates using AutoTVM, see :ref:`tutorials-autotvm-sec`.
-- Besides using LLVM's vectorization, we can also embed micro-kernels to leverage hardware intrinsics,
-  see :ref:`tutorials-tensorize`.
-
-
-TVM's relation to Other IR/DSL Projects
----------------------------------------
-There are usually two levels of abstractions of IR in the deep learning systems.
-TensorFlow's XLA and Intel's ngraph both use a computation graph representation.
-This representation is high level, and can be helpful to perform generic optimizations
-such as memory reuse, layout transformation and automatic differentiation.
-
-TVM adopts a low-level representation, that explicitly express the choice of memory
-layout, parallelization pattern, locality and hardware primitives etc.
-This level of IR is closer to directly target hardwares.
-The low-level IR adopts ideas from existing image processing languages like Halide, darkroom
-and loop transformation tools like loopy and polyhedra-based analysis.
-We specifically focus on expressing deep learning workloads (e.g. recurrence),
-optimization for different hardware backends and embedding with frameworks to provide
-end-to-end compilation stack.
-
-
-TVM's relation to libDNN, cuDNN
--------------------------------
-TVM can incorporate these libraries as external calls. One goal of TVM is to be able to
-generate high-performing kernels. We will evolve TVM an incremental manner as
-we learn from the techniques of manual kernel crafting and add these as primitives in DSL.
-See also top for recipes of operators in TVM.
-
-
-Security
---------
-See :ref:`dev-security`
diff --git a/docs/how_to/legacy_index.rst b/docs/how_to/legacy_index.rst
index a98e04c96978..d675adbee2da 100644
--- a/docs/how_to/legacy_index.rst
+++ b/docs/how_to/legacy_index.rst
@@ -35,4 +35,3 @@ schedule with tesor expressions?"
    work_with_microtvm/index
    extend_tvm/index
    profile/index
-   ../faq
diff --git a/docs/index.rst b/docs/index.rst
index 3abc39e82fd1..041931552b03 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -75,7 +75,6 @@ driving its costs down.
    dev/how_to/how_to.rst
    reference/langref/index
    topic/microtvm/index
-   topic/vta/index
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/legacy_redirect.py b/docs/legacy_redirect.py
deleted file mode 100644
index 502c7dd0b5bf..000000000000
--- a/docs/legacy_redirect.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from string import Template
-import json
-import os
-
-legacy_redirects = [
-    ["dev/benchmark.html", "../arch/benchmark.html"],
-    ["dev/convert_layout.html", "../arch/convert_layout.html"],
-    ["dev/debugger.html", "../arch/debugger.html"],
-    ["dev/device_target_interactions.html", "../arch/device_target_interactions.html"],
-    ["dev/frontend/tensorflow.html", "../../arch/frontend/tensorflow.html"],
-    ["dev/hybrid_script.html", "../arch/hybrid_script.html"],
-    ["dev/index.html", "../arch/index.html"],
-    ["dev/inferbound.html", "../arch/inferbound.html"],
-    [
-        "dev/introduction_to_module_serialization.html",
-        "../arch/introduction_to_module_serialization.html",
-    ],
-    ["dev/microtvm_design.html", "../arch/microtvm_design.html"],
-    ["dev/model_library_format.html", "../arch/model_library_format.html"],
-    ["dev/pass_infra.html", "../arch/pass_infra.html"],
-    ["dev/relay_intro.html", "../arch/relay_intro.html"],
-    ["dev/relay_op_strategy.html", "../arch/relay_op_strategy.html"],
-    ["dev/runtime.html", "../arch/runtime.html"],
-    ["dev/runtimes/vulkan.html", "../../arch/runtimes/vulkan.html"],
-    ["dev/security.html", "../arch/security.html"],
-    ["dev/virtual_machine.html", "../arch/virtual_machine.html"],
-    ["dev/how_to.html", "index.html"],
-    ["dev/pytest_target_parametrization.html", "how_to/pytest_target_parametrization.html"],
-    ["dev/relay_add_op.html", "how_to/relay_add_op.html"],
-    ["dev/relay_add_pass.html", "how_to/relay_add_pass.html"],
-    ["dev/relay_bring_your_own_codegen.html", "how_to/relay_bring_your_own_codegen.html"],
-    ["dev/codebase_walkthrough.html", "tutorial/codebase_walkthrough.html"],
-    ["deploy/android.html", "../how_to/deploy/android.html"],
-    ["deploy/arm_compute_lib.html", "../how_to/deploy/arm_compute_lib.html"],
-    ["deploy/bnns.html", "../how_to/deploy/bnns.html"],
-    ["deploy/cpp_deploy.html", "../how_to/deploy/cpp_deploy.html"],
-    ["deploy/hls.html", "../how_to/deploy/hls.html"],
-    ["deploy/index.html", "../how_to/deploy/index.html"],
-    ["deploy/integrate.html", "../how_to/deploy/integrate.html"],
-    ["deploy/tensorrt.html", "../how_to/deploy/tensorrt.html"],
-    ["deploy/vitis_ai.html", "../how_to/deploy/vitis_ai.html"],
-    ["profiling/index.html", "../how_to/profile/index.html"],
-    ["profiling/papi.html", "../how_to/profile/papi.html"],
-    ["api/links.html", "../reference/api/links.html"],
-    ["api/python/auto_scheduler.html", "../../reference/api/python/auto_scheduler.html"],
-    ["api/python/autotvm.html", "../../reference/api/python/autotvm.html"],
-    ["api/python/contrib.html", "../../reference/api/python/contrib.html"],
-    ["api/python/driver.html", "../../reference/api/python/driver.html"],
-    ["api/python/error.html", "../../reference/api/python/error.html"],
-    ["api/python/graph_executor.html", "../../reference/api/python/graph_executor.html"],
-    ["api/python/index.html", "../../reference/api/python/index.html"],
-    ["api/python/ir.html", "../../reference/api/python/ir.html"],
-    ["api/python/micro.html", "../../reference/api/python/micro.html"],
-    ["api/python/ndarray.html", "../../reference/api/python/ndarray.html"],
-    ["api/python/relay/analysis.html", "../../../reference/api/python/relay/analysis.html"],
-    ["api/python/relay/backend.html", "../../../reference/api/python/relay/backend.html"],
-    [
-        "api/python/relay/dataflow_pattern.html",
-        "../../../reference/api/python/relay/dataflow_pattern.html",
-    ],
-    ["api/python/relay/frontend.html", "../../../reference/api/python/relay/frontend.html"],
-    ["api/python/relay/image.html", "../../../reference/api/python/relay/image.html"],
-    ["api/python/relay/index.html", "../../../reference/api/python/relay/index.html"],
-    ["api/python/relay/nn.html", "../../../reference/api/python/relay/nn.html"],
-    ["api/python/relay/testing.html", "../../../reference/api/python/relay/testing.html"],
-    ["api/python/relay/transform.html", "../../../reference/api/python/relay/transform.html"],
-    ["api/python/relay/vision.html", "../../../reference/api/python/relay/vision.html"],
-    ["api/python/rpc.html", "../../reference/api/python/rpc.html"],
-    ["api/python/runtime.html", "../../reference/api/python/runtime.html"],
-    ["api/python/target.html", "../../reference/api/python/target.html"],
-    ["api/python/te.html", "../../reference/api/python/te.html"],
-    ["api/python/tir.html", "../../reference/api/python/tir.html"],
-    ["api/python/topi.html", "../../reference/api/python/topi.html"],
-    ["api/python/vta/index.html", "../../../reference/api/python/vta/index.html"],
-    ["langref/hybrid_script.html", "../reference/langref/hybrid_script.html"],
-    ["langref/index.html", "../reference/langref/index.html"],
-    ["langref/relay_adt.html", "../reference/langref/relay_adt.html"],
-    ["langref/relay_expr.html", "../reference/langref/relay_expr.html"],
-    ["langref/relay_op.html", "../reference/langref/relay_op.html"],
-    ["langref/relay_pattern.html", "../reference/langref/relay_pattern.html"],
-    ["langref/relay_type.html", "../reference/langref/relay_type.html"],
-    ["microtvm/index.html", "../topic/microtvm/index.html"],
-    ["vta/dev/config.html", "../../topic/vta/dev/config.html"],
-    ["vta/dev/hardware.html", "../../topic/vta/dev/hardware.html"],
-    ["vta/dev/index.html", "../../topic/vta/dev/index.html"],
-    ["vta/index.html", "../topic/vta/index.html"],
-    ["vta/install.html", "../topic/vta/install.html"],
-    ["tutorials/index.html", "../tutorial/index.html"],
-    ["tutorials/frontend/from_caffe2.html", "../../how_to/compile_models/from_caffe2.html"],
-    ["tutorials/frontend/from_coreml.html", "../../how_to/compile_models/from_coreml.html"],
-    ["tutorials/frontend/from_darknet.html", "../../how_to/compile_models/from_darknet.html"],
-    ["tutorials/frontend/from_keras.html", "../../how_to/compile_models/from_keras.html"],
-    ["tutorials/frontend/from_mxnet.html", "../../how_to/compile_models/from_mxnet.html"],
-    ["tutorials/frontend/from_onnx.html", "../../how_to/compile_models/from_onnx.html"],
-    ["tutorials/frontend/from_paddle.html", "../../how_to/compile_models/from_paddle.html"],
-    ["tutorials/frontend/from_pytorch.html", "../../how_to/compile_models/from_pytorch.html"],
-    ["tutorials/frontend/from_tensorflow.html", "../../how_to/compile_models/from_tensorflow.html"],
-    ["tutorials/frontend/from_tflite.html", "../../how_to/compile_models/from_tflite.html"],
-    [
-        "tutorials/frontend/deploy_model_on_android.html",
-        "../../how_to/deploy_models/deploy_model_on_android.html",
-    ],
-    [
-        "tutorials/frontend/deploy_model_on_rasp.html",
-        "../../how_to/deploy_models/deploy_model_on_rasp.html",
-    ],
-    [
-        "tutorials/frontend/deploy_object_detection_pytorch.html",
-        "../../how_to/deploy_models/deploy_object_detection_pytorch.html",
-    ],
-    [
-        "tutorials/frontend/deploy_prequantized.html",
-        "../../how_to/deploy_models/deploy_prequantized.html",
-    ],
-    [
-        "tutorials/frontend/deploy_prequantized_tflite.html",
-        "../../how_to/deploy_models/deploy_prequantized_tflite.html",
-    ],
-    [
-        "tutorials/frontend/deploy_quantized.html",
-        "../../how_to/deploy_models/deploy_quantized.html",
-    ],
-    ["tutorials/frontend/deploy_sparse.html", "../../how_to/deploy_models/deploy_sparse.html"],
-    [
-        "tutorials/dev/bring_your_own_datatypes.html",
-        "../../how_to/extend_tvm/bring_your_own_datatypes.html",
-    ],
-    [
-        "tutorials/dev/low_level_custom_pass.html",
-        "../../how_to/extend_tvm/low_level_custom_pass.html",
-    ],
-    ["tutorials/dev/use_pass_infra.html", "../../how_to/extend_tvm/use_pass_infra.html"],
-    ["tutorials/dev/use_pass_instrument.html", "../../how_to/extend_tvm/use_pass_instrument.html"],
-    ["tutorials/optimize/opt_conv_cuda.html", "../../how_to/optimize_operators/opt_conv_cuda.html"],
-    [
-        "tutorials/optimize/opt_conv_tensorcore.html",
-        "../../how_to/optimize_operators/opt_conv_tensorcore.html",
-    ],
-    ["tutorials/optimize/opt_gemm.html", "../../how_to/optimize_operators/opt_gemm.html"],
-    [
-        "tutorials/auto_scheduler/tune_conv2d_layer_cuda.html",
-        "../../how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_network_arm.html",
-        "../../how_to/tune_with_autoscheduler/tune_network_arm.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_network_cuda.html",
-        "../../how_to/tune_with_autoscheduler/tune_network_cuda.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_network_mali.html",
-        "../../how_to/tune_with_autoscheduler/tune_network_mali.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_network_x86.html",
-        "../../how_to/tune_with_autoscheduler/tune_network_x86.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_sparse_x86.html",
-        "../../how_to/tune_with_autoscheduler/tune_sparse_x86.html",
-    ],
-    [
-        "tutorials/autotvm/tune_conv2d_cuda.html",
-        "../../how_to/tune_with_autotvm/tune_conv2d_cuda.html",
-    ],
-    ["tutorials/autotvm/tune_relay_arm.html", "../../how_to/tune_with_autotvm/tune_relay_arm.html"],
-    [
-        "tutorials/autotvm/tune_relay_cuda.html",
-        "../../how_to/tune_with_autotvm/tune_relay_cuda.html",
-    ],
-    [
-        "tutorials/autotvm/tune_relay_mobile_gpu.html",
-        "../../how_to/tune_with_autotvm/tune_relay_mobile_gpu.html",
-    ],
-    ["tutorials/autotvm/tune_relay_x86.html", "../../how_to/tune_with_autotvm/tune_relay_x86.html"],
-    ["tutorials/micro/micro_autotune.html", "../../how_to/work_with_microtvm/micro_autotune.html"],
-    [
-        "tutorials/micro/micro_reference_vm.html",
-        "../../how_to/work_with_microtvm/micro_reference_vm.html",
-    ],
-    ["tutorials/micro/micro_tflite.html", "../../how_to/work_with_microtvm/micro_tflite.html"],
-    ["tutorials/frontend/build_gcn.html", "../../how_to/work_with_relay/build_gcn.html"],
-    [
-        "tutorials/frontend/using_external_lib.html",
-        "../../how_to/work_with_relay/using_external_lib.html",
-    ],
-    ["tutorials/language/extern_op.html", "../../how_to/work_with_schedules/extern_op.html"],
-    ["tutorials/language/reduction.html", "../../how_to/work_with_schedules/reduction.html"],
-    ["tutorials/language/scan.html", "../../how_to/work_with_schedules/scan.html"],
-    [
-        "tutorials/language/schedule_primitives.html",
-        "../../how_to/work_with_schedules/schedule_primitives.html",
-    ],
-    ["tutorials/language/tedd.html", "../../how_to/work_with_schedules/tedd.html"],
-    ["tutorials/language/tensorize.html", "../../how_to/work_with_schedules/tensorize.html"],
-    ["tutorials/language/tuple_inputs.html", "../../how_to/work_with_schedules/tuple_inputs.html"],
-    [
-        "tutorials/get_started/auto_scheduler_matmul_x86.html",
-        "../../tutorial/auto_scheduler_matmul_x86.html",
-    ],
-    ["tutorials/get_started/autotvm_matmul_x86.html", "../../tutorial/autotvm_matmul_x86.html"],
-    ["tutorials/get_started/autotvm_relay_x86.html", "../../tutorial/autotvm_relay_x86.html"],
-    [
-        "tutorials/get_started/cross_compilation_and_rpc.html",
-        "../../tutorial/cross_compilation_and_rpc.html",
-    ],
-    ["tutorials/get_started/install.html", "../../tutorial/install.html"],
-    ["tutorials/topi/intro_topi.html", "../../tutorial/intro_topi.html"],
-    ["tutorials/get_started/introduction.html", "../../tutorial/introduction.html"],
-    ["tutorials/get_started/relay_quick_start.html", "../../tutorial/relay_quick_start.html"],
-    [
-        "tutorials/get_started/tensor_expr_get_started.html",
-        "../../tutorial/tensor_expr_get_started.html",
-    ],
-    [
-        "tutorials/get_started/tvmc_command_line_driver.html",
-        "../../tutorial/tvmc_command_line_driver.html",
-    ],
-    [
-        "tutorials/get_started/tvmc_python.html",
-        "../../tutorial/tvmc_python.html",
-    ],
-]
-
-redirect_template = """
-<!DOCTYPE html>
-<html>
-  <head>
-    <meta http-equiv="refresh" content="1; url=$to" />
-    <script>
-      window.location.href = "$to"
-    </script>
-  </head>
-</html>
-"""
-
-
-def build_legacy_redirect(tvm_path):
-    def legacy_redirect(app, docname):  # Sphinx expects two arguments
-        if app.builder.name == "html":
-
-            src = Template(redirect_template)
-
-            for frm, to in legacy_redirects:
-                frm = tvm_path.resolve() / "docs" / "_build" / "html" / frm
-                redirect = src.substitute({"to": to})
-                os.makedirs(os.path.dirname(frm), exist_ok=True)
-                with open(frm, "w") as f:
-                    f.write(redirect)
-
-    return legacy_redirect
diff --git a/docs/reference/langref/relay_pattern.rst b/docs/reference/langref/relay_pattern.rst
index 16211b2cb125..a80c55323b98 100644
--- a/docs/reference/langref/relay_pattern.rst
+++ b/docs/reference/langref/relay_pattern.rst
@@ -20,7 +20,7 @@
 Pattern Matching in Relay
 =========================
 
-There are many places in TVM where we identify pure data-flow sub-graphs of the Relay program and attempt to transform them in some way example passes include fusion, quantization, external code generation, and device specific optimizations such as bitpacking, and layer slicing used by VTA.
+There are many places in TVM where we identify pure data-flow sub-graphs of the Relay program and attempt to transform them in some way example passes include fusion, quantization, external code generation, and device specific optimizations.
 
 Many of these passes today require a lots of boring boilerplate code in order to implement as well as requiring users to think in terms of visitors and AST matching. Many of these transformations can easily be described in terms of graph rewrites. In order to build a rewriter or other advanced machinery we first need a language of patterns to describe what we can match.
 
diff --git a/docs/topic/vta/.gitignore b/docs/topic/vta/.gitignore
deleted file mode 100644
index 7445cd0171c7..000000000000
--- a/docs/topic/vta/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-tutorials
diff --git a/docs/topic/vta/dev/config.rst b/docs/topic/vta/dev/config.rst
deleted file mode 100644
index b3ec49e769af..000000000000
--- a/docs/topic/vta/dev/config.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-VTA Configuration
-=================
-
-The VTA stack incorporates both a hardware accelerator stack and
-a TVM based software stack.
-VTA incorporates flexibility out of the box: by modifying the
-``3rdparty/vta-hw/config/vta_config.json`` high-level configuration file,
-the user can change the shape of the tensor intrinsic,
-clock frequency, pipelining, data type width, and on-chip buffer sizes.
-
-Parameters Overview
--------------------
-
-We explain the parameters listed in the ``vta_config.json`` file in the table
-below.
-
-+-----------------------+------------+--------------------------------------------------------+
-| Attribute             | Format     | Description                                            |
-+=======================+============+========================================================+
-| ``TARGET``            | String     | The TVM device target.                                 |
-+-----------------------+------------+--------------------------------------------------------+
-| ``HW_VER``            | String     | VTA hardware version number.                           |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_INP_WIDTH``     | Int (log2) | Input data type signed integer width.                  |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_WGT_WIDTH``     | Int (log2) | Weight data type signed integer width.                 |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_ACC_WIDTH``     | Int (log2) | Accumulator data type signed integer width.            |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_BATCH``         | Int (log2) | VTA matrix multiply intrinsic input/output dimension 0.|
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_BLOCK``         | Int (log2) | VTA matrix multiply inner dimensions.                  |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_UOP_BUFF_SIZE`` | Int (log2) | Micro-op on-chip buffer in Bytes.                      |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_INP_BUFF_SIZE`` | Int (log2) | Input on-chip buffer in Bytes.                         |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_WGT_BUFF_SIZE`` | Int (log2) | Weight on-chip buffer in Bytes.                        |
-+-----------------------+------------+--------------------------------------------------------+
-| ``LOG_ACC_BUFF_SIZE`` | Int (log2) | Accumulator on-chip buffer in Bytes.                   |
-+-----------------------+------------+--------------------------------------------------------+
-
-
- .. note::
-
-    When a parameter name is preceded with ``LOG``, it means that it describes a value that can only be expressed a power of two.
-    For that reason we describe these parameters by their log2 value.
-    For instance, to describe an integer width of 8-bits for the input data types, we set the ``LOG_INP_WIDTH`` to be 3, which is the log2 of 8.
-    Similarly, to descibe a 64kB micro-op buffer, we would set ``LOG_UOP_BUFF_SIZE`` to be 16.
-
-We provide additional detail below regarding each parameter:
-
- - ``TARGET``: Can be set to ``"pynq"``, ``"ultra96"``, ``"sim"`` (fast simulator), or ``"tsim"`` (cycle accurate sim with verilator).
- - ``HW_VER``: Hardware version which increments every time the VTA hardware design changes. This parameter is used to uniquely identity hardware bitstreams.
- - ``LOG_BATCH``: Equivalent to A in multiplication of shape (A, B) x (B, C), or typically, the batch dimension of inner tensor computation.
- - ``LOG_BLOCK``: Equivalent to B and C in multiplication of shape (A, B) x (B, C), or typically, the input/output channel dimensions of the inner tensor computation.
diff --git a/docs/topic/vta/dev/hardware.rst b/docs/topic/vta/dev/hardware.rst
deleted file mode 100644
index 8251278994da..000000000000
--- a/docs/topic/vta/dev/hardware.rst
+++ /dev/null
@@ -1,298 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-VTA Hardware Guide
-==================
-
-We present a top-down overview of the VTA hardware design.
-This hardware design guide covers VTA hardware at two levels:
-
- - An architectural overview of the VTA design and its ISA hardware-software
-   interface.
- - A micro-architectural overview of the VTA hardware modules, and the
-   micro-code specification for the compute core.
-
-VTA Overview
-------------
-
-VTA is a generic deep learning accelerator built for fast and efficient dense linear algebra.
-VTA incorporates a simple RISC-like processor that can perform dense linear algebra operations on rank 1 or 2 tensor registers.
-In addition the design adopts decoupled access-execute to hide memory access latency.
-
-
-To a broader extent, VTA can serve as a template deep learning accelerator design for full stack optimization, exposing a generic tensor computation interface to the compiler stack.
-
-.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/blogpost/vta_overview.png
-   :align: center
-   :width: 80%
-
-The figure above gives a high-level overview of the VTA hardware organization.
-VTA is composed of four modules that communicate among each other via FIFO queues and local memory blocks (SRAM), to enable task-level pipeline parallelism:
-
-- The fetch module takes care of loading an instruction stream from DRAM. It also decodes those instructions to route them into one of three command queues.
-- The load module takes care of loading input and weight tensors from DRAM into data-specialized on-chip memories.
-- The compute module performs both dense linear algebra computation with its GEMM core, and general computation with its tensor ALU. It also takes care of loading data from DRAM into the register file, and loading micro-op kernels into the micro-op cache.
-- The store module stores results produced by the compute core back to DRAM.
-
-HLS Hardware Source Organization
---------------------------------
-
-The VTA design is currently specified in Vivado HLS C++, which is only supported
-by Xilinx toolchains.
-The VTA hardware sources are contained under ``3rdparty/vta-hw/hardware/xilinx/sources``:
-
- - ``vta.cc`` contains the definitions for each VTA module, as well as a top
-   level behavioral model for the top-level VTA design.
- - ``vta.h`` contains type definitions using Xilinx ``ap_int`` types, and
-   function prototypes declarations.
-
-In addition preprocessor macros are defined under ``3rdparty/vta-hw/include/vta/hw_spec.h``.
-Much of these macro definitions are derived from the parameters listed in the
-``3rdparty/vta-hw/config/vta_config.json`` file.
-The json file is processed by ``3rdparty/vta-hw/config/vta_config.py`` to produce a string of
-compile flags that define the preprocessor macros.
-That string is used by the makefile in order to set those high-level
-parameters in both the HLS hardware synthesis compiler, and the C++
-compiler that builds the VTA runtime.
-
-HLS Module Example
-~~~~~~~~~~~~~~~~~~
-
-We show a definition of one of the VTA modules defined in C++:
-
-.. code-block:: c
-
-  void fetch(
-    uint32_t insn_count,
-    volatile insn_T *insns,
-    hls::stream<insn_T> &load_queue,
-    hls::stream<insn_T> &gemm_queue,
-    hls::stream<insn_T> &store_queue) {
-  #pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
-  #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
-  #pragma HLS INTERFACE axis port = load_queue
-  #pragma HLS INTERFACE axis port = gemm_queue
-  #pragma HLS INTERFACE axis port = store_queue
-  #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
-
-    INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
-  #pragma HLS PIPELINE II = 1
-      // Read instruction fields
-      insn_T insn = insns[pc];
-      // Do some partial decoding
-      opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
-      memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
-      // Push to appropriate instruction queue
-      if (opcode == VTA_OPCODE_STORE) {
-        store_queue.write(insn);
-      } else if (opcode == VTA_OPCODE_LOAD &&
-          (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) {
-        load_queue.write(insn);
-      } else {
-        gemm_queue.write(insn);
-      }
-    }
-  }
-
-A few observations on HLS coding:
- - *Parameters:* The parameter list of each function, combined with the
-   interface pragmas define the hardware interface exposed by the
-   generated hardware module.
-
-    - Parameters passed by value indicate a read-only hardware memory-mapped
-      register that the host can write to.
-      This fetch function for instance has an ``insn_count`` parameter
-      which will be synthesized as a memory mapped register for the host
-      to write to, in order to set the length of a given VTA instruction
-      sequence.
-    - Pointer parameters can mean one of two things depending on the interface
-      pragma being used.
-
-       - When used with a ``m_axi`` interface pragma, an AXI requestor interface
-         gets generated to provide DMA access to DRAM.
-       - When used with a ``bram`` interface pragma, a BRAM interface gets
-         generated to expose read and/or write ports to an FPGA block-RAM.
-    - HLS streams being passed by reference combined with the ``axis`` interface
-      pragma produce FIFO interfaces to the module. Hardware FIFOs provide a
-      useful synchronization mechanism between modules.
- - *Pragmas*: Compiler pragmas are essential to define hardware implementation
-   of each module. We list several pragmas used in the VTA design to communicate
-   implementation requirements to the compiler.
-
-    - ``HLS INTERFACE``: specifies the interface of the synthesized
-      hardware module.
-    - ``HLS PIPELINE``: defines hardware pipeline performance target by setting
-      an initiation interval goal. When the ``II == 1`` target is set, it tells
-      the compiler that the synthesized hardware pipeline should be able to
-      execute one loop iteration per cycle.
-    - ``HLS DEPENDENCE``: instructs the compiler to ignore certain types
-      of dependence checks in a given loop. Consider a loop body that writes
-      and reads to the same BRAM structure, and needs to achieve an II of 1.
-      The HLS compiler has to assume worst-case scenario, whereby a read is
-      issued to an address that a past write updated the cycle prior: this
-      cannot be achieved given BRAM timing characteristics (it takes at least
-      2 cycles to see the updated value). Therefore in order to achieve an II of 1,
-      the dependence checks have to be relaxed.
-      Note that when turning this optimization on, it falls onto
-      the software stack to prevent writes followed by reads to the same address.
-
- .. note::
-    This `reference guide <https://www.xilinx.com/support/documentation/sw_manuals/xilinx2018_2/ug902-vivado-high-level-synthesis.pdf>`_
-    provides a much more in-depth, and complete specification of HLS for the Xilinx 2018.2 toolchains.
-
-Architectural Overview
-----------------------
-
-Instruction Set Architecture
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-VTA's instruction set architecture (ISA) is composed of 4 CISC instructions that have a variable execution latency, two of which execute a micro-coded instruction sequence to perform computation.
-
-The VTA instructions are listed below:
-
-- ``LOAD`` instruction: loads a 2D tensor from DRAM into the input buffer, weight buffer, or register file. It can also load a micro-kernel into the micro-op cache. Supports dynamic padding when loading input and weight tiles.
-- ``GEMM`` instruction: performs a micro-op sequence of matrix-matrix multiplications over an input tensor and a weight tensors, and adds the result to a register-file tensor.
-- ``ALU`` instruction: performs a micro-op sequence of matrix-matrix ALU operations over register-file tensor data.
-- ``STORE`` instruction: stores a 2D tensor from the output buffer to DRAM.
-
-The ``LOAD`` instructions are executed by the load and compute modules depending on the store memory buffer location target.
-The ``GEMM`` and ``ALU`` instructions are executed by the compute module's GEMM core and tensor ALU.
-Finally, the ``STORE`` instructions are executed by the store module exclusively.
-The fields of each instruction is described in the figure below.
-The meaning of each field will be further explained in the :ref:`vta-uarch` section.
-
-.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/vta_instructions.png
-   :align: center
-   :width: 100%
-
-.. note::
-   Note that the VTA ISA changes as VTA's architectural parameters are modified (i.e. GEMM core shape, data type, memory size etc.), and as a result the ISA does not guarantee compatibility across all variants of VTA.
-   This is acceptable however, since the VTA runtime adapts to parameter changes, and produces binary code tailored for the version of the accelerator that gets generated.
-   This exemplifies the co-design philosophy adopted by the VTA stack which embraces fluidity of the hardware-software interface.
-
-Dataflow Execution
-~~~~~~~~~~~~~~~~~~
-
-VTA relies on dependence FIFO queues between hardware modules to synchronize the execution of concurrent tasks.
-The figure below shows how a given hardware module can execute concurrently from its producer and consumer modules in a dataflow fashion through the use of dependence FIFO queues, and single-reader/single-writer SRAM buffers.
-Each module is connected to its consumer and producer via read-after-write (RAW) and write-after-read (WAR) dependence queues.
-
-.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/dataflow.png
-   :align: center
-   :width: 100%
-
-The pseudo-code above describes how a module executes a given instruction predicated on dependences with other instructions.
-First, the dependence flags within each instruction are decoded in hardware.
-If the instruction has an incoming RAW dependences, execution is predicated upon receiving a RAW dependence token from the producer module.
-Similarly, if the task has an incoming WAR dependence, execution is predicated upon receiving a WAR dependence token from the consumer module.
-Finally when the task is done, we check for outgoing RAW and WAR dependences, and notify the consumer and producer modules respectively.
-
-.. note::
-   Note that the dependence tokens in this scenario are information-less.
-   This is because the instructions executed by each module cannot be reordered by design, as they arrive in FIFO order.
-
-Pipeline Expandability
-~~~~~~~~~~~~~~~~~~~~~~
-
-The default VTA design is composed of four modules that describe a 3-stage ``load-compute-store`` task pipeline.
-Following the dataflow hardware organization principle, we can extend VTA the pipeline to include more stages.
-For example, we can envision separating the tensor ALU from the GEMM core in order to maximize the utilization of the GEMM core.
-This would result in a ``load-gemm-activate-store`` task pipeline which closely reflects the TPU design.
-Adding more stages has a cost however: it can add storage and extra logic overhead, which is why we opted for a default 3-stage pipeline.
-
-.. _vta-uarch:
-
-Microarchitectural Overview
----------------------------
-
-We describe the modules that compose the VTA design.
-The module definitions are contained in ``3rdparty/vta-hw/hardware/xilinx/sources/vta.cc``.
-
-Fetch Module
-~~~~~~~~~~~~
-
-VTA is programmed by a linear instruction stream.
-The fetch module is the entry point of VTA to the CPU and is programmed via three memory mapped registers:
-
-- The read-write ``control`` register starts the fetch module, and is read to check for its completion.
-- The write-only ``insn_count`` register sets the number of instructions to execute.
-- The write-only ``insns`` register sets the start address of the instruction stream in DRAM.
-
-The CPU prepares the instruction stream in DRAM in a physically-contiguous buffer prepared by the VTA runtime.
-When the instruction stream is ready, the CPU writes the start physical address into the ``insns`` register, the length of the instruction stream into the ``insn_count`` register, and asserts the start signal in the ``control`` register.
-This procedure starts VTA, which reads in the instruction stream from DRAM via DMA.
-
-Upon accessing the instruction stream, the fetch module partially decodes instructions, and pushes those instructions into command queues that feed into the load, compute, and store modules:
-
-- ``STORE`` instructions are pushed to the store command queue to be processed by the store module.
-- ``GEMM`` and ``ALU`` instructions are pushed to the compute command queue to be processed by the compute module.
-- ``LOAD`` instructions that describe a load operation of micro-op kernels or register file data are pushed to the compute command queue to be processed by the compute module.
-- ``LOAD`` instructions that describe a load operation of input or weight data are pushed to the load command queue to be processed by the load module.
-
-When one of the command queues becomes full, the fetch module stalls until the queue is not full.
-Consequently, the command queues are sized to be deep enough to allow for a wide execution window, and allow multiple tasks to be in flight concurrently across the ``load-compute-store`` pipeline.
-
-
-Compute Module
-~~~~~~~~~~~~~~
-
-VTA's compute module acts as a RISC processor that performs computation on tensor registers rather than scalar registers.
-Two functional units mutate the register file: the tensor ALU, and the GEMM core.
-
-The compute module executes RISC micro-ops from the micro-op cache.
-There are two types of compute micro-ops: ALU and GEMM operations.
-To minimize the footprint of micro-op kernels, while avoiding the need for control-flow instructions such as conditional jumps, the compute module executes micro-op sequences inside a two-level nested loop that computes the location of each tensor register location via an affine function.
-This compression approach helps reduce the micro-kernel instruction footprint, and applies to both matrix multiplication and 2D convolution, commonly found in neural network operators.
-
-.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/gemm_core.png
-   :align: center
-   :width: 100%
-
-The **GEMM core** evaluates GEMM instructions, by executing a micro-code sequence in a 2-level nested loop described in the Figure above.
-The GEMM core can perform one input-weight matrix multiplication per cycle.
-The dimensions of the single-cycle matrix multiplication defines a hardware *tensorization intrinsic* which the TVM compiler has to lower a computation schedule onto.
-This tensorization intrinsic is defined by the dimensions of the input, weight and accumulator tensors.
-Each data type can have a different integer precision: typically both weight and input types are low-precision (8-bits or less), while the accumulator tensor has a wider type to prevent overflows (32-bits).
-In order to keep the GEMM core busy, each of the input buffer, weight buffer, and register file have to expose sufficient read/write bandwidth.
-
-.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/alu_core.png
-   :align: center
-   :width: 100%
-
-The **Tensor ALU** supports a set of standard operations to implement common activation, normalization, and pooling operators.
-VTA being a modular design, the range of operators that the Tensor ALU supports can be extended for higher operator coverage, at the expense of higher resource utilization.
-The Tensor ALU can perform tensor-tensor operations, as well as tensor-scalar operations on an immediate value.
-The opcode of the tensor ALU, and the immediate value are specified by the high-level CISC instruction.
-The micro-code in the context of tensor ALU computation only takes care of specifying data access patterns.
-
-.. note::
-   In terms of computational throughput, the Tensor ALU does not execute at a rate of one operation per cycle.
-   The limitation comes from the lack of read-ports: since one register file tensor can be read per cycle, the tensor ALU has an initiation interval of at least 2 (i.e. performs at most 1 operation every 2 cycles).
-   In addition, performing a single tensor-tensor operation at once can be expensive especially given that register file types are wide, typically 32-bit integers.
-   As a result, in order to balance the resource utilization footprint of the Tensor ALU with the GEMM core, a tensor-tensor operation is by default performed via vector-vector operations over multiple cycles.
-
-
-Load and Store Modules
-~~~~~~~~~~~~~~~~~~~~~~
-
-.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/developer/2d_dma.png
-   :align: center
-   :width: 100%
-
-The load and store modules perform 2D DMA loads with a strided access pattern from DRAM to SRAM.
-In addition, the load module can insert 2D padding on the fly, which is useful when blocking 2D convolution.
-This means that VTA can tile 2D convolution inputs without paying the overhead of re-laying data out in DRAM to insert spatial padding around input and weight tiles.
diff --git a/docs/topic/vta/dev/index.rst b/docs/topic/vta/dev/index.rst
deleted file mode 100644
index 753af7a21721..000000000000
--- a/docs/topic/vta/dev/index.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-VTA Design and Developer Guide
-==============================
-
-This developer guide details the complete VTA-TVM hardware-software stack.
-
-.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/blogpost/vta_stack.png
-   :align: center
-   :width: 60%
-
-.. toctree::
-   :maxdepth: 2
-
-   config
-   hardware
diff --git a/docs/topic/vta/index.rst b/docs/topic/vta/index.rst
deleted file mode 100644
index d09b31bb360b..000000000000
--- a/docs/topic/vta/index.rst
+++ /dev/null
@@ -1,55 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-.. _vta-index:
-
-VTA: Versatile Tensor Accelerator
-=================================
-
-The Versatile Tensor Accelerator (VTA) is an open, generic, and customizable deep learning accelerator with a complete TVM-based compiler stack. We designed VTA to expose the most salient and common characteristics of mainstream deep learning accelerators. Together TVM and VTA form an end-to-end hardware-software deep learning system stack that includes hardware design, drivers, a JIT runtime, and an optimizing compiler stack based on TVM.
-
-.. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/blogpost/vta_overview.png
-   :align: center
-   :width: 60%
-
-VTA has the following key features:
-
-- Generic, modular, open-source hardware.
-- Streamlined workflow to deploy to FPGAs.
-- Simulator support to prototype compilation passes on regular workstations.
-- Pynq-based driver and JIT runtime for both simulated and FPGA hardware back-end.
-- End to end TVM stack integration.
-
-This page contains links to all the resources related to VTA:
-
-
-.. toctree::
-   :maxdepth: 1
-
-   install
-   dev/index
-   tutorials/index
-
-
-Literature
-----------
-
-- Read the VTA `release blog post`_.
-- Read the VTA tech report: `An Open Hardware Software Stack for Deep Learning`_.
-
-.. _release blog post: https://tvm.apache.org/2018/07/12/vta-release-announcement
-.. _An Open Hardware Software Stack for Deep Learning: https://arxiv.org/abs/1807.04188
diff --git a/docs/topic/vta/install.rst b/docs/topic/vta/install.rst
deleted file mode 100644
index ba76df410c1f..000000000000
--- a/docs/topic/vta/install.rst
+++ /dev/null
@@ -1,488 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-VTA Installation Guide
-======================
-
-We present three installation guides, each extending on the previous one:
-
-1. `VTA Simulator Installation`_
-2. `Xilinx Pynq FPGA Setup`_
-3. `Intel DE10 FPGA Setup`_
-4. `Bitstream Generation with Xilinx Toolchains`_
-5. `Bitstream Generation with Intel Toolchains`_
-
-
-VTA Simulator Installation
---------------------------
-
-You need :ref:`TVM installed <installation>` on your machine.  For a quick and
-easy start, checkout the :ref:`Docker Guide <docker-images>`.
-
-You'll need to set the following paths to use VTA:
-
-.. code:: bash
-
-   export TVM_PATH=<path to TVM root>
-   export VTA_HW_PATH=$TVM_PATH/3rdparty/vta-hw
-
-The VTA functional simulation library needs to be enabled when building TVM.
-
-.. code:: bash
-
-   cd <tvm-root>
-   mkdir build
-   cp cmake/config.cmake build/.
-   echo 'set(USE_VTA_FSIM ON)' >> build/config.cmake
-   cd build && cmake .. && make -j4
-
-Add the VTA python library to your python path to run the VTA examples.
-
-.. code:: bash
-
-   export PYTHONPATH=/path/to/vta/python:${PYTHONPATH}
-
-Testing your VTA Simulation Setup
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-To ensure that you've properly installed the VTA python package, run the following 2D convolution testbench.
-
-.. code:: bash
-
-   python <tvm root>/vta/tests/python/integration/test_benchmark_topi_conv2d.py
-
-You are invited to try out our :ref:`VTA programming tutorials <vta-tutorials>`.
-
-   **Note**: You'll notice that for every convolution layer, the throughput gets reported in GOPS. These numbers are actually the computational throughput that the simulator achieves, by evaluating the convolutions in software.
-
-Advanced Configuration (optional)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-VTA is a generic configurable deep learning accelerator.
-The configuration is specified by ``vta_config.json`` under ``3rdparty/vta-hw/config``.
-This file provides an architectural specification of the VTA accelerator to parameterize the TVM compiler stack and the VTA hardware stack.
-
-The VTA configuration file also specifies the TVM compiler target.
-When ``TARGET`` is set to ``sim``, all TVM workloads execute on the VTA simulator.
-You can modify the content of the configuration file to rebuild VTA to a different parameterization.
-To do so,
-
-.. code:: bash
-
-   cd <tvm root>
-   vim 3rdparty/vta-hw/config/vta_config.json
-   # edit vta_config.json
-   make
-
-
-
-Xilinx Pynq FPGA Setup
-----------------------
-
-This second guide extends the *VTA Simulator Installation* guide above to run FPGA hardware tests of the complete TVM and VTA software-hardware stack.
-In terms of hardware components you'll need:
-
-* The `Pynq <http://www.pynq.io/>`_ FPGA development board which can be acquired for $200, or $150 for academics from `Digilent <https://store.digilentinc.com/pynq-z1-python-productivity-for-zynq/>`_.
-* An Ethernet-to-USB adapter to connect the Pynq board to your development machine.
-* An 8+GB micro SD card.
-* An AC to DC 12V 3A power adapter.
-
-This guide covers the following themes:
-
-1. Pynq board setup instructions.
-2. Pynq-side RPC server build and deployment.
-3. Revisiting the test examples from the *VTA Simulator Installation* guide, this time executing on the Pynq board.
-
-Pynq Board Setup
-^^^^^^^^^^^^^^^^
-
-Setup your Pynq board based on the `Pynq board getting started tutorial <http://pynq.readthedocs.io/en/latest/getting_started.html>`_.
-
-You should follow the instructions up to and including the *Turning On the PYNQ-Z1* step (no need to pursue the tutorial beyond this point).
-
-* Make sure that you've downloaded the latest Pynq image, `PYNQ-Z1 v2.5 <http://www.pynq.io/board.html>`_, and have imaged your SD card with it (we recommend the free `Etcher <https://etcher.io/>`_ program).
-* For this test setup, follow the `"Connect to a Computer" <https://pynq.readthedocs.io/en/latest/getting_started/pynq_z1_setup.html>`_ Ethernet setup instructions. To be able to talk to the board, make sure to `assign your computer a static IP address <https://pynq.readthedocs.io/en/latest/appendix.html#assign-your-computer-a-static-ip>`_
-
-Once the board is powered on and connected to your development machine, try connecting to it to make sure you've properly set up your Pynq board:
-
-.. code:: bash
-
-   # To connect to the Pynq board use the <username, password> combo: <xilinx, xilinx>
-   ssh xilinx@192.168.2.99
-
-Pynq-Side RPC Server Build & Deployment
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Because the direct board-to-computer connection prevents the board from directly accessing the internet, we'll need to mount the Pynq's file system to your development machine's file system with `sshfs <https://www.digitalocean.com/community/tutorials/how-to-use-sshfs-to-mount-remote-file-systems-over-ssh>`_. Next we directly clone the TVM repository into the sshfs mountpoint on your development machine.
-
-.. code:: bash
-
-   # On the Host-side
-   mkdir <mountpoint>
-   sshfs xilinx@192.168.2.99:/home/xilinx <mountpoint>
-   cd <mountpoint>
-   git clone --recursive https://github.com/apache/tvm tvm
-   # When finished, you can leave the moutpoint and unmount the directory
-   cd ~
-   sudo umount <mountpoint>
-
-Now that we've cloned the VTA repository in the Pynq's file system, we can ssh into it and launch the build of the TVM-based RPC server.
-The build process should take roughly 5 minutes.
-
-.. code:: bash
-
-   ssh xilinx@192.168.2.99
-   # Build TVM runtime library (takes 5 mins)
-   cd /home/xilinx/tvm
-   mkdir build
-   cp cmake/config.cmake build/.
-   echo 'set(USE_VTA_FPGA ON)' >> build/config.cmake
-   # Copy pynq specific configuration
-   cp 3rdparty/vta-hw/config/pynq_sample.json 3rdparty/vta-hw/config/vta_config.json
-   cd build
-   cmake ..
-   make runtime vta -j2
-   # FIXME (tmoreau89): remove this step by fixing the cmake build
-   make clean; make runtime vta -j2
-   # Build VTA RPC server (takes 1 min)
-   cd ..
-   sudo ./apps/vta_rpc/start_rpc_server.sh # pw is 'xilinx'
-
-
-You should see the following being displayed when starting the RPC server. In order to run the next examples, you'll need to leave the RPC server running in an ``ssh`` session.
-
-.. code:: bash
-
-   INFO:root:RPCServer: bind to 0.0.0.0:9091
-
-
-Tips regarding the Pynq RPC Server:
-
-* The RPC server should be listening on port ``9091``. If not, an earlier process might have terminated unexpectedly and it's recommended in this case to just reboot the Pynq, and re-run the RPC server.
-* To kill the RPC server, just send the ``Ctrl + c`` command. You can re-run it with ``sudo ./apps/pynq_rpc/start_rpc_server.sh``.
-* If unresponsive, the board can be rebooted by power-cycling it with the physical power switch.
-
-Testing your Pynq-based Hardware Setup
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Before running the examples on your development machine, you'll need to configure your host environment as follows:
-
-.. code:: bash
-
-   # On the Host-side
-   export VTA_RPC_HOST=192.168.2.99
-   export VTA_RPC_PORT=9091
-
-
-In addition, you'll need to edit the ``vta_config.json`` file on the host to indicate that we are targeting the Pynq platform, by setting the ``TARGET`` field to ``"pynq"``.
-> Note: in contrast to our simulation setup, there are no libraries to compile on the host side since the host offloads all of the computation to the Pynq board.
-
-.. code:: bash
-
-   # On the Host-side
-   cd <tvm root>
-   cp 3rdparty/vta-hw/config/pynq_sample.json 3rdparty/vta-hw/config/vta_config.json
-
-
-This time again, we will run the 2D convolution testbench.
-Beforehand, we need to program the Pynq board FPGA with a VTA bitstream, and build the VTA runtime via RPC.
-The following ``test_program_rpc.py`` script will perform two operations:
-
-* FPGA programming, by downloading a pre-compiled bitstream from a `VTA bitstream repository <https://github.com/uwsampl/vta-distro>`_ that matches the default ``vta_config.json`` configuration set by the host, and sending it over to the Pynq via RPC to program the Pynq's FPGA.
-* Runtime building on the Pynq, which needs to be run every time the ``vta_config.json`` configuration is modified. This ensures that the VTA software runtime that generates the accelerator's executable via just-in-time (JIT) compilation matches the specifications of the VTA design that is programmed on the FPGA. The build process takes about 30 seconds to complete so be patient!
-
-.. code:: bash
-
-   # On the Host-side
-   python <tvm root>/vta/tests/python/pynq/test_program_rpc.py
-
-
-We are now ready to run the 2D convolution testbench in hardware.
-
-.. code:: bash
-
-   # On the Host-side
-   python <tvm root>/vta/tests/python/integration/test_benchmark_topi_conv2d.py
-
-The performance metrics measured on the Pynq board will be reported for each convolutional layer.
-
-**Tip**: You can track progress of the FPGA programming and the runtime rebuilding steps by looking at the RPC server's logging messages in your Pynq ``ssh`` session.
-
-You can also try out our :ref:`VTA programming tutorials <vta-tutorials>`.
-
-Intel DE10 FPGA Setup
----------------------
-
-Similar to the Pynq-side setup steps, this third guide bring us the details on how can we setup up the Linux environment for Intel FPGA boards like DE10-Nano.
-
-In terms of hardware components, you would need the `DE10-Nano Development Kit <https://www.terasic.com.tw/cgi-bin/page/archive.pl?Language=English&No=1046>`_, which can be acquired for $130, or $100 for academics from `Terasic <https://www.terasic.com.tw/>`_. A microSD card would be delivered the kit. Power cables and USB cables would be included as well. However, an additional Ethernet cable would be needed to connect the board to LAN.
-
-The rest part of this guide would provide the steps to
-
-* Flash the microSD card with latest Angstrom Linux image
-* Cross-compilation setup
-* Device-side RPC server setup and deployment
-
-DE10-Nano Board Setup
-^^^^^^^^^^^^^^^^^^^^^
-
-Before powering up the device, we need to flash the microSD card image with latest Angstrom Linux image.
-
-Flash SD Card and Boot Angstrom Linux
-"""""""""""""""""""""""""""""""""""""
-
-To flash SD card and boot Linux on DE10-Nano, it is recommended to navigate to the `Resource <https://www.terasic.com.tw/cgi-bin/page/archive.pl?Language=English&CategoryNo=167&No=1046&PartNo=4>`_ tab of the DE10-Nano product page from Terasic Inc.
-After registration and login on the webpage, the prebuilt Angstrom Linux image would be available for downloading and flashing.
-Specifically, to flash the downloaded Linux SD card image into your physical SD card:
-
-First, extract the gzipped archive file.
-
-.. code:: bash
-
-   tar xf de10-nano-image-Angstrom-v2016.12.socfpga-sdimg.2017.03.31.tgz
-
-This would produce a single SD card image named ``de10-nano-image-Angstrom-v2016.12.socfpga-sdimg`` (approx. 2.4 GB), it contains all the file systems to boot Angstrom Linux.
-
-Second, plugin a SD card that is ready to flash in your PC, and identify the device id for the disk with ``fdisk -l``, or ``gparted`` if you feel better to use GUI. The typical device id for your disk would likely to be ``/dev/sdb``.
-
-Then, flash the disk image into your physical SD card with the following command:
-
-.. code:: bash
-
-   # NOTE: root privilege is typically required to run the following command.
-   dd if=de10-nano-image-Angstrom-v2016.12.socfpga-sdimg of=/dev/sdb status=progress
-
-This would take a few minutes for your PC to write the whole file systems into the SD card.
-After this process completes, you are ready to unmount the SD card and insert it into your DE10-Nano board.
-Now you can connect the power cable and serial port to boot the Angstrom Linux.
-
-   **Note**: When boot up from the microSD card, you might notice the incompatibility of the Linux kernel ``zImage`` in the microSD card.
-   In this case, you might need to build the ``zImage`` file of your own from `socfpga-4.9.78-ltsi <https://github.com/altera-opensource/linux-socfpga/tree/socfpga-4.9.78-ltsi>`_ branch of the `linux-socfpga <https://github.com/altera-opensource/linux-socfpga>`_ repository.
-   For a quick fix, you can also download a prebuilt version of the ``zImage`` file `from this link <https://raw.githubusercontent.com/liangfu/de10-nano-supplement/master/zImage>`_.
-
-After connecting the usb cables to the DE10-Nano board, power on the board by connecting the power cable. You may then connect to the serial port of the device by using ``minicom`` on your host PC:
-
-.. code:: bash
-
-   # NOTE: root privilege is typically required to run the following command.
-   minicom -D /dev/ttyUSB0
-
-The default user name for the device would be ``root``, and the password is empty for the default user.
-
-You may now start to install supporting Python3 packages (TVM has dropped the support for Python2), specifically, they are ``numpy``, ``attrs`` and ``decorator``.
-
-   **Note**: You might fail to install ``numpy`` by using ``pip3`` on the DE10-Nano device.
-   In that case, you have the option to either build your own filesystem image for the board from `meta-de10-nano <https://github.com/intel/meta-de10-nano>`_ repository;
-   an alternative option is to download prebuilt packages from existing Linux distributions, e.g. Debian.
-   For a quick fix, we have concatenated the supplementary binary files `here <https://raw.githubusercontent.com/liangfu/de10-nano-supplement/master/rootfs_supplement.tgz>`_, and you can extract the files into the root filesystem.
-
-Install Required Python Packages
-""""""""""""""""""""""""""""""""
-
-After accessing bash terminal from the serial port, we need to install required Python packages before building and installing TVM and VTA programs.
-
-Build Additional Components to Use VTA Bitstream
-""""""""""""""""""""""""""""""""""""""""""""""""
-
-To use the above built bitstream on DE10-Nano hardware, several additional components need to be compiled for the system.
-Specifically, to compile application executables for the system, you need to download and install `SoCEDS <http://fpgasoftware.intel.com/soceds/18.1/?edition=standard&download_manager=dlm3&platform=linux>`_ (recommended), or alternatively install the ``g++-arm-linux-gnueabihf`` package on your host machine. You would also need a ``cma`` kernel module to allocate contigous memory, and a driver for communicating with the VTA subsystem.
-
-
-Bitstream Generation with Xilinx Toolchains
--------------------------------------------
-
-If you're interested in generating the Xilinx FPGA bitstream on your own instead of using the pre-built VTA bitstreams, follow the instructions below.
-
-Xilinx Toolchain Installation
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-We recommend using Vivado 2020.1 since our scripts have been tested to work on this version of the Xilinx toolchains.
-Our guide is written for Linux (Ubuntu) installation.
-
-You’ll need to install Xilinx’ FPGA compilation toolchain, `Vivado HL WebPACK 2020.1 <https://www.xilinx.com/products/design-tools/vivado.html>`_, which a license-free version of the Vivado HLx toolchain.
-
-Obtaining and Launching the Vivado GUI Installer
-""""""""""""""""""""""""""""""""""""""""""""""""
-
-1. Go to the `download webpage <https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/2020-1.html>`_, and download the Linux Self Extracting Web Installer for Vivado HLx 2020.1: WebPACK and Editions.
-2. You’ll have to sign in with a Xilinx account. This requires a Xilinx account creation that will take 2 minutes.
-3. Complete the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called ``Xilinx_Unified_2020.1_0602_1208_Lin64.bin``.
-4. Now that the file is downloaded, go to your ``Downloads`` directory, and change the file permissions so it can be executed:
-
-.. code:: bash
-
-   chmod u+x Xilinx_Unified_2020.1_0602_1208_Lin64.bin
-
-5. Now you can execute the binary:
-
-.. code:: bash
-
-   ./Xilinx_Unified_2020.1_0602_1208_Lin64.bin
-
-Xilinx Vivado GUI Installer Steps
-"""""""""""""""""""""""""""""""""
-
-At this point you've launched the Vivado 2020.1 Installer GUI program.
-
-1. Click “Next” on the "Welcome" screen.
-2. On the "Select Install Type" screen, enter your Xilinx user credentials under the “User Authentication” box and select the “Download and Install Now” option before clicking “Next”.
-3. On the "Accept License Agreements" screen, accept all terms before clicking “Next”.
-4. On the "Select Edition to Install" screen, select the “Vivado HL WebPACK” before clicking “Next”.
-5. Under the "Vivado HL WebPACK" screen, before hitting “Next", check the following options (the rest should be unchecked):
-   * Design Tools -> Vivado Design Suite -> Vivado
-   * Devices -> Production Devices -> SoCs -> Zynq-7000 (if you are targeting the Pynq board)
-   * Devices -> Production Devices -> SoCs -> UltraScale+ MPSoC (if you are targeting the Ultra-96 board)
-6. Your total download size should be about 5GB and the amount of Disk Space Required 23GB.
-7. On the "Select Destination Directory" screen, set the installation directory before clicking “Next”. It might highlight some paths as red - that’s because the installer doesn’t have the permission to write to the directory. In that case select a path that doesn’t require special write permissions (e.g. your home directory).
-8. On the "Installation Summary" screen, hit “Install”.
-9. An "Installation Progress" window will pop-up to track progress of the download and the installation.
-10. This process will take about 20-30 minutes depending on your connection speed.
-11. A pop-up window will inform you that the installation completed successfully. Click "OK".
-12. Finally the "Vivado License Manager" will launch. Select "Get Free ISE WebPACK, ISE/Vivado IP or PetaLinux License" and click "Connect Now" to complete the license registration process.
-
-Environment Setup
-"""""""""""""""""
-
-The last step is to update your ``~/.bashrc`` with the following lines. This will include all of the Xilinx binary paths so you can launch compilation scripts from the command line.
-
-.. code:: bash
-
-   # Xilinx Vivado 2020.1 environment
-   export XILINX_VIVADO=${XILINX_PATH}/Vivado/2020.1
-   export PATH=${XILINX_VIVADO}/bin:${PATH}
-
-HLS-based Custom VTA Bitstream Compilation for Pynq
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-High-level hardware parameters are listed in the VTA configuration file and can be customized by the user.
-For this custom VTA bitstream compilation exercise, we'll change the frequency of our design, so it can be clocked a little faster.
-
-* Set the ``HW_FREQ`` field to ``142``. The Pynq board supports 100, 142, 167 and 200MHz clocks. Note that the higher the frequency, the harder it will be to close timing. Increasing the frequency can lead to timing violation and thus faulty hardware execution.
-* Set the ``HW_CLK_TARGET`` to ``6``. This parameters refers to the target clock period in nano seconds for HLS - a lower clock period leads to more aggressive pipelining to achieve timing closure at higher frequencies. Technically a 142MHz clock would require a 7ns target, but we intentionally lower the clock target to 6ns to more aggressively pipeline our design.
-
-Bitstream generation is driven by a top-level ``Makefile`` under ``<tvm root>/3rdparty/vta-hw/hardware/xilinx/``.
-
-If you just want to simulate the VTA design in software emulation to make sure that it is functional, enter:
-
-.. code:: bash
-
-   cd <tvm root>/3rdparty/vta-hw/hardware/xilinx
-   make ip MODE=sim
-
-
-If you just want to generate the HLS-based VTA IP cores without launching the entire design place and route, enter:
-
-.. code:: bash
-
-   make ip
-
-You'll be able to view the HLS synthesis reports under ``<tvm root>/3rdparty/vta-hw/build/hardware/xilinx/hls/<configuration>/<block>/solution0/syn/report/<block>_csynth.rpt``
-
-   **Note**: The ``<configuration>`` name is a string that summarizes the VTA configuration parameters listed in the ``vta_config.json``. The ``<block>`` name refers to the specific module (or HLS function) that compose the high-level VTA pipeline.
-
-Finally to run the full hardware compilation and generate the VTA bitstream, run ``make``.
-
-This process is lengthy, and can take around up to an hour to complete depending on your machine's specs.
-We recommend setting the ``VTA_HW_COMP_THREADS`` variable in the Makefile to take full advantage of all the cores on your development machine.
-
-Once the compilation completes, the generated bitstream can be found under ``<tvm root>/3rdparty/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit``.
-
-Using A Custom Bitstream
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-We can program the new VTA FPGA bitstream by setting the bitstream path of the ``vta.program_fpga()`` function in the tutorial examples, or in the ``test_program_rpc.py`` script.
-
-.. code:: python
-
-   vta.program_fpga(remote, bitstream="<tvm root>/3rdparty/vta-hw/build/hardware/xilinx/vivado/<configuration>/export/vta.bit")
-
-Instead of downloading a pre-built bitstream from the VTA bitstream repository, TVM will instead use the new bitstream you just generated, which is a VTA design clocked at a higher frequency.
-Do you observe a noticeable performance increase on the ImageNet classification example?
-
-
-
-Bitstream Generation with Intel Toolchains
--------------------------------------------
-
-If you're interested in generating the Xilinx FPGA bitstream on your own instead of using the pre-built VTA bistreams, follow the instructions below.
-
-Intel Toolchain Installation
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-It is recommended to use ``Intel Quartus Prime 18.1``, since the test scripts contained in this document have been tested on this version.
-
-You would need to install Intel's FPGA compilation toolchain, `Quartus Prime Lite <http://fpgasoftware.intel.com/?edition=lite>`_, which is a license-free version of the Intel Quartus Prime software.
-
-Obtaining and Launching the Quartus GUI Installer
-"""""""""""""""""""""""""""""""""""""""""""""""""
-
-1. Go to the `download center <http://fpgasoftware.intel.com/?edition=lite>`_, and download the linux version of "Quartus Prime (include Nios II EDS)" and "Cyclone V device support" files in the "Separate file" tab. This avoid downloading unused device support files.
-2. Sign in the form if you have an account, or register on the right side of the web page to create an account.
-3. After signed in, you are able to download the installer and the device support files.
-4. Now that the files are downloaded, go to your ``Downloads`` directory, and change the file permissions:
-
-.. code:: bash
-
-   chmod u+x QuartusLiteSetup-18.1.0.625-linux.run
-
-5. Now ensure both the installer and device support files are in the same directory, and you can run the install with:
-
-.. code:: bash
-
-   ./QuartusLiteSetup-18.1.0.625-linux.run
-
-6. Follow the instructions on the pop-up GUI form, and install all the content in the ``/usr/local`` directory. After installation, ``/usr/local/intelFPGA_lite/18.1`` would be created and the Quartus program along with other programs would be available in the folder.
-
-Environment Setup
-"""""""""""""""""
-
-Similar to what should be done for Xilinx toolchain, the following line should be added to your ``~/.bashrc``.
-
-.. code:: bash
-
-   # Intel Quartus 18.1 environment
-   export QUARTUS_ROOTDIR="/usr/local/intelFPGA_lite/18.1/quartus"
-   export PATH=${QUARTUS_ROOTDIR}/bin:${PATH}
-   export PATH=${QUARTUS_ROOTDIR}/sopc_builder/bin:${PATH}
-
-This would add quartus binary path into your ``PATH`` environment variable, so you can launch compilation scripts from the command line.
-
-Chisel-based Custom VTA Bitstream Compilation for DE10-Nano
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Similar to the HLS-based design, high-level hardware parameters in Chisel-based design are listed in the VTA configuration file `Configs.scala <https://github.com/apache/tvm/blob/main/3rdparty/vta-hw/hardware/chisel/src/main/scala/core/Configs.scala>`_, and they can be customized by the user.
-
-For Intel FPGA, bitstream generation is driven by a top-level ``Makefile`` under ``<tvm root>/3rdparty/vta-hw/hardware/intel``.
-
-If you just want to generate the Chisel-based VTA IP core for the DE10-Nano board without compiling the design for the FPGA hardware, enter:
-
-.. code:: bash
-
-   cd <tvm root>/3rdparty/vta-hw/hardware/intel
-   make ip
-
-Then you'll be able to locate the generated verilog file at ``<tvm root>/3rdparty/vta-hw/build/hardware/intel/chisel/<configuration>/VTA.DefaultDe10Config.v``.
-
-If you would like to run the full hardware compilation for the ``de10nano`` board:
-
-.. code:: bash
-
-   make
-
-This process might be a bit lengthy, and might take up to half an hour to complete depending on the performance of your PC. The Quartus Prime software would automatically detect the number of cores available on your PC and try to utilize all of them to perform such process.
-
-Once the compilation completes, the generated bistream can be found under ``<tvm root>/3rdparty/vta-hw/build/hardware/intel/quartus/<configuration>/export/vta.rbf``. You can also open the Quartus project file (.qpf) available at ``<tvm root>/3rdparty/vta-hw/build/hardware/intel/quartus/<configuration>/de10_nano_top.qpf`` to look around the generated reports.
diff --git a/gallery/how_to/work_with_schedules/tensorize.py b/gallery/how_to/work_with_schedules/tensorize.py
index 63ba8299033c..8f7035511e1d 100644
--- a/gallery/how_to/work_with_schedules/tensorize.py
+++ b/gallery/how_to/work_with_schedules/tensorize.py
@@ -312,8 +312,6 @@ def _reduce_update():
 # Tensorize provides a way for users to get fully optimized schedule via micro-kernels.
 # For example, INT8 quantization on Intel CPUs uses tensorization
 # to invoke AVX instruction directly.
-# It also enables TVM to compile to ASICs -
-# checkout :ref:`vta-index` for details.
 # We also demonstrates how to use inline assembly importing,
 # which helps users inject asm easily into the schedule.
 #
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index c57ebfc88bd2..c1ea34cefe30 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -480,41 +480,29 @@ def _build_func_common(measure_input, runtime=None, checks=None, build_option=No
         if not config.valid():
             raise InstantiationError(config.errors)
 
-        # if target is vta, we need to use vta build
-        if (
-            hasattr(measure_input.target, "device_name")
-            and measure_input.target.device_name == "vta"
-        ):
-            # pylint: disable=import-outside-toplevel
-            import vta
+        current_pass_context: tvm.ir.transform.PassContext = tvm.ir.transform.PassContext.current()
+        current_config = dict(current_pass_context.config)
+        if build_option is not None:
+            current_config.update(build_option)
 
-            func = vta.build(s, args, target_host=task.target_host)
+        if "tir.add_lower_pass" in current_config:
+            current_add_lower_pass = list(current_config["tir.add_lower_pass"])
         else:
-            current_pass_context: tvm.ir.transform.PassContext = (
-                tvm.ir.transform.PassContext.current()
-            )
-            current_config = dict(current_pass_context.config)
-            if build_option is not None:
-                current_config.update(build_option)
-
-            if "tir.add_lower_pass" in current_config:
-                current_add_lower_pass = list(current_config["tir.add_lower_pass"])
-            else:
-                current_add_lower_pass = []
-            if checks.get("gpu"):
-                current_add_lower_pass.append((2, gpu_verify_pass(**checks.get("gpu"))))
-            if checks.get("hexagon"):
-                current_add_lower_pass.append((2, vtcm_verify_pass(**checks.get("hexagon"))))
-            current_config["tir.add_lower_pass"] = current_add_lower_pass
-
-            with tvm.ir.transform.PassContext(
-                opt_level=current_pass_context.opt_level,
-                required_pass=current_pass_context.required_pass,
-                disabled_pass=current_pass_context.disabled_pass,
-                instruments=current_pass_context.instruments,
-                config=current_config,
-            ):
-                func = build(s, args, target=target, runtime=runtime)
+            current_add_lower_pass = []
+        if checks.get("gpu"):
+            current_add_lower_pass.append((2, gpu_verify_pass(**checks.get("gpu"))))
+        if checks.get("hexagon"):
+            current_add_lower_pass.append((2, vtcm_verify_pass(**checks.get("hexagon"))))
+        current_config["tir.add_lower_pass"] = current_add_lower_pass
+
+        with tvm.ir.transform.PassContext(
+            opt_level=current_pass_context.opt_level,
+            required_pass=current_pass_context.required_pass,
+            disabled_pass=current_pass_context.disabled_pass,
+            instruments=current_pass_context.instruments,
+            config=current_config,
+        ):
+            func = build(s, args, target=target, runtime=runtime)
     return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
 
 
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 11f40ed62756..4ee92641917b 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -32,21 +32,10 @@
 logger = logging.getLogger("autotvm")
 
 
-# TODO(moreau89) find a more elegant way to lower for VTAs
 def _lower(mod, target, params, opt_level=3):
     """Helper to lower VTA properly."""
     # pylint: disable=import-outside-toplevel
     from tvm import relay
-    from tvm.relay.backend import graph_executor_codegen
-
-    if hasattr(target, "device_name") and target.device_name == "vta":
-        import vta
-
-        with vta.build_config(opt_level=opt_level, disabled_pass={"AlterOpLayout"}):
-            mod, _ = relay.optimize(mod, target=target, params=params)
-            grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
-            grc.codegen(mod, mod["main"])
-            return
 
     # Alter op layout code has been written expecting that tuning is applied
     # without it, so we disable AlterOpLayout to maintain that behavior.
diff --git a/python/tvm/target/__init__.py b/python/tvm/target/__init__.py
index 14bd4753d400..beaddf03c7fa 100644
--- a/python/tvm/target/__init__.py
+++ b/python/tvm/target/__init__.py
@@ -66,7 +66,6 @@
     intel_graphics,
     arm_cpu,
     rasp,
-    vta,
     bifrost,
     riscv_cpu,
     hexagon,
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index c4199c72c2ca..35ebaf46f067 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -607,12 +607,6 @@ def rasp(options=None):
     return arm_cpu("rasp3b", options)
 
 
-def vta(model="unknown", options=None):
-    opts = ["-device=vta", "-keys=vta,cpu", "-model=%s" % model]
-    opts = _merge_opts(opts, options)
-    return Target(" ".join(["ext_dev"] + opts))
-
-
 def bifrost(model="unknown", options=None):
     """Return an ARM Mali GPU target (Bifrost architecture).
 
diff --git a/tests/azure-pipelines/main.yml b/tests/azure-pipelines/main.yml
deleted file mode 100644
index 49d488aba5fd..000000000000
--- a/tests/azure-pipelines/main.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Azure pipeline
-# We use it to cover windows and mac build
-# Jenkins is still the primary CI
-
-name: $(Date:yyyyMMdd)$(Rev:.r)
-jobs:
-  - job: Windows_VS2017_x86
-    pool:
-      vmImage: 'vs2017-win2016'
-    steps:
-    - script: git submodule update --recursive --init
-      displayName: Initialize submodules
-    - script: mkdir build.common
-      displayName: Make Build Directory
-    - task: CMake@1
-      inputs:
-        workingDirectory: 'build.common'
-        cmakeArgs: >
-          -DUSE_SORT=ON
-          -DUSE_RPC=ON
-          -DUSE_GRAPH_EXECUTOR=ON
-          ..
-    - task: MSBuild@1
-      inputs:
-        solution: 'build.common/ALL_BUILD.vcxproj'
-        maximumCpuCount: true
-        configuration: 'Debug'
-  - job: Windows_VS2017_x64
-    pool:
-      vmImage: 'vs2017-win2016'
-    steps:
-    - script: git submodule update --recursive --init
-      displayName: Initialize submodules
-    - script: mkdir build.common
-      displayName: Make Build Directory
-    - task: CMake@1
-      inputs:
-        workingDirectory: 'build.common'
-        cmakeArgs: >
-          -DUSE_SORT=ON
-          -DUSE_RPC=ON
-          -DUSE_GRAPH_EXECUTOR=ON
-          ..
-    - task: MSBuild@1
-      inputs:
-        solution: 'build.common/ALL_BUILD.vcxproj'
-  - job: MacOS_XCode9
-    pool:
-      vmImage: 'xcode9-macos10.13'
-    steps:
-    - script: git submodule update --recursive --init
-      displayName: Initialize submodules
-    - script: mkdir build.common
-      displayName: Make Build Directory
-    - task: CMake@1
-      inputs:
-        workingDirectory: 'build.common'
-        cmakeArgs: >
-          -DUSE_SORT=ON
-          -DUSE_RPC=ON
-          -DUSE_GRAPH_EXECUTOR=ON
-          ..
-    - script: cd build.common && make -j`sysctl -n hw.ncpu`
-      displayName: Build the project
diff --git a/tests/lint/blocklint.sh b/tests/lint/blocklint.sh
index 7525bfa64cf7..8ced0b1bc189 100755
--- a/tests/lint/blocklint.sh
+++ b/tests/lint/blocklint.sh
@@ -26,7 +26,7 @@ do
     for subdir in $(find $dir -type d -print)
     do
       blocklint --blocklist blacklist,whitelist,white\ box,master\ ,\ master,master_,_master,slave $subdir \
-      --skip-files tests/lint/blocklint.sh,tests/lint/pylintrc,conda/recipe/meta.yaml,rust/tvm-sys/build.rs,docs/topic/vta/dev/hardware.rst,src/target/source/codegen_vhls.cc,tests/micro/zephyr/test_utils.py
+      --skip-files tests/lint/blocklint.sh,tests/lint/pylintrc,conda/recipe/meta.yaml,rust/tvm-sys/build.rs,src/target/source/codegen_vhls.cc,tests/micro/zephyr/test_utils.py
     done
   fi
 done
diff --git a/tests/lint/cpplint.sh b/tests/lint/cpplint.sh
index b948c91c1edd..39b86937adc9 100755
--- a/tests/lint/cpplint.sh
+++ b/tests/lint/cpplint.sh
@@ -18,8 +18,7 @@
 
 set -e
 
-echo "Running 2 cpplints (VTA and TVM)..."
-python3 3rdparty/dmlc-core/scripts/lint.py --quiet vta cpp vta/include vta/src
+echo "Running 2 cpplints..."
 python3 3rdparty/dmlc-core/scripts/lint.py --quiet tvm cpp \
 	include src \
 	examples/extension/src examples/graph_executor/src \
diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 2657723c1d18..2e0e0afa145b 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -18,7 +18,6 @@
 set -euxo pipefail
 
 python3 -m pylint python/tvm --rcfile="$(dirname "$0")"/pylintrc
-python3 -m pylint vta/python/vta --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/micro/test_crt.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/tvmscript/test_tvmscript_type.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/contrib/test_verilator/__init__.py b/tests/python/contrib/test_verilator/__init__.py
deleted file mode 100644
index 4838dc3f4371..000000000000
--- a/tests/python/contrib/test_verilator/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Infrastructure and tests for Verilator codegen """
diff --git a/tests/python/contrib/test_verilator/infrastructure.py b/tests/python/contrib/test_verilator/infrastructure.py
deleted file mode 100644
index 779f7872eb2b..000000000000
--- a/tests/python/contrib/test_verilator/infrastructure.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Verilator utility functions"""
-
-import os
-import sys
-import subprocess as sp
-import json
-
-import tvm
-from tvm import relay
-import tvm.relay.testing
-from tvm import runtime
-from tvm.relay import transform
-
-
-def _register_verilator_op(op_name, supported=True):
-    """The helper function to indicate that a given operator can be supported by Verilator.
-
-    Paramters
-    ---------
-    op_name : Str
-        The name of operator that will be registered.
-
-    Returns
-    -------
-    f : callable
-        A function that returns if the operator is supported by DNNL.
-    """
-
-    @tvm.ir.register_op_attr(op_name, "target.verilator")
-    def _func_wrapper(expr):
-        return supported
-
-    return _func_wrapper
-
-
-_register_verilator_op("add")
-_register_verilator_op("nn.bias_add")
-
-
-def skip_test():
-    """Skip test if it requires the Verilator codegen and it's not present."""
-    if not tvm.get_global_func("relay.ext.verilator", True):
-        print("Skip test because Verilator codegen is not available.")
-        return True
-    if sys.platform == "win32":
-        print("Skip test on Windows for now")
-        return True
-    return False
-
-
-def clear_stats():
-    """Clear profiler statistics."""
-    f = tvm.get_global_func("verilator.profiler_clear", True)
-    if f:
-        f()
-
-
-def stats():
-    """Get profiler statistics."""
-
-    x = tvm.get_global_func("verilator.profiler_status")()
-    return json.loads(x)
-
-
-def offload(mod):
-    """Offload ops based on the registered ops
-
-    Paramters
-    ---------
-    mod : Module
-        The input module.
-
-    Returns
-    -------
-    mod : Module
-        The output module with offloaded ops.
-    """
-
-    backend = "verilator"
-    mod = transform.AnnotateTarget([backend])(mod)
-    mod = transform.PartitionGraph()(mod)
-    return mod
-
-
-def verilator_app_path():
-    """Create verilator hardware app path."""
-
-    cur_dir = os.path.dirname(os.path.realpath(__file__))
-    return os.path.join(
-        cur_dir,
-        "..",
-        "..",
-        "..",
-        "..",
-        "3rdparty",
-        "vta-hw",
-        "apps",
-        "verilator",
-        "add",
-    )
-
-
-def compile_hardware(lanes):
-    """Compile hardware into shared library
-
-    Paramters
-    ---------
-    lanes : Int
-        The number of vector lanes.
-
-    Returns
-    -------
-    path : Str
-        The path of the shared library.
-    """
-    lib_name = "libverilator_{}".format(lanes)
-    lib_name_ext = "{}.so".format(lib_name)
-    lib = os.path.join(verilator_app_path(), lib_name_ext)
-    if not os.path.isfile(lib):
-        opt_lib_name = "LIB_NAME={}".format(lib_name)
-        opt_lanes = "LANES={}".format(lanes)
-        cmd = []
-        cmd.append("make")
-        cmd.append("--directory")
-        cmd.append(verilator_app_path())
-        cmd.append(opt_lib_name)
-        cmd.append(opt_lanes)
-        sp.run(cmd, check=True, stdout=sp.DEVNULL)
-    return lib
-
-
-def compiler_opts(lib):
-    """Create compiler options
-
-    Paramters
-    ---------
-    lib : Str
-        The path of the hardware shared library.
-
-    Returns
-    -------
-    opts : Dict
-        The compiler options.
-    """
-    opts = {
-        "lib_path": lib,
-        "profiler_enable": True,
-        "profiler_cycle_counter_id": 0,
-    }
-    return opts
-
-
-def run_module(inp, mod, params=None, opts=None):
-    """Compile Relay module and hardware library
-
-    Paramters
-    ---------
-    inp : Data
-        The input data.
-
-    mod : Module
-        The relay module.
-
-    params : Parameters
-        The model Parameters.
-
-    opts : Dict
-        The compiler
-
-    Returns
-    -------
-    out : Data
-        The output data.
-    """
-
-    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.verilator.options": opts}):
-        lib = relay.vm.compile(mod, target="llvm", params=params)
-    code, lib = lib.save()
-    exe = runtime.vm.Executable.load_exec(code, lib)
-    vm = runtime.vm.VirtualMachine(exe, tvm.cpu())
-    out = vm.run(**inp)
-    return out
diff --git a/tests/python/contrib/test_verilator/test_mobilenet.py b/tests/python/contrib/test_verilator/test_mobilenet.py
deleted file mode 100644
index 5728bc8bb25c..000000000000
--- a/tests/python/contrib/test_verilator/test_mobilenet.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tvm
-from tvm import te, relay, transform
-from tvm.contrib.download import download_testdata
-from tvm.contrib import graph_executor as runtime
-
-import os
-import pytest
-from PIL import Image
-import numpy as np
-
-from test_verilator.infrastructure import (
-    skip_test,
-    compile_hardware,
-    compiler_opts,
-    offload,
-    clear_stats,
-    stats,
-)
-
-
-def extract(path):
-    """Extract a tgz or gz file.
-
-    Paramters
-    ---------
-    path : Str
-        The path of the compressed file.
-    """
-    import tarfile
-
-    if path.endswith("tgz") or path.endswith("gz"):
-        dir_path = os.path.dirname(path)
-        tar = tarfile.open(path)
-        tar.extractall(path=dir_path)
-        tar.close()
-    else:
-        raise RuntimeError("Could not decompress the file: " + path)
-
-
-def get_real_image(im_height, im_width):
-    """Get a real image.
-
-    Paramters
-    ---------
-    im_height : Int
-        The image height.
-
-    im_width : Int
-        The image width.
-
-    Returns
-    -------
-    data: Data
-        The image array.
-    """
-    repo_base = "https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/"
-    img_name = "elephant-299.jpg"
-    image_url = os.path.join(repo_base, img_name)
-    img_path = download_testdata(image_url, img_name, module="data")
-    image = Image.open(img_path).resize((im_height, im_width))
-    x = np.array(image).astype("uint8")
-    data = np.reshape(x, (1, im_height, im_width, 3))
-    return data
-
-
-def get_mobilenet_model():
-    """Return mobilenet model."""
-    model_url = "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
-    model_path = download_testdata(
-        model_url, "mobilenet_v1_1.0_224_quant.tgz", module=["tf", "official"]
-    )
-    model_dir = os.path.dirname(model_path)
-    extract(model_path)
-    tflite_model_file = os.path.join(model_dir, "mobilenet_v1_1.0_224_quant.tflite")
-    tflite_model_buf = open(tflite_model_file, "rb").read()
-    try:
-        import tflite
-
-        return tflite.Model.GetRootAsModel(tflite_model_buf, 0)
-    except AttributeError:
-        import tflite.Model
-
-        return tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-
-
-def get_input_tensor_name():
-    """Return input name."""
-    return "input"
-
-
-def compile_model_to_relay(model):
-    """Compile model to relay.
-
-    Paramters
-    ---------
-    model : Model
-        The input model.
-
-    Returns
-    -------
-    mod: Module
-        The relay module.
-
-    params: Parameters
-        The model parameters.
-    """
-    input_tensor = get_input_tensor_name()
-    input_shape = (1, 224, 224, 3)
-    input_dtype = "uint8"
-    mod, params = relay.frontend.from_tflite(
-        model,
-        shape_dict={input_tensor: input_shape},
-        dtype_dict={input_tensor: input_dtype},
-    )
-    return mod, params
-
-
-def run_model(mod, params=None, opts=None):
-    """Run model.
-
-    Paramters
-    ---------
-    mod: Module
-        The relay module.
-
-    params: Parameters
-        The model parameters.
-
-    opts: Dict
-        The compiler options.
-
-    Returns
-    -------
-    out: Data
-        The output data.
-    """
-    with transform.PassContext(opt_level=3, config={"relay.ext.verilator.options": opts}):
-        lib = relay.build(mod, target="llvm", params=params)
-    module = runtime.GraphModule(lib["default"](tvm.cpu()))
-    image_data = get_real_image(224, 224)
-    input_tensor = get_input_tensor_name()
-    module.set_input(input_tensor, image_data)
-    module.run()
-    out = module.get_output(0).numpy()
-    return out
-
-
-def get_labels():
-    """Return labels."""
-    label_file_url = "".join(
-        [
-            "https://raw.githubusercontent.com/",
-            "tensorflow/tensorflow/master/tensorflow/lite/java/demo/",
-            "app/src/main/assets/",
-            "labels_mobilenet_quant_v1_224.txt",
-        ]
-    )
-    label_file = "labels_mobilenet_quant_v1_224.txt"
-    label_path = download_testdata(label_file_url, label_file, module="data")
-    # List of 1001 classes
-    with open(label_path) as f:
-        labels = f.readlines()
-    return labels
-
-
-def check_result(res):
-    """Check prediction."""
-    labels = get_labels()
-    predictions = np.squeeze(res)
-    prediction = np.argmax(predictions)
-    # 387 is the elephant
-    assert prediction == 387
-
-
-def print_test_info(lanes, cycles):
-    """Print test info
-
-    Paramters
-    ---------
-    lanes : Int
-        The number of vector lanes.
-
-    cycles : Int
-        The number of cycles.
-    """
-    print(
-        "[mobilenet] vector-lanes:{} number of cycles:{} spent in nn.bias_add".format(lanes, cycles)
-    )
-
-
-def is_tflite_available():
-    """Skip test if tensorflow-lite is not installed."""
-    try:
-        import tflite
-
-        return True
-    except:
-        return False
-
-
-@pytest.mark.skipif(skip_test(), reason="Skip because Verilator codegen is not available")
-def tmobilenet(lanes):
-    """Mobilenet test template.
-    Paramters
-    ---------
-    lanes : Int
-        The number of vector lanes.
-    """
-    if skip_test():
-        return
-    if not is_tflite_available():
-        return
-    model = get_mobilenet_model()
-    mod, params = compile_model_to_relay(model)
-    mod = offload(mod)
-    lib = compile_hardware(lanes)
-    opts = compiler_opts(lib)
-    clear_stats()
-    res = run_model(mod, params, opts)
-    values = stats()
-    check_result(res)
-    print_test_info(lanes, values["cycle_counter"])
-
-
-def test_mobilenet():
-    """Mobilenet tests."""
-    tmobilenet(4)
-    tmobilenet(32)
diff --git a/tests/python/contrib/test_verilator/test_verilator_ops.py b/tests/python/contrib/test_verilator/test_verilator_ops.py
deleted file mode 100644
index 29d54890b367..000000000000
--- a/tests/python/contrib/test_verilator/test_verilator_ops.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Verilator codegen tests"""
-
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import relay
-import pytest
-
-from test_verilator.infrastructure import (
-    skip_test,
-    compile_hardware,
-    compiler_opts,
-    run_module,
-    offload,
-    clear_stats,
-    stats,
-)
-
-
-def create_module_add(shape, dtype):
-    """Create add module.
-
-    Paramters
-    ---------
-    shape : Tuple
-        The shape tuple.
-
-    dtype : Str
-        The data type.
-
-    Returns
-    -------
-    mod: Module
-        The relay module.
-    """
-    x = relay.var("x", shape=shape, dtype=dtype)
-    y = relay.var("y", shape=shape, dtype=dtype)
-    z = relay.add(x, y)
-    f = relay.Function([x, y], z)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    return mod
-
-
-def create_module_bias_add(xshape, yshape, dtype):
-    """Create bias_add module.
-
-    Paramters
-    ---------
-    xshape : Tuple
-        The x shape tuple.
-
-    yshape : Tuple
-        The y shape tuple.
-
-    dtype : Str
-        The data type.
-
-    Returns
-    -------
-    mod: Module
-        The relay module.
-    """
-    x = relay.var("x", shape=xshape, dtype=dtype)
-    y = relay.var("y", shape=yshape, dtype=dtype)
-    z = relay.nn.bias_add(x, y, axis=3)
-    f = relay.Function([x, y], z)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    return mod
-
-
-def run_and_check(xshape, yshape, dtype, mod, opts):
-    """Run and check values.
-
-    Paramters
-    ---------
-    xshape : Tuple
-        The x shape tuple.
-
-    yshape : Tuple
-        The y shape tuple.
-
-    dtype : Str
-        The data type.
-
-    mod: Module
-        The relay module.
-
-    opts: Dict
-        The compiler options.
-
-    Returns
-    -------
-    cycles: Int
-        The number of cycles.
-    """
-    x_data = np.random.randint(5, size=xshape, dtype=dtype)
-    y_data = np.random.randint(5, size=yshape, dtype=dtype)
-    ref = x_data + y_data
-    inp = {"x": x_data, "y": y_data}
-    clear_stats()
-    out = run_module(inp, mod, params=None, opts=opts)
-    values = stats()
-    tvm.testing.assert_allclose(out.numpy(), ref, rtol=1e-5, atol=1e-5)
-    return values["cycle_counter"]
-
-
-def print_test_info(test, lanes, cycles):
-    """Print counter
-
-    Paramters
-    ---------
-    test : Str
-        The name of the test.
-
-    lanes : Int
-        The number of vector lanes.
-
-    cycles : Int
-        The number of cycles.
-    """
-    print("test:{} vector-lanes:{} number of cycles:{}".format(test, lanes, cycles))
-
-
-@pytest.mark.skipif(skip_test(), reason="Skip because Verilator codegen is not available")
-def tadd(lanes):
-    """Print counter
-
-    Paramters
-    ---------
-    lanes : Int
-        The number of vector lanes.
-    """
-    if skip_test():
-        return
-    dtype = "int32"
-    shape = (8, 4)
-    mod = create_module_add(shape, dtype)
-    mod = offload(mod)
-    lib = compile_hardware(lanes)
-    opts = compiler_opts(lib)
-    cycles = run_and_check(shape, shape, dtype, mod, opts)
-    print_test_info("add", lanes, cycles)
-
-
-@pytest.mark.skipif(skip_test(), reason="Skip because Verilator codegen is not available")
-def tbias(lanes):
-    """Print counter
-
-    Paramters
-    ---------
-    lanes : Int
-        The number of vector lanes.
-    """
-    if skip_test():
-        return
-    dtype = "int32"
-    xshape = (1, 112, 112, 32)
-    yshape = (32,)
-    mod = create_module_bias_add(xshape, yshape, dtype)
-    mod = offload(mod)
-    lib = compile_hardware(lanes)
-    opts = compiler_opts(lib)
-    cycles = run_and_check(xshape, yshape, dtype, mod, opts)
-    print_test_info("nn.bias_add", lanes, cycles)
-
-
-def test_add():
-    """add tests."""
-    tadd(1)
-    tadd(4)
-
-
-def test_bias_add():
-    """bias_add tests."""
-    tbias(1)
-    tbias(32)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/tests/python/target/test_target_target.py b/tests/python/target/test_target_target.py
index 1a52a46da1fc..fd79661ce632 100644
--- a/tests/python/target/test_target_target.py
+++ b/tests/python/target/test_target_target.py
@@ -19,7 +19,7 @@
 import pytest
 import tvm
 import tvm.testing
-from tvm.target import Target, arm_cpu, bifrost, cuda, intel_graphics, mali, rocm, vta
+from tvm.target import Target, arm_cpu, bifrost, cuda, intel_graphics, mali, rocm
 
 
 @tvm.target.generic_func
@@ -179,7 +179,7 @@ def test_target_llvm_jit_options():
 
 
 def test_target_create():
-    targets = [cuda(), rocm(), mali(), intel_graphics(), arm_cpu("rk3399"), vta(), bifrost()]
+    targets = [cuda(), rocm(), mali(), intel_graphics(), arm_cpu("rk3399"), bifrost()]
     for tgt in targets:
         assert tgt is not None
 
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index b9bc93343b6b..3c7c9a4b3a10 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -622,8 +622,6 @@ def add_subparser(
                 "run unit tests",
                 [
                     "./tests/scripts/task_python_unittest.sh",
-                    "./tests/scripts/task_python_vta_fsim.sh",
-                    "./tests/scripts/task_python_vta_tsim.sh",
                 ],
             ),
             "frontend": ("run frontend tests", ["./tests/scripts/task_python_frontend_cpu.sh"]),
diff --git a/tests/scripts/release/make_notes.py b/tests/scripts/release/make_notes.py
index 2835a7241ff7..8877d97253dd 100644
--- a/tests/scripts/release/make_notes.py
+++ b/tests/scripts/release/make_notes.py
@@ -71,7 +71,6 @@
     "rpc": "Misc",
     "transform": "Misc",
     "tophub": "Misc",
-    "vta": "Misc",
     "ux": "Misc",
     "APP": "Misc",
     "docker": "Docker",
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index 5fbc22aa297d..742436680208 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -39,13 +39,12 @@
     parser.add_argument("--cmake-target", help="optional build target")
     parser.add_argument("--debug", required=False, action="store_true", help="build in debug mode")
     args = parser.parse_args()
-
-    env = {"VTA_HW_PATH": str(Path(os.getcwd()) / "3rdparty" / "vta-hw")}
     sccache_exe = shutil.which("sccache")
 
     use_sccache = sccache_exe is not None
     build_dir = Path(os.getcwd()) / args.build_dir
     build_dir = build_dir.relative_to(REPO_ROOT)
+    env = {}
 
     if use_sccache:
         if args.sccache_bucket and "AWS_ACCESS_KEY_ID" in os.environ:
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
index 6aa53f510001..87a3ee24750a 100755
--- a/tests/scripts/task_config_build_arm.sh
+++ b/tests/scripts/task_config_build_arm.sh
@@ -30,7 +30,6 @@ echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-17\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
-echo set\(USE_VTA_FSIM ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR "/opt/acl"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 03f90c5ad4a1..530c44e170ef 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -39,7 +39,6 @@ echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake
 echo set\(USE_STACKVM_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_ANTLR ON\) >> config.cmake
-echo set\(USE_VTA_FSIM ON\) >> config.cmake
 echo set\(USE_BLAS openblas\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(USE_TENSORRT_CODEGEN ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index d4cdb7028e9c..86ab32c71b3c 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -31,6 +31,5 @@ echo set\(USE_LLVM llvm-config-15\) >> config.cmake
 echo set\(USE_ANTLR ON\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
-echo set\(USE_VTA_FSIM ON\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index bc152dfba34f..bd526db1857c 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -28,11 +28,6 @@ else
     BUILD_DIR=build
 fi
 
-
-# NOTE: important to use abspath, when VTA is enabled.
-VTA_HW_PATH=$(pwd)/3rdparty/vta-hw
-export VTA_HW_PATH
-
 # to avoid CI thread throttling.
 export TVM_BIND_THREADS=0
 export OMP_NUM_THREADS=1
diff --git a/tests/scripts/task_microtvm_cpp_tests.sh b/tests/scripts/task_microtvm_cpp_tests.sh
index 03628c1d0d4d..ce4c62ecee0c 100755
--- a/tests/scripts/task_microtvm_cpp_tests.sh
+++ b/tests/scripts/task_microtvm_cpp_tests.sh
@@ -24,9 +24,6 @@ BUILD_DIR=$1
 source tests/scripts/setup-pytest-env.sh
 
 export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
-# NOTE: important to use abspath, when VTA is enabled.
-VTA_HW_PATH=$(pwd)/3rdparty/vta-hw
-export VTA_HW_PATH
 
 # to avoid CI thread throttling.
 export TVM_BIND_THREADS=0
diff --git a/tests/scripts/task_python_vta_fsim.sh b/tests/scripts/task_python_vta_fsim.sh
index 14004361ee08..6edd950cc7c2 100755
--- a/tests/scripts/task_python_vta_fsim.sh
+++ b/tests/scripts/task_python_vta_fsim.sh
@@ -15,35 +15,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-set -euxo pipefail
-
-source tests/scripts/setup-pytest-env.sh
-# to avoid CI thread throttling.
-export TVM_BIND_THREADS=0
-export OMP_NUM_THREADS=1
-
-export PYTHONPATH=${PYTHONPATH}:${TVM_PATH}/vta/python
-export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
-
-# disable fsim test for now
-exit 0
-
-# cleanup pycache
-find . -type f -path "*.pyc" | xargs rm -f
-
-rm -rf ~/.tvm
-
-# Rebuild cython
-make cython3
-
-# Reset default fsim simulation
-cp ${VTA_HW_PATH}/config/fsim_sample.json ${VTA_HW_PATH}/config/vta_config.json
-
-# Run unit tests in functional/fast simulator
-echo "Running unittest in fsim..."
-run_pytest cython python-vta-fsim-unittest ${TVM_PATH}/vta/tests/python/unittest
-
-# Run unit tests in functional/fast simulator
-echo "Running integration test in fsim..."
-run_pytest cython python-vta-fsim-integration ${TVM_PATH}/vta/tests/python/integration
diff --git a/tests/scripts/task_python_vta_tsim.sh b/tests/scripts/task_python_vta_tsim.sh
index d6a181fb570c..6edd950cc7c2 100755
--- a/tests/scripts/task_python_vta_tsim.sh
+++ b/tests/scripts/task_python_vta_tsim.sh
@@ -15,53 +15,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-set -euxo pipefail
-
-source tests/scripts/setup-pytest-env.sh
-export PYTHONPATH=${PYTHONPATH}:${TVM_PATH}/vta/python
-export VTA_HW_PATH=`pwd`/3rdparty/vta-hw
-
-# to avoid CI CPU thread throttling.
-export TVM_BIND_THREADS=0
-export OMP_NUM_THREADS=1
-
-# temporary skip tsim test, enable later
-exit 0
-
-# cleanup pycache
-find . -type f -path "*.pyc" | xargs rm -f
-
-rm -rf ~/.tvm
-
-# Rebuild cython
-make cython3
-
-# Set default VTA config to use TSIM cycle accurate sim
-cp ${VTA_HW_PATH}/config/tsim_sample.json ${VTA_HW_PATH}/config/vta_config.json
-
-# Build and run the TSIM apps (disable until refactor is complete)
-# echo "Test the TSIM apps..."
-# make -C ${VTA_HW_PATH}/apps/tsim_example/ run_verilog
-# make -C ${VTA_HW_PATH}/apps/tsim_example/ run_chisel
-# make -C ${VTA_HW_PATH}/apps/gemm/ default
-
-# Check style of scala code
-echo "Check style of scala code..."
-make -C ${VTA_HW_PATH}/hardware/chisel lint
-
-# Build VTA chisel design and verilator simulator
-echo "Building VTA chisel design..."
-make -C ${VTA_HW_PATH}/hardware/chisel cleanall
-make -C ${VTA_HW_PATH}/hardware/chisel USE_THREADS=0 lib
-
-# Run unit tests in cycle accurate simulator
-echo "Running unittest in tsim..."
-run_pytest cython python-vta-tsim-unittest ${TVM_PATH}/vta/tests/python/unittest
-
-# Run unit tests in cycle accurate simulator
-echo "Running integration test in tsim..."
-run_pytest cython python-vta-tsim-integration ${TVM_PATH}/vta/tests/python/integration
-
-# Reset default fsim simulation
-cp ${VTA_HW_PATH}/config/fsim_sample.json ${VTA_HW_PATH}/config/vta_config.json
diff --git a/vta/README.md b/vta/README.md
deleted file mode 100644
index debf2482b479..000000000000
--- a/vta/README.md
+++ /dev/null
@@ -1,33 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-VTA: Open, Modular, Deep Learning Accelerator Stack
-===================================================
-VTA (versatile tensor accelerator) is an open-source deep learning accelerator complemented with an end-to-end TVM-based compiler stack.
-
-The key features of VTA include:
-
-- Generic, modular, open-source hardware
-  - Streamlined workflow to deploy to FPGAs.
-  - Simulator support to prototype compilation passes on regular workstations.
-- Driver and JIT runtime for both simulator and FPGA hardware back-end.
-- End-to-end TVM stack integration
-  - Direct optimization and deployment of models from deep learning frameworks via TVM.
-  - Customized and extensible TVM compiler back-end.
-  - Flexible RPC support to ease deployment, and program FPGAs with the convenience of Python.
-
-Learn more about VTA [here](https://tvm.apache.org/docs/vta/index.html).
diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py
deleted file mode 100644
index af840c9c55f3..000000000000
--- a/vta/python/vta/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""VTA Package is a TVM backend extension to support VTA hardware.
-
-Besides the compiler toolchain, it also includes utility functions to
-configure the hardware environment and access remote device through RPC.
-"""
-import sys
-import tvm._ffi.base
-
-from .autotvm import module_loader
-from .bitstream import get_bitstream_path, download_bitstream
-from .environment import get_env, Environment
-from .rpc_client import reconfig_runtime, program_fpga
-
-__version__ = "0.1.0"
-
-
-# do not from tvm import topi when running vta.exec.rpc_server
-# in lib tvm runtime only mode
-if not tvm._ffi.base._RUNTIME_ONLY:
-    from . import top
-    from .build_module import build_config, lower, build
diff --git a/vta/python/vta/autotvm.py b/vta/python/vta/autotvm.py
deleted file mode 100644
index 285e30923b13..000000000000
--- a/vta/python/vta/autotvm.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Defines AutoTVM components used with VTA."""
-
-from tvm.autotvm.measure import default_module_loader
-from . import rpc_client
-
-
-def module_loader(bitstream=None):
-    """Construct a ModuleLoader implementation specialized for VTA.
-
-    Parameters
-    ----------
-    bitsream : Optional[str]
-        Path to the bitstream to write prior to uploading code.
-
-    Returns
-    -------
-    ModuleLoader :
-        The ModuleLoader instance.
-    """
-
-    def reprogram_fpga(remote, _build_result):
-        """default_module_loader callback which reprograms the FPGA.
-
-        Parameters
-        ----------
-        remote : tvm.rpc.RPCSession
-            RPC session established to the remote device.
-
-        _build_result : tvm.autotvm.measure.measure_methods.BuildResult
-            Artifact from the build phase, unused here.
-        """
-        rpc_client.program_fpga(remote, bitstream)
-        rpc_client.reconfig_runtime(remote)
-
-    return default_module_loader(reprogram_fpga)
diff --git a/vta/python/vta/bitstream.py b/vta/python/vta/bitstream.py
deleted file mode 100644
index 3f7064061c06..000000000000
--- a/vta/python/vta/bitstream.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA specific bitstream management library."""
-from __future__ import absolute_import as _abs
-
-import os
-import sys
-
-from tvm.contrib.download import download
-from .environment import get_env
-
-if sys.version_info >= (3,):
-    import urllib.error as urllib2
-else:
-    import urllib2
-
-# bitstream repo
-BITSTREAM_URL = "https://github.com/uwsampl/vta-distro/raw/master/bitstreams/"
-
-
-def get_bitstream_path():
-    """Returns the path to the cached bitstream corresponding to the current config
-
-    Returns
-    -------
-    bit_path: str
-        Corresponding to the filepath of the bitstream
-    """
-
-    env = get_env()
-
-    # Derive destination path
-    cache_dir = os.getenv("VTA_CACHE_PATH", os.path.join(os.getenv("HOME"), ".vta_cache/"))
-    cache_dir = os.path.join(cache_dir, env.TARGET)
-    cache_dir = os.path.join(cache_dir, env.HW_VER.replace(".", "_"))
-    # Create the directory if it didn't exist
-    if not os.path.exists(cache_dir):
-        os.makedirs(cache_dir)
-    bit_path = os.path.join(cache_dir, env.BITSTREAM) + ".bit"
-
-    return bit_path
-
-
-def download_bitstream():
-    """Downloads a cached bitstream corresponding to the current config"""
-
-    env = get_env()
-
-    success = False
-    bit = get_bitstream_path()
-    url = os.path.join(BITSTREAM_URL, env.TARGET)
-    url = os.path.join(url, env.HW_VER)
-    url = os.path.join(url, env.BITSTREAM + ".bit")
-
-    try:
-        download(url, bit)
-    except urllib2.HTTPError as err:
-        if err.code == 404:
-            raise RuntimeError(
-                # Raise error - the solution when this happens it to build your
-                # own bitstream and add it to your $VTA_CACHE_PATH
-                "{} is not available. It appears that this configuration \
-bistream has not been cached. Please compile your own bitstream (see hardware \
-compilation guide to get Xilinx toolchains setup) and add it to your \
-$VTA_CACHE_PATH. Alternatively edit your config.json back to its default \
-settings. You can see the list of available bitstreams under {}".format(
-                    url, BITSTREAM_URL
-                )
-            )
-        raise RuntimeError(
-            # This could happen when trying to access the URL behind a proxy
-            "Something went wrong when trying to access {}. Check your \
-internet connection or proxy settings.".format(
-                url
-            )
-        )
-
-    return success
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
deleted file mode 100644
index 8ced8e5ce494..000000000000
--- a/vta/python/vta/build_module.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, invalid-name
-"""VTA specific buildin for runtime."""
-import tvm
-from tvm.ir import register_intrin_lowering
-from . import transform
-from .environment import get_env, Environment
-
-
-def EarlyRewrite():
-    """Try to do storage rewrite in early pass."""
-
-    def _transform(mod, ctx):
-        try:
-            return tvm.tir.transform.StorageRewrite()(mod)
-        except tvm.error.TVMError:
-            return mod
-
-    return tvm.transform.module_pass(_transform, opt_level=0, name="tir.vta.EarlyRewrite")
-
-
-def build_config(debug_flag=0, **kwargs):
-    """Build a build config for VTA.
-
-    Parameters
-    ----------
-    debug_flag : int
-        The dbeug flag to be passed.
-
-    kwargs : dict
-        Additional configurations.
-
-    Returns
-    -------
-    build_config: tvm.transform.PassContext
-        The build config that can be used in TVM.
-
-    Example
-    --------
-    .. code-block:: python
-
-      # build a vta module.
-      with vta.build_config():
-          vta_module = tvm.build(s, ...)
-    """
-    env = get_env()
-
-    @tvm.tir.transform.prim_func_pass(opt_level=0)
-    def add_debug(f, *_):
-        debug = tvm.tir.call_extern("int32", "VTASetDebugMode", env.dev.command_handle, debug_flag)
-
-        return f.with_body(tvm.tir.stmt_seq(debug, f.body))
-
-    pass_list = [
-        (0, transform.InjectConv2DTransposeSkip()),
-        (1, transform.InjectDMAIntrin()),
-        (1, transform.InjectSkipCopy()),
-        (1, transform.AnnotateALUCoProcScope()),
-        (1, tvm.tir.transform.LiftAttrScope("coproc_uop_scope")),
-        (1, transform.LiftAllocToScopeBegin()),
-        (1, tvm.tir.transform.LiftAttrScope("coproc_scope")),
-        (1, transform.InjectCoProcSync()),
-        (1, EarlyRewrite()),
-    ]
-    if debug_flag:
-        pass_list.append((1, add_debug))
-    pass_list.append((2, transform.InjectALUIntrin()))
-    pass_list.append((3, tvm.tir.transform.LowerDeviceStorageAccessInfo()))
-    pass_list.append((3, transform.FoldUopLoop()))
-    pass_list.append((3, transform.CPUAccessRewrite()))
-    config = {"tir.add_lower_pass": pass_list}
-    if kwargs.get("config"):
-        config.update(kwargs[config])
-        del kwargs["config"]
-
-    return tvm.transform.PassContext(config=config, **kwargs)
-
-
-def lower(*args, **kwargs):
-    """Thin wrapper of tvm.lower
-
-    This wrapper automatically applies VTA's build_config
-    if there is no user specified build_config in context.
-
-    See Also
-    --------
-    tvm.lower : The original TVM's lower function
-    """
-    pass_ctx = tvm.transform.PassContext.current()
-    if not pass_ctx.config.get("add_lower_pass"):
-        with build_config():
-            return tvm.lower(*args, **kwargs)
-    return tvm.lower(*args, **kwargs)
-
-
-def build(*args, **kwargs):
-    """Thin wrapper of tvm.build
-
-    This wrapper automatically applies VTA's build_config
-    if there is no user specified build_config in context.
-
-    See Also
-    --------
-    tvm.build : The original TVM's build function
-    """
-    pass_ctx = tvm.transform.PassContext.current()
-    if not pass_ctx.config.get("tir.add_lower_pass"):
-        with build_config():
-            return tvm.build(*args, **kwargs)
-    return tvm.build(*args, **kwargs)
-
-
-# Register key ops
-tvm.ir.register_op_attr("tir.vta.coproc_sync", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
-tvm.ir.register_op_attr("tir.vta.coproc_dep_push", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
-tvm.ir.register_op_attr("tir.vta.coproc_dep_pop", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
-
-tvm.ir.register_op_attr("tir.vta.uop_push", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
-tvm.ir.register_op_attr("tir.vta.uop_push", "TGlobalSymbol", "VTAUopPush")
-
-tvm.ir.register_op_attr("tir.vta.command_handle", "TGlobalSymbol", "VTATLSCommandHandle")
-tvm.ir.register_op_attr("tir.vta.command_handle", "TCallEffectKind", tvm.tir.CallEffectKind.Opaque)
-
-# The memory information for the compiler
-@tvm.register_func("tvm.info.mem.%s" % Environment.inp_scope)
-def mem_info_inp_buffer():
-    spec = get_env()
-    return tvm.ir.make_node(
-        "MemoryInfo",
-        unit_bits=spec.INP_ELEM_BITS,
-        max_simd_bits=spec.INP_ELEM_BITS,
-        max_num_bits=spec.INP_BUFF_SIZE * 8,
-        head_address=None,
-    )
-
-
-@tvm.register_func("tvm.info.mem.%s" % Environment.wgt_scope)
-def mem_info_wgt_buffer():
-    spec = get_env()
-    return tvm.ir.make_node(
-        "MemoryInfo",
-        unit_bits=spec.WGT_ELEM_BITS,
-        max_simd_bits=spec.WGT_ELEM_BITS,
-        max_num_bits=spec.WGT_BUFF_SIZE * 8,
-        head_address=None,
-    )
-
-
-@tvm.register_func("tvm.info.mem.%s" % Environment.acc_scope)
-def mem_info_acc_buffer():
-    spec = get_env()
-    return tvm.ir.make_node(
-        "MemoryInfo",
-        unit_bits=spec.ACC_ELEM_BITS,
-        max_simd_bits=spec.ACC_ELEM_BITS,
-        max_num_bits=spec.ACC_BUFF_SIZE * 8,
-        head_address=None,
-    )
-
-
-# TVM Op related registration
-@register_intrin_lowering("tir.vta.coproc_sync", "default")
-def coproc_sync(op):
-    _ = op
-    return tvm.tir.call_extern(
-        "int32",
-        "VTASynchronize",
-        get_env().dev.command_handle,
-        tvm.runtime.const(1 << 31, dtype="uint32"),
-    )
-
-
-@register_intrin_lowering("tir.vta.coproc_dep_push", "default")
-def coproc_dep_push(op):
-    return tvm.tir.call_extern(
-        "int32", "VTADepPush", get_env().dev.command_handle, op.args[0], op.args[1]
-    )
-
-
-@register_intrin_lowering("tir.vta.coproc_dep_pop", "default")
-def coproc_dep_pop(op):
-    return tvm.tir.call_extern(
-        "int32", "VTADepPop", get_env().dev.command_handle, op.args[0], op.args[1]
-    )
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
deleted file mode 100644
index 087c7e852cf6..000000000000
--- a/vta/python/vta/environment.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Configurable VTA Hareware Environment scope."""
-# pylint: disable=invalid-name, exec-used
-from __future__ import absolute_import as _abs
-
-import os
-import json
-import copy
-import tvm
-from tvm import te
-from . import intrin
-
-
-def get_vta_hw_path():
-    """Get the VTA HW path."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    vta_hw_default = os.path.abspath(os.path.join(curr_path, "../../../3rdparty/vta-hw"))
-    VTA_HW_PATH = os.getenv("VTA_HW_PATH", vta_hw_default)
-    return os.path.abspath(VTA_HW_PATH)
-
-
-def pkg_config(cfg):
-    """Returns PkgConfig pkg config object."""
-    pkg_config_py = os.path.join(get_vta_hw_path(), "config/pkg_config.py")
-    libpkg = {"__file__": pkg_config_py}
-    exec(compile(open(pkg_config_py, "rb").read(), pkg_config_py, "exec"), libpkg, libpkg)
-    PkgConfig = libpkg["PkgConfig"]
-    return PkgConfig(cfg)
-
-
-class DevContext(object):
-    """Internal development context
-
-    This contains all the non-user facing compiler
-    internal context that is hold by the Environment.
-
-    Parameters
-    ----------
-    env : Environment
-        The environment hosting the DevContext
-
-    Note
-    ----
-    This class is introduced so we have a clear separation
-    of developer related, and user facing attributes.
-    """
-
-    # Memory id for DMA
-    MEM_ID_UOP = 0
-    MEM_ID_WGT = 1
-    MEM_ID_INP = 2
-    MEM_ID_ACC = 3
-    MEM_ID_OUT = 4
-    MEM_ID_ACC_8BIT = 5
-    # VTA ALU Opcodes
-    ALU_OPCODE_MIN = 0
-    ALU_OPCODE_MAX = 1
-    ALU_OPCODE_ADD = 2
-    ALU_OPCODE_SHR = 3
-    ALU_OPCODE_MUL = 4
-    # Task queue id (pipeline stage)
-    QID_LOAD_INP = 1
-    QID_LOAD_WGT = 1
-    QID_LOAD_OUT = 2
-    QID_STORE_OUT = 3
-    QID_COMPUTE = 2
-
-    def __init__(self, env):
-        self.vta_axis = te.thread_axis("vta")
-        self.vta_push_uop = tvm.tir.StringImm("VTAPushGEMMOp")
-        ctx = tvm.tir.call_intrin("handle", "tir.vta.command_handle")
-        self.command_handle = tvm.tir.Call("handle", "tir.tvm_thread_context", [ctx])
-        self.DEBUG_NO_SYNC = False
-        env._dev_ctx = self
-        self.gemm = intrin.gemm(env, env.mock_mode)
-
-    def get_task_qid(self, qid):
-        """Get transformed queue index."""
-        return 1 if self.DEBUG_NO_SYNC else qid
-
-
-class Environment(object):
-    """Hardware configuration object.
-
-    This object contains all the information
-    needed for compiling to a specific VTA backend.
-
-    Parameters
-    ----------
-    cfg : dict of str to value.
-        The configuration parameters.
-
-    Example
-    --------
-    .. code-block:: python
-
-      # the following code reconfigures the environment
-      # temporarily to attributes specified in new_cfg.json
-      new_cfg = json.load(json.load(open("new_cfg.json")))
-      with vta.Environment(new_cfg):
-          # env works on the new environment
-          env = vta.get_env()
-    """
-
-    current = None
-    # constants
-    MAX_XFER = 1 << 22
-    # debug flags
-    DEBUG_DUMP_INSN = 1 << 1
-    DEBUG_DUMP_UOP = 1 << 2
-    DEBUG_SKIP_READ_BARRIER = 1 << 3
-    DEBUG_SKIP_WRITE_BARRIER = 1 << 4
-    # memory scopes
-    inp_scope = "local.inp_buffer"
-    wgt_scope = "local.wgt_buffer"
-    acc_scope = "local.acc_buffer"
-
-    # initialization function
-    def __init__(self, cfg):
-        # Produce the derived parameters and update dict
-        self.pkg = pkg_config(cfg)
-        self.__dict__.update(self.pkg.cfg_dict)
-        # data type width
-        self.INP_WIDTH = 1 << self.LOG_INP_WIDTH
-        self.WGT_WIDTH = 1 << self.LOG_WGT_WIDTH
-        self.ACC_WIDTH = 1 << self.LOG_ACC_WIDTH
-        self.OUT_WIDTH = 1 << self.LOG_OUT_WIDTH
-        # tensor intrinsic shape
-        self.BATCH = 1 << self.LOG_BATCH
-        self.BLOCK_IN = 1 << self.LOG_BLOCK_IN
-        self.BLOCK_OUT = 1 << self.LOG_BLOCK_OUT
-        # buffer size
-        self.UOP_BUFF_SIZE = 1 << self.LOG_UOP_BUFF_SIZE
-        self.INP_BUFF_SIZE = 1 << self.LOG_INP_BUFF_SIZE
-        self.WGT_BUFF_SIZE = 1 << self.LOG_WGT_BUFF_SIZE
-        self.ACC_BUFF_SIZE = 1 << self.LOG_ACC_BUFF_SIZE
-        self.OUT_BUFF_SIZE = 1 << self.LOG_OUT_BUFF_SIZE
-        # bytes per buffer
-        self.INP_ELEM_BITS = self.BATCH * self.BLOCK_IN * self.INP_WIDTH
-        self.WGT_ELEM_BITS = self.BLOCK_OUT * self.BLOCK_IN * self.WGT_WIDTH
-        self.ACC_ELEM_BITS = self.BATCH * self.BLOCK_OUT * self.ACC_WIDTH
-        self.OUT_ELEM_BITS = self.BATCH * self.BLOCK_OUT * self.OUT_WIDTH
-        self.INP_ELEM_BYTES = self.INP_ELEM_BITS // 8
-        self.WGT_ELEM_BYTES = self.WGT_ELEM_BITS // 8
-        self.ACC_ELEM_BYTES = self.ACC_ELEM_BITS // 8
-        self.OUT_ELEM_BYTES = self.OUT_ELEM_BITS // 8
-        # dtypes
-        self.acc_dtype = "int%d" % self.ACC_WIDTH
-        self.inp_dtype = "int%d" % self.INP_WIDTH
-        self.wgt_dtype = "int%d" % self.WGT_WIDTH
-        self.out_dtype = "int%d" % self.OUT_WIDTH
-        # bistream name
-        self.BITSTREAM = self.pkg.bitstream
-        # model string
-        self.MODEL = self.TARGET + "_" + self.BITSTREAM
-        # lazy cached members
-        self.mock_mode = False
-        self._mock_env = None
-        self._dev_ctx = None
-        self._last_env = None
-
-    def __enter__(self):
-        self._last_env = Environment.current
-        Environment.current = self
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        Environment.current = self._last_env
-
-    @property
-    def cfg_dict(self):
-        return self.pkg.cfg_dict
-
-    @property
-    def dev(self):
-        """Developer context"""
-        if self._dev_ctx is None:
-            self._dev_ctx = DevContext(self)
-        return self._dev_ctx
-
-    @property
-    def mock(self):
-        """A mock version of the Environment
-
-        The ALU, dma_copy and intrinsics will be
-        mocked to be nop.
-        """
-        if self.mock_mode:
-            return self
-        if self._mock_env is None:
-            self._mock_env = copy.copy(self)
-            self._mock_env._dev_ctx = None
-            self._mock_env.mock_mode = True
-        return self._mock_env
-
-    @property
-    def dma_copy(self):
-        """DMA copy pragma"""
-        return "dma_copy" if not self.mock_mode else "skip_dma_copy"
-
-    @property
-    def alu(self):
-        """ALU pragma"""
-        return "alu" if not self.mock_mode else "skip_alu"
-
-    @property
-    def gemm(self):
-        """GEMM intrinsic"""
-        return self.dev.gemm
-
-    @property
-    def target(self):
-        return tvm.target.vta(model=self.MODEL)
-
-    @property
-    def target_host(self):
-        """The target host"""
-        if self.TARGET in ["pynq", "de10nano"]:
-            return "llvm -mtriple=armv7-none-linux-gnueabihf"
-        if self.TARGET == "ultra96":
-            return "llvm -mtriple=aarch64-linux-gnu"
-        if self.TARGET in ["sim", "tsim", "intelfocl"]:
-            return "llvm"
-        raise ValueError("Unknown target %s" % self.TARGET)
-
-    @property
-    def target_vta_cpu(self):
-        return tvm.target.arm_cpu(model=self.TARGET)
-
-
-def get_env():
-    """Get the current VTA Environment.
-
-    Returns
-    -------
-    env : Environment
-        The current environment.
-    """
-    return Environment.current
-
-
-def _init_env():
-    """Initialize the default global env"""
-    config_path = os.path.join(get_vta_hw_path(), "config/vta_config.json")
-    if not os.path.exists(config_path):
-        raise RuntimeError("Cannot find config in %s" % str(config_path))
-    cfg = json.load(open(config_path))
-    return Environment(cfg)
-
-
-Environment.current = _init_env()
diff --git a/vta/python/vta/exec/__init__.py b/vta/python/vta/exec/__init__.py
deleted file mode 100644
index 8297ff04097d..000000000000
--- a/vta/python/vta/exec/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""VTA Command line utils."""
diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py
deleted file mode 100644
index 1abad98b2216..000000000000
--- a/vta/python/vta/exec/rpc_server.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA customized TVM RPC Server
-
-Provides additional runtime function and library loading.
-"""
-from __future__ import absolute_import
-
-import logging
-import argparse
-import os
-import ctypes
-import json
-import tvm
-from tvm import rpc
-from tvm.contrib import cc
-from vta import program_bitstream
-
-from ..environment import get_env, pkg_config
-from ..libinfo import find_libvta
-
-
-def server_start():
-    """VTA RPC server extension."""
-    # pylint: disable=unused-variable
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    proj_root = os.path.abspath(os.path.join(curr_path, "../../../../"))
-    dll_path = find_libvta("libvta")[0]
-    cfg_path = os.path.abspath(os.path.join(proj_root, "3rdparty/vta-hw/config/vta_config.json"))
-    runtime_dll = []
-    _load_module = tvm.get_global_func("tvm.rpc.server.load_module")
-
-    def load_vta_dll():
-        """Try to load vta dll"""
-        if not runtime_dll:
-            runtime_dll.append(ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL))
-        logging.info("Loading VTA library: %s", dll_path)
-        return runtime_dll[0]
-
-    @tvm.register_func("tvm.rpc.server.load_module", override=True)
-    def load_module(file_name):
-        load_vta_dll()
-        return _load_module(file_name)
-
-    @tvm.register_func("device_api.ext_dev")
-    def ext_dev_callback():
-        load_vta_dll()
-        return tvm.get_global_func("device_api.ext_dev")()
-
-    @tvm.register_func("tvm.contrib.vta.init", override=True)
-    def program_fpga(file_name):
-        # pylint: disable=import-outside-toplevel
-        env = get_env()
-        if env.TARGET == "pynq":
-            from pynq import xlnk
-
-            # Reset xilinx driver
-            xlnk.Xlnk().xlnk_reset()
-        elif env.TARGET == "de10nano":
-            # Load the de10nano program function.
-            load_vta_dll()
-        path = tvm.get_global_func("tvm.rpc.server.workpath")(file_name)
-        program_bitstream.bitstream_program(env.TARGET, path)
-        logging.info("Program FPGA with %s ", file_name)
-
-    @tvm.register_func("tvm.rpc.server.shutdown", override=True)
-    def server_shutdown():
-        if runtime_dll:
-            runtime_dll[0].VTARuntimeShutdown()
-            runtime_dll.pop()
-
-    @tvm.register_func("tvm.contrib.vta.reconfig_runtime", override=True)
-    def reconfig_runtime(cfg_json):
-        """Rebuild and reload runtime with new configuration.
-
-        Parameters
-        ----------
-        cfg_json : str
-            JSON string used for configurations.
-        """
-        env = get_env()
-        if runtime_dll:
-            if env.TARGET == "de10nano":
-                print("Please reconfigure the runtime AFTER programming a bitstream.")
-            raise RuntimeError("Can only reconfig in the beginning of session...")
-        cfg = json.loads(cfg_json)
-        cfg["TARGET"] = env.TARGET
-        pkg = pkg_config(cfg)
-        # check if the configuration is already the same
-        if os.path.isfile(cfg_path):
-            old_cfg = json.loads(open(cfg_path, "r").read())
-            if pkg.same_config(old_cfg):
-                logging.info("Skip reconfig_runtime due to same config.")
-                return
-        cflags = ["-O2", "-std=c++17"]
-        cflags += pkg.cflags
-        ldflags = pkg.ldflags
-        lib_name = dll_path
-        source = pkg.lib_source
-        logging.info(
-            "Rebuild runtime:\n output=%s,\n cflags=%s,\n source=%s,\n ldflags=%s",
-            dll_path,
-            "\n\t".join(cflags),
-            "\n\t".join(source),
-            "\n\t".join(ldflags),
-        )
-        cc.create_shared(lib_name, source, cflags + ldflags)
-        with open(cfg_path, "w") as outputfile:
-            outputfile.write(pkg.cfg_json)
-
-
-def main():
-    """Main funciton"""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--host", type=str, default="0.0.0.0", help="The host IP address the server binds to"
-    )
-    parser.add_argument("--port", type=int, default=9091, help="The port of the RPC")
-    parser.add_argument("--port-end", type=int, default=9199, help="The end search port of the RPC")
-    parser.add_argument(
-        "--key", type=str, default="", help="RPC key used to identify the connection type."
-    )
-    parser.add_argument("--tracker", type=str, default="", help="Report to RPC tracker")
-    args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO)
-
-    if args.tracker:
-        url, port = args.tracker.split(":")
-        port = int(port)
-        tracker_addr = (url, port)
-        if not args.key:
-            raise RuntimeError("Need key to present type of resource when tracker is available")
-    else:
-        tracker_addr = None
-
-    # register the initialization callback
-    def server_init_callback():
-        # pylint: disable=redefined-outer-name, reimported, import-outside-toplevel, import-self
-        import tvm
-        import vta.exec.rpc_server
-
-        tvm.register_func("tvm.rpc.server.start", vta.exec.rpc_server.server_start, override=True)
-
-    server = rpc.Server(
-        args.host,
-        args.port,
-        args.port_end,
-        key=args.key,
-        tracker_addr=tracker_addr,
-        server_init_callback=server_init_callback,
-    )
-    server.proc.join()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/vta/python/vta/intrin.py b/vta/python/vta/intrin.py
deleted file mode 100644
index 52bf5869f9cc..000000000000
--- a/vta/python/vta/intrin.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA related intrinsics"""
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import te
-
-
-def gemm(env, mock=False):
-    """Matrix-matrix multiply intrinsic
-
-    Parameters
-    ----------
-    env : Environment
-        The Environment
-
-    mock : bool
-        Whether create a mock version.
-    """
-    wgt_lanes = env.WGT_ELEM_BITS // env.WGT_WIDTH
-    assert wgt_lanes == env.BLOCK_OUT * env.BLOCK_IN
-    wgt_shape = (env.BLOCK_OUT, env.BLOCK_IN)
-    assert wgt_shape[0] * wgt_shape[1] == wgt_lanes
-
-    inp_lanes = env.INP_ELEM_BITS // env.INP_WIDTH
-    assert inp_lanes == env.BATCH * env.BLOCK_IN
-    inp_shape = (env.BATCH, env.BLOCK_IN)
-    assert inp_shape[0] * inp_shape[1] == inp_lanes
-
-    out_lanes = env.ACC_ELEM_BITS // env.ACC_WIDTH
-    assert out_lanes == env.BATCH * env.BLOCK_OUT
-    out_shape = (env.BATCH, env.BLOCK_OUT)
-    assert out_shape[0] * out_shape[1] == out_lanes
-
-    wgt = te.placeholder(
-        (wgt_shape[0], wgt_shape[1]), dtype="int%d" % env.WGT_WIDTH, name=env.wgt_scope
-    )
-    inp = te.placeholder(
-        (inp_shape[0], inp_shape[1]), dtype="int%d" % env.INP_WIDTH, name=env.inp_scope
-    )
-    k = te.reduce_axis((0, wgt_shape[1]), name="k")
-    out_dtype = "int%d" % env.ACC_WIDTH
-    out = te.compute(
-        (out_shape[0], out_shape[1]),
-        lambda i, j: te.sum(inp[i, k].astype(out_dtype) * wgt[j, k].astype(out_dtype), axis=[k]),
-        name="out",
-    )
-    wgt_layout = tvm.tir.decl_buffer(
-        wgt.shape,
-        wgt.dtype,
-        env.wgt_scope,
-        scope=env.wgt_scope,
-        offset_factor=wgt_lanes,
-        data_alignment=wgt_lanes,
-    )
-    inp_layout = tvm.tir.decl_buffer(
-        inp.shape,
-        inp.dtype,
-        env.inp_scope,
-        scope=env.inp_scope,
-        offset_factor=inp_lanes,
-        data_alignment=inp_lanes,
-    )
-    out_layout = tvm.tir.decl_buffer(
-        out.shape,
-        out.dtype,
-        env.acc_scope,
-        scope=env.acc_scope,
-        offset_factor=out_lanes,
-        data_alignment=out_lanes,
-    )
-
-    def intrin_func(ins, outs):
-        """Matrix-matrix multiply intrinsic function"""
-        dinp, dwgt = ins
-        dout = outs[0]
-
-        def instr(index):
-            """Generate matrix-matrix multiply VTA instruction"""
-            irb = tvm.tir.ir_builder.create()
-            dev = env.dev
-            irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE))
-            irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop)
-            if index in (0, 2):
-                irb.emit(
-                    tvm.tir.call_intrin(
-                        "int32",
-                        "tir.vta.uop_push",
-                        0,
-                        0,
-                        dout.access_ptr("rw", "int32"),
-                        dinp.access_ptr("r", "int32"),
-                        dwgt.access_ptr("r", "int32"),
-                        0,
-                        0,
-                        0,
-                    )
-                )
-            else:
-                irb.emit(
-                    tvm.tir.call_intrin(
-                        "int32",
-                        "tir.vta.uop_push",
-                        0,
-                        1,
-                        dout.access_ptr("rw", "int32"),
-                        0,
-                        0,
-                        0,
-                        0,
-                        0,
-                    )
-                )
-            return irb.get()
-
-        # return a triple of normal-set, reset, update
-        nop = tvm.tir.Evaluate(0)
-        if mock:
-            return (nop, nop, nop)
-        return (instr(0), instr(1), instr(2))
-
-    return te.decl_tensor_intrin(
-        out.op, intrin_func, name="GEMM", binds={inp: inp_layout, wgt: wgt_layout, out: out_layout}
-    )
diff --git a/vta/python/vta/libinfo.py b/vta/python/vta/libinfo.py
deleted file mode 100644
index 65ea7083a255..000000000000
--- a/vta/python/vta/libinfo.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Library information."""
-from __future__ import absolute_import
-import sys
-import os
-
-from .environment import get_vta_hw_path
-
-
-def _get_lib_name(lib_name):
-    """Get lib name with extension
-
-    Returns
-    -------
-    lib_name_ext : str
-        Name of VTA shared library with extension
-
-    Parameters
-    ------------
-    lib_name : str
-        Name of VTA shared library
-    """
-    if sys.platform.startswith("win32"):
-        return lib_name + ".dll"
-    if sys.platform.startswith("darwin"):
-        return lib_name + ".dylib"
-    return lib_name + ".so"
-
-
-def find_libvta(lib_vta, optional=False):
-    """Find VTA Chisel-based library
-
-    Returns
-    -------
-    lib_found : str
-        Library path
-
-    Parameters
-    ------------
-    lib_vta : str
-        Name of VTA shared library
-
-    optional : bool
-        Enable error check
-    """
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    tvm_library_path = os.environ.get("TVM_LIBRARY_PATH", None)
-    if tvm_library_path is None:
-        tvm_library_path = os.path.join(
-            curr_path,
-            os.pardir,
-            os.pardir,
-            os.pardir,
-            "build",
-        )
-
-    lib_search = [tvm_library_path, os.path.join(get_vta_hw_path(), "build")]
-    lib_name = _get_lib_name(lib_vta)
-    lib_path = [os.path.join(x, lib_name) for x in lib_search]
-    lib_found = [x for x in lib_path if os.path.exists(x)]
-    if not lib_found and not optional:
-        raise RuntimeError(
-            "Cannot find the files.\n" + "List of candidates:\n" + str("\n".join(lib_path))
-        )
-    return lib_found
diff --git a/vta/python/vta/program_bitstream.py b/vta/python/vta/program_bitstream.py
deleted file mode 100644
index a7da89d2f637..000000000000
--- a/vta/python/vta/program_bitstream.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA specific bitstream program library."""
-import os
-import argparse
-
-
-def main():
-    """Main function"""
-    parser = argparse.ArgumentParser()
-    parser.add_argument("target", type=str, default="", help="target")
-    parser.add_argument("bitstream", type=str, default="", help="bitstream path")
-    args = parser.parse_args()
-
-    if args.target not in ("pynq", "ultra96", "de10nano", "sim", "tsim"):
-        raise RuntimeError("Unknown target {}".format(args.target))
-
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    path_list = [
-        os.path.join(curr_path, "/{}".format(args.bitstream)),
-        os.path.join("./", "{}".format(args.bitstream)),
-    ]
-    ok_path_list = [p for p in path_list if os.path.exists(p)]
-    if not ok_path_list:
-        raise RuntimeError("Cannot find bitstream file in %s" % str(path_list))
-
-    bitstream_program(args.target, args.bitstream)
-
-
-def pynq_bitstream_program(bitstream_path):
-    # pylint: disable=import-outside-toplevel
-    from pynq import Bitstream
-
-    bitstream = Bitstream(bitstream_path)
-    bitstream.download()
-
-
-def de10nano_bitstream_program(bitstream_path):
-    # pylint: disable=import-outside-toplevel
-    from tvm import get_global_func
-
-    program = get_global_func("vta.de10nano.program")
-    program(bitstream_path)
-
-
-def intelfocl_bitstream_program(bitstream_path, mem_size=4 * 1024 * 1024 * 1024):
-    # pylint: disable=import-outside-toplevel
-    from tvm import get_global_func
-
-    program = get_global_func("vta.oclfpga.program")
-    program(bitstream_path, mem_size)
-
-
-def bitstream_program(target, bitstream, *args):
-    """program bitstream to devices"""
-
-    if target in ["pynq", "ultra96"]:
-        pynq_bitstream_program(bitstream)
-    elif target in ["de10nano"]:
-        de10nano_bitstream_program(bitstream)
-    elif target in ["sim", "tsim"]:
-        # In simulation, bit stream programming is a no-op
-        return
-    elif target in ["intelfocl"]:
-        intelfocl_bitstream_program(bitstream, *args)
-    else:
-        raise RuntimeError("Unknown target {}".format(target))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/vta/python/vta/rpc_client.py b/vta/python/vta/rpc_client.py
deleted file mode 100644
index 90203983987a..000000000000
--- a/vta/python/vta/rpc_client.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA RPC client function"""
-import os
-
-from tvm import rpc
-from vta import program_bitstream
-from .environment import get_env
-from .bitstream import download_bitstream, get_bitstream_path
-
-
-def reconfig_runtime(remote):
-    """Reconfigure remote runtime based on current hardware spec.
-
-    Parameters
-    ----------
-    remote : RPCSession
-        The TVM RPC session
-    """
-    env = get_env()
-    freconfig = remote.get_function("tvm.contrib.vta.reconfig_runtime")
-    freconfig(env.pkg.cfg_json)
-
-
-def program_fpga(remote, bitstream=None):
-    """Upload and program bistream
-
-    Parameters
-    ----------
-    remote : RPCSession
-        The TVM RPC session
-
-    bitstream : str, optional
-        Path to a local bistream file. If unset, tries to download from cache server.
-    """
-    env = get_env()
-
-    if bitstream:
-        assert os.path.isfile(bitstream)
-    else:
-        bitstream = get_bitstream_path()
-        if not os.path.isfile(bitstream):
-            if env.TARGET == "de10nano":
-                return
-            download_bitstream()
-
-    if isinstance(remote, rpc.LocalSession):
-        program_bitstream.bitstream_program(env.TARGET, bitstream)
-    else:
-        fprogram = remote.get_function("tvm.contrib.vta.init")
-        remote.upload(bitstream)
-        fprogram(os.path.basename(bitstream))
diff --git a/vta/python/vta/testing/__init__.py b/vta/python/vta/testing/__init__.py
deleted file mode 100644
index 8d294c2f4d22..000000000000
--- a/vta/python/vta/testing/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Testing utilities, this namespace is not imported by default."""
-
-from .utils import run
diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
deleted file mode 100644
index 5374fe77ca03..000000000000
--- a/vta/python/vta/testing/simulator.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=superfluous-parens
-"""Utilities to start simulator."""
-import ctypes
-import json
-import warnings
-import tvm
-from ..environment import get_env
-from ..libinfo import find_libvta
-
-
-def _load_sw():
-    """Load hardware library for simulator."""
-
-    env = get_env()
-    lib_driver_name = (
-        "libvta_tsim"
-        if env.TARGET == "tsim"
-        else "libvta"
-        if env.TARGET == "intelfocl"
-        else "libvta_fsim"
-    )
-    require_sim = env.TARGET in ("sim", "tsim")
-    libs = []
-
-    # Load driver library
-    lib_driver = find_libvta(lib_driver_name, optional=(not require_sim))
-
-    if not lib_driver:
-        return []
-
-    try:
-        libs = [ctypes.CDLL(lib_driver[0], ctypes.RTLD_GLOBAL)]
-    except OSError as err:
-        if require_sim:
-            raise err
-        warnings.warn("Error when loading VTA driver {}: {}".format(lib_driver[0], err))
-        return []
-
-    if env.TARGET == "tsim":
-        lib_hw = find_libvta("libvta_hw", optional=True)
-        assert lib_hw  # make sure to make in ${VTA_HW_PATH}/hardware/chisel
-        f = tvm.get_global_func("vta.tsim.init")
-        m = tvm.runtime.load_module(lib_hw[0], "vta-tsim")
-        f(m)
-        return lib_hw
-
-    return libs
-
-
-def enabled():
-    """Check if simulator is enabled."""
-    f = tvm.get_global_func("vta.simulator.profiler_clear", True)
-    return f is not None
-
-
-def clear_stats():
-    """Clear profiler statistics."""
-    env = get_env()
-    if env.TARGET == "sim":
-        f = tvm.get_global_func("vta.simulator.profiler_clear", True)
-    else:
-        f = tvm.get_global_func("vta.tsim.profiler_clear", True)
-    if f:
-        f()
-
-
-def stats():
-    """Get profiler statistics
-
-    Returns
-    -------
-    stats : dict
-        Current profiler statistics
-    """
-    env = get_env()
-    if env.TARGET == "sim":
-        x = tvm.get_global_func("vta.simulator.profiler_status")()
-    else:
-        x = tvm.get_global_func("vta.tsim.profiler_status")()
-    return json.loads(x)
-
-
-# debug flag to skip execution.
-DEBUG_SKIP_EXEC = 1
-
-
-def debug_mode(flag):
-    """Set debug mode
-    Paramaters
-    ----------
-    flag : int
-        The debug flag, 0 means clear all flags.
-    """
-    tvm.get_global_func("vta.simulator.profiler_debug_mode")(flag)
-
-
-LIBS = _load_sw()
diff --git a/vta/python/vta/testing/utils.py b/vta/python/vta/testing/utils.py
deleted file mode 100644
index f163359667f1..000000000000
--- a/vta/python/vta/testing/utils.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test Utilities"""
-from __future__ import absolute_import as _abs
-
-import os
-from tvm import rpc, autotvm
-from ..environment import get_env
-from . import simulator
-
-
-def run(run_func):
-    """Run test function on all available env.
-
-    Parameters
-    ----------
-    run_func : function(env, remote)
-    """
-    env = get_env()
-
-    if env.TARGET in ["sim", "tsim", "intelfocl"]:
-        # Talk to local RPC if necessary to debug RPC server.
-        # Compile vta on your host with make at the root.
-        # Make sure TARGET is set to "sim" in the config.json file.
-        # Then launch the RPC server on the host machine
-        # with ./apps/vta_rpc/start_rpc_server.sh
-        # Set your VTA_LOCAL_SIM_RPC environment variable to
-        # the port it's listening to, e.g. 9090
-        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
-        if local_rpc:
-            remote = rpc.connect("127.0.0.1", local_rpc)
-            run_func(env, remote)
-        else:
-            # Make sure simulation library exists
-            # If this fails, build vta on host (make)
-            # with TARGET="sim" in the json.config file.
-            if env.TARGET == "sim":
-                assert simulator.enabled()
-            run_func(env, rpc.LocalSession())
-
-    elif env.TARGET in ["pynq", "ultra96", "de10nano"]:
-        # The environment variables below should be set if we are using
-        # a tracker to obtain a remote for a test device
-        tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-        tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
-        # Otherwise, we can set the variables below to directly
-        # obtain a remote from a test device
-        pynq_host = os.environ.get("VTA_RPC_HOST", None)
-        pynq_port = os.environ.get("VTA_RPC_PORT", None)
-        # Run device from fleet node if env variables are defined
-        if tracker_host and tracker_port:
-            remote = autotvm.measure.request_remote(
-                env.TARGET, tracker_host, int(tracker_port), timeout=10000
-            )
-            run_func(env, remote)
-        else:
-            # Next, run on PYNQ if env variables are defined
-            if pynq_host and pynq_port:
-                remote = rpc.connect(pynq_host, int(pynq_port))
-                run_func(env, remote)
-            else:
-                raise RuntimeError(
-                    "Please set the VTA_RPC_HOST and VTA_RPC_PORT environment variables"
-                )
-
-    else:
-        raise RuntimeError("Unknown target %s" % env.TARGET)
diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
deleted file mode 100644
index b9ebe55703c5..000000000000
--- a/vta/python/vta/top/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""TVM TOPI connector, eventually most of these should go to TVM repo"""
-
-from . import bitpack
-from .graphpack import graph_pack
-from . import op
-from .vta_conv2d import conv2d_packed, schedule_conv2d_packed
-from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
-from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
-from .vta_dense import dense_packed, schedule_dense_packed
-from . import utils
diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py
deleted file mode 100644
index 630fd93f448a..000000000000
--- a/vta/python/vta/top/bitpack.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=ungrouped-imports, unsupported-binary-operation
-
-"""Bit packing operators"""
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import te
-from tvm.topi import utils
-
-from tvm.relay.op.op import register_compute, register_injective_schedule
-from tvm.relay.op.op import register_pattern, OpPattern
-
-
-def bitpack(data, bits, pack_type="int8", name="bitpack"):
-    """Packs lowest dimension into format needed by VTA
-
-    Parameters
-    ----------
-    pack_axis : int
-        index of the axis to pack in data
-    bit_axis : int
-        index of axis to place bit axis in resulting packed data
-
-    Returns
-    -------
-    packed : Tensor
-        The packed tensor.
-    """
-    shape_vec = list(data.shape)
-    if pack_type == "int8":
-        data_width = 8
-    elif pack_type == "int16":
-        data_width = 16
-    elif pack_type == "int32":
-        data_width = 32
-    else:
-        raise RuntimeError("Unknown pack type %s" % pack_type)
-    assert data_width % bits == 0
-    lanes = data_width // bits
-
-    # Data must be in multiples of the data_width
-    assert utils.get_const_int(shape_vec[-1]) % lanes == 0, "Not a multiple of word size"
-    shape_vec[-1] = shape_vec[-1] // lanes
-    oshape = tuple(shape_vec)
-
-    def _bitpack(*indices):
-        ret = None
-        mask = tvm.tir.const((1 << bits) - 1, pack_type)
-        for k in range(lanes):
-            idx = list(indices)
-            idx[-1] = idx[-1] * lanes + k
-            elem = data(*idx).astype(pack_type)
-            if k == 0:
-                ret = elem & mask
-            else:
-                val = (elem & mask) << tvm.tir.const(k * bits, pack_type)
-                ret = ret | val
-        return ret
-
-    return te.compute(oshape, _bitpack, name=name, tag="bitpack")
-
-
-@register_compute("bitpack", level=15)
-def compute_bitpack(attrs, inputs):
-    lanes = attrs.lanes
-    dtype = inputs[0].dtype
-    assert dtype == "int8"
-    width = 8
-    assert width % lanes == 0
-    bits = 8 // lanes
-    return bitpack(inputs[0], bits, dtype)
-
-
-register_injective_schedule("bitpack")
-register_pattern("bitpack", OpPattern.INJECTIVE)
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
deleted file mode 100644
index f375c87157d8..000000000000
--- a/vta/python/vta/top/graphpack.py
+++ /dev/null
@@ -1,628 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, bad-chained-comparison
-"""A Relay implementation of graph packing."""
-
-import tvm
-from tvm import relay
-from tvm.relay import op, transform
-from tvm.relay import ExprMutator
-
-
-def run_opt_pass(expr, opt_pass):
-    """Exectue a relay pass."""
-    assert isinstance(opt_pass, tvm.transform.Pass)
-    mod = tvm.IRModule.from_expr(expr)
-    mod = opt_pass(mod)
-    entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
-
-
-def _to_shape(shape):
-    """convert shape into tuple."""
-    return tuple(int(sh) for sh in shape)
-
-
-def _pack_batch_channel(data, dshape, bfactor, cfactor):
-    """Pack the data channel dimension."""
-    assert int(dshape[0]) % bfactor == 0
-    assert int(dshape[1]) % cfactor == 0
-    data = op.reshape(
-        data,
-        newshape=(
-            int(dshape[0]) // bfactor,
-            bfactor,
-            int(dshape[1]) // cfactor,
-            cfactor,
-            int(dshape[2]),
-            int(dshape[3]),
-        ),
-    )
-    data = op.transpose(data, axes=(0, 2, 4, 5, 1, 3))
-    return data
-
-
-def _unpack_batch_channel(data, old_shape, unpack_transpose=False):
-    """Unpack the data channel dimension."""
-    if unpack_transpose:
-        data = op.transpose(data, axes=(0, 4, 1, 5, 2, 3))
-    data = op.reshape(data, newshape=old_shape)
-    return data
-
-
-def _channel_const_match(channel_length, cfactor_out):
-    """Round the channel const variant if the value not divisible by cfactor_out"""
-    diff = int(channel_length) % cfactor_out
-    if diff != 0:
-        diff = cfactor_out - diff
-        channel_length = channel_length + diff
-
-    return diff, channel_length
-
-
-def _const_shape_match(data, dshape, cfactor_out):
-    """Pad the constant if the shape[0] not divisible by cfactor_out."""
-    assert len(dshape) == 3
-    pad_width = int(dshape[0]) % cfactor_out
-    if pad_width != 0:
-        pad_width = cfactor_out - pad_width
-        data = op.nn.pad(data, [[0, pad_width], [0, 0], [0, 0]])
-        dshape = tuple([dshape[0] + pad_width, dshape[1], dshape[2]])
-    return data, dshape
-
-
-def _weight_shape_match(data, dshape, channels, cfactor_out, transpose=False):
-    """Pad the weight if the shape[0] not divisible by cfactor_out."""
-    assert len(dshape) == 4
-    pad_width = int(dshape[0]) % cfactor_out
-    channels_pad = int(channels) % cfactor_out
-    if pad_width != 0:
-        pad_width = cfactor_out - pad_width
-        data = op.nn.pad(data, [[0, pad_width], [0, 0], [0, 0], [0, 0]])
-        dshape = tuple([dshape[0] + pad_width, dshape[1], dshape[2], dshape[3]])
-
-    if channels_pad != 0:
-        channels = channels + (cfactor_out - channels_pad)
-
-    return data, dshape, channels
-
-
-def _weight_shape_match_transpose(data, dshape, channels, cfactor_out):
-    """Pad the weight if the shape[1] not divisible by cfactor_out."""
-    assert len(dshape) == 4
-    pad_width = int(dshape[1]) % cfactor_out
-    channels_pad = int(channels) % cfactor_out
-    if pad_width != 0:
-        pad_width = cfactor_out - pad_width
-        data = op.nn.pad(data, [[0, 0], [0, pad_width], [0, 0], [0, 0]])
-        dshape = tuple(dshape[0], [dshape[1] + pad_width, dshape[2], dshape[3]])
-
-    if channels_pad != 0:
-        channels = channels + (cfactor_out - channels_pad)
-
-    return data, dshape, channels
-
-
-def _pack_weight(data, dshape, cfactor):
-    """Pack the weight into packed format."""
-    assert len(dshape) == 4
-    assert int(dshape[0]) % cfactor == 0
-    assert int(dshape[1]) % cfactor == 0
-    data = op.reshape(
-        data,
-        newshape=(
-            int(dshape[0]) // cfactor,
-            cfactor,
-            int(dshape[1]) // cfactor,
-            cfactor,
-            int(dshape[2]),
-            int(dshape[3]),
-        ),
-    )
-    data = op.transpose(data, axes=(0, 2, 4, 5, 1, 3))
-    return data
-
-
-def _pack_weight_conv2d_transpose(data, dshape, cfactor):
-    """Pack the weight into packed format."""
-    dshape = _to_shape(dshape)
-    assert len(dshape) == 4
-    assert dshape[0] % cfactor == 0
-    assert dshape[1] % cfactor == 0
-    data = op.reshape(
-        data,
-        newshape=(
-            dshape[0] // cfactor,
-            cfactor,
-            dshape[1] // cfactor,
-            cfactor,
-            dshape[2],
-            dshape[3],
-        ),
-    )
-    data = op.transpose(data, axes=(2, 0, 4, 5, 3, 1))
-    return data
-
-
-def _pack_const(data, dshape, dtype, bfactor, cfactor):
-    """Pack a constant parameter."""
-    dshape = _to_shape(dshape)
-    assert len(dshape) == 3
-    assert dshape[0] % cfactor == 0
-    data = op.reshape(data, newshape=(dshape[0] // cfactor, cfactor, dshape[1], dshape[2], 1))
-    data = op.transpose(data, axes=(0, 2, 3, 4, 1))
-
-    # broadcast batch dimension to bfactor
-    data = op.broadcast_to(
-        data, shape=(dshape[0] // cfactor, dshape[1], dshape[2], bfactor, cfactor)
-    )
-    return data
-
-
-def _get_tensor_shape(node):
-    """Get node shape."""
-    if isinstance(node.checked_type, relay.ty.TensorType):
-        return _to_shape(node.checked_type.shape)
-    return []
-
-
-def _get_tensor_type(node):
-    """Get node type."""
-    if isinstance(node.checked_type, relay.ty.TensorType):
-        return node.checked_type.dtype
-    return "float32"
-
-
-def _operator_idx_inc(expr, count_meta, operator_current_idx):
-    """Increase operator index"""
-    if isinstance(expr, relay.expr.Constant):
-        operator_current_idx = operator_current_idx + 1 if count_meta else operator_current_idx
-    else:
-        operator_current_idx = operator_current_idx + 1
-    return operator_current_idx
-
-
-class ExprDeviceAnnot(ExprMutator):
-    """Visitor to perform graph annotation on an AST.
-
-    Parameters
-    ----------
-    start: int
-        the start location to mark run on vta (inclusive)
-    end: int
-        the end location to mark run on vta (exclusive)
-
-    Returns
-    ---------
-    None
-    """
-
-    def __init__(self, start=-1, end=-1):
-        self.ext_dev = tvm.device("ext_dev")
-        self.cpu_dev = tvm.device("cpu")
-        self.cast = op.op.get("cast")
-        self.counter = -1
-        self.start = start
-        self.end = end
-        super().__init__()
-
-    def visit_call(self, call):
-        """Visit the children."""
-        # First visit the children.
-        args = [self.visit(arg) for arg in call.args]
-
-        self.counter += 1
-        if self.counter == self.start:
-            ret = relay.Call(call.op, args, call.attrs)
-            ret = relay.annotation.on_device(ret, self.ext_dev)
-            return ret
-
-        if self.counter == self.end:
-            ret = relay.Call(call.op, args, call.attrs)
-            ret = relay.annotation.on_device(ret, self.cpu_dev)
-            return ret
-
-        if self.counter > self.start and self.counter < self.end:
-            ret = relay.Call(call.op, args, call.attrs)
-
-            # skip the float op, i.e., float->int cast
-            if self.is_float_op(call):
-                return ret
-
-            return relay.annotation.on_device(ret, self.ext_dev)
-
-        return relay.Call(self.visit(call.op), args, call.attrs)
-
-    def is_float_op(self, call):
-        """check if this op belongs to a float op
-        in general, float op's odtype is float;
-        a special case is float->int cast, which follow this op sequence:
-        multiply(float) -> round(float) -> clip(float) -> cast(int);
-        """
-        args = call.args
-        odtype = _get_tensor_type(call)
-
-        if odtype == "float32":
-            return True
-
-        if call.op == self.cast:
-            idtype = _get_tensor_type(args[0])
-            if idtype == "float32":
-                return True
-
-        return False
-
-
-class ExprLocator(ExprMutator):
-    """Visitor to locate op on an AST."""
-
-    def __init__(self):
-        self.counter = -1
-        self.op2nodes = {}
-        super().__init__()
-
-    def visit_call(self, call):
-        """Visit the children."""
-        # First visit the children.
-        args = [self.visit(arg) for arg in call.args]
-
-        odtype = _get_tensor_type(call)
-        self.counter += 1
-        if (call.op, odtype) in self.op2nodes:
-            self.op2nodes[(call.op, odtype)].append(self.counter)
-        else:
-            self.op2nodes[(call.op, odtype)] = [self.counter]
-
-        return relay.Call(self.visit(call.op), args, call.attrs)
-
-
-class ExprPack(ExprMutator):
-    """Visitor to perform graph packing on an AST."""
-
-    def __init__(self, bfactor, cfactor, weight_bits):
-        self.bfactor = bfactor
-        self.cfactor = cfactor
-        self.weight_bits = weight_bits
-        self.start_pack = False
-        # Cache Operator the algorithm matches against.
-        self.bitpack_start = op.op.get("annotation.bitpack_start")
-        self.bitpack_end = op.op.get("annotation.bitpack_end")
-        self.conv2d = op.op.get("nn.conv2d")
-        self.conv2d_transpose = op.op.get("nn.conv2d_transpose")
-        self.add = op.op.get("add")
-        self.multiply = op.op.get("multiply")
-        self.bias_add = op.op.get("nn.bias_add")
-        self.pad = op.op.get("nn.pad")
-        self.upsampling = op.op.get("nn.upsampling")
-        self.reshape = op.op.get("reshape")
-        self.number_of_conv2d = 0
-        self.unpack_transpose = True
-        super().__init__()
-
-    def visit_call(self, call):
-        """Visit the children."""
-        # First visit the children.
-        oshape = _get_tensor_shape(call)
-        odtype = _get_tensor_type(call)
-        input_types = [arg.checked_type for arg in call.args]
-        args = [self.visit(arg) for arg in call.args]
-
-        # Start and stop cases.
-        if call.op == self.bitpack_start:
-            assert not self.start_pack
-            self.start_pack = True
-            return _pack_batch_channel(args[0], oshape, self.bfactor, self.cfactor)
-        if call.op == self.bitpack_end:
-            if self.start_pack:
-                self.start_pack = False
-                data = args[0]
-                data_shape = _get_tensor_shape(call.args[0])
-                return _unpack_batch_channel(data, data_shape, self.unpack_transpose)
-        if self.start_pack:
-            # Operator cases
-            if call.op == self.conv2d and odtype == "int32":
-                self.number_of_conv2d += 1
-                assert 8 % self.weight_bits == 0
-                w_lanes = 8 // self.weight_bits
-                data_layout = "NCHW%dn%dc" % (self.bfactor, self.cfactor)
-                kernel_layout = "OIHW%do%di" % (self.cfactor, self.cfactor)
-                data, weight = args
-                data_shape = _to_shape(input_types[0].shape)
-                kernel_shape = _to_shape(input_types[1].shape)
-                channels = call.attrs.channels
-                weight, kernel_shape, channels = _weight_shape_match(
-                    weight, kernel_shape, channels, self.cfactor
-                )
-                kernel = _pack_weight(weight, kernel_shape, self.cfactor)
-                # insert bit packing when necessary
-                if w_lanes != 1:
-                    assert 8 % w_lanes == 0
-                    kernel = op.bitpack(kernel, lanes=w_lanes)
-
-                conv2d = op.nn.conv2d(
-                    data,
-                    kernel,
-                    strides=call.attrs.strides,
-                    padding=call.attrs.padding,
-                    dilation=call.attrs.dilation,
-                    groups=call.attrs.groups,
-                    channels=channels,
-                    kernel_size=call.attrs.kernel_size,
-                    data_layout=data_layout,
-                    kernel_layout=kernel_layout,
-                    out_dtype=call.attrs.out_dtype,
-                )
-                return conv2d
-
-            if call.op == self.conv2d_transpose and odtype == "int32":
-                self.number_of_conv2d += 1
-                assert 8 % self.weight_bits == 0
-                w_lanes = 8 // self.weight_bits
-                if self.start_pack:
-                    data_layout = "NCHW%dn%dc" % (self.bfactor, self.cfactor)
-                    kernel_layout = "IOHW%di%do" % (self.cfactor, self.cfactor)
-                    data, weight = args
-                    data_shape = _to_shape(input_types[0].shape)
-                    kernel_shape = _to_shape(input_types[1].shape)
-                    channels = call.attrs.channels
-                    weight, kernel_shape, channels = _weight_shape_match_transpose(
-                        weight, kernel_shape, channels, self.cfactor
-                    )
-                    kernel = _pack_weight_conv2d_transpose(weight, kernel_shape, self.cfactor)
-                    conv2d = op.nn.conv2d_transpose(
-                        data,
-                        kernel,
-                        strides=call.attrs.strides,
-                        padding=call.attrs.padding,
-                        dilation=call.attrs.dilation,
-                        groups=call.attrs.groups,
-                        channels=call.attrs.channels,
-                        kernel_size=call.attrs.kernel_size,
-                        data_layout=data_layout,
-                        kernel_layout=kernel_layout,
-                        output_padding=call.attrs.output_padding,
-                        out_dtype=call.attrs.out_dtype,
-                    )
-                return conv2d
-            if call.op == self.add and tuple(input_types[0].shape) == tuple(input_types[1].shape):
-                pass
-            elif call.op == self.add and len(input_types[1].shape) == 3:
-                data, const = args
-                const, input_shape = _const_shape_match(const, input_types[1].shape, self.cfactor)
-                const = _pack_const(
-                    const, _to_shape(input_shape), input_types[1].dtype, self.bfactor, self.cfactor
-                )
-                return relay.Call(self.add, [data, const])
-            elif call.op == self.multiply and tuple(input_types[0].shape) == tuple(
-                input_types[1].shape
-            ):
-                pass
-            elif call.op == self.multiply and len(input_types[1].shape) == 3:
-                data, const = args
-                const = _pack_const(
-                    const,
-                    _to_shape(input_types[1].shape),
-                    input_types[1].dtype,
-                    self.bfactor,
-                    self.cfactor,
-                )
-                return relay.Call(self.multiply, [data, const])
-            elif self.start_pack and call.op == self.bias_add:
-                data, bias = args
-                bias = _pack_const(
-                    bias,
-                    _to_shape(input_types[1].shape),
-                    input_types[1].dtype,
-                    self.bfactor,
-                    self.cfactor,
-                )
-                return relay.Call(self.add, [data, bias])
-            elif (
-                self.start_pack and call.op == op.op.get("cast") and input_types[0].dtype == "int32"
-            ):
-                cast = relay.Call(op.op.get("cast"), [args[0]], call.attrs)
-                return cast
-            elif call.op == self.pad:
-                pad_width = call.attrs.pad_width
-                if len(pad_width) == 6:
-                    pass
-                elif len(pad_width) == 4:
-                    (data, pad_value) = args
-                    new_pad_width = []
-                    new_pad_width.extend(pad_width)
-                    for _ in range(2):
-                        new_pad_width.append([0, 0])
-                    return op.nn.pad(data, pad_value=pad_value, pad_width=new_pad_width)
-            elif call.op == self.upsampling:
-                (data,) = args
-                scale_h = call.attrs.scale_h
-                scale_w = call.attrs.scale_w
-                data_layout = "NCHW%dn%dc" % (self.bfactor, self.cfactor)
-                method = call.attrs.method
-                align_corners = call.attrs.align_corners
-                return op.nn.upsampling(data, scale_h, scale_w, data_layout, method, align_corners)
-            elif call.op == self.reshape and len(input_types[0].shape) == 4:
-                (data,) = args
-                self.unpack_transpose = False
-                data = op.transpose(data, axes=(0, 4, 1, 5, 2, 3))
-                new_shape = [int(x) for x in input_types[0].shape]
-                # Check if the reshape match with such shape after pad
-                pad, new_shape[1] = _channel_const_match(new_shape[1], self.cfactor)
-                data = op.reshape(data, new_shape)
-                # remove pad data
-                if pad != 0:
-                    new_pad_width = [[0, 0], [0, -pad], [0, 0], [0, 0]]
-                    data = op.nn.pad(data, pad_width=new_pad_width)
-                return data
-
-        return relay.Call(self.visit(call.op), args, call.attrs)
-
-
-class BT(Exception):
-    pass
-
-
-def get_subgraph(expr, start_name, stop_name, start_name_idx, stop_name_idx, count_meta):
-    """We assume stop_name only appears once for simplicity.
-    This constraint will be lifted in the future.
-    bitpack_start and bitpack_end are both inclusive.
-    """
-    bitpack_start = op.op.get("annotation.bitpack_start")
-    bitpack_end = op.op.get("annotation.bitpack_end")
-    anf = run_opt_pass(expr, transform.ToANormalForm())
-    operator_current_idx = 0
-
-    def _recursion(anf, start_found, stop_found, operator_current_idx):
-        """Helper to obtain the subgraph."""
-        if isinstance(anf, relay.Function):
-            return relay.Function(
-                anf.params,
-                _recursion(anf.body, start_found, stop_found, operator_current_idx),
-                anf.ret_type,
-                anf.type_params,
-                anf.attrs,
-            )
-        if isinstance(anf, relay.expr.Let):
-            value = anf.value
-            if isinstance(value, relay.expr.Call):
-                if isinstance(value.op, tvm.ir.Op):
-                    if value.op.name == start_name and not start_found:
-                        if operator_current_idx == start_name_idx or start_name_idx is None:
-                            value = relay.expr.Call(bitpack_start, [value])
-                            start_found = True
-                    elif value.op.name == stop_name:
-                        if operator_current_idx == stop_name_idx or stop_name_idx is None:
-                            raise BT()
-
-            operator_current_idx = _operator_idx_inc(value, count_meta, operator_current_idx)
-
-            try:
-                return relay.expr.Let(
-                    anf.var,
-                    value,
-                    _recursion(anf.body, start_found, stop_found, operator_current_idx),
-                )
-            except BT:
-                assert start_found
-                assert not stop_found
-                stop_found = True
-                value = relay.expr.Call(bitpack_end, [value])
-                # todo: check anf.body has no more stop_name beside that one
-                return relay.expr.Let(anf.var, value, anf.body)
-        else:
-            assert start_found
-            assert stop_found
-            return anf
-
-    annotated = _recursion(anf, False, False, operator_current_idx)
-    return run_opt_pass(annotated, transform.ToGraphNormalForm())
-
-
-def graph_pack(
-    expr,
-    bfactor,
-    cfactor,
-    weight_bits,
-    start_name="nn.max_pool2d",
-    stop_name="nn.global_avg_pool2d",
-    start_name_idx=None,
-    stop_name_idx=None,
-    count_meta=False,
-    device_annot=False,
-    annot_start_name="nn.conv2d",
-    annot_end_name="annotation.stop_fusion",
-):
-    """Pack the graph into batch&channel packed format.
-
-    Parameters
-    ----------
-    expr : relay.Expr
-       The input program.
-
-    bfactor : int
-       The packing factor in batch
-
-    cfactor : int
-       The packing factor in channel
-
-    weight_bits: int
-        The bit-width of the weights.
-
-    start_name: str, optional
-       Start packing from certain known node when start_name_idx is None.
-
-    stop_name: str, optional
-       Stop packing from certain known node when stop_name_idx is None.
-
-    start_name_idx: int, optional
-        When start_name_idx not None, start packing only when node name equal start_name
-        and node idx equals start_name_idx.
-
-    stop_name_idx: int, optional
-        When stop_name_idx not None, stop packing only when node name equal stop_name
-        and node index equals stop_name_idx.
-
-    count_meta:boolean, optional
-        When count_meta is False, the operator increase logic would not count the meta that have
-        the type 'relay.expr.Constant', start_name_idx and stop_name_idx follow the index from
-        'expr.astext(show_meta_data=False)'. When count_meta is True, the operator increase
-        logic would count the meta.
-
-    device_annot: boolean, optional
-        if we want to annoate the device_type
-
-    annot_start_name: str, optional
-        device annotation start node, from which we mark the nodes as `ext_dev`
-
-    annot_end_name: str, optional
-        device annotation end node, after which we mark the nodes as 'cpu'
-
-    Returns
-    -------
-    expr : Expr
-        The transformed expression.
-    """
-    assert isinstance(expr, relay.Function)
-    assert (
-        (start_name != stop_name)
-        or (start_name_idx is None != stop_name_idx is None)
-        or (not (start_name_idx is None and stop_name_idx is None))
-        or (start_name_idx < stop_name_idx)
-    )
-    expr = get_subgraph(expr, start_name, stop_name, start_name_idx, stop_name_idx, count_meta)
-    expr = run_opt_pass(expr, transform.InferType())
-    packer = ExprPack(bfactor, cfactor, weight_bits)
-    expr = packer.visit(expr)
-    assert not packer.start_pack
-    expr = run_opt_pass(expr, transform.InferType())
-
-    if device_annot:
-        expr_locator = ExprLocator()
-        expr_locator.visit(expr)
-
-        annot_start = op.op.get(annot_start_name)
-        start = expr_locator.op2nodes[(annot_start, "int32")][0]
-
-        annot_end = op.op.get(annot_end_name)
-        # we mark the next op to the last stop_fusion on cpu device
-        end = expr_locator.op2nodes[(annot_end, "int8")][-1] + 1
-
-        device_annot = ExprDeviceAnnot(start=start, end=end)
-        expr = device_annot.visit(expr)
-        return run_opt_pass(expr, transform.InferType())
-
-    return expr
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
deleted file mode 100644
index 4fa5b6ff8438..000000000000
--- a/vta/python/vta/top/op.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument, ungrouped-imports
-"""Namespace for supporting Relay operators on VTA."""
-from __future__ import absolute_import as _abs
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-
-from tvm.relay.op import op as reg
-from tvm.relay.op import strategy as _strategy
-from tvm.relay.op.op import OpPattern, OpStrategy
-
-from .utils import is_packed_layout
-from .vta_conv2d import conv2d_packed, schedule_conv2d_packed
-from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
-from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
-from .vta_dense import dense_packed, schedule_dense_packed
-from ..environment import get_env
-
-ENV = get_env()
-
-# override to force partition at copy
-reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
-
-# add clip vta strategy
-def compute_clip_vta(attrs, inputs, output_type):
-    """Clip operator."""
-    x = inputs[0]
-    a_min = attrs.a_min
-    a_max = attrs.a_max
-    const_min = tvm.tir.const(a_min, x.dtype)
-    const_max = tvm.tir.const(a_max, x.dtype)
-    with tvm.te.tag_scope(topi.tag.ELEMWISE):
-        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
-        x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
-    return [x]
-
-
-def clip_strategy_vta(attrs, inputs, out_type, target):
-    strategy = OpStrategy()
-    strategy.add_implementation(
-        compute_clip_vta,
-        _strategy.wrap_topi_schedule(topi.generic.schedule_injective),
-        name="clip.vta",
-    )
-    return strategy
-
-
-reg.get("clip").get_attr("FTVMStrategy").register(clip_strategy_vta, "vta")
-
-
-@autotvm.register_topi_compute("add.vta")
-def add_packed(cfg, lhs, rhs):
-    return topi.add(lhs, rhs)
-
-
-@autotvm.register_topi_compute("multiply.vta")
-def multiply_packed(cfg, lhs, rhs):
-    return topi.multiply(lhs, rhs)
-
-
-def schedule_alu_packed(cfg, outs):
-    """alu packed schedule"""
-    assert len(outs) == 1
-
-    def is_cast_op(op):
-        return op.name == "T_cast"
-
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    output = outs[0]
-    s = te.create_schedule([x.op for x in outs])
-    te.schedule.AutoInlineInjective(s)
-
-    # other target does not support alu-only ops
-    if not (ENV.TARGET in ["sim", "tsim", "intelfocl"]):
-        return s
-
-    # only put the int-related ops to vta
-    if "int" in output.dtype and len(output.shape) == 6:
-        ewise_inputs = []
-        ewise_ops = []
-        const_ops = []
-
-        def _traverse(op):
-            if topi.tag.is_broadcast(op.tag):
-                if not op.same_as(output.op):
-                    if not op.axis:
-                        const_ops.append(op)
-                    elif not is_cast_op(op):
-                        ewise_ops.append(op)
-
-                for tensor in op.input_tensors:
-                    if isinstance(tensor.op, tvm.te.PlaceholderOp):
-                        ewise_inputs.append((op, tensor))
-                    elif is_cast_op(tensor.op) and not op.same_as(output.op):
-                        ewise_inputs.append((op, tensor))
-                    else:
-                        _traverse(tensor.op)
-            else:
-                for tensor in op.input_tensors:
-                    if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) and (
-                        not is_cast_op(tensor.op)
-                    ):
-                        _traverse(tensor.op)
-
-        op = output.op
-        _traverse(op)
-        for _, t in ewise_inputs:
-            if t.dtype == "float32":
-                return s
-
-        x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
-
-        cfg.define_split("tile_co", x_co, num_outputs=2)
-        cfg.define_split("tile_h", x_i, num_outputs=2)
-        cfg.define_split("tile_w", x_j, num_outputs=2)
-
-        x_co0, x_co1 = cfg["tile_co"].apply(s, output, x_co)
-        x_i0, x_i1 = cfg["tile_h"].apply(s, output, x_i)
-        x_j0, x_j1 = cfg["tile_w"].apply(s, output, x_j)
-        s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
-        store_pt = x_j0
-
-        for e_o in ewise_ops:
-            s[e_o].set_scope(ENV.acc_scope)
-            s[e_o].pragma(s[e_o].op.axis[0], ENV.alu)
-            s[e_o].compute_at(s[output], store_pt)
-
-        # cache read input
-        cache_read_ewise = []
-        for consumer, tensor in ewise_inputs:
-            cache_read_ewise.append(s.cache_read(tensor, ENV.acc_scope, [consumer]))
-
-        for tensor in cache_read_ewise:
-            if s[tensor].op.axis:
-                s[tensor].pragma(s[tensor].op.axis[0], ENV.dma_copy)
-            s[tensor].compute_at(s[output], store_pt)
-
-        for op in const_ops:
-            s[op].compute_inline()
-
-        s[output].pragma(x_co1, ENV.dma_copy)
-
-    return s
-
-
-@autotvm.register_topi_schedule("add.vta")
-def schedule_add_packed(cfg, outs):
-    return schedule_alu_packed(cfg, outs)
-
-
-@autotvm.register_topi_schedule("multiply.vta")
-def schedule_multiply_packed(cfg, outs):
-    return schedule_alu_packed(cfg, outs)
-
-
-def add_strategy_vta(attrs, inputs, out_type, target):
-    strategy = OpStrategy()
-    strategy.add_implementation(
-        _strategy.wrap_topi_compute(add_packed),
-        _strategy.wrap_topi_schedule(schedule_add_packed),
-        name="add.vta",
-    )
-    return strategy
-
-
-def multiply_strategy_vta(attrs, inputs, out_type, target):
-    strategy = OpStrategy()
-    strategy.add_implementation(
-        _strategy.wrap_topi_compute(multiply_packed),
-        _strategy.wrap_topi_schedule(schedule_multiply_packed),
-        name="multiply.vta",
-    )
-    return strategy
-
-
-# other target does not support alu-only ops
-if ENV.TARGET in ["sim", "intelfocl"]:
-    reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
-    reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
-
-
-@_strategy.conv2d_strategy.register("vta")
-def conv2d_strategy_vta(attrs, inputs, out_type, target):
-    """conv2d vta strategy"""
-    strategy = OpStrategy()
-    kernel = inputs[1]
-    dilation = topi.utils.get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-
-    assert dilation == (1, 1), "support for dilation limited to (1, 1)"
-    if is_packed_layout(layout):
-        if groups == 1:
-            assert ENV.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
-            assert ENV.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now"
-            assert kernel.dtype == "int8"
-
-            strategy.add_implementation(
-                _strategy.wrap_compute_conv2d(conv2d_packed, need_data_layout=True),
-                _strategy.wrap_topi_schedule(schedule_conv2d_packed),
-                name="conv2d_packed.vta",
-            )
-        else:  # group_conv2d
-            strategy.add_implementation(
-                _strategy.wrap_compute_conv2d(group_conv2d_packed, has_groups=True),
-                _strategy.wrap_topi_schedule(schedule_group_conv2d_packed),
-                name="group_conv2d_packed.vta",
-            )
-        return strategy
-
-    # If it's not packed, run on ARM CPU
-    arm_tgt = tvm.target.arm_cpu(target.model)
-    return _strategy.arm_cpu.conv2d_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt)
-
-
-@_strategy.conv2d_transpose_strategy.register("vta")
-def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target):
-    """conv2d_transpose vta strategy"""
-    dilation = topi.utils.get_const_tuple(attrs.dilation)
-    layout = attrs.data_layout
-    assert dilation == (1, 1), "support for dilation limited to (1, 1)"
-
-    if is_packed_layout(layout):
-        strategy = OpStrategy()
-        strategy.add_implementation(
-            _strategy.wrap_compute_conv2d_transpose(conv2d_transpose_packed),
-            _strategy.wrap_topi_schedule(schedule_conv2d_transpose_packed),
-            name="conv2d_transpose_packed.vta",
-        )
-        return strategy
-
-    # If it's not packed, run on ARM CPU
-    arm_tgt = tvm.target.arm_cpu(target.model)
-    return _strategy.arm_cpu.conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt)
-
-
-@_strategy.dense_strategy.register("vta")
-def dense_strategy_vta(attrs, inputs, out_type, target):
-    """dense vta strategy"""
-    if len(inputs[0].shape) == 4:  # this implies the layout is packed
-        strategy = OpStrategy()
-        strategy.add_implementation(
-            _strategy.wrap_compute_dense(dense_packed),
-            _strategy.wrap_topi_schedule(schedule_dense_packed),
-            name="dense_packed.vta",
-        )
-        return strategy
-    # If it's not packed, run on ARM CPU
-    arm_tgt = tvm.target.arm_cpu(target.model)
-    return _strategy.x86.dense_strategy_cpu(attrs, inputs, out_type, arm_tgt)
diff --git a/vta/python/vta/top/utils.py b/vta/python/vta/top/utils.py
deleted file mode 100644
index 46a3a8851425..000000000000
--- a/vta/python/vta/top/utils.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""VTA TOPI Utils."""
-
-
-def is_packed_layout(layout):
-    """Check if layout is packed layout"""
-    if layout == "NCHW":
-        return False
-    if "n" in layout and "c" in layout:
-        return True
-    return False
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
deleted file mode 100644
index 5271b407fb8d..000000000000
--- a/vta/python/vta/top/vta_conv2d.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Conv2D operator declaration and schedule registration for VTA."""
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-
-from .utils import is_packed_layout
-from ..environment import get_env
-
-
-@autotvm.register_topi_compute("conv2d_packed.vta")
-def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
-    """Packed conv2d function."""
-    if not is_packed_layout(layout):
-        raise topi.InvalidShapeError()
-    assert dilation == (1, 1)
-
-    if padding[0]:
-        pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data")
-    else:
-        pad_data = data
-    assert len(data.shape) == 6
-    assert len(kernel.shape) == 6
-    oheight = topi.utils.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
-    owidth = topi.utils.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
-    oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])
-
-    ishape = topi.utils.get_const_tuple(data.shape)
-    kshape = topi.utils.get_const_tuple(kernel.shape)
-    d_i = te.reduce_axis((0, kshape[2]), name="d_i")
-    d_j = te.reduce_axis((0, kshape[3]), name="d_j")
-    k_o = te.reduce_axis((0, ishape[1]), name="k_o")
-    k_i = te.reduce_axis((0, ishape[-1]), name="k_i")
-    hstride, wstride = strides
-    res = te.compute(
-        oshape,
-        lambda b_o, c_o, i, j, b_i, c_i: te.sum(
-            pad_data[b_o, k_o, i * hstride + d_i, j * wstride + d_j, b_i, k_i].astype(out_dtype)
-            * kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
-            axis=[k_o, d_i, d_j, k_i],
-        ),
-        name="res",
-        tag="conv2d_dense",
-    )
-
-    cfg.add_flop(
-        2
-        * np.prod(topi.utils.get_const_tuple(oshape))
-        * kshape[2]
-        * kshape[3]
-        * ishape[1]
-        * ishape[-1]
-    )
-
-    return res
-
-
-@autotvm.register_topi_schedule("conv2d_packed.vta")
-def schedule_conv2d_packed(cfg, outs):
-    """Schedule packed conv2d"""
-    assert len(outs) == 1
-    output = outs[0]
-    const_ops = []
-    ewise_inputs = []
-    ewise_ops = []
-    conv2d_res = []
-    assert "int" in output.op.input_tensors[0].dtype
-
-    def _traverse(op):
-        if topi.tag.is_broadcast(op.tag):
-            if not op.same_as(output.op):
-                if not op.axis:
-                    const_ops.append(op)
-                else:
-                    ewise_ops.append(op)
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.te.PlaceholderOp):
-                    ewise_inputs.append((op, tensor))
-                else:
-                    _traverse(tensor.op)
-        else:
-            assert op.tag == "conv2d_dense"
-            conv2d_res.append(op)
-
-    _traverse(output.op)
-    assert len(conv2d_res) == 1
-    conv2d_stage = conv2d_res[0].output(0)
-    s = te.create_schedule(output.op)
-
-    ##### space definition begin #####
-    b, c_o, x_i, x_j, _, _ = s[conv2d_stage].op.axis
-    c_i, _, _, _ = s[conv2d_stage].op.reduce_axis
-    cfg.define_split("tile_b", b, num_outputs=2)
-    cfg.define_split("tile_h", x_i, num_outputs=2)
-    cfg.define_split("tile_w", x_j, num_outputs=2)
-    cfg.define_split("tile_ci", c_i, num_outputs=2)
-    cfg.define_split("tile_co", c_o, num_outputs=2)
-    cfg.define_knob("oc_nthread", [1, 2])
-    cfg.define_knob("h_nthread", [1, 2])
-    ###### space definition end ######
-
-    data, kernel = conv2d_stage.op.input_tensors
-    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-        temp = data.op.input_tensors[0]
-        pad_data = data
-        data = temp
-    else:
-        pad_data = None
-
-    env = get_env()
-
-    # setup pad
-    if pad_data is not None:
-        cdata = pad_data
-        s[pad_data].set_scope(env.inp_scope)
-    else:
-        cdata = s.cache_read(data, env.inp_scope, [conv2d_stage])
-    ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage])
-    s[conv2d_stage].set_scope(env.acc_scope)
-
-    # cache read input
-    cache_read_ewise = []
-    for consumer, tensor in ewise_inputs:
-        cache_read_ewise.append(s.cache_read(tensor, env.acc_scope, [consumer]))
-
-    # set ewise scope
-    for op in ewise_ops:
-        s[op].set_scope(env.acc_scope)
-        s[op].pragma(s[op].op.axis[0], env.alu)
-
-    for op in const_ops:
-        s[op].compute_inline()
-
-    # tile
-    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
-    x_co0, x_co1 = cfg["tile_co"].apply(s, output, x_co)
-    x_i0, x_i1 = cfg["tile_h"].apply(s, output, x_i)
-    x_j0, x_j1 = cfg["tile_w"].apply(s, output, x_j)
-    s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
-    store_pt = x_j0
-
-    # set all compute scopes
-    s[conv2d_stage].compute_at(s[output], store_pt)
-    for op in ewise_ops:
-        s[op].compute_at(s[output], store_pt)
-
-    for tensor in cache_read_ewise:
-        s[tensor].compute_at(s[output], store_pt)
-        s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
-
-    # virtual threading along output channel axes
-    if cfg["oc_nthread"].val > 1:
-        _, v_t = s[output].split(x_co0, factor=cfg["oc_nthread"].val)
-        s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, te.thread_axis("cthread"))
-
-    # virtual threading along spatial rows
-    if cfg["h_nthread"].val > 1:
-        _, v_t = s[output].split(x_i0, factor=cfg["h_nthread"].val)
-        s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, te.thread_axis("cthread"))
-
-    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis
-    k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis
-    s[conv2d_stage].reorder(x_bo, k_o, x_j, d_j, d_i, x_co, x_i, x_bi, x_ci, k_i)
-
-    k_o, _ = cfg["tile_ci"].apply(s, conv2d_stage, k_o)
-    s[cdata].compute_at(s[conv2d_stage], k_o)
-    s[ckernel].compute_at(s[conv2d_stage], k_o)
-
-    # Use VTA instructions
-    s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy)
-    s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy)
-    s[conv2d_stage].tensorize(x_bi, env.gemm)
-    s[output].pragma(x_co1, env.dma_copy)
-
-    return s
diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py
deleted file mode 100644
index 5a44104baa57..000000000000
--- a/vta/python/vta/top/vta_conv2d_transpose.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Conv2D_transpose operator declaration and schedule registration for VTA."""
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.utils import get_pad_tuple
-
-from ..environment import get_env
-
-
-@autotvm.register_topi_compute("conv2d_transpose_packed.vta")
-def conv2d_transpose_packed(cfg, data, kernel, strides, padding, out_dtype, output_padding=(0, 0)):
-    """Packed conv2d_transpose compute"""
-    ishape = get_const_tuple(data.shape)
-    kshape = get_const_tuple(kernel.shape)
-    b, c_i, i_h, i_w, t_b, t_ci = ishape
-    c_o, _, k_h, k_w, t_co, t_ci = kshape
-    stride_h, stride_w = strides
-    opad_h, opad_w = output_padding
-    # FIXME(tmoreau89): currently IR pass breaks when output padding != (0,0)
-    assert opad_h == 0 and opad_w == 0, "VTA does not support output padding for now"
-
-    # derive padding parameters
-    fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (k_h, k_w))
-    bpad_top = k_h - 1 - fpad_top
-    bpad_bottom = k_h - 1 - fpad_bottom + opad_h
-    bpad_left = k_w - 1 - fpad_left
-    bpad_right = k_w - 1 - fpad_right + opad_w
-
-    # padding stage
-    dilated_input = topi.nn.dilate(data, [1, 1, stride_h, stride_w, 1, 1])
-    data_pad = topi.nn.pad(
-        dilated_input, [0, 0, bpad_top, bpad_left, 0, 0], [0, 0, bpad_bottom, bpad_right, 0, 0]
-    )
-
-    # convolution transpose stage
-    out_h = (i_h - 1) * stride_h - fpad_top - fpad_bottom + k_h + opad_h
-    out_w = (i_w - 1) * stride_w - fpad_left - fpad_right + k_w + opad_w
-    oshape = (b, c_o, out_h, out_w, t_b, t_co)
-    d_c = te.reduce_axis((0, c_i), name="d_c")
-    d_h = te.reduce_axis((0, k_h), name="d_h")
-    d_w = te.reduce_axis((0, k_w), name="d_w")
-    d_ci = te.reduce_axis((0, t_ci), name="d_ci")
-
-    out = te.compute(
-        oshape,
-        lambda i_n, i_c, i_h, i_w, j_n, j_c: te.sum(
-            data_pad(i_n, d_c, i_h + d_h, i_w + d_w, j_n, d_ci).astype(out_dtype)
-            * kernel[i_c, d_c, d_h, d_w, j_c, d_ci].astype(out_dtype),
-            axis=[d_c, d_h, d_w, d_ci],
-        ),
-        tag="packed_conv2d_transpose",
-        name="res",
-    )
-
-    cfg.add_flop(
-        2
-        * np.prod(topi.utils.get_const_tuple(oshape))
-        * kshape[2]
-        * kshape[3]
-        * ishape[1]
-        * ishape[-1]
-    )
-
-    return out
-
-
-@autotvm.register_topi_schedule("conv2d_transpose_packed.vta")
-def schedule_conv2d_transpose_packed(cfg, outs):
-    """Schedule packed conv2d_transpose"""
-    assert len(outs) == 1
-    output = outs[0]
-    ewise_inputs = []
-    ewise_ops = []
-    conv2d_res = []
-    assert output.dtype == "int8"
-    assert output.op.input_tensors[0].dtype == "int32"
-
-    def _traverse(op):
-        if topi.tag.is_broadcast(op.tag):
-            if not op.same_as(output.op):
-                ewise_ops.append(op)
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.te.PlaceholderOp):
-                    ewise_inputs.append((op, tensor))
-                else:
-                    _traverse(tensor.op)
-        else:
-            assert op.tag == "packed_conv2d_transpose"
-            conv2d_res.append(op)
-
-    _traverse(output.op)
-    assert len(conv2d_res) == 1
-    conv2d_stage = conv2d_res[0].output(0)
-    s = te.create_schedule(output.op)
-
-    ##### space definition begin #####
-    b, c_o, x_i, x_j, _, c_i = s[conv2d_stage].op.axis
-    c_i, _, _, _ = s[conv2d_stage].op.reduce_axis
-    cfg.define_split("tile_b", b, num_outputs=2)
-    cfg.define_split("tile_h", x_i, num_outputs=2)
-    cfg.define_split("tile_w", x_j, num_outputs=2)
-    cfg.define_split("tile_ci", c_i, num_outputs=2)
-    cfg.define_split("tile_co", c_o, num_outputs=2)
-    cfg.define_knob("oc_nthread", [1, 2])
-    cfg.define_knob("h_nthread", [1, 2])
-    ###### space definition end ######
-
-    data, kernel = conv2d_stage.op.input_tensors
-    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-        temp = data.op.input_tensors[0]
-        pad_data = data
-        data = temp
-    else:
-        pad_data = None
-
-    env = get_env()
-
-    # setup pad
-    if pad_data is not None:
-        cdata = pad_data
-        s[pad_data].set_scope(env.inp_scope)
-    else:
-        cdata = s.cache_read(data, env.inp_scope, [conv2d_stage])
-    ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage])
-    s[conv2d_stage].set_scope(env.acc_scope)
-
-    # cache read input
-    cache_read_ewise = []
-    for consumer, tensor in ewise_inputs:
-        cache_read_ewise.append(s.cache_read(tensor, env.acc_scope, [consumer]))
-    # set ewise scope
-    for op in ewise_ops:
-        s[op].set_scope(env.acc_scope)
-        s[op].pragma(s[op].op.axis[0], env.alu)
-
-    # tile
-    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
-    x_co0, x_co1 = cfg["tile_co"].apply(s, output, x_co)
-    x_i0, x_i1 = cfg["tile_h"].apply(s, output, x_i)
-    x_j0, x_j1 = cfg["tile_w"].apply(s, output, x_j)
-    s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
-    store_pt = x_j0
-
-    # set all compute scopes
-    s[conv2d_stage].compute_at(s[output], store_pt)
-    for op in ewise_ops:
-        s[op].compute_at(s[output], store_pt)
-
-    for tensor in cache_read_ewise:
-        s[tensor].compute_at(s[output], store_pt)
-        s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
-
-    # virtual threading along output channel axes
-    if cfg["oc_nthread"].val > 1:
-        _, v_t = s[output].split(x_co0, factor=cfg["oc_nthread"].val)
-        s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, te.thread_axis("cthread"))
-
-    # virtual threading along spatial rows
-    if cfg["h_nthread"].val > 1:
-        _, v_t = s[output].split(x_i0, factor=cfg["h_nthread"].val)
-        s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, te.thread_axis("cthread"))
-
-    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis
-    k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis
-    x_i, x_ii = s[conv2d_stage].split(x_i, 4)
-    x_j, x_jj = s[conv2d_stage].split(x_j, 2)
-    s[conv2d_stage].reorder(x_bo, k_o, x_j, x_co, x_i, x_jj, d_j, d_i, x_ii, x_bi, x_ci, k_i)
-
-    for axis in [d_j, d_i, x_ii, x_jj]:
-        s[conv2d_stage].unroll(axis)
-
-    k_o, _ = cfg["tile_ci"].apply(s, conv2d_stage, k_o)
-    s[cdata].compute_at(s[conv2d_stage], k_o)
-    s[ckernel].compute_at(s[conv2d_stage], k_o)
-
-    # Use VTA instructions
-    s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy)
-    s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy)
-    s[conv2d_stage].pragma(x_bi, "conv2d_transpose_gemm")
-    s[output].pragma(x_co1, env.dma_copy)
-
-    return s
diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
deleted file mode 100644
index 5e06cf9f5624..000000000000
--- a/vta/python/vta/top/vta_dense.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-argument
-"""Dense operator declaration and schedule registration for VTA."""
-
-import numpy as np
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-
-from ..environment import get_env
-
-
-def is_packed_layout(layout):
-    """Check if layout is packed layout"""
-    if layout == "NCHW":
-        return False
-    if "n" in layout and "c" in layout:
-        return True
-    return False
-
-
-@autotvm.register_topi_compute("dense_packed.vta")
-def dense_packed(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense function declaration."""
-
-    # Make sure that the dense operator is packed
-    if len(data.shape) != 4 or len(weight.shape) != 4:
-        raise topi.InvalidShapeError()
-
-    # Derive shapes
-    ishape = topi.utils.get_const_tuple(data.shape)
-    wshape = topi.utils.get_const_tuple(weight.shape)
-    oshape = (data.shape[0], weight.shape[0], data.shape[2], weight.shape[2])
-
-    # Reduction axes (input channel)
-    assert ishape[1] == wshape[1]
-    assert ishape[3] == wshape[3]
-    k_o = te.reduce_axis((0, ishape[1]), name="k_o")
-    k_i = te.reduce_axis((0, ishape[3]), name="k_i")
-    res = te.compute(
-        oshape,
-        lambda b_o, c_o, b_i, c_i: te.sum(
-            data[b_o, k_o, b_i, k_i].astype(out_dtype)
-            * weight[c_o, k_o, c_i, k_i].astype(out_dtype),
-            axis=[k_o, k_i],
-        ),
-        name="res",
-        tag="dense_pack",
-    )
-
-    cfg.add_flop(2 * np.prod(topi.utils.get_const_tuple(oshape)) * ishape[1] * ishape[3])
-
-    return res
-
-
-@autotvm.register_topi_schedule("dense_packed.vta")
-def schedule_dense_packed(cfg, outs):
-    """Packed dense schedule."""
-
-    assert len(outs) == 1
-    output = outs[0]
-    const_ops = []
-    ewise_inputs = []
-    ewise_ops = []
-    dense_res = []
-    assert "int" in output.op.input_tensors[0].dtype
-
-    def _traverse(op):
-        if topi.tag.is_broadcast(op.tag):
-            if not op.same_as(output.op):
-                if not op.axis:
-                    const_ops.append(op)
-                else:
-                    ewise_ops.append(op)
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.te.PlaceholderOp):
-                    ewise_inputs.append((op, tensor))
-                else:
-                    _traverse(tensor.op)
-        else:
-            assert op.tag == "dense_pack"
-            dense_res.append(op)
-
-    _traverse(output.op)
-    assert len(dense_res) == 1
-    dense_stage = dense_res[0].output(0)
-    s = te.create_schedule(output.op)
-
-    ##### space definition begin #####
-    b, c_o, _, _ = s[dense_stage].op.axis
-    c_i, _ = s[dense_stage].op.reduce_axis
-    cfg.define_split("tile_b", b, num_outputs=2)
-    cfg.define_split("tile_ci", c_i, num_outputs=2)
-    cfg.define_split("tile_co", c_o, num_outputs=2)
-    cfg.define_knob("oc_nthread", [1, 2])
-    ###### space definition end ######
-
-    data, weight = dense_stage.op.input_tensors
-
-    env = get_env()
-
-    cdata = s.cache_read(data, env.inp_scope, [dense_stage])
-    cweight = s.cache_read(weight, env.wgt_scope, [dense_stage])
-    s[dense_stage].set_scope(env.acc_scope)
-
-    # cache read input
-    cache_read_ewise = []
-    for consumer, tensor in ewise_inputs:
-        cache_read_ewise.append(s.cache_read(tensor, env.acc_scope, [consumer]))
-
-    # set ewise scope
-    for op in ewise_ops:
-        s[op].set_scope(env.acc_scope)
-        s[op].pragma(s[op].op.axis[0], env.alu)
-
-    for op in const_ops:
-        s[op].compute_inline()
-
-    # apply tiling for SRAM reuse
-    x_b, x_c, _, _ = s[output].op.axis
-    x_bo, x_bi = cfg["tile_b"].apply(s, output, x_b)
-    x_co, x_ci = cfg["tile_co"].apply(s, output, x_c)
-    s[output].reorder(x_bo, x_co, x_bi, x_ci)
-    store_pt = x_co
-
-    # set all compute scopes
-    s[dense_stage].compute_at(s[output], store_pt)
-    for op in ewise_ops:
-        s[op].compute_at(s[output], store_pt)
-
-    for tensor in cache_read_ewise:
-        s[tensor].compute_at(s[output], store_pt)
-        s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
-
-    # virtual threading along output channel axes
-    if cfg["oc_nthread"].val > 1:
-        _, v_t = s[output].split(x_co, factor=cfg["oc_nthread"].val)
-        s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, te.thread_axis("cthread"))
-
-    x_bo, x_co, x_bi, _ = s[dense_stage].op.axis
-    k_o, _ = s[dense_stage].op.reduce_axis
-    s[dense_stage].reorder(x_bo, k_o, x_co)
-
-    k_o, _ = cfg["tile_ci"].apply(s, dense_stage, k_o)
-    s[cdata].compute_at(s[dense_stage], k_o)
-    s[cweight].compute_at(s[dense_stage], k_o)
-
-    # Use VTA instructions
-    s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy)
-    s[cweight].pragma(s[cweight].op.axis[0], env.dma_copy)
-    s[dense_stage].tensorize(x_bi, env.gemm)
-    s[output].pragma(x_ci, env.dma_copy)
-
-    return s
diff --git a/vta/python/vta/top/vta_group_conv2d.py b/vta/python/vta/top/vta_group_conv2d.py
deleted file mode 100644
index 69d2579ad78c..000000000000
--- a/vta/python/vta/top/vta_group_conv2d.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Group conv2D operator declaration and schedule registration for VTA."""
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-
-from ..environment import get_env
-
-
-@autotvm.register_topi_compute("group_conv2d_packed.vta")
-def group_conv2d_packed(cfg, data, kernel, strides, padding, dilation, group, out_dtype):
-    """Packed group conv2d nchw function."""
-    assert dilation == (1, 1)
-
-    if padding[0]:
-        pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data")
-    else:
-        pad_data = data
-    assert len(data.shape) == 6
-    assert len(kernel.shape) == 6
-    assert data.dtype == "int8", data.dtype
-    assert kernel.dtype == "int8", kernel.dtype
-    assert out_dtype == "int32", out_dtype
-
-    oheight = topi.utils.get_const_int((pad_data.shape[2] - kernel.shape[2]) // strides[0] + 1)
-    owidth = topi.utils.get_const_int((pad_data.shape[3] - kernel.shape[3]) // strides[1] + 1)
-    oshape = (data.shape[0], kernel.shape[0], oheight, owidth, data.shape[4], kernel.shape[4])
-
-    ishape = topi.utils.get_const_tuple(data.shape)
-    kshape = topi.utils.get_const_tuple(kernel.shape)
-    assert group * kshape[1] == ishape[1]
-    assert kshape[0] % group == 0
-    d_i = te.reduce_axis((0, kshape[2]), name="d_i")
-    d_j = te.reduce_axis((0, kshape[3]), name="d_j")
-    k_o = te.reduce_axis((0, kshape[1]), name="k_o")
-    k_i = te.reduce_axis((0, kshape[-1]), name="k_i")
-    hstride, wstride = strides
-    out = te.compute(
-        oshape,
-        lambda b_o, c_o, i, j, b_i, c_i: te.sum(
-            pad_data[
-                b_o,
-                c_o // (kshape[0] // group) * kshape[1] + k_o,
-                i * hstride + d_i,
-                j * wstride + d_j,
-                b_i,
-                k_i,
-            ].astype(out_dtype)
-            * kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
-            axis=[k_o, d_i, d_j, k_i],
-        ),
-        name="res",
-        tag="packed_group_conv2d",
-    )
-
-    cfg.add_flop(
-        2
-        * np.prod(topi.utils.get_const_tuple(oshape))
-        * kshape[2]
-        * kshape[3]
-        * ishape[1]
-        * kshape[-1]
-    )
-
-    return out
-
-
-@autotvm.register_topi_schedule("group_conv2d_packed.vta")
-def schedule_group_conv2d_packed(cfg, outs):
-    """Schedule the packed conv2d."""
-    assert len(outs) == 1
-    output = outs[0]
-    const_ops = []
-    ewise_inputs = []
-    ewise_ops = []
-    conv2d_res = []
-    assert output.dtype == "int8"
-    assert output.op.input_tensors[0].dtype == "int32"
-
-    def _traverse(op):
-        if topi.tag.is_broadcast(op.tag):
-            if not op.same_as(output.op):
-                if not op.axis:
-                    const_ops.append(op)
-                else:
-                    ewise_ops.append(op)
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.te.PlaceholderOp):
-                    ewise_inputs.append((op, tensor))
-                else:
-                    _traverse(tensor.op)
-        else:
-            assert op.tag == "packed_group_conv2d"
-            conv2d_res.append(op)
-
-    _traverse(output.op)
-    assert len(conv2d_res) == 1
-    conv2d_stage = conv2d_res[0].output(0)
-    s = te.create_schedule(output.op)
-
-    ##### space definition begin #####
-    b, c_o, x_i, x_j, _, _ = s[conv2d_stage].op.axis
-    c_i, _, _, _ = s[conv2d_stage].op.reduce_axis
-    cfg.define_split("tile_b", b, num_outputs=2)
-    cfg.define_split("tile_h", x_i, num_outputs=2)
-    cfg.define_split("tile_w", x_j, num_outputs=2)
-    cfg.define_split("tile_ci", c_i, num_outputs=2)
-    cfg.define_split("tile_co", c_o, num_outputs=2)
-    cfg.define_knob("oc_nthread", [1, 2])
-    cfg.define_knob("h_nthread", [1, 2])
-    ###### space definition end ######
-
-    data, kernel = conv2d_stage.op.input_tensors
-    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
-        temp = data.op.input_tensors[0]
-        pad_data = data
-        data = temp
-    else:
-        pad_data = None
-
-    env = get_env()
-
-    # setup pad
-    if pad_data is not None:
-        cdata = pad_data
-        s[pad_data].set_scope(env.inp_scope)
-    else:
-        cdata = s.cache_read(data, env.inp_scope, [conv2d_stage])
-    ckernel = s.cache_read(kernel, env.wgt_scope, [conv2d_stage])
-    s[conv2d_stage].set_scope(env.acc_scope)
-
-    # cache read input
-    cache_read_ewise = []
-    for consumer, tensor in ewise_inputs:
-        cache_read_ewise.append(s.cache_read(tensor, env.acc_scope, [consumer]))
-
-    # set ewise scope
-    for op in ewise_ops:
-        s[op].set_scope(env.acc_scope)
-        s[op].pragma(s[op].op.axis[0], env.alu)
-
-    for op in const_ops:
-        s[op].compute_inline()
-
-    # tile
-    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
-    x_co0, x_co1 = cfg["tile_co"].apply(s, output, x_co)
-    x_i0, x_i1 = cfg["tile_h"].apply(s, output, x_i)
-    x_j0, x_j1 = cfg["tile_w"].apply(s, output, x_j)
-    s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
-    store_pt = x_j0
-
-    # set all compute scopes
-    s[conv2d_stage].compute_at(s[output], store_pt)
-    for op in ewise_ops:
-        s[op].compute_at(s[output], store_pt)
-
-    for tensor in cache_read_ewise:
-        s[tensor].compute_at(s[output], store_pt)
-        s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
-
-    # virtual threading along output channel axes
-    if cfg["oc_nthread"].val > 1:
-        _, v_t = s[output].split(x_co0, factor=cfg["oc_nthread"].val)
-        s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, te.thread_axis("cthread"))
-
-    # virtual threading along spatial rows
-    if cfg["h_nthread"].val > 1:
-        _, v_t = s[output].split(x_i0, factor=cfg["h_nthread"].val)
-        s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, te.thread_axis("cthread"))
-
-    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis
-    k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis
-    s[conv2d_stage].reorder(x_bo, k_o, x_j, d_j, d_i, x_co, x_i, x_bi, x_ci, k_i)
-
-    k_o, _ = cfg["tile_ci"].apply(s, conv2d_stage, k_o)
-    s[cdata].compute_at(s[conv2d_stage], k_o)
-    s[ckernel].compute_at(s[conv2d_stage], k_o)
-
-    # Use VTA instructions
-    s[cdata].pragma(s[cdata].op.axis[0], env.dma_copy)
-    s[ckernel].pragma(s[ckernel].op.axis[0], env.dma_copy)
-    s[conv2d_stage].tensorize(x_bi, env.gemm)
-    s[output].pragma(x_co1, env.dma_copy)
-
-    return s
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
deleted file mode 100644
index ae83a9d66392..000000000000
--- a/vta/python/vta/transform.py
+++ /dev/null
@@ -1,1123 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Additional Transformation Passes. for VTA"""
-# pylint: disable=len-as-condition, no-else-return, unused-argument, invalid-name
-import tvm
-from tvm import te
-from tvm.topi import utils
-from tvm.script import tir as T
-
-from .environment import get_env
-
-
-def _match_pragma(stmt, key):
-    """Internal helper to match stmt to pragma stmt.
-
-    Parameters
-    ----------
-    stmt : Stmt
-        The AttrStmt
-
-    key : str
-        The pragma key
-    """
-    return (stmt.attr_key == "pragma_" + key) or (
-        stmt.attr_key == "pragma_scope" and stmt.value.value == key
-    )
-
-
-def FoldUopLoop():
-    """Detect and fold uop loop.
-
-    VTA support uop programming model
-    that recognizes loop structure.
-    This pass detect the loop structure
-    and extract that into uop loop AST.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The pass
-    """
-
-    def _fold_outermost_loop(body):
-        stmt = body
-        if not isinstance(stmt, tvm.tir.For):
-            return None, body, None
-
-        loop_var = stmt.loop_var
-        gemm_offsets = [None, None, None]
-        fail = [False]
-        builtin_uop_push = tvm.ir.Op.get("tir.vta.uop_push")
-
-        def _post_order(op):
-            assert isinstance(op, tvm.tir.Call)
-            base_args = 2
-            if op.op.same_as(builtin_uop_push):
-                args = []
-                args += op.args[:base_args]
-                for i in range(3):
-                    m = tvm.arith.detect_linear_equation(op.args[i + base_args], [loop_var])
-                    if not m:
-                        fail[0] = True
-                        return op
-                    if gemm_offsets[i] is not None:
-                        if not tvm.ir.structural_equal(m[0], gemm_offsets[i]):
-                            fail[0] = True
-                            return op
-                        args.append(m[1])
-                    else:
-                        gemm_offsets[i] = m[0]
-                        args.append(m[1])
-                args += op.args[base_args + 3 :]
-                return tvm.tir.call_intrin("int32", builtin_uop_push, *args)
-            if op.op.name not in ("tir.vta.command_handle", "tir.tvm_thread_context"):
-                raise RuntimeError("unexpected op %s" % op)
-            return op
-
-        ret = tvm.tir.stmt_functor.ir_transform(stmt.body, None, _post_order, ["tir.Call"])
-
-        if not fail[0] and all(x is not None for x in gemm_offsets):
-
-            def _visit(op):
-                if op.same_as(loop_var):
-                    fail[0] = True
-
-            tvm.tir.stmt_functor.post_order_visit(ret, _visit)
-            if not fail[0]:
-                begin = tvm.tir.call_extern("int32", "VTAUopLoopBegin", stmt.extent, *gemm_offsets)
-                end = tvm.tir.call_extern("int32", "VTAUopLoopEnd")
-                return [begin, ret, end]
-        raise ValueError("Failed to fold the GEMM instructions..")
-
-    def _do_fold(stmt):
-        env = get_env()
-        if (
-            stmt.attr_key == "coproc_uop_scope"
-            and isinstance(stmt.value, tvm.tir.StringImm)
-            and stmt.value.value == env.dev.vta_push_uop.value
-        ):
-            body = stmt.body
-            begins = []
-            ends = []
-            try:
-                begin, body, end = _fold_outermost_loop(body)
-                if begin is not None:
-                    begins.append(begin)
-                if end is not None:
-                    ends.append(end)
-                begin, body, end = _fold_outermost_loop(body)
-                if begin is not None:
-                    begins.append(begin)
-                if end is not None:
-                    ends.append(end)
-            except ValueError:
-                pass
-            if body == stmt.body:
-                return stmt
-            ends = list(reversed(ends))
-            body = tvm.tir.stmt_seq(*(begins + [body] + ends))
-            return tvm.tir.AttrStmt(stmt.node, stmt.attr_key, stmt.value, body)
-        return None
-
-    def _ftransform(f, mod, ctx):
-        return f.with_body(
-            tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
-        )
-
-    return tvm.tir.transform.prim_func_pass(_ftransform, opt_level=0, name="tir.vta.FoldUopLoop")
-
-
-def CPUAccessRewrite():
-    """Detect CPU access to VTA buffer and get address correctly.
-
-    VTA's buffer is an opaque handle that do not
-    correspond to address in CPU.
-    This pass detect CPU access and rewrite to use pointer
-    returned VTABufferCPUPtr for CPU access.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The pass
-    """
-
-    def _ftransform(f, mod, ctx):
-        env = get_env()
-
-        var_remap = {}
-        buf_remap = {}
-
-        def find_var_remap(old_var):
-            if old_var in var_remap:
-                return var_remap[old_var]
-
-            new_var = tvm.tir.Var(old_var.name + "_ptr", dtype=old_var.type_annotation)
-            var_remap[old_var] = new_var
-            return new_var
-
-        def find_buf_remap(old_buf):
-            if old_buf in buf_remap:
-                return buf_remap[old_buf]
-
-            new_var = find_var_remap(old_buf.data)
-            new_buf = tvm.tir.decl_buffer(
-                shape=old_buf.shape,
-                dtype=old_buf.dtype,
-                data=new_var,
-                strides=old_buf.strides,
-                elem_offset=old_buf.elem_offset,
-                scope=old_buf.scope,
-                data_alignment=old_buf.data_alignment,
-                offset_factor=old_buf.offset_factor,
-                buffer_type="auto_broadcast" if (old_buf.buffer_type == 2) else "",
-                axis_separators=old_buf.axis_separators,
-            )
-            buf_remap[old_buf] = new_buf
-            return new_buf
-
-        def _post_order(op):
-            if isinstance(op, tvm.tir.Allocate):
-                buffer_var = op.buffer_var
-                if buffer_var not in var_remap:
-                    return None
-                new_var = var_remap[buffer_var]
-                let_stmt = tvm.tir.LetStmt(
-                    new_var,
-                    tvm.tir.call_extern(
-                        "handle", "VTABufferCPUPtr", env.dev.command_handle, buffer_var
-                    ),
-                    op.body,
-                )
-                alloc = tvm.tir.Allocate(buffer_var, op.dtype, op.extents, op.condition, let_stmt)
-                del var_remap[buffer_var]
-                bufs_to_delete = [
-                    old_buf for old_buf in buf_remap if old_buf.data.same_as(buffer_var)
-                ]
-                for buf in bufs_to_delete:
-                    del buf_remap[buf]
-                return alloc
-
-            if isinstance(op, tvm.tir.BufferLoad):
-                return tvm.tir.BufferLoad(find_buf_remap(op.buffer), op.indices)
-
-            if isinstance(op, tvm.tir.BufferStore):
-                return tvm.tir.BufferStore(find_buf_remap(op.buffer), op.value, op.indices)
-
-            raise RuntimeError("not reached")
-
-        stmt_in = f.body
-        stmt = tvm.tir.stmt_functor.ir_transform(
-            stmt_in, None, _post_order, ["tir.Allocate", "tir.BufferLoad", "tir.BufferStore"]
-        )
-
-        for old_var, new_var in var_remap.items():
-            stmt = tvm.tir.LetStmt(
-                new_var,
-                tvm.tir.call_extern("handle", "VTABufferCPUPtr", env.dev.command_handle, old_var),
-                stmt,
-            )
-        return f.with_body(stmt)
-
-    return tvm.tir.transform.prim_func_pass(
-        _ftransform, opt_level=0, name="tir.vta.CPUAccessRewrite"
-    )
-
-
-def LiftAllocToScopeBegin():
-    """Lift allocate to beginning of the current scope.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The pass
-    """
-
-    def _ftransform(f, mod, ctx):
-        lift_stmt = [[]]
-
-        def _merge_block(slist, body):
-            for op in slist:
-                if op.body == body:
-                    body = op
-                elif isinstance(op, tvm.tir.Allocate):
-                    body = tvm.tir.Allocate(op.buffer_var, op.dtype, op.extents, op.condition, body)
-                elif isinstance(op, tvm.tir.AttrStmt):
-                    body = tvm.tir.AttrStmt(op.node, op.attr_key, op.value, body)
-                elif isinstance(op, tvm.tir.For):
-                    body = tvm.tir.For(
-                        op.loop_var,
-                        op.min,
-                        op.extent,
-                        op.kind,
-                        body,
-                        op.thread_binding,
-                        op.annotations,
-                    )
-                else:
-                    raise RuntimeError("unexpected op")
-            del slist[:]
-            return body
-
-        def _pre_order(op):
-            if isinstance(op, tvm.tir.For):
-                lift_stmt.append([])
-            elif isinstance(op, tvm.tir.AttrStmt):
-                if op.attr_key == "virtual_thread":
-                    lift_stmt.append([])
-
-        def _post_order(op):
-            if isinstance(op, tvm.tir.Allocate):
-                lift_stmt[-1].append(op)
-                return op.body
-            if isinstance(op, tvm.tir.AttrStmt):
-                if op.attr_key == "storage_scope":
-                    lift_stmt[-1].append(op)
-                    return op.body
-                if op.attr_key == "virtual_thread":
-                    return _merge_block(lift_stmt.pop() + [op], op.body)
-                return op
-            if isinstance(op, tvm.tir.For):
-                return _merge_block(lift_stmt.pop() + [op], op.body)
-            raise RuntimeError("not reached")
-
-        stmt_in = f.body
-        stmt = tvm.tir.stmt_functor.ir_transform(
-            stmt_in, _pre_order, _post_order, ["tir.Allocate", "tir.AttrStmt", "tir.For"]
-        )
-        assert len(lift_stmt) == 1
-        return f.with_body(_merge_block(lift_stmt[0], stmt))
-
-    return tvm.tir.transform.prim_func_pass(
-        _ftransform, opt_level=0, name="tir.vta.LiftAllocToScopeBegin"
-    )
-
-
-def InjectSkipCopy():
-    """Pass to inject skip copy stmt, used for debug purpose.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The pass
-    """
-
-    def _do_fold(stmt):
-        if _match_pragma(stmt, "skip_dma_copy"):
-            return tvm.tir.Evaluate(0)
-        return None
-
-    def _ftransform(f, mod, ctx):
-        return f.with_body(
-            tvm.tir.stmt_functor.ir_transform(f.body, _do_fold, None, ["tir.AttrStmt"])
-        )
-
-    return tvm.tir.transform.prim_func_pass(_ftransform, opt_level=0, name="tir.vta.InjectSkipCopy")
-
-
-def InjectCoProcSync():
-    """Pass inject coproc sync
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The pass
-    """
-
-    def _ftransform(f, *_):
-        success = [False]
-
-        def _do_fold(stmt):
-            if _match_pragma(stmt, "coproc_sync"):
-                success[0] = True
-                sync = tvm.tir.Call("int32", "vta.coproc_sync", [])
-                return tvm.tir.SeqStmt([stmt.body, tvm.tir.Evaluate(sync)])
-            if _match_pragma(stmt, "trim_loop"):
-                op = stmt.body
-                assert isinstance(op, tvm.tir.For)
-                return tvm.tir.For(
-                    op.loop_var, op.min, 2, op.kind, op.body, op.thread_binding, op.annotations
-                )
-            return None
-
-        return f.with_body(
-            tvm.tir.stmt_functor.ir_transform(f.body, None, _do_fold, ["tir.AttrStmt"])
-        )
-
-    return tvm.transform.Sequential(
-        [
-            tvm.tir.transform.prim_func_pass(_ftransform, 0, "tir.vta.InjectCoProcSync"),
-            tvm.tir.transform.CoProcSync(),
-        ],
-        opt_level=0,
-        name="tir.vta.InjectCoProcSync",
-    )
-
-
-def InjectDMAIntrin():
-    """Pass to inject DMA copy intrinsics.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The pass
-    """
-    idxd = tvm.tir.indexdiv
-    idxm = tvm.tir.indexmod
-
-    def _check_compact(buf):
-        ndim = len(buf.shape)
-        size = tvm.tir.const(1, buf.shape[0].dtype)
-        for i in reversed(range(ndim)):
-            if not utils.equal_const_int(size - buf.strides[i], 0):
-                raise RuntimeError(
-                    "Cannot prove compact: shape=%s, strides=%s" % (buf.shape, buf.strides)
-                )
-            size = size * buf.shape[i]
-
-    def _fold_buffer_dim(buf, scope, elem_block):
-        ndim = len(buf.shape)
-        x_size = 1
-        base = 0
-        for i in range(1, ndim + 1):
-            if not utils.equal_const_int(buf.strides[ndim - i] - x_size, 0):
-                raise RuntimeError("scope %s needs to have block=%d" % (scope, elem_block))
-            x_size = x_size * buf.shape[ndim - i]
-            if utils.equal_const_int(x_size - elem_block, 0):
-                base = i + 1
-                break
-        if base == 0:
-            raise RuntimeError(
-                "scope %s need to have block=%d, shape=%s" % (scope, elem_block, buf.shape)
-            )
-        shape = [elem_block]
-        strides = [1]
-
-        if base < ndim + 1 and not utils.equal_const_int(buf.strides[ndim - base], elem_block):
-            shape.append(1)
-            strides.append(elem_block)
-
-        analyzer = tvm.arith.Analyzer()
-        while base < ndim + 1:
-            x_size = 1
-            x_stride = buf.strides[ndim - base]
-            next_base = base
-            if not utils.equal_const_int(idxm(x_stride, elem_block), 0):
-                raise RuntimeError(
-                    "scope %s need to have block=%d, shape=%s, strides=%s"
-                    % (scope, elem_block, buf.shape, buf.strides)
-                )
-            for i in range(base, ndim + 1):
-                k = ndim - i
-                if not utils.equal_const_int(x_size * x_stride - buf.strides[k], 0):
-                    break
-                x_size = x_size * buf.shape[k]
-                next_base = i + 1
-            shape.append(analyzer.simplify(x_size))
-            strides.append(x_stride)
-            assert next_base != base
-            base = next_base
-
-        strides = list(reversed(strides))
-        shape = list(reversed(shape))
-        return shape, strides
-
-    def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold):
-        elem_block = elem_bytes * 8 // elem_width
-        shape, strides = buf.shape, buf.strides
-        if not utils.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
-            raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block))
-        if allow_fold:
-            shape, strides = _fold_buffer_dim(buf, scope, elem_block)
-        else:
-            shape = list(x for x in shape)
-            strides = list(x for x in strides)
-
-        def raise_error():
-            """Internal function to raise error"""
-            raise RuntimeError(
-                (
-                    "Scope[%s]: cannot detect 2d pattern with elem_block=%d:"
-                    + " shape=%s, strides=%s"
-                )
-                % (scope, elem_block, buf.shape, buf.strides)
-            )
-
-        ndim = len(shape)
-
-        # Check if the inner-tensor is already flat
-        flat = utils.equal_const_int(shape[-1], elem_block)
-
-        if flat:
-            if not utils.equal_const_int(strides[-1], 1):
-                raise_error()
-
-            if ndim == 1:
-                x_size = 1
-                x_stride = 1
-                y_size = 1
-                return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not utils.equal_const_int(strides[-2] - elem_block, 0):
-                raise_error()
-
-            if ndim == 2:
-                x_size = shape[-2]
-                x_stride = shape[-2]
-                y_size = 1
-                return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not utils.equal_const_int(idxm(strides[-3], elem_block), 0):
-                raise_error()
-
-            if ndim == 3:
-                x_size = shape[-2]
-                x_stride = idxd(strides[-3], elem_block)
-                y_size = shape[-3]
-                return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-
-        else:
-            if not utils.equal_const_int(strides[-1], 1):
-                raise_error()
-            if not utils.equal_const_int(strides[-2] - shape[-1], 0):
-                raise_error()
-            if not utils.equal_const_int(shape[-1] * shape[-2], elem_block):
-                raise_error()
-
-            if ndim == 2:
-                x_size = 1
-                x_stride = 1
-                y_size = 1
-                return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not utils.equal_const_int(strides[-3], elem_block):
-                raise_error()
-
-            if ndim == 3:
-                x_size = shape[-3]
-                x_stride = shape[-3]
-                y_size = 1
-                return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-            if not utils.equal_const_int(idxm(strides[-4], elem_block), 0):
-                raise_error()
-
-            if ndim == 4:
-                x_size = shape[-3]
-                x_stride = idxd(strides[-4], elem_block)
-                y_size = shape[-4]
-                return x_size, y_size, x_stride, idxd(buf.elem_offset, elem_block)
-
-        raise_error()
-
-    def _inject_copy(src, dst, pad_before, pad_after, pad_value):
-        # FIXME: pad_value is ignored...
-        env = get_env()
-        _ = pad_value
-        if dst.scope() == "global":
-            # Store
-            if pad_before or pad_after:
-                raise RuntimeError("Do not support copy into DRAM with pad")
-            if src.scope() == env.acc_scope:
-                elem_width = env.OUT_WIDTH
-                elem_bytes = env.OUT_ELEM_BYTES
-                mem_type = env.dev.MEM_ID_OUT
-                data_type = "int%d" % env.OUT_WIDTH
-                task_qid = env.dev.QID_STORE_OUT
-            else:
-                raise RuntimeError("Do not support copy %s->dram" % (src.scope()))
-            _check_compact(src)
-            x_size, y_size, x_stride, offset = _get_2d_pattern(
-                dst, elem_width, elem_bytes, data_type, src.scope(), allow_fold=True
-            )
-            irb = tvm.tir.ir_builder.create()
-            irb.scope_attr(env.dev.vta_axis, "coproc_scope", env.dev.get_task_qid(task_qid))
-            irb.emit(
-                tvm.tir.call_extern(
-                    "int32",
-                    "VTAStoreBuffer2D",
-                    env.dev.command_handle,
-                    src.access_ptr("r", "int32"),
-                    mem_type,
-                    dst.data,
-                    offset,
-                    x_size,
-                    y_size,
-                    x_stride,
-                )
-            )
-            return irb.get()
-        elif src.scope() == "global":
-            if dst.scope() == env.acc_scope:
-                elem_width = env.ACC_WIDTH
-                elem_bytes = env.ACC_ELEM_BYTES
-                mem_type = env.dev.MEM_ID_ACC
-                data_type = "int%d" % env.ACC_WIDTH
-                task_qid = env.dev.QID_LOAD_OUT
-            elif dst.scope() == env.inp_scope:
-                elem_width = env.INP_WIDTH
-                elem_bytes = env.INP_ELEM_BYTES
-                mem_type = env.dev.MEM_ID_INP
-                data_type = "int%d" % env.INP_WIDTH
-                task_qid = env.dev.QID_LOAD_INP
-            elif dst.scope() == env.wgt_scope:
-                elem_width = env.WGT_WIDTH
-                elem_bytes = env.WGT_ELEM_BYTES
-                mem_type = env.dev.MEM_ID_WGT
-                data_type = "int%d" % env.WGT_WIDTH
-                task_qid = env.dev.QID_LOAD_WGT
-            else:
-                raise RuntimeError("Do not support copy dram->%s" % (dst.scope()))
-            # collect pad statistics
-            if pad_before:
-                assert pad_after
-                ndim = len(pad_before)
-                if ndim <= 2 or ndim > 5:
-                    raise ValueError("Limitation of 2D pad load forbid ndim=%d" % ndim)
-                if ndim == 5:
-                    # This case occurs when batch size N > 1
-                    y_pad_before = pad_before[1]
-                    x_pad_before = pad_before[2]
-                    y_pad_after = pad_after[1]
-                    x_pad_after = pad_after[2]
-                    for dim in range(3, ndim):
-                        if not utils.equal_const_int(pad_before[dim], 0):
-                            raise ValueError("Do not support pad on the innermost block")
-                        if not utils.equal_const_int(pad_after[dim], 0):
-                            raise ValueError("Do not support pad on the innermost block")
-                else:
-                    y_pad_before = pad_before[0]
-                    x_pad_before = pad_before[1]
-                    y_pad_after = pad_after[0]
-                    x_pad_after = pad_after[1]
-                    for dim in range(2, ndim):
-                        if not utils.equal_const_int(pad_before[dim], 0):
-                            raise ValueError("Do not support pad on the innermost block")
-                        if not utils.equal_const_int(pad_after[dim], 0):
-                            raise ValueError("Do not support pad on the innermost block")
-                allow_fold = False
-            else:
-                x_pad_before = 0
-                y_pad_before = 0
-                x_pad_after = 0
-                y_pad_after = 0
-                allow_fold = True
-
-            _check_compact(dst)
-            x_size, y_size, x_stride, offset = _get_2d_pattern(
-                src, elem_width, elem_bytes, data_type, dst.scope(), allow_fold=allow_fold
-            )
-
-            if data_type != src.dtype:
-                assert data_type == "int%d" % env.ACC_WIDTH and src.dtype == "int%d" % env.INP_WIDTH
-                mem_type = env.dev.MEM_ID_ACC_8BIT
-
-            irb = tvm.tir.ir_builder.create()
-            irb.scope_attr(env.dev.vta_axis, "coproc_scope", env.dev.get_task_qid(task_qid))
-
-            irb.emit(
-                tvm.tir.call_extern(
-                    "int32",
-                    "VTALoadBuffer2D",
-                    env.dev.command_handle,
-                    src.data,
-                    offset,
-                    x_size,
-                    y_size,
-                    x_stride,
-                    x_pad_before,
-                    y_pad_before,
-                    x_pad_after,
-                    y_pad_after,
-                    dst.access_ptr("r", "int32"),
-                    mem_type,
-                )
-            )
-            return irb.get()
-
-        else:
-            raise RuntimeError("Do not support copy %s->%s" % (src.scope(), dst.scope()))
-
-    return tvm.tir.transform.InjectCopyIntrin("dma_copy", _inject_copy)
-
-
-def _get_gemm_intrin_buffer():
-    env = get_env()
-    wgt_lanes = env.WGT_ELEM_BITS // env.WGT_WIDTH
-    assert wgt_lanes == env.BLOCK_OUT * env.BLOCK_IN
-    wgt_shape = (env.BLOCK_OUT, env.BLOCK_IN)
-    assert wgt_shape[0] * wgt_shape[1] == wgt_lanes
-    inp_lanes = env.INP_ELEM_BITS // env.INP_WIDTH
-    assert inp_lanes == env.BATCH * env.BLOCK_IN
-    inp_shape = (env.BATCH, env.BLOCK_IN)
-    assert inp_shape[0] * inp_shape[1] == inp_lanes
-    out_lanes = env.ACC_ELEM_BITS // env.ACC_WIDTH
-    assert out_lanes == env.BATCH * env.BLOCK_OUT
-    out_shape = (env.BATCH, env.BLOCK_OUT)
-    assert out_shape[0] * out_shape[1] == out_lanes
-    wgt = te.placeholder(
-        (wgt_shape[0], wgt_shape[1]), dtype="int%d" % env.WGT_WIDTH, name=env.wgt_scope
-    )
-    inp = te.placeholder(
-        (inp_shape[0], inp_shape[1]), dtype="int%d" % env.INP_WIDTH, name=env.inp_scope
-    )
-    k = te.reduce_axis((0, wgt_shape[1]), name="k")
-    out_dtype = "int%d" % env.ACC_WIDTH
-    out = te.compute(
-        (out_shape[0], out_shape[1]),
-        lambda i, j: te.sum(inp[i, k].astype(out_dtype) * wgt[j, k].astype(out_dtype), axis=[k]),
-        name="out",
-    )
-    wgt_layout = tvm.tir.decl_buffer(
-        wgt.shape,
-        wgt.dtype,
-        env.wgt_scope,
-        scope=env.wgt_scope,
-        offset_factor=wgt_lanes,
-        data_alignment=wgt_lanes,
-    )
-    inp_layout = tvm.tir.decl_buffer(
-        inp.shape,
-        inp.dtype,
-        env.inp_scope,
-        scope=env.inp_scope,
-        offset_factor=inp_lanes,
-        data_alignment=inp_lanes,
-    )
-    out_layout = tvm.tir.decl_buffer(
-        out.shape,
-        out.dtype,
-        env.acc_scope,
-        scope=env.acc_scope,
-        offset_factor=out_lanes,
-        data_alignment=out_lanes,
-    )
-
-    return wgt_layout, inp_layout, out_layout
-
-
-def InjectConv2DTransposeSkip():
-    """Pass to skip 0-weights in conv2d transpose with stride > 1.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The pass
-    """
-
-    def _ftransform(func, mod, ctx):
-        env = get_env()
-        dwgt, dinp, dout = _get_gemm_intrin_buffer()
-
-        calls = []
-        selects = []
-
-        def _find_basics(op):
-            if isinstance(op, tvm.tir.BufferLoad):
-                calls.append(op)
-            elif isinstance(op, tvm.tir.Select):
-                selects.append(op)
-
-        def _do_fold(op):
-            if _match_pragma(op, "conv2d_transpose_gemm"):
-                is_init = "_init" in str(op)
-                tvm.tir.stmt_functor.post_order_visit(op, _find_basics)
-
-                if is_init:
-                    # create inner most block
-                    irb = tvm.tir.ir_builder.create()
-                    dev = env.dev
-                    irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE))
-                    irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop)
-                    irb.emit(
-                        tvm.tir.call_intrin(
-                            "int32",
-                            "tir.vta.uop_push",
-                            0,
-                            1,
-                            dout.access_ptr("rw", "int32"),
-                            0,
-                            0,
-                            0,
-                            0,
-                            0,
-                        )
-                    )
-                    inner = irb.get()
-                    # TODO(@tmoreau89): This is only a temporary fix, please take a look.
-                    body = op.body.body
-                    while isinstance(body, tvm.tir.IfThenElse):
-                        body = body.then_case
-                    args = body.indices
-                    res_buffer = body.buffer
-                    tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_OUT)
-                    inner = tvm.tir.AttrStmt(
-                        [dout, res_buffer],
-                        "buffer_bind_scope",
-                        tvm.tir.call_intrin("handle", "tir.tvm_tuple", *tpl),
-                        inner,
-                    )
-                    return inner
-                else:
-                    conv_call, data_call, kernel_call = calls[-3:]
-                    pad_data_tensor = data_call.buffer
-                    kernel_tensor = kernel_call.buffer
-                    res_tensor = conv_call.buffer
-
-                    if selects:
-                        condition = selects[0].condition
-                    else:
-                        condition = tvm.tir.const(1, "int")
-
-                    # create inner most block
-                    irb = tvm.tir.ir_builder.create()
-                    with irb.if_scope(condition):
-                        dev = env.dev
-                        irb.scope_attr(
-                            dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE)
-                        )
-                        irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop)
-                        irb.emit(
-                            tvm.tir.call_intrin(
-                                "int32",
-                                "tir.vta.uop_push",
-                                0,
-                                0,
-                                dout.access_ptr("rw", "int32"),
-                                dinp.access_ptr("r", "int32"),
-                                dwgt.access_ptr("r", "int32"),
-                                0,
-                                0,
-                                0,
-                            )
-                        )
-                    inner = irb.get()
-
-                    args = conv_call.indices
-                    tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_OUT)
-                    inner = tvm.tir.AttrStmt(
-                        [dout, res_tensor],
-                        "buffer_bind_scope",
-                        tvm.tir.call_intrin("handle", "tir.tvm_tuple", *tpl),
-                        inner,
-                    )
-                    args = kernel_call.indices
-                    tpl = (
-                        args[0],
-                        1,
-                        args[1],
-                        1,
-                        args[2],
-                        1,
-                        args[3],
-                        1,
-                        0,
-                        env.BLOCK_OUT,
-                        0,
-                        env.BLOCK_IN,
-                    )
-                    inner = tvm.tir.AttrStmt(
-                        [dwgt, kernel_tensor],
-                        "buffer_bind_scope",
-                        tvm.tir.call_intrin("handle", "tir.tvm_tuple", *tpl),
-                        inner,
-                    )
-                    args = data_call.indices
-                    tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_IN)
-                    inner = tvm.tir.AttrStmt(
-                        [dinp, pad_data_tensor],
-                        "buffer_bind_scope",
-                        tvm.tir.call_intrin("handle", "tir.tvm_tuple", *tpl),
-                        inner,
-                    )
-                    return inner
-            return None
-
-        return func.with_body(
-            tvm.tir.stmt_functor.ir_transform(func.body, _do_fold, None, ["tir.AttrStmt"])
-        )
-
-    return tvm.tir.transform.prim_func_pass(
-        _ftransform, opt_level=0, name="tir.vta.InjectConv2DTrasnposeSkip"
-    )
-
-
-def AnnotateALUCoProcScope():
-    """Pass to insert ALU instruction.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The pass
-    """
-
-    def _ftransform(func, mod, ctx):
-        env = get_env()
-
-        def _do_fold(stmt):
-            if _match_pragma(stmt, "alu"):
-                irb = tvm.tir.ir_builder.create()
-                irb.scope_attr(
-                    env.dev.vta_axis, "coproc_scope", env.dev.get_task_qid(env.dev.QID_COMPUTE)
-                )
-                irb.scope_attr(
-                    env.dev.vta_axis, "coproc_uop_scope", tvm.tir.StringImm("VTAPushALUOp")
-                )
-                irb.emit(stmt)
-                return irb.get()
-            if _match_pragma(stmt, "skip_alu"):
-                return tvm.tir.Evaluate(0)
-            return stmt
-
-        return func.with_body(
-            tvm.tir.stmt_functor.ir_transform(func.body, None, _do_fold, ["tir.AttrStmt"])
-        )
-
-    return tvm.tir.transform.prim_func_pass(
-        _ftransform, opt_level=0, name="tir.vta.AnnotateALUCoProcScope"
-    )
-
-
-def InjectALUIntrin():
-    """Pass to inject ALU micro-ops.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The pass
-    """
-
-    def _ftransform(func, mod, ctx):
-        env = get_env()
-        idxm = tvm.tir.indexmod
-        analyzer = tvm.arith.Analyzer()
-
-        def _do_fold(stmt):
-            def _flatten_loop(src_coeff, dst_coeff, extents):
-                src_coeff = list(src_coeff)
-                dst_coeff = list(dst_coeff)
-                extents = list(extents)
-                rev_src_coeff = [src_coeff.pop()]
-                rev_dst_coeff = [dst_coeff.pop()]
-                rev_extents = []
-                assert src_coeff
-                vsrc = src_coeff.pop()
-                vdst = dst_coeff.pop()
-                vext = extents.pop()
-                while src_coeff:
-                    next_src = src_coeff.pop()
-                    next_dst = dst_coeff.pop()
-                    next_ext = extents.pop()
-
-                    if analyzer.can_prove_equal(next_src, vsrc * vext) and analyzer.can_prove_equal(
-                        next_dst, vdst * vext
-                    ):
-                        vext = analyzer.simplify(vext * next_ext)
-                    else:
-                        rev_src_coeff.append(vsrc)
-                        rev_dst_coeff.append(vdst)
-                        rev_extents.append(vext)
-                        vsrc = next_src
-                        vdst = next_dst
-                        vext = next_ext
-                rev_src_coeff.append(vsrc)
-                rev_dst_coeff.append(vdst)
-                rev_extents.append(vext)
-                rev_src_coeff.reverse()
-                rev_dst_coeff.reverse()
-                rev_extents.reverse()
-
-                return rev_src_coeff, rev_dst_coeff, rev_extents
-
-            if _match_pragma(stmt, "alu"):
-                # Get to the innermost loop body
-                loop_body = stmt.body
-                nest_size = 0
-                while isinstance(loop_body, tvm.tir.For):
-                    loop_body = loop_body.body
-                    nest_size += 1
-                # Get the src/dst arguments
-                dst_var = loop_body.buffer.data
-                dst_idx = loop_body.indices[0]
-                # Derive loop variables and extents
-                tmp_body = stmt.body
-                indices = []
-                extents = []
-                for _ in range(nest_size):
-                    indices.append(tmp_body.loop_var)
-                    extents.append(tmp_body.extent)
-                    tmp_body = tmp_body.body
-                # Derive opcode
-                if isinstance(loop_body.value, tvm.tir.Add):
-                    alu_opcode = env.dev.ALU_OPCODE_ADD
-                    lhs = loop_body.value.a
-                    rhs = loop_body.value.b
-                elif isinstance(loop_body.value, tvm.tir.Sub):
-                    alu_opcode = env.dev.ALU_OPCODE_SUB
-                    lhs = loop_body.value.a
-                    rhs = loop_body.value.b
-                elif isinstance(loop_body.value, tvm.tir.Mul):
-                    alu_opcode = env.dev.ALU_OPCODE_MUL
-                    lhs = loop_body.value.a
-                    rhs = loop_body.value.b
-                elif isinstance(loop_body.value, tvm.tir.Min):
-                    alu_opcode = env.dev.ALU_OPCODE_MIN
-                    lhs = loop_body.value.a
-                    rhs = loop_body.value.b
-                elif isinstance(loop_body.value, tvm.tir.Max):
-                    alu_opcode = env.dev.ALU_OPCODE_MAX
-                    lhs = loop_body.value.a
-                    rhs = loop_body.value.b
-                elif isinstance(loop_body.value, tvm.tir.Call):
-                    if loop_body.value.op.name == "tir.shift_left":
-                        alu_opcode = env.dev.ALU_OPCODE_SHR
-                        lhs = loop_body.value.args[0]
-                        rhs = analyzer.simplify(-loop_body.value.args[1])
-                    elif loop_body.value.op.name == "tir.shift_right":
-                        alu_opcode = env.dev.ALU_OPCODE_SHR
-                        lhs = loop_body.value.args[0]
-                        rhs = loop_body.value.args[1]
-                    else:
-                        raise RuntimeError(
-                            "Function call not recognized %s" % (loop_body.value.op.name)
-                        )
-                elif isinstance(loop_body.value, tvm.tir.BufferLoad):
-                    alu_opcode = env.dev.ALU_OPCODE_SHR
-                    lhs = loop_body.value
-                    rhs = tvm.tir.const(0, "int32")
-                else:
-                    raise RuntimeError(
-                        "Expression not recognized %s, %s, %s"
-                        % (type(loop_body.value), str(loop_body.value), str(stmt))
-                    )
-
-                # Derive array index coefficients
-                dst_coeff = tvm.arith.detect_linear_equation(dst_idx, indices)
-                # Check if lhs/rhs is immediate
-                use_imm = False
-                imm_val = None
-                if isinstance(rhs, tvm.tir.IntImm):
-                    assert lhs.buffer.data.same_as(dst_var)
-                    src_coeff = tvm.arith.detect_linear_equation(lhs.indices[0], indices)
-                    use_imm = True
-                    imm_val = rhs
-                if isinstance(lhs, tvm.tir.IntImm):
-                    assert rhs.buffer.data.same_as(dst_var)
-                    src_coeff = tvm.arith.detect_linear_equation(rhs.indices[0], indices)
-                    use_imm = True
-                    imm_val = lhs
-                if imm_val is None:
-                    imm_val = 0
-                    assert lhs.buffer.data.same_as(dst_var) and rhs.buffer.data.same_as(dst_var)
-                    src_lhs_coeff = tvm.arith.detect_linear_equation(lhs.indices[0], indices)
-                    src_rhs_coeff = tvm.arith.detect_linear_equation(rhs.indices[0], indices)
-                    # Determine which side has the same coefficients
-                    lhs_equal = True
-                    rhs_equal = True
-                    for i, coef in enumerate(dst_coeff):
-                        if not tvm.ir.structural_equal(coef, src_lhs_coeff[i]):
-                            lhs_equal = False
-                        if not tvm.ir.structural_equal(coef, src_rhs_coeff[i]):
-                            rhs_equal = False
-                    # Make sure at least one of the source is identical to the
-                    # destination (in-place computation)
-                    assert lhs_equal or rhs_equal
-                    # Assign the source coefficients
-                    if lhs_equal:
-                        src_coeff = src_rhs_coeff
-                    else:
-                        src_coeff = src_lhs_coeff
-
-                # Ensure that we have the proper tensor dimensions in the
-                # innermost loop (pattern match)
-                src_coeff = list(src_coeff)
-                dst_coeff = list(dst_coeff)
-                extents = list(extents)
-                assert len(src_coeff) > 1
-                assert len(dst_coeff) > 1
-                assert len(extents) != 0
-                tvm.ir.assert_structural_equal(
-                    analyzer.simplify(idxm(src_coeff[-1], env.BATCH * env.BLOCK_OUT)), T.int32(0)
-                )
-                tvm.ir.assert_structural_equal(
-                    analyzer.simplify(idxm(dst_coeff[-1], env.BATCH * env.BLOCK_OUT)), T.int32(0)
-                )
-                tvm.ir.assert_structural_equal(src_coeff[-2], T.int32(1))
-                tvm.ir.assert_structural_equal(dst_coeff[-2], T.int32(1))
-                if env.BATCH > 1:
-                    assert len(src_coeff) > 2
-                    assert len(dst_coeff) > 2
-                    assert len(extents) > 1
-                    tvm.ir.assert_structural_equal(src_coeff[-3], T.int32(env.BLOCK_OUT))
-                    tvm.ir.assert_structural_equal(dst_coeff[-3], T.int32(env.BLOCK_OUT))
-
-                # Apply tensorization of the loop coefficients
-                src_offset = src_coeff[-1]
-                dst_offset = dst_coeff[-1]
-                if env.BATCH == 1:
-                    src_coeff = src_coeff[:-2]
-                    dst_coeff = dst_coeff[:-2]
-                    extents = extents[:-1]
-                else:
-                    src_coeff = src_coeff[:-3]
-                    dst_coeff = dst_coeff[:-3]
-                    extents = extents[:-2]
-                src_coeff.append(src_offset)
-                dst_coeff.append(dst_offset)
-                src_coeff = [analyzer.simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in src_coeff]
-                dst_coeff = [analyzer.simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in dst_coeff]
-
-                # Flatten the outer loops
-                if extents:
-                    src_coeff, dst_coeff, extents = _flatten_loop(src_coeff, dst_coeff, extents)
-
-                # Insert ALU micro-ops
-                irb = tvm.tir.ir_builder.create()
-                for idx, extent in enumerate(extents):
-                    irb.emit(
-                        tvm.tir.call_extern(
-                            "int32",
-                            "VTAUopLoopBegin",
-                            extent,
-                            dst_coeff[idx],
-                            src_coeff[idx],
-                            0,
-                        )
-                    )
-                use_imm = int(use_imm)
-                irb.emit(
-                    tvm.tir.call_intrin(
-                        "int32",
-                        "tir.vta.uop_push",
-                        1,
-                        0,
-                        dst_coeff[len(dst_coeff) - 1],
-                        src_coeff[len(src_coeff) - 1],
-                        0,
-                        alu_opcode,
-                        use_imm,
-                        imm_val,
-                    )
-                )
-                for extent in extents:
-                    irb.emit(tvm.tir.call_extern("int32", "VTAUopLoopEnd"))
-                return irb.get()
-            return stmt
-
-        return func.with_body(
-            tvm.tir.stmt_functor.ir_transform(func.body, None, _do_fold, ["tir.AttrStmt"])
-        )
-
-    return tvm.tir.transform.prim_func_pass(
-        _ftransform, opt_level=0, name="tir.vta.InjectALUIntrin"
-    )
diff --git a/vta/runtime/device_api.cc b/vta/runtime/device_api.cc
deleted file mode 100644
index b021ed103933..000000000000
--- a/vta/runtime/device_api.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file device_api.cc
- * \brief TVM device API for VTA
- */
-
-#include <dmlc/thread_local.h>
-#include <tvm/runtime/registry.h>
-
-#include "../../src/runtime/workspace_pool.h"
-#include "runtime.h"
-
-namespace tvm {
-namespace runtime {
-
-class VTADeviceAPI final : public DeviceAPI {
- public:
-  void SetDevice(Device dev) final {}
-
-  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final {
-    if (kind == kExist) {
-      *rv = 1;
-    }
-  }
-
-  void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final {
-    return VTABufferAlloc(size);
-  }
-
-  void FreeDataSpace(Device dev, void* ptr) final { VTABufferFree(ptr); }
-
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      Device dev_from, Device dev_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final {
-    int kind_mask = 0;
-    if (dev_from.device_type != kDLCPU) {
-      kind_mask |= 2;
-    }
-    if (dev_to.device_type != kDLCPU) {
-      kind_mask |= 1;
-    }
-    VTABufferCopy(from, from_offset, to, to_offset, size, kind_mask);
-  }
-
-  void StreamSync(Device dev, TVMStreamHandle stream) final {}
-
-  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
-
-  void FreeWorkspace(Device dev, void* data) final;
-
-  static VTADeviceAPI* Global() {
-    static VTADeviceAPI* inst = new VTADeviceAPI();
-    return inst;
-  }
-};
-
-struct VTAWorkspacePool : public WorkspacePool {
-  VTAWorkspacePool() : WorkspacePool(kDLExtDev, VTADeviceAPI::Global()) {}
-};
-
-void* VTADeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
-  return dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->AllocWorkspace(dev, size);
-}
-
-void VTADeviceAPI::FreeWorkspace(Device dev, void* data) {
-  dmlc::ThreadLocalStore<VTAWorkspacePool>::Get()->FreeWorkspace(dev, data);
-}
-
-// Register device api with override.
-static TVM_ATTRIBUTE_UNUSED auto& __register_dev__ =
-    ::tvm::runtime::Registry::Register("device_api.ext_dev", true)
-        .set_body([](TVMArgs args, TVMRetValue* rv) {
-          DeviceAPI* ptr = VTADeviceAPI::Global();
-          *rv = static_cast<void*>(ptr);
-        });
-}  // namespace runtime
-}  // namespace tvm
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
deleted file mode 100644
index 66a428cc5590..000000000000
--- a/vta/runtime/runtime.cc
+++ /dev/null
@@ -1,1417 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file runtime.cc
- * \brief Generic VTA runtime in C++11.
- *
- *  The runtime depends on specific instruction
- *  stream spec as specified in hw_spec.h
- */
-#include "runtime.h"
-
-#include <dmlc/logging.h>
-#include <stdlib.h>
-#include <tvm/runtime/c_runtime_api.h>
-#include <vta/driver.h>
-#include <vta/hw_spec.h>
-
-#include <algorithm>
-#include <cassert>
-#include <cstring>
-#include <memory>
-#include <mutex>
-#include <set>
-#include <thread>
-#include <vector>
-
-namespace vta {
-
-// Avoid bad configurations.
-static_assert(VTA_UOP_WIDTH == sizeof(VTAUop) * 8, "VTA_UOP_WIDTH do not match VTAUop size");
-
-/*! \brief Enable coherent access of data buffers between VTA and CPU */
-static const bool kBufferCoherent = VTA_COHERENT_ACCESSES;
-/*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */
-static const bool kAlwaysCache = true;
-
-template <typename T, std::size_t N = ALLOC_ALIGNMENT>
-class AlignmentAllocator : public std::allocator<T> {
- public:
-  typedef T value_type;
-  typedef std::size_t size_type;
-  typedef std::ptrdiff_t difference_type;
-
-  typedef T* pointer;
-  typedef const T* const_pointer;
-
-  typedef T& reference;
-  typedef const T& const_reference;
-
-  inline AlignmentAllocator() throw() {}
-
-  template <typename T2>
-  inline AlignmentAllocator(const AlignmentAllocator<T2, N>&) throw() {}
-
-  inline ~AlignmentAllocator() throw() {}
-
-  inline pointer address(reference r) { return &r; }
-
-  inline const_pointer address(const_reference r) const { return &r; }
-
-  inline pointer allocate(size_type n) {
-    pointer mem = nullptr;
-    const int err = posix_memalign((void**)&mem, N, n * sizeof(value_type));
-    ICHECK_EQ(err, 0) << "InternalError: failed to allocate aligned memory. ";
-    return mem;
-  }
-
-  inline void deallocate(pointer p, size_type) { free(p); }
-
-  inline void construct(pointer p, const value_type& wert) { new (p) value_type(wert); }
-
-  inline void destroy(pointer p) { p->~value_type(); }
-
-  inline size_type max_size() const throw() { return size_type(-1) / sizeof(value_type); }
-
-  template <typename T2>
-  struct rebind {
-    typedef AlignmentAllocator<T2, N> other;
-  };
-
-  bool operator!=(const AlignmentAllocator<T, N>& other) const { return !(*this == other); }
-
-  // Returns true if and only if storage allocated from *this
-  // can be deallocated from other, and vice versa.
-  // Always returns true for stateless allocators.
-  bool operator==(const AlignmentAllocator<T, N>& other) const { return true; }
-};
-
-class DeviceAllocStat {
- public:
-  void AddAlloc(const void* ptr) {
-    std::lock_guard<std::mutex> lock(mtx_);
-    allocated_.insert(ptr);
-  }
-
-  bool CheckAlloc(const void* ptr) {
-    std::lock_guard<std::mutex> lock(mtx_);
-    return allocated_.count(ptr);
-  }
-
-  void DelAlloc(const void* ptr) {
-    std::lock_guard<std::mutex> lock(mtx_);
-    allocated_.erase(ptr);
-  }
-
- private:
-  std::set<const void*> allocated_;
-  std::mutex mtx_;
-};
-
-// here we use a global variable to memorize the allocation stats
-static std::shared_ptr<DeviceAllocStat> alloc_stat(new DeviceAllocStat());
-
-/*!
- * \brief Data buffer represents data on CMA.
- */
-struct DataBuffer {
-  DataBuffer() { alloc_stat_ = alloc_stat; }
-
-  /*! \return Virtual address of the data. */
-  void* virt_addr() const { return data_; }
-  /*! \return Physical address of the data. */
-  vta_phy_addr_t phy_addr() const { return phy_addr_; }
-  /*!
-   * \brief Invalidate the cache of given location in data buffer.
-   * \param offset The offset to the data.
-   * \param size The size of the data.
-   */
-  void InvalidateCache(size_t offset, size_t size) {
-    if (!kBufferCoherent && kAlwaysCache) {
-      VTAInvalidateCache(reinterpret_cast<char*>(data_) + offset, phy_addr_ + offset, size);
-    }
-  }
-  /*!
-   * \brief Invalidate the cache of certain location in data buffer.
-   * \param offset The offset to the data.
-   * \param size The size of the data.
-   */
-  void FlushCache(size_t offset, size_t size) {
-    if (!kBufferCoherent && kAlwaysCache) {
-      VTAFlushCache(reinterpret_cast<char*>(data_) + offset, phy_addr_ + offset, size);
-    }
-  }
-  /*!
-   * \brief Performs a copy operation from host memory to buffer allocated with VTAMemAlloc.
-   * \param dst The desination buffer in FPGA-accessible memory. Has to be allocated with
-   * VTAMemAlloc(). \param src The source buffer in host memory. \param size Size of the region in
-   * Bytes.
-   */
-  void MemCopyFromHost(void* dst, const void* src, size_t size) {
-    VTAMemCopyFromHost(dst, src, size);
-  }
-  /*!
-   * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
-   * \param dst The desination buffer in host memory.
-   * \param src The source buffer in FPGA-accessible memory. Has to be allocated with VTAMemAlloc().
-   * \param size Size of the region in Bytes.
-   */
-  void MemCopyToHost(void* dst, const void* src, size_t size) { VTAMemCopyToHost(dst, src, size); }
-  /*!
-   * \brief Allocate a buffer of a given size.
-   * \param size The size of the buffer.
-   */
-  static DataBuffer* Alloc(size_t size) {
-    void* data = VTAMemAlloc(size, kAlwaysCache);
-    CHECK(data != nullptr);
-    DataBuffer* buffer = new DataBuffer();
-    buffer->data_ = data;
-    buffer->phy_addr_ = VTAMemGetPhyAddr(data);
-
-    alloc_stat->AddAlloc(buffer);
-    return buffer;
-  }
-  /*!
-   * \brief Free the data buffer.
-   * \param buffer The buffer to be freed.
-   */
-  static void Free(DataBuffer* buffer) {
-    alloc_stat->DelAlloc(buffer);
-    VTAMemFree(buffer->data_);
-    delete buffer;
-  }
-  /*!
-   * \brief Create data buffer header from buffer ptr.
-   * \param buffer The buffer pointer.
-   * \return The corresponding data buffer header.
-   */
-  static DataBuffer* FromHandle(const void* buffer) {
-    if (alloc_stat->CheckAlloc(buffer)) {
-      return const_cast<DataBuffer*>(reinterpret_cast<const DataBuffer*>(buffer));
-    } else {
-      return nullptr;
-    }
-  }
-
- private:
-  /*! \brief The internal data. */
-  void* data_;
-  /*! \brief The physical address of the buffer, excluding header. */
-  vta_phy_addr_t phy_addr_;
-
-  // a copy of global shared_ptr instance
-  // to avoid the global instance is destructed before there are still some pending DataBuffers not
-  // destructed
-  std::shared_ptr<DeviceAllocStat> alloc_stat_;
-};
-
-/*!
- * \brief Micro op kernel.
- *  Contains functions to construct the kernel with prefix Push.
- */
-class UopKernel {
- public:
-  /*! \brief Loop information. */
-  struct LoopEntry {
-    uint32_t extent;
-    uint32_t dst_factor;
-    uint32_t src_factor;
-    uint32_t wgt_factor;
-  };
-  /*!
-   * \brief Construct UopKernel with signature.
-   * \param signature The pointer to signature.
-   * \param nbytes Number of bytes.
-   */
-  UopKernel(const char* signature, int nbytes) : signature_(signature, signature + nbytes) {}
-  /*!
-   * \brief Verify if the signature is correct.
-   * \param signature Signature ptr.
-   * \param nbytes Number of bytes.
-   */
-  bool MatchSignature(void* signature, int nbytes) const {
-    if (static_cast<size_t>(nbytes) != signature_.size()) return false;
-    return memcmp(signature, signature_.data(), nbytes) == 0;
-  }
-  /*! \return Whether the kernel is cached in SRAM. */
-  bool cached() const { return sram_begin_ != sram_end_; }
-  /*! \return The length of the micro op sequence. */
-  size_t size() const { return seq_.size(); }
-  /*! \return The micro-op data. */
-  const VTAUop* data() const { return seq_.data(); }
-  /*! \return The loop structure. */
-  const std::vector<LoopEntry>& loop() const { return loop_; }
-  /*!
-   * \brief Declare loop start.
-   * \param extent The loop extent.
-   * \param dst_factor Loop factor of accum index.
-   * \param src_factor Loop factor of input index
-   * \param wgt_factor Loop factor of weight index.
-   */
-  void PushLoopBegin(uint32_t extent, uint32_t dst_factor, uint32_t src_factor,
-                     uint32_t wgt_factor) {
-    LoopEntry le;
-    le.extent = extent;
-    le.dst_factor = dst_factor;
-    le.src_factor = src_factor;
-    le.wgt_factor = wgt_factor;
-    CHECK_EQ(seq_.size(), 0U);
-    CHECK_LT(loop_.size(), 2U);
-    loop_.push_back(le);
-    ++loop_ptr_;
-  }
-  /*!
-   * \brief Declare loop end.
-   */
-  void PushLoopEnd() { --loop_ptr_; }
-  /*!
-   * \brief Push micro op into kernel.
-   * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
-   * \param reset_out Resets the accum to 0.
-   * \param dst_index The accum memory index.
-   * \param src_index The input memory (gemm) / accum memory (alu) index.
-   * \param wgt_index The weight memory index.
-   * \param opcode The ALU opcode.
-   * \param use_imm Use immediate in ALU mode if set to true.
-   * \param imm_val Immediate value in ALU mode.
-   */
-  void Push(uint32_t mode, uint32_t reset_out, uint32_t dst_index, uint32_t src_index,
-            uint32_t wgt_index, uint32_t opcode, uint32_t use_imm, int32_t imm_val) {
-    // The loop nest structure
-    VerifyDep(dst_index);
-    VTAUop op;
-    op.dst_idx = dst_index;
-    op.src_idx = src_index;
-    op.wgt_idx = wgt_index;
-    seq_.push_back(op);
-    // Ensure that mode is consistent if set
-    if (mode_ == 0xFFFFFFFF) {
-      mode_ = mode;
-    } else {
-      CHECK(mode_ == mode);
-    }
-    // Set reset_out field if unset
-    if (reset_out_ == 0xFFFFFFFF) {
-      reset_out_ = reset_out;
-    } else {
-      CHECK(reset_out_ == reset_out);
-    }
-    // Check kernel op and imm/imm_val in ALU mode
-    if (mode == 1) {
-      if (opcode_ == 0xFFFFFFFF) {
-        opcode_ = opcode;
-        use_imm_ = use_imm;
-        imm_val_ = imm_val;
-      } else {
-        CHECK(opcode_ == opcode);
-        CHECK(use_imm_ == use_imm);
-        CHECK(imm_val_ == imm_val);
-      }
-    }
-  }
-  /*! \brief Dump kernel micro ops to stdout. */
-  void Dump() {
-    uint32_t size = seq_.size();
-    printf("There are %u uops\n", size);
-    for (uint32_t i = 0; i < size; ++i) {
-      printf("[%04u]\t acc=%u, inp=%u, wgt=%u\n", i, seq_[i].dst_idx, seq_[i].src_idx,
-             seq_[i].wgt_idx);
-    }
-    printf("\n");
-  }
-
- public:
-  // The kernel's mode, opcode, immediate setting and value
-  uint32_t mode_{0xFFFFFFFF};  // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU
-  uint32_t opcode_{0xFFFFFFFF};
-  uint32_t reset_out_{0xFFFFFFFF};
-  bool use_imm_{false};
-  int16_t imm_val_{0};
-
- private:
-  // Verify that we don't write to the same acc_mem index two cycles in a row
-  void VerifyDep(uint32_t dst_index) {
-    size_t step = std::min(static_cast<size_t>(2U), seq_.size());
-    for (size_t i = seq_.size() - step; i < seq_.size(); ++i) {
-      CHECK(seq_[i].dst_idx != dst_index);
-    }
-  }
-  // The uop buffer
-  template <int, bool, bool>
-  friend class UopQueue;
-  friend class CommandQueue;
-  // SRAM location if begin != end
-  uint32_t sram_begin_{0};
-  uint32_t sram_end_{0};
-  // The signature used for verification
-  std::vector<char> signature_;
-  // Internal sequence
-  std::vector<VTAUop> seq_;
-  // The loop nest structure specific to ALU instructions
-  std::vector<LoopEntry> loop_;
-  // The loop pointer
-  size_t loop_ptr_{0};
-};
-
-/*!
- * \brief Base class of all queues to send and recv serial data.
- */
-template <class T>
-class BaseQueue {
- public:
-  virtual ~BaseQueue() {
-    if (fpga_buff_ != nullptr) {
-      VTAMemFree(fpga_buff_);
-    }
-  }
-  /*! \return Content of DRAM buffer. */
-  char* dram_buffer() const { return dram_buffer_; }
-  /*! \return Physical address of DRAM. */
-  vta_phy_addr_t dram_phy_addr() const {
-    CHECK(fpga_buff_phy_);
-    return fpga_buff_phy_;
-  }
-  /*! \return Whether there is pending information. */
-  bool pending() const { return sram_begin_ != sram_end_; }
-  /*! \brief Initialize the space of the buffer. */
-  void InitSpace(uint32_t elem_bytes, uint32_t max_bytes, bool coherent, bool always_cache) {
-    coherent_ = coherent;
-    always_cache_ = always_cache;
-    elem_bytes_ = elem_bytes;
-    // Allocate buffer ahead of time
-    fpga_buff_ = static_cast<char*>(VTAMemAlloc(max_bytes, coherent_ || always_cache_));
-    CHECK(fpga_buff_ != nullptr);
-    fpga_buff_phy_ = VTAMemGetPhyAddr(fpga_buff_);
-  }
-  /*!
-   * \brief Reset the pointer of the buffer.
-   *  Set SRAM pointer to be the current end.
-   */
-  virtual void Reset() {
-    dram_buffer_.clear();
-    // reset to 0 as we always copy data to area starting from fpga_buff base
-    // we do mem copy for every DeviceRun
-    sram_end_ = 0;
-    sram_begin_ = sram_end_;
-  }
-
- protected:
-  // Cache coherence access (shared memory only)
-  bool coherent_{false};
-  // Make the buffer cacheable
-  bool always_cache_{false};
-  // Element bytes
-  uint32_t elem_bytes_{0};
-  // Begin location of current SRAM read in FIFO mode
-  uint32_t sram_begin_{0};
-  // End location of current SRAM write in FIFO mode
-  uint32_t sram_end_{0};
-  // The buffer in DRAM
-  std::vector<T, AlignmentAllocator<T, ALLOC_ALIGNMENT>> dram_buffer_;
-  // FPGA accessible buffer
-  void* fpga_buff_{NULL};
-  // Physical address of the FPGA buffer
-  vta_phy_addr_t fpga_buff_phy_{0};
-};
-
-/*!
- * \brief Micro op buffer that manages the micro op cache.
- */
-template <int kMaxBytes, bool kCoherent, bool kAlwaysCache>
-class UopQueue : public BaseQueue<VTAUop> {
- public:
-  void InitSpace() { BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache); }
-  // Push data to the queue
-  template <typename FAutoSync>
-  void Push(UopKernel* kernel, FAutoSync fautosync) {
-    // if the micro-op is cached in VTA SRAM, skip
-    if (kernel->cached()) return;
-    // check if we've exceeded the size of the allocated FPGA readable buffer
-    size_t num_op = kernel->size();
-    if (dram_buffer_.size() + num_op > kMaxElems) {
-      fautosync();
-      CHECK(dram_buffer_.size() <= kMaxElems);
-    }
-    // Cannot have a micro-op kernel larger than SRAM buffer
-    CHECK(num_op <= kMaxNumUop);
-    uint32_t uop_begin = 0;
-    if (sram_end_ + num_op > kMaxNumUop) {
-      // Need to evict
-      cache_idx_ = 0;
-      sram_begin_ = 0;
-      sram_end_ = num_op;
-    } else {
-      uop_begin = sram_end_;
-      sram_end_ += num_op;
-    }
-    // Simple eviction policy
-    uint32_t evict_begin = cache_idx_;
-    for (; cache_idx_ < cache_.size(); ++cache_idx_) {
-      if (cache_[cache_idx_]->sram_begin_ >= sram_end_) break;
-      // Mark the kernel as "invalid"
-      cache_[cache_idx_]->sram_begin_ = 0;
-      cache_[cache_idx_]->sram_end_ = 0;
-    }
-    // Increase size of buffer
-    kernel->sram_begin_ = uop_begin;
-    kernel->sram_end_ = sram_end_;
-    CHECK(kernel->cached());
-    cache_.insert(cache_.begin() + cache_idx_, kernel);
-    cache_.erase(cache_.begin() + evict_begin, cache_.begin() + cache_idx_);
-    cache_idx_ = evict_begin + 1;
-  }
-  // Flush micro op load instruction
-  void FlushUopLoad(VTAMemInsn* insn) {
-    if (sram_begin_ != sram_end_) {
-      // Derive offset in FPGA-readable buffer
-      int32_t offset = 0;
-      for (uint32_t i = 0; i < cache_idx_ - 1; ++i) {
-        offset += cache_[i]->size() * kElemBytes;
-      }
-      insn->memory_type = VTA_MEM_ID_UOP;
-      insn->sram_base = sram_begin_;
-      // Update cache idx to physical address map
-      insn->dram_base = (fpga_buff_phy_ + offset) / kElemBytes;
-      insn->y_size = 1;
-      insn->x_size = (sram_end_ - sram_begin_);
-      insn->x_stride = (sram_end_ - sram_begin_);
-      insn->y_pad_0 = 0;
-      insn->y_pad_1 = 0;
-      insn->x_pad_0 = 0;
-      insn->x_pad_1 = 0;
-      // Reset indices
-      sram_begin_ = sram_end_;
-    }
-  }
-  /*! \brief clear cache and reset base queue buffer.*/
-  void Reset() {
-    // unmark "cached" status
-    // as we cannot assume it is still in SRAM across DeviceRun
-    for (UopKernel* kernel : cache_) {
-      kernel->sram_begin_ = 0;
-      kernel->sram_end_ = 0;
-    }
-
-    cache_.clear();
-    cache_idx_ = 0;
-    BaseQueue<VTAUop>::Reset();
-  }
-  void AutoReadBarrier() { ReadBarrier(); }
-  /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
-  void ReadBarrier() {
-    CHECK(fpga_buff_ != nullptr);
-    CHECK(fpga_buff_phy_);
-    // Iterate over caches; allocate buffer in FPGA-readable memory
-    uint32_t buff_size = 0;
-    for (uint32_t i = 0; i < cache_.size(); ++i) {
-      buff_size += cache_[i]->size() * kElemBytes;
-    }
-    CHECK(buff_size <= kMaxBytes);
-
-    // merge all the cache entries and do CopyFromHost once
-    uint32_t total_size = 0;
-    for (uint32_t i = 0; i < cache_.size(); ++i) {
-      uint32_t ksize = cache_[i]->size() * kElemBytes;
-      total_size += ksize;
-    }
-
-    char* lbuf = nullptr;
-    const int err = posix_memalign((void**)&lbuf, ALLOC_ALIGNMENT, total_size);
-    ICHECK_EQ(err, 0) << "InternalError: failed to allocate aligned memory for load buffer. ";
-    uint32_t offset = 0;
-    for (uint32_t i = 0; i < cache_.size(); ++i) {
-      uint32_t ksize = cache_[i]->size() * kElemBytes;
-      memcpy(lbuf + offset, cache_[i]->data(), ksize);
-      offset += ksize;
-    }
-    VTAMemCopyFromHost(static_cast<char*>(fpga_buff_), lbuf, total_size);
-    free(lbuf);
-
-    // Flush if we're using a shared memory system
-    // and if interface is non-coherent
-    if (!coherent_ && always_cache_) {
-      VTAFlushCache(fpga_buff_, fpga_buff_phy_, offset);
-    }
-  }
-
- private:
-  // Cache pointer
-  uint32_t cache_idx_{0};
-  // Cached ring, sorted by sram_begin
-  std::vector<UopKernel*> cache_;
-  // Constants
-  static constexpr int kElemBytes = sizeof(VTAUop);
-  static constexpr int kMaxNumUop = VTA_UOP_BUFF_DEPTH;
-  static constexpr int kMaxElems = kMaxBytes / kElemBytes;
-};
-
-// Internal kernel structure
-class UopKernelMap {
- public:
-  // Simple hash map
-  UopKernel** Get(void* signature, int nbytes) {
-    uint32_t key = 0;
-    CHECK(nbytes == 0 || nbytes == sizeof(int));
-    if (nbytes == sizeof(int)) {
-      memcpy(&key, signature, sizeof(int));
-      key = key + 1;
-    }
-    CHECK_LT(key, 100);
-    if (kmap_.size() <= key) {
-      kmap_.resize(key + 1, nullptr);
-    }
-    return &(kmap_[key]);
-  }
-
- private:
-  std::vector<UopKernel*> kmap_;
-};
-
-enum PipelineStage : int { kNoneStage = 0, kLoadStage = 1, kComputeStage = 2, kStoreStage = 3 };
-
-// Instruction Queue
-template <int kMaxBytes, bool kCoherent, bool kAlwaysCache>
-class InsnQueue : public BaseQueue<VTAGenericInsn> {
- public:
-  /*! \brief Initialize the space. */
-  void InitSpace() {
-    BaseQueue::InitSpace(kElemBytes, kMaxBytes, kCoherent, kAlwaysCache);
-    // Initialize the stage
-    std::fill(pending_pop_prev_, pending_pop_prev_ + 4, 0);
-    std::fill(pending_pop_next_, pending_pop_next_ + 4, 0);
-  }
-  /*! \return The data pointer. */
-  VTAGenericInsn* data() { return dram_buffer_.data(); }
-  /*! \return Number of instructions. */
-  uint32_t count() { return dram_buffer_.size(); }
-  // Insert dependency push of load
-  void DepPop(int from, int to) {
-    // NOTE: This instruction executes on queue[to]
-    if (from < to) {
-      if (pending_pop_prev_[to]) {
-        this->CommitPendingPop(to);
-      }
-      pending_pop_prev_[to] = 1;
-    } else {
-      if (pending_pop_next_[to]) {
-        this->CommitPendingPop(to);
-      }
-      pending_pop_next_[to] = 1;
-    }
-    // Impossible condition
-    CHECK(from != kLoadStage || to != kStoreStage);
-    CHECK(from != kStoreStage || to != kLoadStage);
-  }
-  // Insert dependency push of load
-  void DepPush(int from, int to) {
-    // NOTE: this instruction executes on queue[from]
-    this->CommitPendingPop(from);
-    if (!dram_buffer_.empty()) {
-      VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(&dram_buffer_.back());
-      if (GetPipelineStage(mptr) == from) {
-        if (from < to && !mptr->push_next_dep) {
-          // push(LD->C) or push(C->ST)
-          mptr->push_next_dep = true;
-          return;
-        } else if (from > to && !mptr->push_prev_dep) {
-          // push(C->LD) or push(ST->C)
-          mptr->push_prev_dep = true;
-          return;
-        }
-      }
-    }
-    if (from < to) {
-      // Push next dep
-      PushNoop(from, false, true, false, false);
-    } else {
-      // Push prev dep
-      PushNoop(from, true, false, false, false);
-    }
-  }
-  // Create a new instruction for a GEMM stage
-  VTAGemInsn* CreateGemInsn() { return reinterpret_cast<VTAGemInsn*>(Create(kComputeStage)); }
-  // Create a new instruction for a ALU stage
-  VTAAluInsn* CreateAluInsn() { return reinterpret_cast<VTAAluInsn*>(Create(kComputeStage)); }
-  // Create a new instruction for a memory stage
-  VTAMemInsn* CreateMemInsn(int memory_type) {
-    return reinterpret_cast<VTAMemInsn*>(Create(GetMemPipelineStage(memory_type)));
-  }
-  // create a new instruction for a store stage
-  VTAMemInsn* CreateStoreInsn() { return reinterpret_cast<VTAMemInsn*>(Create(kStoreStage)); }
-  // Rewrite instruction stream to force serial execution
-  void RewriteForceSerial() {
-    int insn_count = count();
-    VTAMemInsn* mem_ptr = reinterpret_cast<VTAMemInsn*>(data());
-    VTAMemInsn* mem_last_store_ptr = nullptr;
-    VTAMemInsn* mem_last_ptr = nullptr;
-    for (int i = 1; i < insn_count; ++i) {
-      PipelineStage prev = GetPipelineStageAll(mem_ptr + i - 1);
-      PipelineStage now = GetPipelineStageAll(mem_ptr + i);
-      if (prev == kLoadStage && now == kComputeStage) {
-        mem_ptr[i - 1].push_prev_dep = false;
-        mem_ptr[i - 1].push_next_dep = true;
-        mem_ptr[i].pop_prev_dep = true;
-        mem_ptr[i].pop_next_dep = false;
-      } else if (prev == kComputeStage && now == kLoadStage) {
-        mem_ptr[i - 1].push_prev_dep = true;
-        mem_ptr[i - 1].push_next_dep = false;
-        mem_ptr[i].pop_prev_dep = false;
-        mem_ptr[i].pop_next_dep = true;
-      } else if (prev == kStoreStage && now == kComputeStage) {
-        mem_ptr[i - 1].push_prev_dep = true;
-        mem_ptr[i - 1].push_next_dep = false;
-        mem_ptr[i].pop_prev_dep = false;
-        mem_ptr[i].pop_next_dep = true;
-      } else if (prev == kComputeStage && now == kStoreStage) {
-        mem_ptr[i - 1].push_prev_dep = false;
-        mem_ptr[i - 1].push_next_dep = true;
-        mem_ptr[i].pop_prev_dep = true;
-        mem_ptr[i].pop_next_dep = false;
-      } else {
-        mem_ptr[i - 1].push_prev_dep = false;
-        mem_ptr[i - 1].push_next_dep = false;
-        mem_ptr[i].pop_prev_dep = false;
-        mem_ptr[i].pop_next_dep = false;
-      }
-      if (now == kStoreStage) {
-        mem_last_store_ptr = &mem_ptr[i];
-      }
-      mem_last_ptr = &mem_ptr[i];
-    }
-    // set dependency to make sure all core instruction get excuted
-    // before last FINISH instruction
-    if (mem_last_store_ptr && mem_last_ptr == mem_last_store_ptr) {
-      mem_last_store_ptr->push_prev_dep = true;
-      if (!pending_pop_next_[kComputeStage]) {
-        DepPop(kStoreStage, kComputeStage);
-      }
-      CommitPendingPop(kComputeStage);
-    } else {
-      pending_pop_next_[kComputeStage] = 0;
-    }
-    DepPush(kComputeStage, kLoadStage);
-    DepPop(kLoadStage, kComputeStage);
-    if (!pending_pop_next_[kLoadStage]) {
-      DepPop(kComputeStage, kLoadStage);
-    }
-    CommitPendingPop(kLoadStage);
-    DepPush(kLoadStage, kComputeStage);
-    CommitPendingPop(kComputeStage);
-  }
-  // Helper function: Get Opcode string
-  const char* getOpcodeString(int opcode, bool use_imm) {
-    // The string name
-    if (opcode == VTA_ALU_OPCODE_MIN) {
-      if (use_imm) {
-        return "min imm";
-      } else {
-        return "min";
-      }
-    } else if (opcode == VTA_ALU_OPCODE_MAX) {
-      if (use_imm) {
-        return "max imm";
-      } else {
-        return "max";
-      }
-    } else if (opcode == VTA_ALU_OPCODE_ADD) {
-      if (use_imm) {
-        return "add imm";
-      } else {
-        return "add";
-      }
-    } else if (opcode == VTA_ALU_OPCODE_SHR) {
-      return "shr";
-    } else if (opcode == VTA_ALU_OPCODE_MUL) {
-      return "mul";
-    }
-
-    return "unknown op";
-  }
-  // Dump instructions in the queue
-  void DumpInsn() {
-    // Keep tabs on dependence queues
-    int l2g_queue = 0;
-    int g2l_queue = 0;
-    int s2g_queue = 0;
-    int g2s_queue = 0;
-    // Converter
-    union VTAInsn c;
-    // Iterate over all instructions
-    int insn_count = count();
-    const VTAGenericInsn* insn = data();
-    printf("There are %u instructions\n", insn_count);
-    for (int i = 0; i < insn_count; ++i) {
-      // Fetch instruction and decode opcode
-      c.generic = insn[i];
-      printf("INSTRUCTION %u: ", i);
-      if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
-        if (c.mem.x_size == 0) {
-          if (c.mem.opcode == VTA_OPCODE_STORE) {
-            printf("NOP-STORE-STAGE\n");
-          } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
-            printf("NOP-COMPUTE-STAGE\n");
-          } else {
-            printf("NOP-MEMORY-STAGE\n");
-          }
-          printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-                 static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-                 static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-          // Count status in queues
-          if (c.mem.opcode == VTA_OPCODE_STORE) {
-            CHECK(c.mem.pop_next_dep == false);
-            CHECK(c.mem.push_next_dep == false);
-            if (c.mem.pop_prev_dep) g2s_queue--;
-            if (c.mem.push_prev_dep) s2g_queue++;
-          } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
-                     (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
-            CHECK(c.mem.pop_prev_dep == false);
-            CHECK(c.mem.push_prev_dep == false);
-            if (c.mem.pop_next_dep) g2l_queue--;
-            if (c.mem.push_next_dep) l2g_queue++;
-          } else {
-            if (c.mem.pop_prev_dep) l2g_queue--;
-            if (c.mem.push_prev_dep) g2l_queue++;
-            if (c.mem.pop_next_dep) s2g_queue--;
-            if (c.mem.push_next_dep) g2s_queue++;
-          }
-          printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
-          printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
-          continue;
-        }
-        // Print instruction field information
-        if (c.mem.opcode == VTA_OPCODE_LOAD) {
-          printf("LOAD ");
-          if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
-          if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
-          if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
-          if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
-        }
-        if (c.mem.opcode == VTA_OPCODE_STORE) {
-          printf("STORE:\n");
-        }
-        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-               static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-               static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-        printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", static_cast<int>(c.mem.dram_base),
-               static_cast<int>(c.mem.sram_base));
-        printf("\ty: size=%d, pad=[%d, %d]\n", static_cast<int>(c.mem.y_size),
-               static_cast<int>(c.mem.y_pad_0), static_cast<int>(c.mem.y_pad_1));
-        printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", static_cast<int>(c.mem.x_size),
-               static_cast<int>(c.mem.x_stride), static_cast<int>(c.mem.x_pad_0),
-               static_cast<int>(c.mem.x_pad_1));
-      } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
-        // Print instruction field information
-        printf("GEMM\n");
-
-        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-               static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-               static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-        printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
-        printf("\trange (%d, %d)\n", static_cast<int>(c.gemm.uop_bgn),
-               static_cast<int>(c.gemm.uop_end));
-        printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
-               static_cast<int>(c.gemm.iter_out), static_cast<int>(c.gemm.wgt_factor_out),
-               static_cast<int>(c.gemm.src_factor_out), static_cast<int>(c.gemm.dst_factor_out));
-        printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
-               static_cast<int>(c.gemm.iter_in), static_cast<int>(c.gemm.wgt_factor_in),
-               static_cast<int>(c.gemm.src_factor_in), static_cast<int>(c.gemm.dst_factor_in));
-      } else if (c.mem.opcode == VTA_OPCODE_ALU) {
-        // Print instruction field information
-        printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
-        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-               static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-               static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-        printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
-        printf("\trange (%d, %d)\n", static_cast<int>(c.alu.uop_bgn),
-               static_cast<int>(c.alu.uop_end));
-        printf("\touter loop - iter: %d, dst: %d, src: %d\n", static_cast<int>(c.alu.iter_out),
-               static_cast<int>(c.alu.dst_factor_out), static_cast<int>(c.alu.src_factor_out));
-        printf("\tinner loop - iter: %d, dst: %d, src: %d\n", static_cast<int>(c.alu.iter_in),
-               static_cast<int>(c.alu.dst_factor_in), static_cast<int>(c.alu.src_factor_in));
-      } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
-        printf("FINISH\n");
-      }
-
-      // Count status in queues
-      if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
-        if (c.mem.opcode == VTA_OPCODE_STORE) {
-          CHECK(c.mem.pop_next_dep == false);
-          CHECK(c.mem.push_next_dep == false);
-          if (c.mem.pop_prev_dep) g2s_queue--;
-          if (c.mem.push_prev_dep) s2g_queue++;
-        } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
-                   (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
-          CHECK(c.mem.pop_prev_dep == false);
-          CHECK(c.mem.push_prev_dep == false);
-          if (c.mem.pop_next_dep) g2l_queue--;
-          if (c.mem.push_next_dep) l2g_queue++;
-        } else {
-          if (c.mem.pop_prev_dep) l2g_queue--;
-          if (c.mem.push_prev_dep) g2l_queue++;
-          if (c.mem.pop_next_dep) s2g_queue--;
-          if (c.mem.push_next_dep) g2s_queue++;
-        }
-      } else if (c.mem.opcode == VTA_OPCODE_GEMM || c.mem.opcode == VTA_OPCODE_ALU) {
-        // Print instruction field information
-        if (c.gemm.pop_prev_dep) l2g_queue--;
-        if (c.gemm.push_prev_dep) g2l_queue++;
-        if (c.gemm.pop_next_dep) s2g_queue--;
-        if (c.gemm.push_next_dep) g2s_queue++;
-      }
-      printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
-      printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
-    }
-  }
-  // Commit all pending pop of corresponding stage
-  void CommitPendingPop(int stage) {
-    // Handle the LD<->compute queue
-    // NOTE: pop executes on target(stage)
-    CHECK(stage > 0 && stage < 4);
-    if (pending_pop_prev_[stage] || pending_pop_next_[stage]) {
-      PushNoop(stage, false, false, pending_pop_prev_[stage], pending_pop_next_[stage]);
-      pending_pop_prev_[stage] = 0;
-      pending_pop_next_[stage] = 0;
-    }
-  }
-  void CommitPending() {
-    for (int i = kLoadStage; i <= kStoreStage; ++i) {
-      CommitPendingPop(i);
-    }
-  }
-  bool PendingPop() {
-    for (int i = kLoadStage; i <= kStoreStage; ++i) {
-      if (pending_pop_prev_[i]) return true;
-      if (pending_pop_next_[i]) return true;
-    }
-    return false;
-  }
-  void AutoReadBarrier() { ReadBarrier(); }
-  /*! \brief Writer barrier to make sure that data written by CPU is visible to VTA. */
-  void ReadBarrier() {
-    CHECK(fpga_buff_ != nullptr);
-    CHECK(fpga_buff_phy_);
-    uint32_t buff_size = dram_buffer_.size() * elem_bytes_;
-    CHECK(buff_size <= kMaxBytes);
-    // Copy contents of DRAM buffer to FPGA buff
-    VTAMemCopyFromHost(fpga_buff_, dram_buffer_.data(), buff_size);
-    // Flush if we're using a shared memory system
-    // and if interface is non-coherent
-    if (!coherent_ && always_cache_) {
-      VTAFlushCache(fpga_buff_, fpga_buff_phy_, buff_size);
-    }
-  }
-
- protected:
-  /*! \return Add new instruction to the buffer. */
-  VTAGenericInsn* NextInsn() {
-    VTAGenericInsn insn = {};
-    dram_buffer_.push_back(insn);
-    return &dram_buffer_.back();
-  }
-  // Create a new instruction for a given stage
-  VTAGenericInsn* Create(PipelineStage stage) {
-    VTAGenericInsn* gptr = NextInsn();
-    VTAMemInsn* mptr = reinterpret_cast<VTAMemInsn*>(gptr);
-    mptr->pop_prev_dep = pending_pop_prev_[stage];
-    mptr->pop_next_dep = pending_pop_next_[stage];
-    mptr->push_prev_dep = false;
-    mptr->push_next_dep = false;
-    pending_pop_prev_[stage] = 0;
-    pending_pop_next_[stage] = 0;
-    return gptr;
-  }
-  // Get stage of the memory
-  static PipelineStage GetMemPipelineStage(int memory_type) {
-    if (memory_type == VTA_MEM_ID_ACC || memory_type == VTA_MEM_ID_ACC_8BIT) return kComputeStage;
-    if (memory_type == VTA_MEM_ID_UOP) return kComputeStage;
-    return kLoadStage;
-  }
-  // Get stage of the computation
-  static PipelineStage GetPipelineStage(VTAMemInsn* insn) {
-    if (insn->opcode == VTA_OPCODE_GEMM) return kComputeStage;
-    if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage;
-    if (insn->opcode == VTA_OPCODE_LOAD) {
-      if (insn->x_size == 0) return kNoneStage;
-      if (insn->memory_type == VTA_MEM_ID_ACC || insn->memory_type == VTA_MEM_ID_ACC_8BIT)
-        return kComputeStage;
-      if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage;
-      return kLoadStage;
-    }
-    if (insn->opcode == VTA_OPCODE_STORE) {
-      // FIXME: Right now memory_type is a 2-bit field which means that
-      //        VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from
-      //        checking the memory_type to avoid an CHECK error...
-      return kStoreStage;
-    }
-    LOG(FATAL) << "not reached";
-  }
-
-  // Get stage of memory and computation
-  static PipelineStage GetPipelineStageAll(VTAMemInsn* insn) {
-    PipelineStage stage = GetPipelineStage(insn);
-    if (stage != kNoneStage) return stage;
-    return GetMemPipelineStage(insn->memory_type);
-  }
-
-  // Push no-op
-  void PushNoop(int stage, bool push_prev_dep, bool push_next_dep, bool pop_prev_dep,
-                bool pop_next_dep) {
-    VTAMemInsn* insn = reinterpret_cast<VTAMemInsn*>(NextInsn());
-    insn->opcode = (stage == kStoreStage ? VTA_OPCODE_STORE : VTA_OPCODE_LOAD);
-    insn->push_prev_dep = push_prev_dep;
-    insn->push_next_dep = push_next_dep;
-    insn->pop_prev_dep = pop_prev_dep;
-    insn->pop_next_dep = pop_next_dep;
-    insn->sram_base = 0;
-    insn->dram_base = 0;
-    insn->y_size = 0;
-    insn->x_size = 0;
-    insn->x_stride = 0;
-    insn->y_pad_0 = 0;
-    insn->y_pad_1 = 0;
-    insn->x_pad_0 = 0;
-    insn->x_pad_1 = 0;
-    insn->memory_type = (stage == kLoadStage ? VTA_MEM_ID_INP : VTA_MEM_ID_UOP);
-  }
-
- private:
-  // Pending pop of each isntruction queue, qid=0 is not used
-  int pending_pop_prev_[4];
-  int pending_pop_next_[4];
-  static constexpr int kElemBytes = sizeof(VTAGenericInsn);
-  static constexpr int kMaxElems = kMaxBytes / kElemBytes;
-};
-
-/*!
- * \brief The command queue object that handles the request.
- */
-class CommandQueue {
- public:
-  CommandQueue() { this->InitSpace(); }
-  void InitSpace() {
-    uop_queue_.InitSpace();
-    insn_queue_.InitSpace();
-    device_ = VTADeviceAlloc();
-    CHECK(device_ != nullptr);
-  }
-
-  ~CommandQueue() { VTADeviceFree(device_); }
-
-  uint32_t GetElemBytes(uint32_t memory_id) {
-    uint32_t elem_bytes = 0;
-    switch (memory_id) {
-      case VTA_MEM_ID_UOP:
-        elem_bytes = VTA_UOP_ELEM_BYTES;
-        break;
-      case VTA_MEM_ID_INP:
-        elem_bytes = VTA_INP_ELEM_BYTES;
-        break;
-      case VTA_MEM_ID_WGT:
-        elem_bytes = VTA_WGT_ELEM_BYTES;
-        break;
-      case VTA_MEM_ID_ACC:
-        elem_bytes = VTA_ACC_ELEM_BYTES;
-        break;
-      case VTA_MEM_ID_OUT:
-        elem_bytes = VTA_OUT_ELEM_BYTES;
-        break;
-      case VTA_MEM_ID_ACC_8BIT:
-        elem_bytes = VTA_ACC_ELEM_BYTES / 4;
-        break;
-      default:
-        LOG(FATAL) << "Memory id not recognized:" << memory_id;
-        break;
-    }
-    /*
-     * elements size should not larger than VTA_PAGE_BYTES.
-     *
-     */
-    CHECK_GE(VTA_PAGE_BYTES, elem_bytes);
-    return elem_bytes;
-  }
-
-  void LoadBuffer2D(void* src_dram_addr, uint32_t src_elem_offset, uint32_t x_size, uint32_t y_size,
-                    uint32_t x_stride, uint32_t x_pad_before, uint32_t y_pad_before,
-                    uint32_t x_pad_after, uint32_t y_pad_after, uint32_t dst_sram_index,
-                    uint32_t dst_memory_type) {
-    VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type);
-    insn->opcode = VTA_OPCODE_LOAD;
-    insn->memory_type = dst_memory_type;
-    insn->sram_base = dst_sram_index;
-    DataBuffer* src = DataBuffer::FromHandle(src_dram_addr);
-    insn->dram_base = src->phy_addr() / GetElemBytes(dst_memory_type) + src_elem_offset;
-    insn->y_size = y_size;
-    insn->x_size = x_size;
-    insn->x_stride = x_stride;
-    insn->y_pad_0 = y_pad_before;
-    insn->y_pad_1 = y_pad_after;
-    insn->x_pad_0 = x_pad_before;
-    insn->x_pad_1 = x_pad_after;
-    this->CheckInsnOverFlow();
-  }
-
-  void StoreBuffer2D(uint32_t src_sram_index, uint32_t src_memory_type, void* dst_dram_addr,
-                     uint32_t dst_elem_offset, uint32_t x_size, uint32_t y_size,
-                     uint32_t x_stride) {
-    VTAMemInsn* insn = insn_queue_.CreateStoreInsn();
-    insn->opcode = VTA_OPCODE_STORE;
-    insn->memory_type = src_memory_type;
-    insn->sram_base = src_sram_index;
-    DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr);
-    insn->dram_base = dst->phy_addr() / GetElemBytes(src_memory_type) + dst_elem_offset;
-    insn->y_size = y_size;
-    insn->x_size = x_size;
-    insn->x_stride = x_stride;
-    insn->y_pad_0 = 0;
-    insn->y_pad_1 = 0;
-    insn->x_pad_0 = 0;
-    insn->x_pad_1 = 0;
-    this->CheckInsnOverFlow();
-  }
-
-  void DepPush(int from_qid, int to_qid) { insn_queue_.DepPush(from_qid, to_qid); }
-
-  void DepPop(int from_qid, int to_qid) { insn_queue_.DepPop(from_qid, to_qid); }
-
-  void ReadBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
-    if (!(debug_flag_ & VTA_DEBUG_SKIP_READ_BARRIER)) {
-      uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
-      DataBuffer::FromHandle(buffer)->FlushCache(elem_bytes * start, elem_bytes * extent);
-    }
-  }
-
-  void WriteBarrier(void* buffer, uint32_t elem_bits, uint32_t start, uint32_t extent) {
-    if (!(debug_flag_ & VTA_DEBUG_SKIP_WRITE_BARRIER)) {
-      uint32_t elem_bytes = (elem_bits + 8 - 1) / 8;
-      DataBuffer::FromHandle(buffer)->InvalidateCache(elem_bytes * start, elem_bytes * extent);
-    }
-  }
-
-  void Synchronize(uint32_t wait_cycles) {
-    // Insert dependences to force serialization
-    if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) {
-      insn_queue_.RewriteForceSerial();
-    } else {
-      // This will issue finish after last store finishes
-      insn_queue_.DepPush(kStoreStage, kComputeStage);
-      insn_queue_.DepPush(kLoadStage, kComputeStage);
-      insn_queue_.DepPop(kStoreStage, kComputeStage);
-      insn_queue_.DepPop(kLoadStage, kComputeStage);
-      insn_queue_.CommitPendingPop(kComputeStage);
-    }
-    // NOTE: FINISH cannot contain pop
-    VTAGemInsn* insn = insn_queue_.CreateGemInsn();
-    insn->opcode = VTA_OPCODE_FINISH;
-    CHECK(!insn_queue_.PendingPop());
-    // Check if there are no instruction to execute at all
-    if (insn_queue_.count() == 0) return;
-    // Synchronization for the queues
-    uop_queue_.AutoReadBarrier();
-    insn_queue_.AutoReadBarrier();
-    // Dump instructions if debug enabled
-    if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
-      insn_queue_.DumpInsn();
-    }
-    // Make sure that the last instruction is a finish instruction
-    CHECK(reinterpret_cast<VTAMemInsn*>(insn_queue_.data())[insn_queue_.count() - 1].opcode ==
-          VTA_OPCODE_FINISH);
-
-    // Make sure that we don't exceed contiguous physical memory limits
-    CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) <= VTA_MAX_XFER);
-    int timeout =
-        VTADeviceRun(device_, insn_queue_.dram_phy_addr(), insn_queue_.count(), wait_cycles);
-    CHECK_EQ(timeout, 0);
-    // Reset buffers
-    uop_queue_.Reset();
-    insn_queue_.Reset();
-  }
-
-  // Get record kernel
-  UopKernel* record_kernel() const {
-    CHECK(record_kernel_ != nullptr);
-    return record_kernel_;
-  }
-
-  // Set debug flag
-  void SetDebugFlag(int debug_flag) { debug_flag_ = debug_flag; }
-
-  void PushGEMMOp(void** uop_handle, int (*finit)(void*), void* signature, int nbytes) {
-    UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
-    if (uptr[0] == nullptr) {
-      uptr[0] = new UopKernelMap();
-    }
-    UopKernel** kptr = uptr[0]->Get(signature, nbytes);
-    if (kptr[0] == nullptr) {
-      record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
-      CHECK_EQ((*finit)(signature), 0);
-      kptr[0] = static_cast<UopKernel*>(record_kernel_);
-      if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
-        record_kernel_->Dump();
-      }
-      record_kernel_ = nullptr;
-    }
-    this->PushGEMMOp(static_cast<UopKernel*>(kptr[0]));
-    this->CheckInsnOverFlow();
-  }
-
-  void PushALUUop(void** uop_handle, int (*finit)(void*), void* signature, int nbytes) {
-    UopKernelMap** uptr = reinterpret_cast<UopKernelMap**>(uop_handle);
-    if (uptr[0] == nullptr) {
-      uptr[0] = new UopKernelMap();
-    }
-    UopKernel** kptr = uptr[0]->Get(signature, nbytes);
-    if (kptr[0] == nullptr) {
-      record_kernel_ = new UopKernel(static_cast<char*>(signature), nbytes);
-      CHECK_EQ((*finit)(signature), 0);
-      kptr[0] = static_cast<UopKernel*>(record_kernel_);
-      if (debug_flag_ & VTA_DEBUG_DUMP_UOP) {
-        record_kernel_->Dump();
-      }
-      record_kernel_ = nullptr;
-    }
-    this->PushALUUop(static_cast<UopKernel*>(kptr[0]));
-    this->CheckInsnOverFlow();
-  }
-
-  static std::shared_ptr<CommandQueue>& ThreadLocal() {
-    static std::shared_ptr<CommandQueue> inst = std::make_shared<CommandQueue>();
-    if (inst == nullptr) {
-      inst = std::make_shared<CommandQueue>();
-    }
-    return inst;
-  }
-
-  static void Shutdown() { ThreadLocal().reset(); }
-
- private:
-  // Push GEMM uop to the command buffer
-  void PushGEMMOp(UopKernel* kernel) {
-    uop_queue_.Push(kernel, [this]() { this->AutoSync(); });
-    if (uop_queue_.pending()) {
-      VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
-      insn->opcode = VTA_OPCODE_LOAD;
-      uop_queue_.FlushUopLoad(insn);
-    }
-    VTAGemInsn* insn = insn_queue_.CreateGemInsn();
-    insn->opcode = VTA_OPCODE_GEMM;
-    insn->reset_reg = kernel->reset_out_;
-    insn->uop_bgn = kernel->sram_begin_;
-    insn->uop_end = kernel->sram_end_;
-    const std::vector<UopKernel::LoopEntry>& loop = kernel->loop();
-    if (loop.size() > 0) {
-      insn->iter_out = loop[0].extent;
-      insn->wgt_factor_out = loop[0].wgt_factor;
-      insn->src_factor_out = loop[0].src_factor;
-      insn->dst_factor_out = loop[0].dst_factor;
-    } else {
-      insn->iter_out = 1;
-      insn->wgt_factor_out = 0;
-      insn->src_factor_out = 0;
-      insn->dst_factor_out = 0;
-    }
-    if (loop.size() > 1) {
-      insn->iter_in = loop[1].extent;
-      insn->wgt_factor_in = loop[1].wgt_factor;
-      insn->src_factor_in = loop[1].src_factor;
-      insn->dst_factor_in = loop[1].dst_factor;
-    } else {
-      insn->iter_in = 1;
-      insn->wgt_factor_in = 0;
-      insn->src_factor_in = 0;
-      insn->dst_factor_in = 0;
-    }
-  }
-
-  // Push ALU uop to the command buffer
-  void PushALUUop(UopKernel* kernel) {
-    uop_queue_.Push(kernel, [this]() { this->AutoSync(); });
-    if (uop_queue_.pending()) {
-      VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
-      insn->opcode = VTA_OPCODE_LOAD;
-      uop_queue_.FlushUopLoad(insn);
-    }
-    VTAAluInsn* insn = insn_queue_.CreateAluInsn();
-    insn->opcode = VTA_OPCODE_ALU;
-    insn->reset_reg = kernel->reset_out_;
-    insn->uop_bgn = kernel->sram_begin_;
-    insn->uop_end = kernel->sram_end_;
-    insn->alu_opcode = kernel->opcode_;
-    insn->use_imm = kernel->use_imm_;
-    insn->imm = kernel->imm_val_;
-    const std::vector<UopKernel::LoopEntry>& loop = kernel->loop();
-    if (loop.size() == 0) {
-      insn->iter_out = 1;
-      insn->dst_factor_out = 0;
-      insn->src_factor_out = 0;
-      insn->iter_in = 1;
-      insn->dst_factor_in = 0;
-      insn->src_factor_in = 0;
-    } else if (loop.size() == 1) {
-      insn->iter_out = 1;
-      insn->dst_factor_out = 0;
-      insn->src_factor_out = 0;
-      insn->iter_in = loop[0].extent;
-      insn->dst_factor_in = loop[0].dst_factor;
-      insn->src_factor_in = loop[0].src_factor;
-    } else {
-      insn->iter_out = loop[0].extent;
-      insn->dst_factor_out = loop[0].dst_factor;
-      insn->src_factor_out = loop[0].src_factor;
-      insn->iter_in = loop[1].extent;
-      insn->dst_factor_in = loop[1].dst_factor;
-      insn->src_factor_in = loop[1].src_factor;
-    }
-  }
-
-  void CheckInsnOverFlow() {
-    // At each API call, we can at most commit:
-    // at most: 2 NOP-COMPUTE-STAGE -> 2 NOP-MEMORY-STAGE -> 1 NOP-COMPUTE-STAGE -> 1 FINISH
-    if ((insn_queue_.count() + 6) * sizeof(VTAGenericInsn) > VTA_MAX_XFER) {
-      this->AutoSync();
-    }
-  }
-  // Auto sync when instruction overflow
-  void AutoSync() { this->Synchronize(1 << 31); }
-
-  // Internal debug flag
-  int debug_flag_{0};
-  // The kernel we are currently recording
-  UopKernel* record_kernel_{nullptr};
-  // Micro op queue
-  UopQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> uop_queue_;
-  // instruction queue
-  InsnQueue<VTA_MAX_XFER, kBufferCoherent, kAlwaysCache> insn_queue_;
-  // Device handle
-  VTADeviceHandle device_{nullptr};
-};
-
-}  // namespace vta
-
-void* VTABufferAlloc(size_t size) { return vta::DataBuffer::Alloc(size); }
-
-void VTABufferFree(void* buffer) { vta::DataBuffer::Free(vta::DataBuffer::FromHandle(buffer)); }
-
-void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                   int kind_mask) {
-  vta::DataBuffer* from_buffer = nullptr;
-  vta::DataBuffer* to_buffer = nullptr;
-
-  if (kind_mask & 2) {
-    from_buffer = vta::DataBuffer::FromHandle(from);
-    from = from_buffer->virt_addr();
-  }
-  if (kind_mask & 1) {
-    to_buffer = vta::DataBuffer::FromHandle(to);
-    to = to_buffer->virt_addr();
-  }
-
-  if (from_buffer) {
-    // This is an FPGA to host mem transfer
-    from_buffer->InvalidateCache(from_offset, size);
-    from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
-                               static_cast<const char*>(from) + from_offset, size);
-  } else if (to_buffer) {
-    // This is a host to FPGA mem transfer
-    to_buffer->MemCopyFromHost(static_cast<char*>(to) + to_offset,
-                               static_cast<const char*>(from) + from_offset, size);
-    to_buffer->FlushCache(to_offset, size);
-  }
-}
-
-VTACommandHandle VTATLSCommandHandle() { return vta::CommandQueue::ThreadLocal().get(); }
-
-void VTARuntimeShutdown() { vta::CommandQueue::Shutdown(); }
-
-void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) {
-  static_cast<vta::CommandQueue*>(cmd)->SetDebugFlag(debug_flag);
-}
-
-void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) {
-  auto data_buf = vta::DataBuffer::FromHandle(buffer);
-  if (data_buf) {
-    return data_buf->virt_addr();
-  } else {  // it is a raw ptr allocated by CPU
-    return buffer;
-  }
-}
-
-void VTAWriteBarrier(VTACommandHandle cmd, void* buffer, uint32_t elem_bits, uint32_t start,
-                     uint32_t extent) {
-  static_cast<vta::CommandQueue*>(cmd)->WriteBarrier(buffer, elem_bits, start, extent);
-}
-
-void VTAReadBarrier(VTACommandHandle cmd, void* buffer, uint32_t elem_bits, uint32_t start,
-                    uint32_t extent) {
-  static_cast<vta::CommandQueue*>(cmd)->ReadBarrier(buffer, elem_bits, start, extent);
-}
-
-void VTALoadBuffer2D(VTACommandHandle cmd, void* src_dram_addr, uint32_t src_elem_offset,
-                     uint32_t x_size, uint32_t y_size, uint32_t x_stride, uint32_t x_pad_before,
-                     uint32_t y_pad_before, uint32_t x_pad_after, uint32_t y_pad_after,
-                     uint32_t dst_sram_index, uint32_t dst_memory_type) {
-  static_cast<vta::CommandQueue*>(cmd)->LoadBuffer2D(
-      src_dram_addr, src_elem_offset, x_size, y_size, x_stride, x_pad_before, y_pad_before,
-      x_pad_after, y_pad_after, dst_sram_index, dst_memory_type);
-}
-
-void VTAStoreBuffer2D(VTACommandHandle cmd, uint32_t src_sram_index, uint32_t src_memory_type,
-                      void* dst_dram_addr, uint32_t dst_elem_offset, uint32_t x_size,
-                      uint32_t y_size, uint32_t x_stride) {
-  static_cast<vta::CommandQueue*>(cmd)->StoreBuffer2D(
-      src_sram_index, src_memory_type, dst_dram_addr, dst_elem_offset, x_size, y_size, x_stride);
-}
-
-void VTAUopPush(uint32_t mode, uint32_t reset_out, uint32_t dst_index, uint32_t src_index,
-                uint32_t wgt_index, uint32_t opcode, uint32_t use_imm, int32_t imm_val) {
-  vta::CommandQueue::ThreadLocal()->record_kernel()->Push(mode, reset_out, dst_index, src_index,
-                                                          wgt_index, opcode, use_imm, imm_val);
-}
-
-void VTAUopLoopBegin(uint32_t extent, uint32_t dst_factor, uint32_t src_factor,
-                     uint32_t wgt_factor) {
-  vta::CommandQueue::ThreadLocal()->record_kernel()->PushLoopBegin(extent, dst_factor, src_factor,
-                                                                   wgt_factor);
-}
-
-void VTAUopLoopEnd() { vta::CommandQueue::ThreadLocal()->record_kernel()->PushLoopEnd(); }
-
-int VTAPushGEMMOp(void** uop_handle, int (*finit)(void*), void* signature, int nbytes) {
-  vta::CommandQueue::ThreadLocal()->PushGEMMOp(uop_handle, finit, signature, nbytes);
-  return 0;
-}
-
-int VTAPushALUOp(void** uop_handle, int (*finit)(void*), void* signature, int nbytes) {
-  vta::CommandQueue::ThreadLocal()->PushALUUop(uop_handle, finit, signature, nbytes);
-  return 0;
-}
-
-int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid) {
-  static_cast<vta::CommandQueue*>(cmd)->DepPush(from_qid, to_qid);
-  return 0;
-}
-
-int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) {
-  static_cast<vta::CommandQueue*>(cmd)->DepPop(from_qid, to_qid);
-  return 0;
-}
-
-void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
-  static_cast<vta::CommandQueue*>(cmd)->Synchronize(wait_cycles);
-}
diff --git a/vta/runtime/runtime.h b/vta/runtime/runtime.h
deleted file mode 100644
index e6a6cb26528e..000000000000
--- a/vta/runtime/runtime.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file runtime.h
- * \brief VTA runtime library.
- */
-
-#ifndef VTA_RUNTIME_RUNTIME_H_
-#define VTA_RUNTIME_RUNTIME_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <tvm/runtime/c_runtime_api.h>
-#include <vta/driver.h>
-
-#define VTA_MEMCPY_H2D 1
-#define VTA_MEMCPY_D2H 2
-#define VTA_MEMCPY_D2D 3
-
-#define VTA_DEBUG_DUMP_INSN (1 << 1)
-#define VTA_DEBUG_DUMP_UOP (1 << 2)
-#define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
-#define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
-#define VTA_DEBUG_FORCE_SERIAL (1 << 5)
-
-#define ALLOC_ALIGNMENT 64
-
-/*!
- * \brief Allocate data buffer.
- * \param size Buffer size.
- * \return A pointer to the allocated buffer.
- */
-TVM_DLL void* VTABufferAlloc(size_t size);
-
-/*!
- * \brief Free data buffer.
- * \param buffer The data buffer to be freed.
- */
-TVM_DLL void VTABufferFree(void* buffer);
-
-/*!
- * \brief Copy data buffer from one location to another.
- * \param from The source buffer base address.
- * \param from_offset The offset of the source buffer.
- * \param to The target buffer base address.
- * \param to_offset The offset of the target buffer.
- * \param size Size of copy.
- * \param kind_mask The memory copy kind.
- */
-TVM_DLL void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_offset,
-                           size_t size, int kind_mask);
-
-/*! \brief VTA command handle */
-typedef void* VTACommandHandle;
-
-/*! \brief Shutdown hook of VTA to cleanup resources */
-TVM_DLL void VTARuntimeShutdown();
-
-/*!
- * \brief Get thread local command handle.
- * \return A thread local command handle.
- */
-TVM_DLL VTACommandHandle VTATLSCommandHandle();
-
-/*!
- * \brief Get the buffer access pointer on CPU.
- * \param cmd The VTA command handle.
- * \param buffer The data buffer.
- * \return The pointer that can be accessed by the CPU.
- */
-TVM_DLL void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
-
-/*!
- * \brief Perform a write barrier to make a memory region visible to the CPU.
- * \param cmd The VTA command handle.
- * \param buffer The head buffer pointer.
- * \param elem_bits The size in bits of each element.
- * \param start The start of the region (in elements).
- * \param extent The end of the region (in elements).
- */
-TVM_DLL void VTAWriteBarrier(VTACommandHandle cmd, void* buffer, uint32_t elem_bits, uint32_t start,
-                             uint32_t extent);
-
-/*!
- * \brief Perform a read barrier to a memory region visible to VTA.
- * \param cmd The VTA command handle.
- * \param buffer The head buffer pointer.
- * \param elem_bits The unit bits of each elements.
- * \param start The start of the region (in elements).
- * \param extent The end of the region (in elements).
- */
-TVM_DLL void VTAReadBarrier(VTACommandHandle cmd, void* buffer, uint32_t elem_bits, uint32_t start,
-                            uint32_t extent);
-
-/*!
- * \brief Set debug mode on the command handle.
- * \param cmd The VTA command handle.
- * \param debug_flag The debug flag.
- */
-TVM_DLL void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
-
-/*!
- * \brief Perform a 2D data load from DRAM.
- *  Sizes are measured in units of vector elements.
- * \param cmd The VTA command handle.
- * \param src_dram_addr Source DRAM address.
- * \param src_elem_offset The source DRAM offset in number of unit elements.
- * \param x_size The lowest dimension (x axis) size in number of unit elements.
- * \param y_size The number of rows (y axis).
- * \param x_stride The x axis stride.
- * \param x_pad_before The start padding on x axis.
- * \param y_pad_before The start padding on y axis.
- * \param x_pad_after The end padding on x axis.
- * \param y_pad_after The end padding of y axis.
- * \param dst_sram_index Destination SRAM index.
- * \param dst_memory_type Destination memory type.
- */
-TVM_DLL void VTALoadBuffer2D(VTACommandHandle cmd, void* src_dram_addr, uint32_t src_elem_offset,
-                             uint32_t x_size, uint32_t y_size, uint32_t x_stride,
-                             uint32_t x_pad_before, uint32_t y_pad_before, uint32_t x_pad_after,
-                             uint32_t y_pad_after, uint32_t dst_sram_index,
-                             uint32_t dst_memory_type);
-
-/*!
- * \brief Perform a 2D data store into DRAM
- *  Sizes are measured in units of vector elements.
- * \param cmd The VTA command handle.
- * \param src_sram_index Source SRAM index.
- * \param src_memory_type Source memory type.
- * \param dst_dram_addr Destination DRAM address.
- * \param dst_elem_offset The destination DRAM offset in number of unit elements.
- * \param x_size The lowest dimension (x axis) size in number of unit elements.
- * \param y_size The number of rows.
- * \param x_stride The x axis stride.
- */
-TVM_DLL void VTAStoreBuffer2D(VTACommandHandle cmd, uint32_t src_sram_index,
-                              uint32_t src_memory_type, void* dst_dram_addr,
-                              uint32_t dst_elem_offset, uint32_t x_size, uint32_t y_size,
-                              uint32_t x_stride);
-
-/*!
- * \brief Push uop into kernel buffer.
- * In GEMM mode, do a blocked GEMM with 2d access pattern.
- * In ALU mode, do a vectorized ALU operation with 2d access pattern.
- *
- *  \code
- *
- *   DType accum[INP_BUFF_DEPTH][l][n];
- *   DType weight[WGT_BUFF_DEPTH][n][m];
- *   DType input[INP_BUFF_DEPTH][l][m];
- *   if reset_out == 1
- *    accum[dst_index] = 0
- *   elif mode == 0
- *    accum[dst_index] += GEMM(input[src_index], weight[wgt_index]);
- *   else
- *    if (use_imm)
- *      accum[dst_index] = opcode(accum[dst_index], imm_val);
- *    else
- *      accum[dst_index] = opcode(accum[dst_index], accum[src_index]);
- *
- *  \endcode
- *
- * \param mode Set to GEMM mode if set to 0, ALU mode is set to 1.
- * \param reset_out Resets the accum to 0.
- * \param dst_index The accum memory index.
- * \param src_index The input memory (gemm) / accum memory (alu) index.
- * \param wgt_index The weight memory index.
- * \param opcode The ALU opcode.
- * \param use_imm Use immediate in ALU mode if set to true.
- * \param imm_val Immediate value in ALU mode.
- */
-TVM_DLL void VTAUopPush(uint32_t mode, uint32_t reset_out, uint32_t dst_index, uint32_t src_index,
-                        uint32_t wgt_index, uint32_t opcode, uint32_t use_imm, int32_t imm_val);
-
-/*!
- * \brief Mark start of a micro op loop.
- * \param extent The extent of the loop.
- * \param dst_factor The accum factor.
- * \param src_factor The input factor.
- * \param wgt_factor The weight factor.
- */
-TVM_DLL void VTAUopLoopBegin(uint32_t extent, uint32_t dst_factor, uint32_t src_factor,
-                             uint32_t wgt_factor);
-
-/*!
- * \brief Mark end of a micro op loop.
- */
-TVM_DLL void VTAUopLoopEnd();
-
-/*!
- * \brief Push GEMM uop kernel into the command handle.
- * \param uop_handle The uop cache handle.
- * \param finit The initalization function to initialize uop.
- * \param signature The closure arguments of the finit.
- * \param nbytes Number of bytes to in the closure arguments.
- * \return 0 if success.
- */
-TVM_DLL int VTAPushGEMMOp(void** uop_handle, int (*finit)(void*), void* signature, int nbytes);
-
-/*!
- * \brief Push ALU uop kernel into the command handle.
- * \param uop_handle The uop cache handle.
- * \param finit The initalization function to initialize uop.
- * \param signature The closure arguments of the finit.
- * \param nbytes Number of bytes to in the closure arguments.
- * \return 0 if success.
- */
-TVM_DLL int VTAPushALUOp(void** uop_handle, int (*finit)(void*), void* signature, int nbytes);
-
-/*!
- * \brief Push dependence token.
- * \param cmd The VTA command handle.
- * \param from_qid The source queue.
- * \param to_qid The destination queue.
- * \return 0 if success.
- */
-TVM_DLL int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
-
-/*!
- * \brief Pop dependence signal.
- * \param cmd The VTA command handle.
- * \param from_qid The source queue.
- * \param to_qid The destination queue.
- * \return 0 if success.
- */
-TVM_DLL int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
-
-/*!
- * \brief Synchronize the command handle.
- *  Commit all the instructions to VTA and wait until
- *  the accelerator finishes its job.
- *  Perform all of the out-of-order DRAM stores.
- * \param cmd The VTA command handle.
- * \param wait_cycles The limit of poll cycles.
- *
- */
-TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // VTA_RUNTIME_RUNTIME_H_
diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py
deleted file mode 100644
index 6333ac245a95..000000000000
--- a/vta/scripts/tune_conv2d.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Tuning a single conv2d operator"""
-
-from collections import namedtuple
-import logging
-import os
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import vta
-import vta.testing
-
-env = vta.get_env()
-
-Workload = namedtuple(
-    "Conv2DWorkload",
-    [
-        "batch",
-        "height",
-        "width",
-        "in_filter",
-        "out_filter",
-        "hkernel",
-        "wkernel",
-        "hpad",
-        "wpad",
-        "hstride",
-        "wstride",
-    ],
-)
-
-resnet_wkls = [
-    # Workloads of resnet18 on imagenet
-    # ('resnet-18.C1',  Workload(env.BATCH, 224, 224, 3,   64,  7, 7, 3, 3, 2, 2)),
-    ("resnet-18.C2", Workload(env.BATCH, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)),
-    ("resnet-18.C3", Workload(env.BATCH, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2)),
-    ("resnet-18.C4", Workload(env.BATCH, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2)),
-    ("resnet-18.C5", Workload(env.BATCH, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1)),
-    ("resnet-18.C6", Workload(env.BATCH, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2)),
-    ("resnet-18.C7", Workload(env.BATCH, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)),
-    ("resnet-18.C8", Workload(env.BATCH, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1)),
-    ("resnet-18.C9", Workload(env.BATCH, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2)),
-    ("resnet-18.C10", Workload(env.BATCH, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)),
-    ("resnet-18.C11", Workload(env.BATCH, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1)),
-]
-
-
-@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
-def my_clip(x, a_min, a_max):
-    """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.tir.const(a_min, x.dtype)
-    const_max = tvm.tir.const(a_max, x.dtype)
-    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
-    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
-    return x
-
-
-def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation):
-    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN)
-    kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN)
-    bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
-
-    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
-
-    with tvm.target.vta():
-        res = topi.nn.conv2d(
-            input=data,
-            filter=kernel,
-            padding=padding,
-            strides=strides,
-            dilation=dilation,
-            layout="NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN),
-            out_dtype=env.acc_dtype,
-        )
-        res = topi.right_shift(res, env.WGT_WIDTH)
-        res = topi.add(res, bias)
-        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res = topi.cast(res, env.out_dtype)
-
-    if tvm.target.Target.current().device_name == "vta":
-        s = topi.generic.schedule_conv2d_nchw([res])
-    else:
-        s = te.create_schedule([res.op])
-
-    return s, [data, kernel, bias, res]
-
-
-if __name__ == "__main__":
-
-    # Logging config (for printing tuning log to the screen)
-    logging.basicConfig()
-    # logging.getLogger('autotvm').setLevel(logging.DEBUG)
-
-    # Tuning log files
-    log_file = "%s.conv2d.log" % (env.TARGET)
-    # create tmp log file
-    tmp_log_file = log_file + ".tmp"
-    if os.path.exists(log_file):
-        os.remove(log_file)
-
-    # Get tracker info from env
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
-    if not tracker_host or not tracker_port:
-        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
-        exit()
-
-    for idx, (wl_name, wl) in enumerate(resnet_wkls):
-        prefix = "[Task %2d/%2d] " % (idx, len(resnet_wkls))
-
-        # Read in workload parameters
-        N = wl.batch
-        CI = wl.in_filter
-        H = wl.height
-        W = wl.width
-        CO = wl.out_filter
-        KH = wl.hkernel
-        KW = wl.wkernel
-        strides = (wl.hstride, wl.wstride)
-        padding = (wl.hpad, wl.wpad)
-        dilation = (1, 1)
-
-        # Create task
-        task = autotvm.task.create(
-            conv2d,
-            args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation),
-            target=tvm.target.vta(),
-            target_host=env.target_host,
-            template_key="direct",
-        )
-        print(task.config_space)
-
-        # Tune
-        measure_option = autotvm.measure_option(
-            builder=autotvm.LocalBuilder(),
-            runner=autotvm.RPCRunner(
-                env.TARGET,
-                host=tracker_host,
-                port=int(tracker_port),
-                number=5,
-                timeout=60,
-                # check_correctness=True, # TODO: re-enable when check_correctness works again.
-            ),
-        )
-
-        # Run Tuner
-        tuner = autotvm.tuner.RandomTuner(task)
-        tuner.tune(
-            n_trial=len(task.config_space),
-            early_stopping=None,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(len(task.config_space), prefix=prefix),
-                autotvm.callback.log_to_file(tmp_log_file),
-            ],
-        )
-
-    # Pick best records to a cache file
-    autotvm.record.pick_best(tmp_log_file, log_file)
-    os.remove(tmp_log_file)
diff --git a/vta/scripts/tune_conv2d_transpose.py b/vta/scripts/tune_conv2d_transpose.py
deleted file mode 100644
index 6251cc86fdce..000000000000
--- a/vta/scripts/tune_conv2d_transpose.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Tuning a single conv2d transpose operator"""
-
-from collections import namedtuple
-import logging
-import os
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import vta
-import vta.testing
-
-# Get batch info from env
-env = vta.get_env()
-
-Workload = namedtuple(
-    "Conv2DTransposeWorkload",
-    [
-        "batch",
-        "height",
-        "width",
-        "in_filter",
-        "out_filter",
-        "hkernel",
-        "wkernel",
-        "hpad",
-        "wpad",
-        "hstride",
-        "wstride",
-        "o_hpad",
-        "o_wpad",
-    ],
-)
-
-# DCGAN workloads
-dcgan_wkls = [
-    # dcgan
-    ("DCGAN.CT1", Workload(env.BATCH, 4, 4, 1024, 512, 4, 4, 1, 1, 2, 2, 0, 0)),
-    ("DCGAN.CT2", Workload(env.BATCH, 8, 8, 512, 256, 4, 4, 1, 1, 2, 2, 0, 0)),
-    ("DCGAN.CT3", Workload(env.BATCH, 16, 16, 256, 128, 4, 4, 1, 1, 2, 2, 0, 0)),
-]
-
-
-@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
-def my_clip(x, a_min, a_max):
-    """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.tir.const(a_min, x.dtype)
-    const_max = tvm.tir.const(a_max, x.dtype)
-    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
-    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
-    return x
-
-
-def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding, opadding):
-    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN)
-    kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN)
-
-    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-
-    with tvm.target.vta():
-        res = topi.nn.conv2d_transpose_nchw(
-            Input=data,
-            Filter=kernel,
-            strides=strides,
-            padding=padding,
-            out_dtype=env.acc_dtype,
-            output_padding=opadding,
-        )
-        res = topi.right_shift(res, env.WGT_WIDTH)
-        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res = topi.cast(res, env.out_dtype)
-
-    if tvm.target.Target.current().device_name == "vta":
-        s = topi.generic.schedule_conv2d_transpose_nchw([res])
-    else:
-        s = te.create_schedule([res.op])
-
-    return s, [data, kernel, res]
-
-
-if __name__ == "__main__":
-
-    # Logging config (for printing tuning log to the screen)
-    logging.basicConfig()
-    # logging.getLogger('autotvm').setLevel(logging.DEBUG)
-
-    # Tuning log files
-    log_file = "%s.conv2d_transpose.log" % (env.TARGET)
-    # create tmp log file
-    tmp_log_file = log_file + ".tmp"
-    if os.path.exists(log_file):
-        os.remove(log_file)
-
-    # Get tracker info from env
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
-    if not tracker_host or not tracker_port:
-        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
-        exit()
-
-    for idx, (wl_name, wl) in enumerate(dcgan_wkls):
-        prefix = "[Task %2d/%2d] " % (idx, len(dcgan_wkls))
-
-        # Read in workload parameters
-        N = wl.batch
-        H = wl.height
-        W = wl.width
-        CI = wl.in_filter
-        CO = wl.out_filter
-        KH = wl.hkernel
-        KW = wl.wkernel
-        strides = (wl.hstride, wl.wstride)
-        padding = (wl.hpad, wl.wpad)
-        opadding = (wl.o_hpad, wl.o_wpad)
-
-        # Create task
-        task = autotvm.task.create(
-            conv2d_transpose,
-            args=(N, CI, H, W, CO, KH, KW, strides, padding, opadding),
-            target=tvm.target.Target(tvm.target.vta(), host=env.target_host),
-            template_key="direct",
-        )
-        print(task.config_space)
-
-        # Tune
-        measure_option = autotvm.measure_option(
-            builder=autotvm.LocalBuilder(),
-            runner=autotvm.RPCRunner(
-                env.TARGET,
-                host=tracker_host,
-                port=int(tracker_port),
-                number=5,
-                timeout=60,
-                # check_correctness=True, # TODO: re-enable when check_correctness works again.
-            ),
-        )
-
-        # Run Tuner
-        tuner = autotvm.tuner.RandomTuner(task)
-        tuner.tune(
-            n_trial=len(task.config_space),
-            early_stopping=None,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(len(task.config_space), prefix=prefix),
-                autotvm.callback.log_to_file(tmp_log_file),
-            ],
-        )
-
-    # Pick best records to a cache file
-    autotvm.record.pick_best(tmp_log_file, log_file)
-    os.remove(tmp_log_file)
diff --git a/vta/scripts/tune_dense.py b/vta/scripts/tune_dense.py
deleted file mode 100644
index 6d600c4c322f..000000000000
--- a/vta/scripts/tune_dense.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Tuning a single dense operator"""
-
-from collections import namedtuple
-import logging
-import os
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import vta
-import vta.testing
-
-env = vta.get_env()
-
-Workload = namedtuple("DenseWorkload", ["batch", "in_filter", "out_filter"])
-
-dense_wkls = [
-    ("lstm.dense.1", Workload(1, 256, 128)),
-    ("lstm.dense.4", Workload(4, 256, 128)),
-]
-
-
-@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
-def my_clip(x, a_min, a_max):
-    """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.tir.const(a_min, x.dtype)
-    const_max = tvm.tir.const(a_max, x.dtype)
-    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
-    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
-    return x
-
-
-def dense(N, CI, CO):
-    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, env.BATCH, env.BLOCK_IN)
-    kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN)
-
-    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-
-    with tvm.target.vta():
-        res = topi.nn.dense(data, kernel, None, "int32")
-        res = topi.right_shift(res, 8)
-        res = my_clip(res, 0, 127)
-        res = topi.cast(res, "int8")
-
-    if tvm.target.Target.current().device_name == "vta":
-        s = topi.generic.schedule_dense([res])
-    else:
-        s = te.create_schedule([res.op])
-
-    return s, [data, kernel, res]
-
-
-if __name__ == "__main__":
-
-    # Logging config (for printing tuning log to the screen)
-    logging.basicConfig()
-    # logging.getLogger('autotvm').setLevel(logging.DEBUG)
-
-    # Tuning log files
-    log_file = "%s.dense.log" % (env.TARGET)
-    # create tmp log file
-    tmp_log_file = log_file + ".tmp"
-    if os.path.exists(log_file):
-        os.remove(log_file)
-
-    # Get tracker info from env
-    tracket_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracket_port = os.environ.get("TVM_TRACKER_PORT", None)
-    if not tracket_host or not tracket_port:
-        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
-        exit()
-
-    for idx, (wl_name, wl) in enumerate(dense_wkls):
-
-        prefix = "[Task %2d/%2d] " % (idx, len(dense_wkls))
-
-        # Workload parameters
-        N = wl.batch
-        CI = wl.in_filter
-        CO = wl.out_filter
-
-        task = autotvm.task.create(
-            dense,
-            args=(N, CI, CO),
-            target=tvm.target.vta(),
-            target_host=env.target_host,
-            template_key="direct",
-        )
-        print(task.config_space)
-
-        # Tune
-        measure_option = autotvm.measure_option(
-            builder=autotvm.LocalBuilder(),
-            runner=autotvm.RPCRunner(
-                env.TARGET,
-                host=tracket_host,
-                port=int(tracket_port),
-                number=5,
-                timeout=60,
-                # check_correctness=True, # TODO: re-enable when check_correctness works again.
-            ),
-        )
-
-        # Run Tuner
-        tuner = autotvm.tuner.RandomTuner(task)
-        tuner.tune(
-            n_trial=len(task.config_space),
-            early_stopping=None,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(len(task.config_space), prefix=prefix),
-                autotvm.callback.log_to_file(tmp_log_file),
-            ],
-        )
-
-    # Pick best records to a cache file
-    autotvm.record.pick_best(tmp_log_file, log_file)
-    os.remove(tmp_log_file)
diff --git a/vta/scripts/tune_group_conv2d.py b/vta/scripts/tune_group_conv2d.py
deleted file mode 100644
index ebb7db88845f..000000000000
--- a/vta/scripts/tune_group_conv2d.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Tuning a single group conv2d operator"""
-
-from collections import namedtuple
-import logging
-import os
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm import topi
-import vta
-import vta.testing
-
-env = vta.get_env()
-
-Workload = namedtuple(
-    "GroupConv2DWorkload",
-    [
-        "batch",
-        "height",
-        "width",
-        "in_filter",
-        "out_filter",
-        "groups",
-        "hkernel",
-        "wkernel",
-        "hpad",
-        "wpad",
-        "hstride",
-        "wstride",
-    ],
-)
-
-# Mobilenet (grouped variant) workloads
-mobilenet_wkls = [
-    ("mobilenet.D1", Workload(env.BATCH, 112, 112, 32, 32, 2, 3, 3, 1, 1, 1, 1)),
-    ("mobilenet.D2", Workload(env.BATCH, 112, 112, 64, 64, 4, 3, 3, 1, 1, 2, 2)),
-    ("mobilenet.D3", Workload(env.BATCH, 56, 56, 128, 128, 8, 3, 3, 1, 1, 1, 1)),
-    ("mobilenet.D4", Workload(env.BATCH, 56, 56, 128, 128, 8, 3, 3, 1, 1, 2, 2)),
-    ("mobilenet.D5", Workload(env.BATCH, 28, 28, 256, 256, 16, 3, 3, 1, 1, 1, 1)),
-    ("mobilenet.D6", Workload(env.BATCH, 28, 28, 256, 256, 16, 3, 3, 1, 1, 2, 2)),
-    ("mobilenet.D7", Workload(env.BATCH, 14, 14, 512, 512, 32, 3, 3, 1, 1, 1, 1)),
-    ("mobilenet.D8", Workload(env.BATCH, 14, 14, 512, 512, 32, 3, 3, 1, 1, 2, 2)),
-    ("mobilenet.D9", Workload(env.BATCH, 7, 7, 1024, 1024, 64, 3, 3, 1, 1, 1, 1)),
-]
-
-
-@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
-def my_clip(x, a_min, a_max):
-    """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.tir.const(a_min, x.dtype)
-    const_max = tvm.tir.const(a_max, x.dtype)
-    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
-    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
-    return x
-
-
-def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group):
-
-    CI_G = CI // groups
-    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN)
-    kernel_shape = (CO // env.BLOCK_OUT, CI_G // env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN)
-    bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
-
-    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
-
-    with tvm.target.vta():
-        res = topi.nn.group_conv2d_nchw(
-            data, kernel, strides, padding, dilation, groups, env.acc_dtype
-        )
-        res = topi.right_shift(res, env.WGT_WIDTH)
-        res = topi.add(res, bias)
-        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res = topi.cast(res, env.out_dtype)
-
-    if tvm.target.Target.current().device_name == "vta":
-        s = topi.generic.schedule_group_conv2d_nchw([res])
-    else:
-        s = te.create_schedule([res.op])
-
-    return s, [data, kernel, bias, res]
-
-
-if __name__ == "__main__":
-
-    # Logging config (for printing tuning log to the screen)
-    logging.basicConfig()
-
-    # Tuning log files
-    log_file = "%s.group_conv2d.log" % (env.TARGET)
-    # create tmp log file
-    tmp_log_file = log_file + ".tmp"
-    if os.path.exists(log_file):
-        os.remove(log_file)
-
-    # Get tracker info from env
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
-    if not tracker_host or not tracker_port:
-        print("Set your AutoTVM tracker node host and port variables to run the autotuner")
-        exit()
-
-    for idx, (wl_name, wl) in enumerate(mobilenet_wkls):
-        prefix = "[Task %2d/%2d] " % (idx, len(mobilenet_wkls))
-
-        # Read in workload parameters
-        N = wl.batch
-        CI = wl.in_filter
-        H = wl.height
-        W = wl.width
-        CO = wl.out_filter
-        KH = wl.hkernel
-        KW = wl.wkernel
-        strides = (wl.hstride, wl.wstride)
-        padding = (wl.hpad, wl.wpad)
-        dilation = (1, 1)
-        groups = wl.groups
-
-        # Create task
-        task = autotvm.task.create(
-            group_conv2d,
-            args=(N, CI, H, W, CO, KH, KW, strides, padding, dilation, groups),
-            target=tvm.target.vta(),
-            target_host=env.target_host,
-            template_key="direct",
-        )
-        print(task.config_space)
-
-        # Tune
-        measure_option = autotvm.measure_option(
-            builder=autotvm.LocalBuilder(),
-            runner=autotvm.RPCRunner(
-                env.TARGET,
-                host=tracker_host,
-                port=int(tracker_port),
-                number=5,
-                timeout=60,
-                # check_correctness=True, # TODO: re-enable when check_correctness works again.
-            ),
-        )
-
-        # Run Tuner
-        tuner = autotvm.tuner.RandomTuner(task)
-        tuner.tune(
-            n_trial=len(task.config_space),
-            early_stopping=None,
-            measure_option=measure_option,
-            callbacks=[
-                autotvm.callback.progress_bar(len(task.config_space), prefix=prefix),
-                autotvm.callback.log_to_file(tmp_log_file),
-            ],
-        )
-
-    # Pick best records to a cache file
-    autotvm.record.pick_best(tmp_log_file, log_file)
-    os.remove(tmp_log_file)
diff --git a/vta/tests/python/de10nano/test_program_rpc.py b/vta/tests/python/de10nano/test_program_rpc.py
deleted file mode 100644
index ea62f3fba2e7..000000000000
--- a/vta/tests/python/de10nano/test_program_rpc.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import sys, os
-import tvm
-from tvm import rpc
-from vta import get_bitstream_path, download_bitstream, program_fpga, reconfig_runtime
-
-host = os.environ.get("VTA_RPC_HOST", "de10nano")
-port = int(os.environ.get("VTA_RPC_PORT", "9091"))
-
-
-def program_rpc_bitstream(path=None):
-    """Program the FPGA on the RPC server
-
-    Parameters
-    ----------
-    path : path to bitstream (optional)
-    """
-    assert tvm.runtime.enabled("rpc")
-    remote = rpc.connect(host, port)
-    program_fpga(remote, path)
-
-
-def reconfig_rpc_runtime():
-    """Reconfig the RPC server runtime"""
-    assert tvm.runtime.enabled("rpc")
-    remote = rpc.connect(host, port)
-    reconfig_runtime(remote)
-
-
-bitstream = sys.argv[1] if len(sys.argv) == 2 else None
-program_rpc_bitstream(bitstream)
-reconfig_rpc_runtime()
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
deleted file mode 100644
index 6290ca436f92..000000000000
--- a/vta/tests/python/integration/test_benchmark_gemm.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-import numpy as np
-from tvm.contrib import utils
-import vta.testing
-from vta.testing import simulator
-
-
-def test_gemm():
-    def run_gemm_packed(env, remote, batch_size, channel, block):
-        data_shape = (batch_size // env.BATCH, channel // env.BLOCK_IN, env.BATCH, env.BLOCK_IN)
-        weight_shape = (
-            channel // env.BLOCK_OUT,
-            channel // env.BLOCK_IN,
-            env.BLOCK_OUT,
-            env.BLOCK_IN,
-        )
-        res_shape = (batch_size // env.BATCH, channel // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT)
-        # To compute number of ops, use a x2 factor for FMA
-        num_ops = 2 * channel * channel * batch_size
-
-        ko = te.reduce_axis((0, channel // env.BLOCK_IN), name="ko")
-        ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
-
-        data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-        weight = te.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype)
-        data_buf = te.compute(data_shape, lambda *i: data(*i), "data_buf")
-        weight_buf = te.compute(weight_shape, lambda *i: weight(*i), "weight_buf")
-        res_gem = te.compute(
-            res_shape,
-            lambda bo, co, bi, ci: te.sum(
-                data_buf[bo, ko, bi, ki].astype(env.acc_dtype)
-                * weight_buf[co, ko, ci, ki].astype(env.acc_dtype),
-                axis=[ko, ki],
-            ),
-            name="res_gem",
-        )
-        res_shf = te.compute(res_shape, lambda *i: res_gem(*i) >> 8, name="res_shf")
-        res_max = te.compute(res_shape, lambda *i: tvm.te.max(res_shf(*i), 0), "res_max")  # relu
-        res_min = te.compute(
-            res_shape, lambda *i: tvm.te.min(res_max(*i), (1 << (env.INP_WIDTH - 1)) - 1), "res_min"
-        )  # relu
-        res = te.compute(res_shape, lambda *i: res_min(*i).astype(env.inp_dtype), name="res")
-
-        def verify(s):
-            mod = vta.build(
-                s,
-                [data, weight, res],
-                tvm.target.Target("ext_dev", host=env.target_host),
-                name="gemm",
-            )
-            temp = utils.tempdir()
-            mod.save(temp.relpath("gemm.o"))
-            remote.upload(temp.relpath("gemm.o"))
-            f = remote.load_module("gemm.o")
-            # verify
-            dev = remote.ext_dev(0)
-            # Data in original format
-            data_orig = np.random.randint(-128, 128, size=(batch_size, channel)).astype(data.dtype)
-            weight_orig = np.random.randint(-128, 128, size=(channel, channel)).astype(weight.dtype)
-            data_packed = data_orig.reshape(
-                batch_size // env.BATCH, env.BATCH, channel // env.BLOCK_IN, env.BLOCK_IN
-            ).transpose((0, 2, 1, 3))
-            weight_packed = weight_orig.reshape(
-                channel // env.BLOCK_OUT, env.BLOCK_OUT, channel // env.BLOCK_IN, env.BLOCK_IN
-            ).transpose((0, 2, 1, 3))
-            res_np = np.zeros(res_shape).astype(res.dtype)
-            data_arr = tvm.nd.array(data_packed, dev)
-            weight_arr = tvm.nd.array(weight_packed, dev)
-            res_arr = tvm.nd.array(res_np, dev)
-            res_ref = np.zeros(res_shape).astype(env.acc_dtype)
-            for b in range(batch_size // env.BATCH):
-                for i in range(channel // env.BLOCK_OUT):
-                    for j in range(channel // env.BLOCK_IN):
-                        res_ref[b, i, :] += np.dot(
-                            data_packed[b, j, :].astype(env.acc_dtype),
-                            weight_packed[i, j].T.astype(env.acc_dtype),
-                        )
-            res_ref = np.right_shift(res_ref, 8)
-            res_ref = np.clip(res_ref, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
-            time_f = f.time_evaluator("gemm", dev, number=20)
-            if env.TARGET in ["sim", "tsim"]:
-                simulator.clear_stats()
-            cost = time_f(data_arr, weight_arr, res_arr)
-            if env.TARGET in ["sim", "tsim"]:
-                stats = simulator.stats()
-                print("Execution statistics:")
-                for k, v in stats.items():
-                    print("\t{:<16}: {:>16}".format(k, v))
-            res_unpack = res_arr.numpy().reshape(
-                batch_size // env.BATCH, channel // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT
-            )
-            return cost
-
-        def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir):
-            s = te.create_schedule(res.op)
-            s[data_buf].set_scope(env.inp_scope)
-            s[weight_buf].set_scope(env.wgt_scope)
-            s[res_gem].set_scope(env.acc_scope)
-            s[res_shf].set_scope(env.acc_scope)
-            s[res_min].set_scope(env.acc_scope)
-            s[res_max].set_scope(env.acc_scope)
-
-            if block:
-                bblock = block // env.BATCH
-                iblock = block // env.BLOCK_IN
-                oblock = block // env.BLOCK_OUT
-                xbo, xco, xbi, xci = s[res].op.axis
-                xb1, xco1, xb2, xco2 = s[res].tile(xbo, xco, bblock, oblock)
-                store_pt = xb2
-
-                s[res_gem].compute_at(s[res], xco1)
-                s[res_shf].compute_at(s[res], xco1)
-                s[res_min].compute_at(s[res], xco1)
-                s[res_max].compute_at(s[res], xco1)
-
-                xbo, xco, xbi, xci = s[res_gem].op.axis
-                # Compute one line at a time
-                ko1, ko2 = s[res_gem].split(ko, iblock)
-                s[res_gem].reorder(ko1, ko2, xbo, xco, xbi, xci, ki)
-                s[data_buf].compute_at(s[res_gem], ko1)
-                s[weight_buf].compute_at(s[res_gem], ko1)
-                # Use VTA instructions
-                s[data_buf].pragma(s[data_buf].op.axis[0], load_inp)
-                s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt)
-                s[res_gem].tensorize(xbi, gemm)
-                s[res_shf].pragma(s[res_shf].op.axis[0], alu)
-                s[res_min].pragma(s[res_min].op.axis[0], alu)
-                s[res_max].pragma(s[res_max].op.axis[0], alu)
-                s[res].pragma(store_pt, store_out)
-            else:
-                xbo, xco, xbi, xci = s[res_gem].op.axis
-                s[res_gem].reorder(ko, xbo, xco, xbi, xci, ki)
-                # Use VTA instructions
-                s[data_buf].pragma(s[data_buf].op.axis[0], load_inp)
-                s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt)
-                s[res_gem].tensorize(xbi, gemm)
-                s[res_shf].pragma(s[res_shf].op.axis[0], alu)
-                s[res_min].pragma(s[res_min].op.axis[0], alu)
-                s[res_max].pragma(s[res_max].op.axis[0], alu)
-                s[res].pragma(s[res].op.axis[0], store_out)
-
-            if print_ir:
-                print(tvm.lower(s, [data, weight, res], simple_mode=True))
-            return verify(s)
-
-        def gemm_normal(print_ir):
-            mock = env.mock
-            print("----- GEMM GOPS End-to-End Test-------")
-
-            def run_test(header, print_ir):
-                cost = run_schedule(
-                    env.dma_copy,
-                    env.dma_copy,
-                    env.gemm,
-                    env.alu,
-                    env.dma_copy,
-                    print_ir,
-                )
-                gops = (num_ops / cost.mean) / float(10**9)
-                print(header)
-                print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
-
-            with vta.build_config():
-                run_test("NORMAL", print_ir)
-
-        def gemm_unittest(print_ir):
-            mock = env.mock
-            print("----- GEMM Unit Test-------")
-
-            def run_test(header, print_ir):
-                cost = run_schedule(
-                    mock.dma_copy, mock.dma_copy, env.gemm, mock.alu, mock.dma_copy, print_ir
-                )
-                gops = (num_ops / cost.mean) / float(10**9)
-                print(header)
-                print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
-
-            with vta.build_config():
-                run_test("NORMAL", print_ir)
-
-        def alu_unittest(print_ir):
-            mock = env.mock
-            print("----- ALU Unit Test-------")
-
-            def run_test(header, print_ir):
-                cost = run_schedule(
-                    mock.dma_copy, mock.dma_copy, mock.gemm, env.alu, mock.dma_copy, print_ir
-                )
-                gops = (num_ops / cost.mean) / float(10**9)
-                print(header)
-                print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
-
-            with vta.build_config():
-                run_test("NORMAL", print_ir)
-            print("")
-
-        def load_inp_unittest(print_ir):
-            mock = env.mock
-            print("----- LoadInp Unit Test-------")
-
-            def run_test(header, print_ir):
-                cost = run_schedule(
-                    env.dma_copy, mock.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir
-                )
-                gops = (num_ops / cost.mean) / float(10**9)
-                bandwith = (batch_size * channel * env.INP_WIDTH / cost.mean) / float(10**9)
-                print(header)
-                print(
-                    "\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits"
-                    % (cost.mean, gops, bandwith)
-                )
-
-            with vta.build_config():
-                run_test("NORMAL", print_ir)
-            print("")
-
-        def load_wgt_unittest(print_ir):
-            mock = env.mock
-            print("----- LoadWgt Unit Test-------")
-
-            def run_test(header, print_ir):
-                cost = run_schedule(
-                    mock.dma_copy, env.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir
-                )
-                gops = (num_ops / cost.mean) / float(10**9)
-                bandwith = (channel * channel * env.WGT_WIDTH / cost.mean) / float(10**9)
-                print(header)
-                print(
-                    "\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits"
-                    % (cost.mean, gops, bandwith)
-                )
-
-            with vta.build_config():
-                run_test("NORMAL", print_ir)
-            print("")
-
-        def store_out_unittest(print_ir):
-            mock = env.mock
-            print("----- StoreOut Unit Test-------")
-
-            def run_test(header, print_ir):
-                cost = run_schedule(
-                    mock.dma_copy, mock.dma_copy, mock.gemm, mock.alu, env.dma_copy, print_ir
-                )
-                gops = (num_ops / cost.mean) / float(10**9)
-                bandwith = (batch_size * channel * env.OUT_WIDTH / cost.mean) / float(10**9)
-                print(header)
-                print(
-                    "\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits"
-                    % (cost.mean, gops, bandwith)
-                )
-
-            with vta.build_config():
-                run_test("NORMAL", print_ir)
-            print("")
-
-        gemm_normal(False)
-        gemm_unittest(False)
-        alu_unittest(False)
-
-    def _run(env, remote):
-        print("========GEMM 128=========")
-        run_gemm_packed(env, remote, 128, 128, 128)
-
-    vta.testing.run(_run)
-
-
-if __name__ == "__main__":
-    test_gemm()
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
deleted file mode 100644
index 64f9ec2debae..000000000000
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Testing topi conv2d operator for VTA"""
-
-import json
-import os
-
-import pytest
-import numpy as np
-from collections import namedtuple
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-from tvm.contrib import utils
-from tvm.contrib.pickle_memoize import memoize
-from tvm import topi
-import tvm.topi.testing
-import vta
-from vta import program_fpga, reconfig_runtime
-import vta.testing
-from vta.testing import simulator
-
-
-Workload = namedtuple(
-    "Conv2DWorkload",
-    [
-        "batch",
-        "height",
-        "width",
-        "in_filter",
-        "out_filter",
-        "hkernel",
-        "wkernel",
-        "hpad",
-        "wpad",
-        "hstride",
-        "wstride",
-    ],
-)
-
-# Get batch info from env
-env = vta.get_env()
-
-# ResNet18 workloads
-resnet_wkls = [
-    # Workloads of resnet18 on imagenet
-    # ('resnet-18.C1',  Workload(env.BATCH, 224, 224, 3,   64,  7, 7, 3, 3, 2, 2)),
-    ("resnet-18.C2", Workload(env.BATCH, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)),
-    ("resnet-18.C3", Workload(env.BATCH, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2)),
-    ("resnet-18.C4", Workload(env.BATCH, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2)),
-    ("resnet-18.C5", Workload(env.BATCH, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1)),
-    ("resnet-18.C6", Workload(env.BATCH, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2)),
-    ("resnet-18.C7", Workload(env.BATCH, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)),
-    ("resnet-18.C8", Workload(env.BATCH, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1)),
-    ("resnet-18.C9", Workload(env.BATCH, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2)),
-    ("resnet-18.C10", Workload(env.BATCH, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)),
-    ("resnet-18.C11", Workload(env.BATCH, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1)),
-]
-
-# FIXME: we need a custom clip operator to circumvent a pattern detection limitation
-@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
-def my_clip(x, a_min, a_max):
-    """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.tir.const(a_min, x.dtype)
-    const_max = tvm.tir.const(a_max, x.dtype)
-    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
-    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
-    return x
-
-
-def run_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4):
-
-    # Workload assertions
-    assert wl.hpad == wl.wpad
-
-    # Perform packing only if we are targeting the accelerator
-    if "arm_cpu" in target.keys:
-        data_pack = False
-        layout = "NCHW"
-        conv2d_fcompute = topi.arm_cpu.conv2d_nchw_spatial_pack
-        conv2d_fschedule = topi.arm_cpu.schedule_conv2d_nchw_spatial_pack
-    elif "vta" in target.keys:
-        data_pack = True
-        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
-        conv2d_fcompute = vta.top.conv2d_packed
-        conv2d_fschedule = vta.top.schedule_conv2d_packed
-
-    # Derive shapes depending upon packing
-    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
-    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
-    b_shape = (wl.batch, wl.out_filter, 1, 1)
-    if data_pack:
-        data_shape = (
-            wl.batch // env.BATCH,
-            wl.in_filter // env.BLOCK_IN,
-            wl.height,
-            wl.width,
-            env.BATCH,
-            env.BLOCK_IN,
-        )
-        kernel_shape = (
-            wl.out_filter // env.BLOCK_OUT,
-            wl.in_filter // env.BLOCK_IN,
-            wl.hkernel,
-            wl.wkernel,
-            env.BLOCK_OUT,
-            env.BLOCK_IN,
-        )
-        bias_shape = (
-            wl.batch // env.BATCH,
-            wl.out_filter // env.BLOCK_OUT,
-            1,
-            1,
-            env.BATCH,
-            env.BLOCK_OUT,
-        )
-    else:
-        data_shape = a_shape
-        kernel_shape = w_shape
-        bias_shape = b_shape
-    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
-    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
-
-    # Define base computation schedule
-    with target:
-        if data_pack:
-            res = conv2d_fcompute(
-                data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), layout, env.acc_dtype
-            )
-        else:
-            res = conv2d_fcompute(
-                data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), env.acc_dtype
-            )
-        res = topi.right_shift(res, 8)
-        res = topi.add(res, bias)
-        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res = topi.cast(res, env.out_dtype)
-        # Derive base schedule
-        s = conv2d_fschedule([res])
-        if print_ir:
-            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
-
-    # Derive number of ops
-    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
-    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
-    num_ops = (
-        2
-        * wl.batch
-        * fout_height
-        * fout_width
-        * wl.hkernel
-        * wl.wkernel
-        * wl.out_filter
-        * wl.in_filter
-    )
-
-    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
-    def get_ref_data():
-        # derive min max for act, wgt, and bias types (max non inclusive)
-        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
-        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
-        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (
-            env.INP_WIDTH + env.WGT_WIDTH - 2
-        )
-        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
-        w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)
-        b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype)
-        r_np = tvm.topi.testing.conv2d_nchw_python(
-            a_np.astype(env.acc_dtype),
-            w_np.astype(env.acc_dtype),
-            (wl.hstride, wl.wstride),
-            wl.hpad,
-        ).astype(env.acc_dtype)
-        return a_np, w_np, b_np, r_np
-
-    # Data in original format
-    data_np, kernel_np, bias_np, res_ref = get_ref_data()
-    if data_pack:
-        data_np = data_np.reshape(
-            wl.batch // env.BATCH,
-            env.BATCH,
-            wl.in_filter // env.BLOCK_IN,
-            env.BLOCK_IN,
-            wl.height,
-            wl.width,
-        ).transpose((0, 2, 4, 5, 1, 3))
-        kernel_np = kernel_np.reshape(
-            wl.out_filter // env.BLOCK_OUT,
-            env.BLOCK_OUT,
-            wl.in_filter // env.BLOCK_IN,
-            env.BLOCK_IN,
-            wl.hkernel,
-            wl.wkernel,
-        ).transpose((0, 2, 4, 5, 1, 3))
-        bias_np = bias_np.reshape(
-            wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT
-        )
-
-    # Build
-    if "vta" in target.keys:
-        with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
-            mod = vta.build(
-                s,
-                [data, kernel, bias, res],
-                target=tvm.target.Target(target, host=env.target_host),
-                name="conv2d",
-            )
-    else:
-        mod = tvm.build(
-            s,
-            [data, kernel, bias, res],
-            target=tvm.target.Target(target, host=env.target_host),
-            name="conv2d",
-        )
-    temp = utils.tempdir()
-    mod.save(temp.relpath("conv2d.o"))
-    remote.upload(temp.relpath("conv2d.o"))
-    f = remote.load_module("conv2d.o")
-    dev = remote.device(str(target))
-
-    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
-    data_arr = tvm.nd.array(data_np, dev)
-    kernel_arr = tvm.nd.array(kernel_np, dev)
-    bias_arr = tvm.nd.array(bias_np, dev)
-    res_arr = tvm.nd.array(res_np, dev)
-    time_f = f.time_evaluator("conv2d", dev, number=samples)
-
-    # In vta sim mode, collect simulator runtime statistics
-    stats = {}
-    cost = None
-    if env.TARGET in ["sim", "tsim"]:
-        # Check if we're in local RPC mode (allows us to rebuild the
-        # runtime on the fly when varying the VTA designs)
-        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
-        if local_rpc:
-            if env.TARGET == "sim":
-                remote.get_function("vta.simulator.profiler_clear")()
-            else:
-                remote.get_function("vta.tsim.profiler_clear")()
-            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
-            if env.TARGET == "sim":
-                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
-            else:
-                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
-        else:
-            simulator.clear_stats()
-            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
-            stats = simulator.stats()
-    else:
-        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
-
-    # Check correctness
-    correct = False
-    if check_correctness:
-        res_orig = res_arr.numpy()
-        if data_pack:
-            res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape(
-                wl.batch, wl.out_filter, fout_height, fout_width
-            )
-            bias_np = bias_np.transpose((0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
-        res_ref = res_ref >> env.WGT_WIDTH
-        res_ref += bias_np
-        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res_ref = res_ref.astype(env.out_dtype)
-        correct = np.allclose(res_orig, res_ref)
-
-    gops = (num_ops / cost.mean) / float(10**9)
-    status = "PASSED" if correct else "FAILED"
-    if "arm_cpu" in target.keys:
-        device = "CPU"
-    elif "vta" in target.keys:
-        device = "VTA"
-    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))
-
-    return correct, cost, stats
-
-
-@pytest.mark.parametrize("device", ["vta", "arm_cpu"])
-def test_conv2d(device):
-    def _run(env, remote):
-        if device == "vta":
-            target = env.target
-            if env.TARGET not in ["sim", "tsim", "intelfocl"]:
-                assert tvm.runtime.enabled("rpc")
-                program_fpga(remote, bitstream=None)
-                reconfig_runtime(remote)
-        elif device == "arm_cpu":
-            target = env.target_vta_cpu
-        with autotvm.tophub.context(target):  # load pre-tuned schedule parameters
-            for _, wl in resnet_wkls:
-                print(wl)
-                run_conv2d(env, remote, wl, target)
-
-    vta.testing.run(_run)
-
-
-if __name__ == "__main__":
-    test_conv2d(device="arm_cpu")
-    test_conv2d(device="vta")
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
deleted file mode 100644
index b0ea2fc113df..000000000000
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Testing topi conv2d_transpose operator for VTA"""
-
-import json
-import os
-
-import pytest
-import numpy as np
-from collections import namedtuple
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-from tvm.contrib import utils
-from tvm.contrib.pickle_memoize import memoize
-from tvm import topi
-import tvm.topi.testing
-import vta
-from vta import program_fpga, reconfig_runtime
-import vta.testing
-from vta.testing import simulator
-
-
-Workload = namedtuple(
-    "Conv2DTransposeWorkload",
-    [
-        "batch",
-        "height",
-        "width",
-        "in_filter",
-        "out_filter",
-        "hkernel",
-        "wkernel",
-        "hpad",
-        "wpad",
-        "hstride",
-        "wstride",
-        "o_hpad",
-        "o_wpad",
-    ],
-)
-
-# Get batch info from env
-env = vta.get_env()
-
-# DCGAN workloads
-dcgan_wklds = [
-    # dcgan
-    ("DCGAN.CT1", Workload(env.BATCH, 4, 4, 1024, 512, 4, 4, 1, 1, 2, 2, 0, 0)),
-    ("DCGAN.CT2", Workload(env.BATCH, 8, 8, 512, 256, 4, 4, 1, 1, 2, 2, 0, 0)),
-    ("DCGAN.CT3", Workload(env.BATCH, 16, 16, 256, 128, 4, 4, 1, 1, 2, 2, 0, 0)),
-]
-
-# FIXME: we need a custom clip operator to circumvent a pattern detection limitation
-@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
-def my_clip(x, a_min, a_max):
-    """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.tir.const(a_min, x.dtype)
-    const_max = tvm.tir.const(a_max, x.dtype)
-    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
-    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
-    return x
-
-
-# Helper function to get factors
-def _find_factors(n):
-    factors = []
-    for f in range(1, n + 1):
-        if n % f == 0:
-            factors.append(f)
-    return factors
-
-
-def run_conv2d_transpose(
-    env, remote, wl, target, check_correctness=True, print_ir=False, samples=4
-):
-
-    # Workload assertions
-    assert wl.hpad == wl.wpad
-
-    # Perform packing only if we are targeting the accelerator
-    if "arm_cpu" in target.keys:
-        data_pack = False
-        layout = "NCHW"
-        fcompute = topi.arm_cpu.conv2d_transpose_nchw
-        fschedule = topi.arm_cpu.schedule_conv2d_transpose_nchw
-    elif "vta" in target.keys:
-        data_pack = True
-        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
-        fcompute = vta.top.conv2d_transpose_packed
-        fschedule = vta.top.schedule_conv2d_transpose_packed
-
-    # Derive shapes depending upon packing
-
-    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
-    w_shape = (wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)
-    if data_pack:
-        data_shape = (
-            wl.batch // env.BATCH,
-            wl.in_filter // env.BLOCK_IN,
-            wl.height,
-            wl.width,
-            env.BATCH,
-            env.BLOCK_IN,
-        )
-        kernel_shape = (
-            wl.out_filter // env.BLOCK_OUT,
-            wl.in_filter // env.BLOCK_IN,
-            wl.hkernel,
-            wl.wkernel,
-            env.BLOCK_OUT,
-            env.BLOCK_IN,
-        )
-    else:
-        data_shape = a_shape
-        kernel_shape = w_shape
-    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
-
-    # Define base computation schedule
-    with target:
-
-        res = fcompute(
-            data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype, (wl.o_hpad, wl.o_wpad)
-        )
-        res = topi.right_shift(res, env.WGT_WIDTH)
-        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res = topi.cast(res, env.out_dtype)
-        # Derive base schedule
-        s = fschedule([res])
-        if print_ir:
-            print(vta.lower(s, [data, kernel, res], simple_mode=True))
-
-    # Derive number of ops
-    fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel + wl.o_hpad
-    fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel + wl.o_wpad
-    num_ops = (
-        2
-        * wl.batch
-        * fout_height
-        * fout_width
-        * wl.hkernel
-        * wl.wkernel
-        * wl.out_filter
-        * wl.in_filter
-    )
-
-    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
-    def get_ref_data():
-        # derive min max for act and wgt types (max non inclusive)
-        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
-        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
-        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
-        w_np = np.random.randint(
-            w_min, w_max, size=(wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)
-        ).astype(kernel.dtype)
-        r_np = tvm.topi.testing.conv2d_transpose_nchw_python(
-            a_np.astype(env.acc_dtype),
-            w_np.astype(env.acc_dtype),
-            (wl.hstride, wl.wstride),
-            wl.hpad,
-            (wl.o_hpad, wl.o_wpad),
-        ).astype(env.acc_dtype)
-        return a_np, w_np, r_np
-
-    # Data in original format
-    data_np, kernel_np, res_ref = get_ref_data()
-    if data_pack:
-        data_np = data_np.reshape(
-            wl.batch // env.BATCH,
-            env.BATCH,
-            wl.in_filter // env.BLOCK_IN,
-            env.BLOCK_IN,
-            wl.height,
-            wl.width,
-        ).transpose((0, 2, 4, 5, 1, 3))
-        kernel_np = kernel_np.reshape(
-            wl.in_filter // env.BLOCK_IN,
-            env.BLOCK_IN,
-            wl.out_filter // env.BLOCK_OUT,
-            env.BLOCK_OUT,
-            wl.hkernel,
-            wl.wkernel,
-        ).transpose((2, 0, 4, 5, 3, 1))
-        kernel_np = np.flip(kernel_np, 2)
-        kernel_np = np.flip(kernel_np, 3)
-
-    # Build
-    if "vta" in target.keys:
-        with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
-            mod = vta.build(
-                s,
-                [data, kernel, res],
-                target=target,
-                target_host=env.target_host,
-                name="conv2d_transpose",
-            )
-    else:
-        mod = tvm.build(
-            s,
-            [data, kernel, res],
-            target=target,
-            target_host=env.target_host,
-            name="conv2d_transpose",
-        )
-    temp = utils.tempdir()
-    mod.save(temp.relpath("conv2d_transpose.o"))
-    remote.upload(temp.relpath("conv2d_transpose.o"))
-    f = remote.load_module("conv2d_transpose.o")
-    dev = remote.device(str(target))
-
-    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
-    data_arr = tvm.nd.array(data_np, dev)
-    kernel_arr = tvm.nd.array(kernel_np, dev)
-    res_arr = tvm.nd.array(res_np, dev)
-    time_f = f.time_evaluator("conv2d_transpose", dev, number=samples)
-
-    # In vta sim mode, collect simulator runtime statistics
-    stats = {}
-    cost = None
-    if env.TARGET in ["sim", "tsim"]:
-        # Check if we're in local RPC mode (allows us to rebuild the
-        # runtime on the fly when varying the VTA designs)
-        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
-        if local_rpc:
-            if env.TARGET == "sim":
-                remote.get_function("vta.simulator.profiler_clear")()
-            else:
-                remote.get_function("vta.tsim.profiler_clear")()
-            cost = time_f(data_arr, kernel_arr, res_arr)
-            if env.TARGET == "sim":
-                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
-            else:
-                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
-        else:
-            simulator.clear_stats()
-            cost = time_f(data_arr, kernel_arr, res_arr)
-            stats = simulator.stats()
-    else:
-        cost = time_f(data_arr, kernel_arr, res_arr)
-
-    # Check correctness
-    correct = False
-    if check_correctness:
-        res_orig = res_arr.numpy()
-        if data_pack:
-            res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape(
-                wl.batch, wl.out_filter, fout_height, fout_width
-            )
-        res_ref = res_ref >> env.WGT_WIDTH
-        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res_ref = res_ref.astype(env.out_dtype)
-        correct = np.allclose(res_orig, res_ref)
-
-    gops = (num_ops / cost.mean) / float(10**9)
-    status = "PASSED" if correct else "FAILED"
-    if "arm_cpu" in target.keys:
-        device = "CPU"
-    elif "vta" in target.keys:
-        device = "VTA"
-    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))
-
-    return correct, cost, stats
-
-
-@pytest.mark.parametrize("device", ["vta", "arm_cpu"])
-def test_conv2d_transpose(device):
-    def _run(env, remote):
-        if device == "vta":
-            target = env.target
-            if env.TARGET not in ["sim", "tsim"]:
-                assert tvm.runtime.enabled("rpc")
-                program_fpga(remote, bitstream=None)
-                reconfig_runtime(remote)
-        elif device == "arm_cpu":
-            target = env.target_vta_cpu
-        with autotvm.tophub.context(target):  # load pre-tuned schedule parameters
-            for _, wl in dcgan_wklds:
-                print(wl)
-                run_conv2d_transpose(env, remote, wl, target)
-
-    vta.testing.run(_run)
-
-
-if __name__ == "__main__":
-    test_conv2d_transpose(device="arm_cpu")
-    test_conv2d_transpose(device="vta")
diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
deleted file mode 100644
index 45a400b24e8d..000000000000
--- a/vta/tests/python/integration/test_benchmark_topi_dense.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Testing topi gemm operator for VTA"""
-
-import os
-import json
-from collections import namedtuple
-
-import numpy as np
-
-import tvm
-from tvm import te
-from tvm import autotvm
-from tvm.contrib import utils
-from tvm.contrib.pickle_memoize import memoize
-from tvm import topi
-import tvm.topi.testing
-import vta
-from vta import program_fpga, reconfig_runtime
-import vta.testing
-from vta.testing import simulator
-
-# FIXME: we need a custom clip operator to circumvent a pattern detection limitation
-@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
-def my_clip(x, a_min, a_max):
-    """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.tir.const(a_min, x.dtype)
-    const_max = tvm.tir.const(a_max, x.dtype)
-    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
-    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
-    return x
-
-
-def run_gemm(
-    env,
-    remote,
-    target,
-    batch_size,
-    in_feat,
-    out_feat,
-    check_correctness=True,
-    print_ir=True,
-    samples=4,
-):
-
-    # Perform packing only if we are targeting the accelerator
-    if "arm_cpu" in target.keys:
-        data_pack = False
-    elif "vta" in target.keys:
-        data_pack = True
-
-    # Derive shapes depending upon packing
-    a_shape = (batch_size, in_feat)
-    w_shape = (out_feat, in_feat)
-    if data_pack:
-        data_shape = (batch_size // env.BATCH, in_feat // env.BLOCK_IN, env.BATCH, env.BLOCK_IN)
-        kernel_shape = (
-            out_feat // env.BLOCK_OUT,
-            in_feat // env.BLOCK_IN,
-            env.BLOCK_OUT,
-            env.BLOCK_IN,
-        )
-        fcompute = vta.top.dense_packed
-        fschedule = vta.top.schedule_dense_packed
-    else:
-        data_shape = a_shape
-        kernel_shape = w_shape
-        fcompute = topi.x86.dense_nopack
-        fschedule = topi.x86.schedule_dense_nopack
-    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-
-    # Define base computation schedule
-    with target:
-        res = fcompute(data, kernel, None, env.acc_dtype)
-        res = topi.right_shift(res, 8)
-        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res = topi.cast(res, env.out_dtype)
-        # Derive base schedule
-        s = fschedule([res])
-        if print_ir:
-            print(vta.lower(s, [data, kernel, res], simple_mode=True))
-
-    # Derive number of ops
-    num_ops = 2 * batch_size * in_feat * out_feat
-
-    # @memoize("vta.tests.test_benchmark_topi.dense.verify")
-    def get_ref_data():
-        # derive min max for act, wgt types (max non inclusive)
-        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
-        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
-        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
-        w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)
-
-        r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(
-            env.acc_dtype
-        )
-        return a_np, w_np, r_np
-
-    # Data in original format
-    data_np, kernel_np, res_ref = get_ref_data()
-    if data_pack:
-        data_np = data_np.reshape(
-            batch_size // env.BATCH, env.BATCH, in_feat // env.BLOCK_IN, env.BLOCK_IN
-        ).transpose((0, 2, 1, 3))
-        kernel_np = kernel_np.reshape(
-            out_feat // env.BLOCK_OUT, env.BLOCK_OUT, in_feat // env.BLOCK_IN, env.BLOCK_IN
-        ).transpose((0, 2, 1, 3))
-
-    # Build
-    if "vta" in target.keys:
-        mod = vta.build(
-            s,
-            [data, kernel, res],
-            target=tvm.target.Target(target, host=env.target_host),
-            name="dense",
-        )
-    else:
-        mod = tvm.build(
-            s,
-            [data, kernel, res],
-            target=tvm.target.Target(target, host=env.target_host),
-            name="dense",
-        )
-    temp = utils.tempdir()
-    mod.save(temp.relpath("dense.o"))
-    remote.upload(temp.relpath("dense.o"))
-    f = remote.load_module("dense.o")
-    dev = remote.device(str(target))
-
-    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
-    data_arr = tvm.nd.array(data_np, dev)
-    kernel_arr = tvm.nd.array(kernel_np, dev)
-    res_arr = tvm.nd.array(res_np, dev)
-    time_f = f.time_evaluator("dense", dev, number=samples)
-
-    # In vta sim mode, collect simulator runtime statistics
-    stats = {}
-    cost = None
-    if env.TARGET in ["sim", "tsim"]:
-        # Check if we're in local RPC mode (allows us to rebuild the
-        # runtime on the fly when varying the VTA designs)
-        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
-        if local_rpc:
-            if env.TARGET == "sim":
-                remote.get_function("vta.simulator.profiler_clear")()
-            else:
-                remote.get_function("vta.tsim.profiler_clear")()
-            cost = time_f(data_arr, kernel_arr, res_arr)
-            if env.TARGET == "sim":
-                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
-            else:
-                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
-        else:
-            simulator.clear_stats()
-            cost = time_f(data_arr, kernel_arr, res_arr)
-            stats = simulator.stats()
-    else:
-        cost = time_f(data_arr, kernel_arr, res_arr)
-
-    # Check correctness
-    correct = False
-    if check_correctness:
-        res_orig = res_arr.numpy()
-        if data_pack:
-            res_orig = res_orig.reshape(batch_size, out_feat)
-        res_ref = res_ref >> 8
-        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res_ref = res_ref.astype(env.out_dtype)
-        correct = np.allclose(res_orig, res_ref)
-
-    gops = (num_ops / cost.mean) / float(10**9)
-    status = "PASSED" if correct else "FAILED"
-    if "arm_cpu" in target.keys:
-        device = "CPU"
-    elif "vta" in target.keys:
-        device = "VTA"
-    print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))
-
-    return correct, cost, stats
-
-
-def test_gemm(device="vta", batch=128, in_feat=128, out_feat=128):
-    def _run(env, remote):
-        if device == "vta":
-            target = env.target
-            if env.TARGET not in ["sim", "tsim"]:
-                assert tvm.runtime.enabled("rpc")
-                program_fpga(remote, bitstream=None)
-                reconfig_runtime(remote)
-        elif device == "arm_cpu":
-            target = env.target_vta_cpu
-        with autotvm.tophub.context(target):  # load pre-tuned schedule parameters
-            run_gemm(env, remote, target, batch, in_feat, out_feat)
-
-    vta.testing.run(_run)
-
-
-if __name__ == "__main__":
-    test_gemm("vta", 16, 512, 1008)
diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
deleted file mode 100644
index bc9efa05f329..000000000000
--- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Testing topi group conv2d operator for VTA"""
-
-import json
-import os
-
-import pytest
-import numpy as np
-from collections import namedtuple
-
-import tvm
-from tvm import te
-from tvm import relay
-from tvm import autotvm
-from tvm.contrib import utils
-from tvm import topi
-import tvm.topi.testing
-import vta
-from vta import program_fpga, reconfig_runtime
-import vta.testing
-from vta.testing import simulator
-
-
-Workload = namedtuple(
-    "GroupConv2DWorkload",
-    [
-        "batch",
-        "height",
-        "width",
-        "in_filter",
-        "out_filter",
-        "groups",
-        "hkernel",
-        "wkernel",
-        "hpad",
-        "wpad",
-        "hstride",
-        "wstride",
-    ],
-)
-
-# Get batch info from env
-env = vta.get_env()
-
-# Mobilenet (grouped variant) workloads
-mobilenet_wkls = [
-    ("mobilenet.D1", Workload(env.BATCH, 112, 112, 32, 32, 2, 3, 3, 1, 1, 1, 1)),
-    ("mobilenet.D2", Workload(env.BATCH, 112, 112, 64, 64, 4, 3, 3, 1, 1, 2, 2)),
-    ("mobilenet.D3", Workload(env.BATCH, 56, 56, 128, 128, 8, 3, 3, 1, 1, 1, 1)),
-    ("mobilenet.D4", Workload(env.BATCH, 56, 56, 128, 128, 8, 3, 3, 1, 1, 2, 2)),
-    ("mobilenet.D5", Workload(env.BATCH, 28, 28, 256, 256, 16, 3, 3, 1, 1, 1, 1)),
-    ("mobilenet.D6", Workload(env.BATCH, 28, 28, 256, 256, 16, 3, 3, 1, 1, 2, 2)),
-    ("mobilenet.D7", Workload(env.BATCH, 14, 14, 512, 512, 32, 3, 3, 1, 1, 1, 1)),
-    ("mobilenet.D8", Workload(env.BATCH, 14, 14, 512, 512, 32, 3, 3, 1, 1, 2, 2)),
-    ("mobilenet.D9", Workload(env.BATCH, 7, 7, 1024, 1024, 64, 3, 3, 1, 1, 1, 1)),
-]
-
-# FIXME: we need a custom clip operator to circumvent a pattern detection limitation
-@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
-def my_clip(x, a_min, a_max):
-    """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.tir.const(a_min, x.dtype)
-    const_max = tvm.tir.const(a_max, x.dtype)
-    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
-    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
-    return x
-
-
-def run_group_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4):
-
-    # Workload assertions
-    assert wl.hpad == wl.wpad
-
-    # Perform packing only if we are targeting the accelerator
-    if "arm_cpu" in target.keys:
-        data_pack = False
-        layout = "NCHW"
-        fcompute = topi.nn.group_conv2d_nchw
-        fschedule = topi.generic.schedule_group_conv2d_nchw
-    elif "vta" in target.keys:
-        data_pack = True
-        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
-        fcompute = vta.top.group_conv2d_packed
-        fschedule = vta.top.schedule_group_conv2d_packed
-
-    # Derive shapes depending upon packing
-    CI_G = wl.in_filter // wl.groups
-    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
-    w_shape = (wl.out_filter, CI_G, wl.hkernel, wl.wkernel)
-    b_shape = (wl.batch, wl.out_filter, 1, 1)
-    if data_pack:
-        data_shape = (
-            wl.batch // env.BATCH,
-            wl.in_filter // env.BLOCK_IN,
-            wl.height,
-            wl.width,
-            env.BATCH,
-            env.BLOCK_IN,
-        )
-        kernel_shape = (
-            wl.out_filter // env.BLOCK_OUT,
-            CI_G // env.BLOCK_IN,
-            wl.hkernel,
-            wl.wkernel,
-            env.BLOCK_OUT,
-            env.BLOCK_IN,
-        )
-        bias_shape = (
-            wl.batch // env.BATCH,
-            wl.out_filter // env.BLOCK_OUT,
-            1,
-            1,
-            env.BATCH,
-            env.BLOCK_OUT,
-        )
-    else:
-        data_shape = a_shape
-        kernel_shape = w_shape
-        bias_shape = b_shape
-    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
-    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
-
-    # Define base computation schedule
-    with target:
-        res = fcompute(
-            data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), wl.groups, env.acc_dtype
-        )
-        res = topi.right_shift(res, 8)
-        res = topi.add(res, bias)
-        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res = topi.cast(res, env.out_dtype)
-        # Derive base schedule
-        s = fschedule([res])
-        if print_ir:
-            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
-
-    # Derive number of ops
-    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
-    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
-    num_ops = (
-        2
-        * wl.batch
-        * fout_height
-        * fout_width
-        * wl.hkernel
-        * wl.wkernel
-        * wl.out_filter
-        * wl.in_filter
-        // wl.groups
-    )
-
-    def get_ref_data():
-        # derive min max for act, wgt, and bias types (max non inclusive)
-        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
-        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
-        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (
-            env.INP_WIDTH + env.WGT_WIDTH - 2
-        )
-        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
-        w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)
-        b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype)
-        r_np = tvm.topi.testing.conv2d_nchw_python(
-            a_np.astype(env.acc_dtype),
-            w_np.astype(env.acc_dtype),
-            (wl.hstride, wl.wstride),
-            wl.hpad,
-            wl.groups,
-        ).astype(env.acc_dtype)
-        return a_np, w_np, b_np, r_np
-
-    # Data in original format
-    data_np, kernel_np, bias_np, res_ref = get_ref_data()
-    if data_pack:
-        data_np = data_np.reshape(
-            wl.batch // env.BATCH,
-            env.BATCH,
-            wl.in_filter // env.BLOCK_IN,
-            env.BLOCK_IN,
-            wl.height,
-            wl.width,
-        ).transpose((0, 2, 4, 5, 1, 3))
-        kernel_np = kernel_np.reshape(
-            wl.out_filter // env.BLOCK_OUT,
-            env.BLOCK_OUT,
-            CI_G // env.BLOCK_IN,
-            env.BLOCK_IN,
-            wl.hkernel,
-            wl.wkernel,
-        ).transpose((0, 2, 4, 5, 1, 3))
-        bias_np = bias_np.reshape(
-            wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT
-        )
-
-    # Build
-    if "vta" in target.keys:
-        with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
-            mod = vta.build(
-                s,
-                [data, kernel, bias, res],
-                target=tvm.target.Target(target, host=env.target_host),
-                name="conv2d",
-            )
-    else:
-        mod = tvm.build(
-            s,
-            [data, kernel, bias, res],
-            target=tvm.target.Target(target, host=env.target_host),
-            name="conv2d",
-        )
-    temp = utils.tempdir()
-    mod.save(temp.relpath("conv2d.o"))
-    remote.upload(temp.relpath("conv2d.o"))
-    f = remote.load_module("conv2d.o")
-    dev = remote.device(str(target))
-
-    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
-    data_arr = tvm.nd.array(data_np, dev)
-    kernel_arr = tvm.nd.array(kernel_np, dev)
-    bias_arr = tvm.nd.array(bias_np, dev)
-    res_arr = tvm.nd.array(res_np, dev)
-    time_f = f.time_evaluator("conv2d", dev, number=samples)
-
-    # In vta sim mode, collect simulator runtime statistics
-    stats = {}
-    cost = None
-    if env.TARGET in ["sim", "tsim"]:
-        # Check if we're in local RPC mode (allows us to rebuild the
-        # runtime on the fly when varying the VTA designs)
-        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
-        if local_rpc:
-            if env.TARGET == "sim":
-                remote.get_function("vta.simulator.profiler_clear")()
-            else:
-                remote.get_function("vta.tsim.profiler_clear")()
-            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
-            if env.TARGET == "sim":
-                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
-            else:
-                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
-        else:
-            simulator.clear_stats()
-            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
-            stats = simulator.stats()
-    else:
-        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
-
-    # Check correctness
-    correct = False
-    if check_correctness:
-        res_orig = res_arr.numpy()
-        if data_pack:
-            res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape(
-                wl.batch, wl.out_filter, fout_height, fout_width
-            )
-            bias_np = bias_np.transpose((0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
-        res_ref = res_ref >> env.WGT_WIDTH
-        res_ref += bias_np
-        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
-        res_ref = res_ref.astype(env.out_dtype)
-        correct = np.allclose(res_orig, res_ref)
-
-    gops = (num_ops / cost.mean) / float(10**9)
-    status = "PASSED" if correct else "FAILED"
-    if "arm_cpu" in target.keys:
-        device = "CPU"
-    elif "vta" in target.keys:
-        device = "VTA"
-    print(
-        "%s GROUP CONV2D TEST %s: Time cost = %g sec/op, %g GOPS"
-        % (device, status, cost.mean, gops)
-    )
-
-    return correct, cost, stats
-
-
-@pytest.mark.parametrize("device", ["vta", "arm_cpu"])
-def test_conv2d(device):
-    def _run(env, remote):
-        if device == "vta":
-            target = env.target
-            if env.TARGET not in ["sim", "tsim"]:
-                assert tvm.runtime.enabled("rpc")
-                program_fpga(remote, bitstream=None)
-                reconfig_runtime(remote)
-        elif device == "arm_cpu":
-            target = env.target_vta_cpu
-        with autotvm.tophub.context(target):  # load pre-tuned schedule parameters
-            for _, wl in mobilenet_wkls:
-                print(wl)
-                run_group_conv2d(env, remote, wl, target)
-
-    vta.testing.run(_run)
-
-
-if __name__ == "__main__":
-    test_conv2d(device="arm_cpu")
-    test_conv2d(device="vta")
diff --git a/vta/tests/python/pynq/test_program_rpc.py b/vta/tests/python/pynq/test_program_rpc.py
deleted file mode 100644
index 5e471531ef5b..000000000000
--- a/vta/tests/python/pynq/test_program_rpc.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import tvm
-from tvm import te
-from tvm import rpc
-from vta import get_bitstream_path, download_bitstream, program_fpga, reconfig_runtime
-
-host = os.environ.get("VTA_RPC_HOST", "pynq")
-port = int(os.environ.get("VTA_RPC_PORT", "9091"))
-
-
-def program_rpc_bitstream(path=None):
-    """Program the FPGA on the RPC server
-
-    Parameters
-    ----------
-    path : path to bitstream (optional)
-    """
-    assert tvm.runtime.enabled("rpc")
-    remote = rpc.connect(host, port)
-    program_fpga(remote, path)
-
-
-def reconfig_rpc_runtime():
-    """Reconfig the RPC server runtime"""
-    assert tvm.runtime.enabled("rpc")
-    remote = rpc.connect(host, port)
-    reconfig_runtime(remote)
-
-
-program_rpc_bitstream()
-reconfig_rpc_runtime()
diff --git a/vta/tests/python/unittest/test_environment.py b/vta/tests/python/unittest/test_environment.py
deleted file mode 100644
index 61219b615ddc..000000000000
--- a/vta/tests/python/unittest/test_environment.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import vta
-
-
-def test_env():
-    env = vta.get_env()
-    mock = env.mock
-    assert mock.alu == "skip_alu"
-
-
-def test_env_scope():
-    env = vta.get_env()
-    cfg = env.cfg_dict
-    cfg["TARGET"] = "xyz"
-    with vta.Environment(cfg):
-        assert vta.get_env().TARGET == "xyz"
-    assert vta.get_env().TARGET == env.TARGET
-
-
-if __name__ == "__main__":
-    test_env()
-    test_env_scope()
diff --git a/vta/tests/python/unittest/test_vta_insn.py b/vta/tests/python/unittest/test_vta_insn.py
deleted file mode 100644
index 12012dc322d0..000000000000
--- a/vta/tests/python/unittest/test_vta_insn.py
+++ /dev/null
@@ -1,569 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Unit test VTA's instructions """
-import tvm
-from tvm import te
-import numpy as np
-from tvm import topi
-from tvm.contrib import utils
-
-import vta
-import vta.testing
-from vta.testing import simulator
-
-np.random.seed(0xDEADB)
-
-
-def test_save_load_out():
-    """Test save/store output command"""
-
-    def _run(env, remote):
-        n = 6
-        x = te.placeholder((n, n, env.BATCH, env.BLOCK_OUT), name="x", dtype=env.acc_dtype)
-        x_buf = te.compute((n, n, env.BATCH, env.BLOCK_OUT), lambda *i: x(*i), "x_buf")
-        # insert no-op that won't be optimized away
-        y_buf = te.compute((n, n, env.BATCH, env.BLOCK_OUT), lambda *i: x_buf(*i) >> 0, "y_buf")
-        y = te.compute(
-            (n, n, env.BATCH, env.BLOCK_OUT), lambda *i: y_buf(*i).astype(env.inp_dtype), "y"
-        )
-        # schedule
-        s = te.create_schedule(y.op)
-        s[x_buf].set_scope(env.acc_scope)
-        s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
-        s[y_buf].set_scope(env.acc_scope)
-        s[y_buf].pragma(y_buf.op.axis[0], env.alu)
-        s[y].pragma(y.op.axis[0], env.dma_copy)
-
-        # verification
-        with vta.build_config():
-            m = vta.build(s, [x, y], tvm.target.Target("ext_dev", host=env.target_host))
-
-        if not remote:
-            return
-        temp = utils.tempdir()
-        m.save(temp.relpath("load_act.o"))
-        remote.upload(temp.relpath("load_act.o"))
-        f = remote.load_module("load_act.o")
-        # verify
-        dev = remote.ext_dev(0)
-        x_np = np.random.randint(1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
-        y_np = x_np.astype(y.dtype)
-        x_nd = tvm.nd.array(x_np, dev)
-        y_nd = tvm.nd.empty(y_np.shape, device=dev, dtype=y_np.dtype)
-
-        if env.TARGET in ["sim", "tsim"]:
-            simulator.clear_stats()
-
-        f(x_nd, y_nd)
-
-        np.testing.assert_equal(y_np, y_nd.numpy())
-
-        if env.TARGET in ["sim", "tsim"]:
-            sim_stats = simulator.stats()
-            print("Save load execution statistics:")
-            for k, v in sim_stats.items():
-                print("\t{:<16}: {:>16}".format(k, v))
-
-    vta.testing.run(_run)
-
-
-def test_padded_load():
-    """Test padded load."""
-
-    def _run(env, remote):
-        def check_padded_load(pad_before, pad_after, test_name=None):
-            # declare
-            n = 3
-            m = 5
-            x = te.placeholder((n, m, env.BATCH, env.BLOCK_OUT), name="x", dtype=env.acc_dtype)
-            x_buf = topi.nn.pad(x, pad_before, pad_after, name="y")
-            # insert no-op that won't be optimized away
-            y_buf = te.compute(
-                (
-                    n + pad_before[0] + pad_after[0],
-                    m + pad_before[1] + pad_after[1],
-                    env.BATCH,
-                    env.BLOCK_OUT,
-                ),
-                lambda *i: x_buf(*i) >> 0,
-                "y_buf",
-            )
-            y = te.compute(
-                (
-                    n + pad_before[0] + pad_after[0],
-                    m + pad_before[1] + pad_after[1],
-                    env.BATCH,
-                    env.BLOCK_OUT,
-                ),
-                lambda *i: y_buf(*i).astype(env.inp_dtype),
-                "y",
-            )
-            # schedule
-            s = te.create_schedule(y.op)
-            s[x_buf].set_scope(env.acc_scope)
-            s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
-            s[y_buf].set_scope(env.acc_scope)
-            s[y_buf].pragma(y_buf.op.axis[0], env.alu)
-            s[y].pragma(y.op.axis[0], env.dma_copy)
-            # build
-            with vta.build_config():
-                mod = vta.build(s, [x, y], tvm.target.Target("ext_dev", host=env.target_host))
-
-            if not remote:
-                return
-            temp = utils.tempdir()
-            mod.save(temp.relpath("padded_load.o"))
-            remote.upload(temp.relpath("padded_load.o"))
-            f = remote.load_module("padded_load.o")
-            # verify
-            dev = remote.ext_dev(0)
-            x_np = np.random.randint(0, 10, size=(n, m, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
-            y_np = np.zeros(
-                (
-                    n + pad_before[0] + pad_after[0],
-                    m + pad_before[1] + pad_after[1],
-                    env.BATCH,
-                    env.BLOCK_OUT,
-                )
-            ).astype(y.dtype)
-            y_np[pad_before[0] : pad_before[0] + n, pad_before[1] : pad_before[1] + m, :] = x_np
-            x_nd = tvm.nd.array(x_np, dev)
-            y_nd = tvm.nd.empty(y_np.shape, device=dev, dtype=y_np.dtype)
-
-            if env.TARGET in ["sim", "tsim"]:
-                simulator.clear_stats()
-
-            f(x_nd, y_nd)
-
-            np.testing.assert_equal(y_np, y_nd.numpy())
-
-            if env.TARGET in ["sim", "tsim"]:
-                sim_stats = simulator.stats()
-                print("Padded {} load execution statistics:".format(test_name))
-                for k, v in sim_stats.items():
-                    print("\t{:<16}: {:>16}".format(k, v))
-
-        check_padded_load([2, 0, 0, 0], [0, 0, 0, 0], test_name="Y0")
-        check_padded_load([0, 2, 0, 0], [0, 0, 0, 0], test_name="Y1")
-        check_padded_load([0, 0, 0, 0], [2, 0, 0, 0], test_name="X0")
-        check_padded_load([0, 0, 0, 0], [0, 2, 0, 0], test_name="X1")
-        check_padded_load([1, 1, 0, 0], [1, 1, 0, 0], test_name="all")
-
-    vta.testing.run(_run)
-
-
-def test_gemm():
-    """Test GEMM."""
-
-    def _run(env, remote):
-        # declare
-        o = 4
-        n = 1
-        m = 4
-        x = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="x", dtype=env.inp_dtype)
-        w = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="w", dtype=env.wgt_dtype)
-        x_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i), "x_buf")
-        w_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: w(*i), "w_buf")
-        ko = te.reduce_axis((0, n), name="ko")
-        ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
-        y_gem = te.compute(
-            (o, m, env.BATCH, env.BLOCK_OUT),
-            lambda bo, co, bi, ci: te.sum(
-                x_buf[bo, ko, bi, ki].astype(env.acc_dtype)
-                * w_buf[co, ko, ci, ki].astype(env.acc_dtype),
-                axis=[ko, ki],
-            ),
-            name="y_gem",
-        )
-        y_shf = te.compute(
-            (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: y_gem(*i) >> 8, name="y_shf"
-        )
-        y_max = te.compute(
-            (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.max(y_shf(*i), 0), "y_max"
-        )  # relu
-        y_min = te.compute(
-            (o, m, env.BATCH, env.BLOCK_OUT),
-            lambda *i: tvm.te.min(y_max(*i), (1 << (env.INP_WIDTH - 1)) - 1),
-            "y_min",
-        )  # relu
-        y = te.compute(
-            (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: y_min(*i).astype(env.inp_dtype), name="y"
-        )
-
-        if not remote:
-            return
-
-        def verify(s, name=None):
-            # Build with the CSE pass disabled as otherwise it would complicate the test
-            with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
-                mod = vta.build(s, [x, w, y], tvm.target.Target("ext_dev", host=env.target_host))
-            temp = utils.tempdir()
-            mod.save(temp.relpath("gemm.o"))
-            remote.upload(temp.relpath("gemm.o"))
-            f = remote.load_module("gemm.o")
-            # verify
-            dev = remote.ext_dev(0)
-            x_np = np.random.randint(-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(
-                x.dtype
-            )
-            w_np = np.random.randint(-128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(
-                w.dtype
-            )
-            y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
-            x_nd = tvm.nd.array(x_np, dev)
-            w_nd = tvm.nd.array(w_np, dev)
-            y_nd = tvm.nd.array(y_np, dev)
-            y_np = y_np.astype(env.acc_dtype)
-            for b in range(o):
-                for i in range(m):
-                    for j in range(n):
-                        y_np[b, i, :] += np.dot(
-                            x_np[b, j, :].astype(env.acc_dtype), w_np[i, j].T.astype(env.acc_dtype)
-                        )
-            y_np = np.right_shift(y_np, 8)
-            y_np = np.clip(y_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(y.dtype)
-
-            if env.TARGET in ["sim", "tsim"]:
-                simulator.clear_stats()
-
-            f(x_nd, w_nd, y_nd)
-
-            np.testing.assert_equal(y_np, y_nd.numpy())
-
-            if env.TARGET in ["sim", "tsim"]:
-                sim_stats = simulator.stats()
-                print("GEMM schedule:{} execution statistics:".format(name))
-                for k, v in sim_stats.items():
-                    print("\t{:<16}: {:>16}".format(k, v))
-
-        def test_schedule1():
-            # default schedule with no smt
-            s = te.create_schedule(y.op)
-            # set the scope of the SRAM buffers
-            s[x_buf].set_scope(env.inp_scope)
-            s[w_buf].set_scope(env.wgt_scope)
-            s[y_gem].set_scope(env.acc_scope)
-            s[y_shf].set_scope(env.acc_scope)
-            s[y_max].set_scope(env.acc_scope)
-            s[y_min].set_scope(env.acc_scope)
-            # set pragmas for DMA transfer and ALU ops
-            s[x_buf].compute_at(s[y_gem], ko)
-            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
-            s[w_buf].compute_at(s[y_gem], ko)
-            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
-            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
-            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
-            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
-            s[y].pragma(s[y].op.axis[0], env.dma_copy)
-            # tensorization
-            s[y_gem].reorder(
-                ko,
-                s[y_gem].op.axis[0],
-                s[y_gem].op.axis[1],
-                s[y_gem].op.axis[2],
-                s[y_gem].op.axis[3],
-                ki,
-            )
-            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
-            verify(s, name="default")
-
-        def test_smt():
-            # test smt schedule
-            s = te.create_schedule(y.op)
-            s[x_buf].set_scope(env.inp_scope)
-            s[w_buf].set_scope(env.wgt_scope)
-            s[y_gem].set_scope(env.acc_scope)
-            s[y_shf].set_scope(env.acc_scope)
-            s[y_max].set_scope(env.acc_scope)
-            s[y_min].set_scope(env.acc_scope)
-            abo, aco, abi, aci = s[y].op.axis
-            abo1, abo2 = s[y].split(abo, nparts=2)
-            s[y].bind(abo1, te.thread_axis("cthread"))
-            s[y_gem].compute_at(s[y], abo1)
-            s[y_shf].compute_at(s[y], abo1)
-            s[y_max].compute_at(s[y], abo1)
-            s[y_min].compute_at(s[y], abo1)
-            s[y_gem].reorder(
-                ko,
-                s[y_gem].op.axis[0],
-                s[y_gem].op.axis[1],
-                s[y_gem].op.axis[2],
-                s[y_gem].op.axis[3],
-                ki,
-            )
-            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
-            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
-            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
-            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
-            s[x_buf].compute_at(s[y_gem], ko)
-            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
-            s[w_buf].compute_at(s[y_gem], ko)
-            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
-            s[y].pragma(abo2, env.dma_copy)
-            verify(s, name="smt")
-
-        test_schedule1()
-        test_smt()
-
-    vta.testing.run(_run)
-
-
-def test_alu():
-    def _run(env, remote):
-        def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
-            """Test ALU"""
-            m = 8
-            n = 8
-            imm = np.random.randint(1, 5)
-            # compute
-            a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype)
-            a_buf = te.compute(
-                (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf"
-            )  # DRAM->SRAM
-            if use_imm:
-                res_buf = te.compute(
-                    (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), imm), "res_buf"
-                )  # compute
-            else:
-                b = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="b", dtype=env.acc_dtype)
-                b_buf = te.compute(
-                    (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: b(*i), "b_buf"
-                )  # DRAM->SRAM
-                res_buf = te.compute(
-                    (m, n, env.BATCH, env.BLOCK_OUT),
-                    lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
-                    "res_buf",
-                )  # compute5B
-            res = te.compute(
-                (m, n, env.BATCH, env.BLOCK_OUT),
-                lambda *i: res_buf(*i).astype(env.inp_dtype),
-                "res",
-            )  # SRAM->DRAM
-            # schedule
-            s = te.create_schedule(res.op)
-            s[a_buf].set_scope(env.acc_scope)  # SRAM
-            s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
-            s[res_buf].set_scope(env.acc_scope)  # SRAM
-            s[res_buf].pragma(res_buf.op.axis[0], env.alu)  # compute
-            s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
-            if not use_imm:
-                s[b_buf].set_scope(env.acc_scope)  # SRAM
-                s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
-
-            if not remote:
-                return
-
-            # build
-            with vta.build_config():
-                if use_imm:
-                    mod = vta.build(s, [a, res], tvm.target.Target("ext_dev", host=env.target_host))
-                else:
-                    mod = vta.build(
-                        s, [a, b, res], tvm.target.Target("ext_dev", host=env.target_host)
-                    )
-            temp = utils.tempdir()
-            mod.save(temp.relpath("load_act.o"))
-            remote.upload(temp.relpath("load_act.o"))
-            f = remote.load_module("load_act.o")
-            # verify
-            dev = remote.ext_dev(0)
-            a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
-            if use_imm:
-                res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
-            else:
-                b_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(
-                    b.dtype
-                )
-                res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
-            res_np = res_np.astype(res.dtype)
-            a_nd = tvm.nd.array(a_np, dev)
-            res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)
-
-            if env.TARGET in ["sim", "tsim"]:
-                simulator.clear_stats()
-
-            if use_imm:
-                f(a_nd, res_nd)
-            else:
-                b_nd = tvm.nd.array(b_np, dev)
-                f(a_nd, b_nd, res_nd)
-
-            np.testing.assert_equal(res_np, res_nd.numpy())
-
-            if env.TARGET in ["sim", "tsim"]:
-                sim_stats = simulator.stats()
-                print("ALU {} execution statistics:".format(test_name))
-                for k, v in sim_stats.items():
-                    print("\t{:<16}: {:>16}".format(k, v))
-
-        check_alu(lambda x, y: x << y, np.left_shift, use_imm=True, test_name="SHL")
-        check_alu(tvm.te.max, np.maximum, use_imm=True, test_name="MAX")
-        check_alu(tvm.te.max, np.maximum, test_name="MAX")
-        check_alu(lambda x, y: x + y, use_imm=True, test_name="ADD")
-        check_alu(lambda x, y: x + y, test_name="ADD")
-        check_alu(lambda x, y: x >> y, np.right_shift, use_imm=True, test_name="SHR")
-
-    vta.testing.run(_run)
-
-
-def test_relu():
-    """Test RELU on ALU"""
-
-    def _run(env, remote):
-        m = 8
-        n = 10
-        # compute
-        a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype)
-        a_buf = te.compute(
-            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf"
-        )  # DRAM->SRAM
-        max_buf = te.compute(
-            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.max(a_buf(*i), 0), "res_buf"
-        )  # relu
-        min_buf = te.compute(
-            (m, n, env.BATCH, env.BLOCK_OUT),
-            lambda *i: tvm.te.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1),
-            "max_buf",
-        )  # relu
-        res = te.compute(
-            (m, n, env.BATCH, env.BLOCK_OUT),
-            lambda *i: min_buf(*i).astype(env.inp_dtype),
-            "min_buf",
-        )  # SRAM->DRAM
-        # schedule
-        s = te.create_schedule(res.op)
-        s[a_buf].set_scope(env.acc_scope)  # SRAM
-        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
-        s[max_buf].set_scope(env.acc_scope)  # SRAM
-        s[min_buf].set_scope(env.acc_scope)  # SRAM
-        s[max_buf].pragma(max_buf.op.axis[0], env.alu)  # compute
-        s[min_buf].pragma(min_buf.op.axis[0], env.alu)  # compute
-        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
-        # build
-        with vta.build_config():
-            mod = vta.build(s, [a, res], tvm.target.Target("ext_dev", host=env.target_host))
-        if not remote:
-            return
-        temp = utils.tempdir()
-        mod.save(temp.relpath("load_act.o"))
-        remote.upload(temp.relpath("load_act.o"))
-        f = remote.load_module("load_act.o")
-        # verify
-        dev = remote.ext_dev(0)
-        a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
-        res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
-        a_nd = tvm.nd.array(a_np, dev)
-        res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)
-
-        if env.TARGET in ["sim", "tsim"]:
-            simulator.clear_stats()
-
-        f(a_nd, res_nd)
-
-        np.testing.assert_equal(res_np, res_nd.numpy())
-
-        if env.TARGET in ["sim", "tsim"]:
-            sim_stats = simulator.stats()
-            print("Relu execution statistics:")
-            for k, v in sim_stats.items():
-                print("\t{:<16}: {:>16}".format(k, v))
-
-    vta.testing.run(_run)
-
-
-def test_shift_and_scale():
-    """Test shift and scale on ALU"""
-
-    def _run(env, remote):
-        m = 2
-        n = 8
-        imm_shift = np.random.randint(0, 8)
-        imm_scale = np.random.randint(1, 5)
-        # compute
-        a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype)
-        a_buf = te.compute(
-            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf"
-        )  # DRAM->SRAM
-        res_shift = te.compute(
-            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a_buf(*i) + imm_shift, "res_shift"
-        )  # compute
-        res_scale = te.compute(
-            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_shift(*i) >> imm_scale, "res_scale"
-        )  # compute
-        res = te.compute(
-            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_scale(*i).astype(env.inp_dtype), "res"
-        )  # SRAM->DRAM
-        # schedule
-        s = te.create_schedule(res.op)
-        s[a_buf].set_scope(env.acc_scope)  # SRAM
-        s[res_shift].set_scope(env.acc_scope)  # SRAM
-        s[res_scale].set_scope(env.acc_scope)  # SRAM
-        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
-        s[res_shift].pragma(res_shift.op.axis[0], env.alu)  # compute
-        s[res_scale].pragma(res_scale.op.axis[0], env.alu)  # compute
-        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
-        # build
-        mod = vta.build(s, [a, res], tvm.target.Target("ext_dev", host=env.target_host))
-        if not remote:
-            return
-        temp = utils.tempdir()
-        mod.save(temp.relpath("load_act.o"))
-        remote.upload(temp.relpath("load_act.o"))
-        f = remote.load_module("load_act.o")
-        # verify
-        dev = remote.ext_dev(0)
-        a_np = np.random.randint(-10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
-        res_np = np.right_shift((a_np + imm_shift), imm_scale)
-        res_np = res_np.astype(res.dtype)
-        a_nd = tvm.nd.array(a_np, dev)
-        res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)
-
-        if env.TARGET in ["sim", "tsim"]:
-            simulator.clear_stats()
-
-        f(a_nd, res_nd)
-
-        np.testing.assert_equal(res_np, res_nd.numpy())
-
-        if env.TARGET in ["sim", "tsim"]:
-            sim_stats = simulator.stats()
-            print("Shift and scale execution statistics:")
-            for k, v in sim_stats.items():
-                print("\t{:<16}: {:>16}".format(k, v))
-
-    vta.testing.run(_run)
-
-
-def test_runtime_array():
-    def _run(env, remote):
-        n = 100
-        dev = remote.ext_dev(0)
-        x_np = np.random.randint(1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype("int8")
-        x_nd = tvm.nd.array(x_np, dev)
-        np.testing.assert_equal(x_np, x_nd.numpy())
-
-    vta.testing.run(_run)
-
-
-if __name__ == "__main__":
-    test_runtime_array()
-    test_save_load_out()
-    test_padded_load()
-    test_gemm()
-    test_alu()
-    test_relu()
-    test_shift_and_scale()
diff --git a/vta/tutorials/README.txt b/vta/tutorials/README.txt
deleted file mode 100644
index c1ff4ca0444d..000000000000
--- a/vta/tutorials/README.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-.. _vta-tutorials:
-
-VTA Tutorials
-=============
-This page contains tutorials about VTA and how to use TVM/Relay to target VTA.
diff --git a/vta/tutorials/frontend/README.txt b/vta/tutorials/frontend/README.txt
deleted file mode 100644
index 7adec27a9bc0..000000000000
--- a/vta/tutorials/frontend/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _vta-tutorial-frontend:
-
-Compile Deep Learning Models
-----------------------------
diff --git a/vta/tutorials/frontend/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
deleted file mode 100644
index 0c430e8f9c6c..000000000000
--- a/vta/tutorials/frontend/deploy_detection.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Deploy Pretrained Vision Detection Model from Darknet on VTA
-============================================================
-**Author**: `Hua Jiang <https://github.com/huajsj>`_
-
-This tutorial provides an end-to-end demo, on how to run Darknet YoloV3-tiny
-inference onto the VTA accelerator design to perform Image detection tasks.
-It showcases Relay as a front end compiler that can perform quantization (VTA
-only supports int8/32 inference) as well as graph packing (in order to enable
-tensorization in the core) to massage the compute graph for the hardware target.
-"""
-
-######################################################################
-# Install dependencies
-# --------------------
-# To use the autotvm package in tvm, we need to install some extra dependencies.
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-#   pip3 install "Pillow<7"
-#
-# YOLO-V3-tiny Model with Darknet parsing have dependancy with CFFI and CV2 library,
-# we need to install CFFI and CV2 before executing this script.
-#
-# .. code-block:: bash
-#
-#   pip3 install cffi
-#   pip3 install opencv-python
-#
-# Now return to the python code. Import packages.
-
-from __future__ import absolute_import, print_function
-
-import sys
-import os
-import time
-import matplotlib.pyplot as plt
-import numpy as np
-import tvm
-import vta
-from tvm import rpc, autotvm, relay
-from tvm.relay.testing import yolo_detection, darknet
-from tvm.relay.testing.darknet import __darknetffi__
-from tvm.contrib import graph_executor, utils
-from tvm.contrib.download import download_testdata
-from vta.testing import simulator
-from vta.top import graph_pack
-
-# Make sure that TVM was compiled with RPC=1
-assert tvm.runtime.enabled("rpc")
-
-##############################################################################
-# Download yolo net configure file, weight file, darknet library file based on
-# Model Name
-# ----------------------------------------------------------------------------
-MODEL_NAME = "yolov3-tiny"
-REPO_URL = "https://github.com/dmlc/web-data/blob/main/darknet/"
-
-cfg_path = download_testdata(
-    "https://github.com/pjreddie/darknet/blob/master/cfg/" + MODEL_NAME + ".cfg" + "?raw=true",
-    MODEL_NAME + ".cfg",
-    module="darknet",
-)
-weights_path = download_testdata(
-    "https://pjreddie.com/media/files/" + MODEL_NAME + ".weights" + "?raw=true",
-    MODEL_NAME + ".weights",
-    module="darknet",
-)
-
-if sys.platform in ["linux", "linux2"]:
-    darknet_lib_path = download_testdata(
-        REPO_URL + "lib/" + "libdarknet2.0.so" + "?raw=true", "libdarknet2.0.so", module="darknet"
-    )
-elif sys.platform == "darwin":
-    darknet_lib_path = download_testdata(
-        REPO_URL + "lib_osx/" + "libdarknet_mac2.0.so" + "?raw=true",
-        "libdarknet_mac2.0.so",
-        module="darknet",
-    )
-else:
-    raise NotImplementedError("Darknet lib is not supported on {} platform".format(sys.platform))
-
-##################################################
-# Download yolo categories and illustration front.
-# ------------------------------------------------
-coco_path = download_testdata(
-    REPO_URL + "data/" + "coco.names" + "?raw=true", "coco.names", module="data"
-)
-font_path = download_testdata(
-    REPO_URL + "data/" + "arial.ttf" + "?raw=true", "arial.ttf", module="data"
-)
-with open(coco_path) as f:
-    content = f.readlines()
-names = [x.strip() for x in content]
-
-########################################
-# Define the platform and model targets.
-# --------------------------------------
-# Execute on CPU vs. VTA, and define the model.
-
-# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
-env = vta.get_env()
-# Set ``device=arm_cpu`` to run inference on the CPU
-# or ``device=vta`` to run inference on the FPGA.
-device = "vta"
-target = env.target if device == "vta" else env.target_vta_cpu
-
-pack_dict = {
-    "yolov3-tiny": ["nn.max_pool2d", "cast", 4, 186],
-}
-
-# Name of Darknet model to compile
-# The ``start_pack`` and ``stop_pack`` labels indicate where
-# to start and end the graph packing relay pass: in other words
-# where to start and finish offloading to VTA.
-# the number 4 indicate the ``start_pack`` index is 4, the
-# number 186 indicate the ``stop_pack index`` is 186, by using
-# name and index number, here we can located to correct place
-# where to start/end when there are multiple ``nn.max_pool2d``
-# or ``cast``, print(mod.astext(show_meta_data=False)) can help
-# to find operator name and index information.
-assert MODEL_NAME in pack_dict
-
-#############################
-# Obtain an execution remote.
-# ---------------------------
-# When target is 'pynq' or other FPGA backend, reconfigure FPGA and runtime.
-# Otherwise, if target is 'sim', execute locally.
-
-if env.TARGET not in ["sim", "tsim"]:
-    # Get remote from tracker node if environment variable is set.
-    # To set up the tracker, you'll need to follow the "Auto-tuning
-    # a convolutional network for VTA" tutorial.
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
-    # Otherwise if you have a device you want to program directly from
-    # the host, make sure you've set the variables below to the IP of
-    # your board.
-    device_host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
-    device_port = os.environ.get("VTA_RPC_PORT", "9091")
-    if not tracker_host or not tracker_port:
-        remote = rpc.connect(device_host, int(device_port))
-    else:
-        remote = autotvm.measure.request_remote(
-            env.TARGET, tracker_host, int(tracker_port), timeout=10000
-        )
-    # Reconfigure the JIT runtime and FPGA.
-    # You can program the FPGA with your own custom bitstream
-    # by passing the path to the bitstream file instead of None.
-    reconfig_start = time.time()
-    vta.reconfig_runtime(remote)
-    vta.program_fpga(remote, bitstream=None)
-    reconfig_time = time.time() - reconfig_start
-    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
-
-# In simulation mode, host the RPC server locally.
-else:
-    remote = rpc.LocalSession()
-
-# Get execution context from remote
-ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-
-#####################################
-# Build the inference graph executor.
-# -----------------------------------
-# Using Darknet library load downloaded vision model and compile with Relay.
-# The compilation steps are:
-#
-# 1. Front end translation from Darknet into Relay module.
-# 2. Apply 8-bit quantization: here we skip the first conv layer,
-#    and dense layer which will both be executed in fp32 on the CPU.
-# 3. Perform graph packing to alter the data layout for tensorization.
-# 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply).
-# 5. Perform relay build to object file.
-# 6. Load the object file onto remote (FPGA device).
-# 7. Generate graph executor, `m`.
-
-# Load pre-configured AutoTVM schedules
-with autotvm.tophub.context(target):
-    net = __darknetffi__.dlopen(darknet_lib_path).load_network(
-        cfg_path.encode("utf-8"), weights_path.encode("utf-8"), 0
-    )
-    dshape = (env.BATCH, net.c, net.h, net.w)
-    dtype = "float32"
-
-    # Measure build start time
-    build_start = time.time()
-
-    # Start front end compilation
-    mod, params = relay.frontend.from_darknet(net, dtype=dtype, shape=dshape)
-
-    if target.device_name == "vta":
-        # Perform quantization in Relay
-        # Note: We set opt_level to 3 in order to fold batch norm
-        with tvm.transform.PassContext(opt_level=3):
-            with relay.quantize.qconfig(
-                global_scale=23.0,
-                skip_conv_layers=[0],
-                store_lowbit_output=True,
-                round_for_shift=True,
-            ):
-                mod = relay.quantize.quantize(mod, params=params)
-            # Perform graph packing and constant folding for VTA target
-            mod = graph_pack(
-                mod["main"],
-                env.BATCH,
-                env.BLOCK_OUT,
-                env.WGT_WIDTH,
-                start_name=pack_dict[MODEL_NAME][0],
-                stop_name=pack_dict[MODEL_NAME][1],
-                start_name_idx=pack_dict[MODEL_NAME][2],
-                stop_name_idx=pack_dict[MODEL_NAME][3],
-            )
-    else:
-        mod = mod["main"]
-
-    # Compile Relay program with AlterOpLayout disabled
-    with vta.build_config(disabled_pass={"AlterOpLayout", "tir.CommonSubexprElimTIR"}):
-        lib = relay.build(
-            mod, target=tvm.target.Target(target, host=env.target_host), params=params
-        )
-
-    # Measure Relay build time
-    build_time = time.time() - build_start
-    print(MODEL_NAME + " inference graph built in {0:.2f}s!".format(build_time))
-
-    # Send the inference library over to the remote RPC server
-    temp = utils.tempdir()
-    lib.export_library(temp.relpath("graphlib.tar"))
-    remote.upload(temp.relpath("graphlib.tar"))
-    lib = remote.load_module("graphlib.tar")
-
-    # Graph executor
-    m = graph_executor.GraphModule(lib["default"](ctx))
-
-####################################
-# Perform image detection inference.
-# ----------------------------------
-# We run detect on an downloaded image
-# Download test image
-[neth, netw] = dshape[2:]
-test_image = "person.jpg"
-img_url = REPO_URL + "data/" + test_image + "?raw=true"
-img_path = download_testdata(img_url, test_image, "data")
-data = darknet.load_image(img_path, neth, netw).transpose(1, 2, 0)
-
-# Prepare test image for inference
-plt.imshow(data)
-plt.show()
-data = data.transpose((2, 0, 1))
-data = data[np.newaxis, :]
-data = np.repeat(data, env.BATCH, axis=0)
-
-# Set the network parameters and inputs
-m.set_input("data", data)
-
-# Perform inference and gather execution statistics
-# More on: :py:method:`tvm.runtime.Module.time_evaluator`
-num = 4  # number of times we run module for a single measurement
-rep = 3  # number of measurements (we derive std dev from this)
-timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
-
-if env.TARGET in ["sim", "tsim"]:
-    simulator.clear_stats()
-    timer()
-    sim_stats = simulator.stats()
-    print("\nExecution statistics:")
-    for k, v in sim_stats.items():
-        # Since we execute the workload many times, we need to normalize stats
-        # Note that there is always one warm up run
-        # Therefore we divide the overall stats by (num * rep + 1)
-        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
-else:
-    tcost = timer()
-    std = np.std(tcost.results) * 1000
-    mean = tcost.mean * 1000
-    print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
-    print("Average per sample inference time: %.2fms" % (mean / env.BATCH))
-
-# Get detection results from out
-thresh = 0.5
-nms_thresh = 0.45
-tvm_out = []
-for i in range(2):
-    layer_out = {}
-    layer_out["type"] = "Yolo"
-    # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
-    layer_attr = m.get_output(i * 4 + 3).numpy()
-    layer_out["biases"] = m.get_output(i * 4 + 2).numpy()
-    layer_out["mask"] = m.get_output(i * 4 + 1).numpy()
-    out_shape = (layer_attr[0], layer_attr[1] // layer_attr[0], layer_attr[2], layer_attr[3])
-    layer_out["output"] = m.get_output(i * 4).numpy().reshape(out_shape)
-    layer_out["classes"] = layer_attr[4]
-    tvm_out.append(layer_out)
-    thresh = 0.560
-
-# Show detection results
-img = darknet.load_image_color(img_path)
-_, im_h, im_w = img.shape
-dets = yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh, 1, tvm_out)
-last_layer = net.layers[net.n - 1]
-yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
-yolo_detection.draw_detections(font_path, img, dets, thresh, names, last_layer.classes)
-plt.imshow(img.transpose(1, 2, 0))
-plt.show()
diff --git a/vta/tutorials/matrix_multiply.py b/vta/tutorials/matrix_multiply.py
deleted file mode 100644
index 0d1167854458..000000000000
--- a/vta/tutorials/matrix_multiply.py
+++ /dev/null
@@ -1,474 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _basic-mat-mult:
-
-Simple Matrix Multiply
-======================
-**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
-
-In this tutorial, we will build on top of the :ref:`vta-get-started` tutorial
-and introduce additional concepts required to implement matrix multiplication
-on VTA with the TVM workflow.
-"""
-
-######################################################################
-# RPC Setup
-# ---------
-# We start by programming the Pynq's FPGA and building its RPC runtime
-# as we did in the VTA introductory tutorial.
-
-from __future__ import absolute_import, print_function
-
-import os
-import tvm
-from tvm import te
-import vta
-import numpy as np
-from tvm import rpc
-from tvm.contrib import utils
-from vta.testing import simulator
-
-# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
-env = vta.get_env()
-
-# We read the Pynq RPC host IP address and port number from the OS environment
-host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
-port = int(os.environ.get("VTA_RPC_PORT", "9091"))
-
-# We configure both the bitstream and the runtime system on the Pynq
-# to match the VTA configuration specified by the vta_config.json file.
-if env.TARGET == "pynq" or env.TARGET == "de10nano":
-
-    # Make sure that TVM was compiled with RPC=1
-    assert tvm.runtime.enabled("rpc")
-    remote = rpc.connect(host, port)
-
-    # Reconfigure the JIT runtime
-    vta.reconfig_runtime(remote)
-
-    # Program the FPGA with a pre-compiled VTA bitstream.
-    # You can program the FPGA with your own custom bitstream
-    # by passing the path to the bitstream file instead of None.
-    vta.program_fpga(remote, bitstream=None)
-
-# In simulation mode, host the RPC server locally.
-elif env.TARGET in ["sim", "tsim"]:
-    remote = rpc.LocalSession()
-
-######################################################################
-# Computation Declaration
-# -----------------------
-# In this example we describe a simple matrix multiplication addition, which
-# requires multiple computation stages, as shown in the dataflow diagram below.
-# First we describe the input tensors :code:`A` and :code:`B` that are living
-# in main memory.
-# Second, we need to declare intermediate tensors :code:`A_buf` and
-# :code:`B_buf`, which will live in VTA's on-chip buffers.
-# Having this extra computational stage allows us to explicitly
-# stage cached reads and writes.
-# Third, we describe the matrix multiplication computation over
-# :code:`A_buf` and :code:`B_buf` to produce the product matrix :code:`C_buf`.
-# The last operation is a cast and copy back to DRAM, into results tensor
-# :code:`C`.
-#
-# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/gemm_dataflow.png
-#      :align: center
-
-######################################################################
-# Data Layout
-# ~~~~~~~~~~~
-# We describe the placeholder tensors :code:`A`, and :code:`B` in a tiled data
-# format to match the data layout requirements imposed by the VTA tensor core.
-
-######################################################################
-# .. note::
-#
-#   **Data Tiling**
-#
-#   One source of complexity when targeting accelerators is to make sure
-#   that the data layout matches the layout imposed by the accelerator design.
-#   VTA is designed around a *tensor core* that performs, one matrix-matrix
-#   operation per cycle between an activation matrix and a weight matrix,
-#   adding the result matrix to an accumulator matrix, as shown in the
-#   figure below.
-#
-#   .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/tensor_core.png
-#        :align: center
-#        :width: 480px
-#
-#   The dimensions of that matrix-matrix multiplication are specified in
-#   the :code:`vta_config.json` configuration file.
-#   The activation matrix has a :code:`(BATCH, BLOCK_IN)` shape
-#   and the transposed weight matrix has a :code:`(BLOCK_OUT, BLOCK_IN)` shape,
-#   thus inferring that the resulting output matrix has a
-#   :code:`(BATCH, BLOCK_OUT)` shape.
-#   Consequently input and output tensors processed by VTA need to be
-#   tiled according to these aforementioned dimension.
-#
-#   The diagram below shows the impact of data tiling on a matrix that is
-#   originally of shape (4, 8).
-#   Tiling by a (2, 2) tile shape ensures that data within each tile is
-#   contiguous.
-#   The resulting tiled tensor has a shape of (2, 4, 2, 2).
-#
-#   .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/data_tiling.png
-#        :align: center
-#        :width: 480px
-#
-# We first define the variables :code:`m`, :code:`n`, :code:`o` to represent
-# the shape of the matrix multiplication. These variables are multiplicative
-# factors over the :code:`BLOCK_OUT`, :code:`BLOCK_IN`, and :code:`BATCH`
-# tensor dimensions respectively.
-# By default, the configuration file sets :code:`BATCH`, :code:`BLOCK_IN`, and
-# :code:`BLOCK_OUT` to be 1, 16 and 16 respectively (:code:`BATCH` being set to
-# 1 implies that our compute building block is vector-matrix multiply).
-#
-
-######################################################################
-# .. note::
-#
-#   **Data Types**
-#
-#   It's important to not only match the inner-tile
-#   dimension of VTA's tensor core, but also to match the specific data types
-#   expected by VTA.
-#   VTA for now only supports fixed point data types, which integer width is
-#   specified in the :code:`vta_config.json` file by :code:`INP_WIDTH` and
-#   :code:`WGT_WIDTH` for the activations and weights data types respectively.
-#   In addition, the accumulator data type integer width is specified by
-#   :code:`ACC_WIDTH`.
-#
-# By default, the configuration file sets :code:`INP_WIDTH`
-# and :code:`WGT_WIDTH` to 8.
-# The accumulator width :code:`ACC_WIDTH` is set to 32, in order to avoid
-# overflow during accumulation.
-# As a result, :code:`env.inp_dtype` and :code:`env.wgt_dtype` are all
-# narrow 8-bit integers, while :code:`env.acc_dtype` is a standard 32-bit
-# integer.
-
-# Output channel factor m - total 16x16=256 output channels
-m = 16
-# Input channel factor n - total 16x16=256 input channels
-n = 16
-# Batch factor o (we use single batch inference)
-o = 1
-# A placeholder tensor in tiled data format
-A = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype)
-# B placeholder tensor in tiled data format
-B = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype)
-# A copy buffer
-A_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: A(*i), "A_buf")
-# B copy buffer
-B_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: B(*i), "B_buf")
-
-######################################################################
-# Matrix Multiplication
-# ~~~~~~~~~~~~~~~~~~~~~
-# Now we're ready to describe the matrix multiplication result tensor :code:`C`,
-# with another compute operation.
-# The compute function takes the shape of the tensor, as well as a lambda
-# function that describes the computation rule for each position of the tensor.
-#
-# In order to implement matrix multiplication, the lambda function needs to
-# include a reduction formula over the input channel dimension axes.
-# To create a reduction formula, we can declare a reduction axis using
-# :code:`te.reduce_axis`, which takes in the range of reductions.
-# :code:`te.sum` takes in the expression to be reduced as well as
-# the reduction axes to compute the sum of value over all k in the declared
-# ranges.
-#
-# Note that the reduction needs to be performed over 32-bit :code:`env.acc_dtype`
-# accumulator data types.
-#
-# No computation happens during this phase, as we are only declaring how
-# the computation should be done.
-
-# Outer input feature reduction axis
-ko = te.reduce_axis((0, n), name="ko")
-# Inner input feature reduction axis
-ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
-# Describe the in-VTA matrix multiplication
-C_buf = te.compute(
-    (o, m, env.BATCH, env.BLOCK_OUT),
-    lambda bo, co, bi, ci: te.sum(
-        A_buf[bo, ko, bi, ki].astype(env.acc_dtype) * B_buf[co, ko, ci, ki].astype(env.acc_dtype),
-        axis=[ko, ki],
-    ),
-    name="C_buf",
-)
-
-######################################################################
-# Casting the Results
-# ~~~~~~~~~~~~~~~~~~~
-# After the computation is done, we'll need to send the results computed by VTA
-# back to main memory.
-
-######################################################################
-# .. note::
-#
-#   **Memory Store Restrictions**
-#
-#   One specificity of VTA is that it only supports DRAM stores in the narrow
-#   :code:`env.inp_dtype` data type format.
-#   This lets us reduce the data footprint for memory transfers, but also lets
-#   us quantize the wide accumulator data type down to a data format that
-#   matches the input activation data type.
-#   This means that in the context of neural network inference, the outputs
-#   of a given layer after activation can be consumed directly by the next
-#   layer.
-#
-# We perform one last typecast operation to the narrow
-# input activation data format.
-
-# Cast to output type, and send to main memory
-C = te.compute(
-    (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: C_buf(*i).astype(env.inp_dtype), name="C"
-)
-
-######################################################################
-# This concludes the computation declaration part of this tutorial.
-
-######################################################################
-# Scheduling the Computation
-# --------------------------
-# While the above lines describes the computation rule, we can obtain
-# :code:`C` in many ways.
-# TVM asks the user to provide an implementation of the computation called
-# *schedule*.
-#
-# A schedule is a set of transformations to an original computation that
-# transforms the implementation of the computation without affecting
-# correctness.
-# This simple VTA programming tutorial aims to demonstrate basic schedule
-# transformations that will map the original schedule down to VTA hardware
-# primitives.
-
-
-######################################################################
-# Default Schedule
-# ~~~~~~~~~~~~~~~~
-# After we construct the schedule, by default the schedule computes
-# :code:`C` in the following way:
-
-# Let's take a look at the generated schedule
-s = te.create_schedule(C.op)
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-######################################################################
-# Although this schedule makes sense, it won't compile to VTA.
-# In order to obtain correct code generation, we need to apply scheduling
-# primitives and code annotation that will transform the schedule into
-# one that can be directly lowered onto VTA hardware intrinsics.
-# Those include:
-#
-#  - DMA copy operations which will take globally-scoped tensors and copy
-#    those into locally-scoped tensors.
-#  - Tensor operations that will perform the matrix multiplication.
-
-######################################################################
-# Buffer Scopes
-# ~~~~~~~~~~~~~
-# First, we set the scope of the buffers to tell TVM that these buffers
-# will be living in the VTA's on-chip SRAM caches.
-# Below, we tell TVM that :code:`A_buf`, :code:`B_buf`, :code:`C_buf`
-# will respectively live in VTA's on-chip input, weight and accumulator
-# memory.
-
-######################################################################
-# .. note::
-#
-#   **VTA's On-Chip SRAMs**
-#
-#   VTA has three different memory scopes, each corresponding to different
-#   on-chip SRAM buffers.
-#
-#    - :code:`env.inp_scope`: Input buffer, which is a read-only SRAM buffer
-#      that stores input matrices of shape :code:`(env.BATCH, env.BLOCK_IN)`
-#      of type :code:`env.inp_dtype`. The input buffer contains
-#      `2 ^ LOG_INP_BUFF_SIZE` matrix elements (as specified in the
-#      :code:`vta_config.json` file).
-#    - :code:`env.wgt_scope`: Weight buffer, which is a read-only SRAM buffer
-#      that stores weight matrices of shape :code:`(env.BLOCK_OUT, env.BLOCK_IN)`
-#      of type :code:`env.wgt_dtype`. The weight buffer contains
-#      `2 ^ LOG_WGT_BUFF_SIZE` matrix elements.
-#    - :code:`env.acc_scope`: Accumulator buffer, which is a read/write SRAM
-#      buffer that stores accumulator matrices of shape
-#      :code:`(env.BATCH, env.BLOCK_OUT)` of type :code:`env.acc_dtype`.
-#      The accumulator buffer is VTA's general purpose register file: it holds
-#      both intermediate results of convolutions and matrix multiplications
-#      as well as intermediate results of pooling, batch normalization, and
-#      activation layers. The accumulator buffer contains
-#      `2 ^ LOG_ACC_BUFF_SIZE` matrix elements.
-
-# Set the intermediate tensor's scope to VTA's on-chip buffers
-s[A_buf].set_scope(env.inp_scope)
-s[B_buf].set_scope(env.wgt_scope)
-s[C_buf].set_scope(env.acc_scope)
-
-######################################################################
-# DMA Transfers
-# ~~~~~~~~~~~~~
-# We need to schedule DMA transfers to move data living in DRAM to
-# and from the VTA on-chip buffers.
-# This can be achieved using the :code:`compute_at` schedule primitive
-# which nests the copying of the buffers into the computation loop
-# that performs the matrix multiplication.
-#
-# We insert :code:`dma_copy` pragmas to indicate to the compiler
-# that the copy operations will be performed in bulk via DMA,
-# which is common in hardware accelerators.
-# Finally, we print the temporary schedule to observe the effects of
-# moving the copy operations into the matrix multiplication loop.
-
-# Move buffer copy into matrix multiply loop
-s[A_buf].compute_at(s[C_buf], ko)
-s[B_buf].compute_at(s[C_buf], ko)
-
-# Tag the buffer copies with the DMA pragma to insert a DMA transfer
-s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
-s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
-s[C].pragma(s[C].op.axis[0], env.dma_copy)
-
-# Let's take a look at the transformed schedule
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-######################################################################
-# Tensorization
-# ~~~~~~~~~~~~~
-# The last step of the schedule transformation consists in applying
-# *tensorization* to our schedule.
-# Tensorization is analogous to vectorization, but extends the concept
-# to a higher-dimensional unit of computation.
-# Consequently, tensorization imposes data layout constraints as discussed
-# earlier when declaring the data layout input placeholders.
-# We've already arranged our tensors in a tiled format, so the next thing
-# we need to perform is loop reordering to accommodate for tensorization.
-#
-# Here we choose to move the outermost reduction axis all the way out.
-# This dictates that we first iterate over input channels, then batch
-# dimensions, and finally output channels.
-# Lastly, we apply the tensorization scheduling primitive :code:`tensorize`
-# along the outer axis of the inner-most matrix matrix multiplication tensor
-# block.
-# We print the finalized schedule that is ready for code-generation
-# by the VTA runtime JIT compiler.
-
-s[C_buf].reorder(
-    ko, s[C_buf].op.axis[0], s[C_buf].op.axis[1], s[C_buf].op.axis[2], s[C_buf].op.axis[3], ki
-)
-s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)
-
-# Let's take a look at the finalized schedule
-print(vta.lower(s, [A, B, C], simple_mode=True))
-
-######################################################################
-# This concludes the scheduling portion of this tutorial.
-
-######################################################################
-# TVM Compilation
-# ---------------
-# After we have finished specifying the schedule, we can compile it
-# into a TVM function.
-
-# Build GEMM VTA kernel
-my_gemm = vta.build(
-    s, [A, B, C], tvm.target.Target("ext_dev", host=env.target_host), name="my_gemm"
-)
-
-# Write the compiled module into an object file.
-temp = utils.tempdir()
-my_gemm.save(temp.relpath("gemm.o"))
-
-# Send the executable over RPC
-remote.upload(temp.relpath("gemm.o"))
-
-# Load the compiled module
-f = remote.load_module("gemm.o")
-
-######################################################################
-# Running the Function
-# --------------------
-# The compiled TVM function uses a concise C API and can be invoked from
-# code language.
-#
-# TVM provides an array API in python to aid quick testing and prototyping.
-# The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
-#
-# - We first create a remote context (for remote execution on the Pynq).
-# - Then :code:`tvm.nd.array` formats the data accordingly.
-# - :code:`f()` runs the actual computation.
-# - :code:`numpy()` copies the result array back in a format that can be
-#   interpreted.
-#
-
-# Get the remote device context
-ctx = remote.ext_dev(0)
-
-# Initialize the A and B arrays randomly in the int range of (-128, 128]
-A_orig = np.random.randint(-128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)
-B_orig = np.random.randint(-128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)
-
-# Apply packing to the A and B arrays from a 2D to a 4D packed layout
-A_packed = A_orig.reshape(o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
-B_packed = B_orig.reshape(m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
-
-# Format the input/output arrays with tvm.nd.array to the DLPack standard
-A_nd = tvm.nd.array(A_packed, ctx)
-B_nd = tvm.nd.array(B_packed, ctx)
-C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)
-
-# Clear stats
-if env.TARGET in ["sim", "tsim"]:
-    simulator.clear_stats()
-
-# Invoke the module to perform the computation
-f(A_nd, B_nd, C_nd)
-
-######################################################################
-# Verifying Correctness
-# ---------------------
-# Compute the reference result with numpy and assert that the output of the
-# matrix multiplication indeed is correct
-
-# Compute reference result with numpy
-C_ref = np.dot(A_orig.astype(env.acc_dtype), B_orig.T.astype(env.acc_dtype)).astype(C.dtype)
-C_ref = C_ref.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
-np.testing.assert_equal(C_ref, C_nd.numpy())
-
-# Print stats
-if env.TARGET in ["sim", "tsim"]:
-    sim_stats = simulator.stats()
-    print("Execution statistics:")
-    for k, v in sim_stats.items():
-        print("\t{:<16}: {:>16}".format(k, v))
-
-print("Successful matrix multiply test!")
-
-######################################################################
-# Summary
-# -------
-# This tutorial showcases the TVM workflow to implement a simple matrix
-# multiplication example on VTA.
-# The general workflow includes:
-#
-# - Programming the FPGA with the VTA bitstream over RPC.
-# - Describing matrix multiplication via a series of computations.
-# - Describing how we want to perform the computation using schedule primitives.
-# - Compiling the function to the VTA target.
-# - Running the compiled module and verifying it against a numpy implementation.
-#
diff --git a/vta/tutorials/optimize/README.txt b/vta/tutorials/optimize/README.txt
deleted file mode 100644
index b051548c5351..000000000000
--- a/vta/tutorials/optimize/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Optimize Tensor Operators
--------------------------
diff --git a/vta/tutorials/optimize/convolution_opt.py b/vta/tutorials/optimize/convolution_opt.py
deleted file mode 100644
index 521a73ab510d..000000000000
--- a/vta/tutorials/optimize/convolution_opt.py
+++ /dev/null
@@ -1,458 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-2D Convolution Optimization
-===========================
-**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
-
-This tutorial provides an overview on how to use TVM to map a 2D convolution
-workload efficiently on the VTA design.
-We recommend covering the :ref:`vta-mat-mult-opt` tutorial first.
-
-2D convolution is dominant in most computer vision deep neural networks.
-In this tutorial, we will demonstrate TVM schedule optimizations to map
-2D convolution operators in NCHW layout onto VTA.
-We also introduce the notion of latency hiding, which allows us to
-maximize VTA's compute and memory resource utilization.
-"""
-
-######################################################################
-# RPC Setup
-# ---------
-# We start by programming the Pynq's FPGA and building its RPC runtime.
-
-from __future__ import absolute_import, print_function
-
-import os
-import tvm
-import tvm.testing
-from tvm import te
-import vta
-import numpy as np
-
-from tvm import rpc
-from tvm.contrib import utils
-from vta.testing import simulator
-
-# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
-env = vta.get_env()
-
-# We read the Pynq RPC host IP address and port number from the OS environment
-host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
-port = int(os.environ.get("VTA_RPC_PORT", "9091"))
-
-# We configure both the bitstream and the runtime system on the Pynq
-# to match the VTA configuration specified by the vta_config.json file.
-if env.TARGET == "pynq":
-
-    # Make sure that TVM was compiled with RPC=1
-    assert tvm.runtime.enabled("rpc")
-    remote = rpc.connect(host, port)
-
-    # Reconfigure the JIT runtime
-    vta.reconfig_runtime(remote)
-
-    # Program the FPGA with a pre-compiled VTA bitstream.
-    # You can program the FPGA with your own custom bitstream
-    # by passing the path to the bitstream file instead of None.
-    vta.program_fpga(remote, bitstream=None)
-
-# In simulation mode, host the RPC server locally.
-elif env.TARGET in ["sim", "tsim"]:
-    remote = rpc.LocalSession()
-
-######################################################################
-# Computation Declaration
-# -----------------------
-# As a first step, we need to describe our 2D convolution computation
-# in NCHW format.
-#
-# We define the 2D convolution shape by the batch size,
-# spatial dimensions, input channels, output channels, kernel dimensions,
-# kernel dimensions, padding dimensions, and stride dimensions.
-#
-# We pick the shape of the 9th convolutional layer of the ResNet-18
-# architecture as our convolution workload parameters.
-#
-# We've added extra operators to the 2D convolution that apply
-# shifting and clipping to the output in order to mimic a fixed-point
-# convolution followed by a rectified linear activation.
-# We describe the TVM dataflow graph of the 2D convolution layer below:
-#
-# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/conv2d_dataflow.png
-#      :align: center
-#
-# This computation is intentionally too large to fit onto VTA's on-chip
-# buffers all at once. Therefore in the scheduling phase we'll
-# rely on computation blocking strategies to break the computation down into
-# manageable chunks.
-#
-# .. note::
-#
-#   *Spatial padding*
-#
-#   Note that we'll need to import the TOPI library to apply spatial padding
-#   on the input feature map tensor.
-#   Spatial padding facilitates blocking in the context of 2D convolutions
-#   due to the fact that the same (x, y) spatial location of the input
-#   feature map of any given layer is read more than once if the convolution
-#   kernel window size is greater than one.
-#   On CPUs, and GPUs, one way to increase efficiency of memory accesses
-#   when parallelizing work is spatial packing, which requires data re-layout.
-#   VTA load DMA engine can insert padding automatically so that the original
-#   input feature map does not have to be re-packed in memory.
-#
-#   We show the effect of VTA's on the fly spatial padding when data is being
-#   loaded from DRAM into VTA's SRAM, following a 2D strided and padded memory
-#   read.
-#
-#   .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/padding.png
-#        :align: center
-#        :width: 480px
-
-from tvm import topi
-
-# 2D convolution layer dimensions taken from ResNet-18 architecture
-# (9th convolutional layer)
-batch_size = 1
-height = 14
-width = 14
-in_channels = 256
-out_channels = 256
-kernel_h = 3
-kernel_w = 3
-pad_h = 1
-pad_w = 1
-stride_h = 1
-stride_w = 1
-assert batch_size % env.BATCH == 0
-assert in_channels % env.BLOCK_IN == 0
-assert out_channels % env.BLOCK_OUT == 0
-
-# Input feature map: (N, IC, H, W, n, ic)
-data_shape = (
-    batch_size // env.BATCH,
-    in_channels // env.BLOCK_IN,
-    height,
-    width,
-    env.BATCH,
-    env.BLOCK_IN,
-)
-# Kernel: (OC, IC, H, W, oc, ic)
-kernel_shape = (
-    out_channels // env.BLOCK_OUT,
-    in_channels // env.BLOCK_IN,
-    kernel_h,
-    kernel_w,
-    env.BLOCK_OUT,
-    env.BLOCK_IN,
-)
-# Derive output feature map dimensions
-fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
-fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
-# Output feature map: (N, OC, H, W, n, oc)
-output_shape = (
-    batch_size // env.BATCH,
-    out_channels // env.BLOCK_OUT,
-    fout_height,
-    fout_width,
-    env.BATCH,
-    env.BLOCK_OUT,
-)
-
-# Convolution reduction axes
-dy = te.reduce_axis((0, kernel_h), name="dy")
-dx = te.reduce_axis((0, kernel_w), name="dx")
-ic = te.reduce_axis((0, in_channels // env.BLOCK_IN), name="ic")
-ic_tns = te.reduce_axis((0, env.BLOCK_IN), name="ic_tns")
-
-# Input placeholder tensors
-data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-
-# Copy buffers:
-#   Apply spatial padding to input feature map
-data_buf = topi.nn.pad(data, [0, 0, pad_h, pad_w, 0, 0], name="data_buf")
-kernel_buf = te.compute(kernel_shape, lambda *i: kernel(*i), "kernel_buf")
-
-# Declare 2D convolution
-res_conv = te.compute(
-    output_shape,
-    lambda bo, co, i, j, bi, ci: te.sum(
-        data_buf[bo, ic, i * stride_h + dy, j * stride_w + dx, bi, ic_tns].astype(env.acc_dtype)
-        * kernel_buf[co, ic, dy, dx, ci, ic_tns].astype(env.acc_dtype),
-        axis=[ic, dy, dx, ic_tns],
-    ),
-    name="res_conv",
-)
-
-# Add shift stage for fix-point normalization
-res_shr = te.compute(output_shape, lambda *i: res_conv(*i) >> 8, name="res_shr")
-
-# Apply clipping between (0, input max value)
-inp_max = (1 << (env.INP_WIDTH - 1)) - 1
-res_max = te.compute(output_shape, lambda *i: tvm.te.max(res_shr(*i), 0), "res_max")
-res_min = te.compute(output_shape, lambda *i: tvm.te.min(res_max(*i), inp_max), "res_min")
-
-# Result Tensor
-res = te.compute(output_shape, lambda *i: res_min(*i).astype(env.inp_dtype), name="res")
-
-
-######################################################################
-# Scheduling the Computation
-# --------------------------
-# We'll look at a set of schedule transformations necessary to map the
-# 2D convolution onto VTA in an efficient fashion.
-# Those include:
-#
-# - Computation blocking
-# - Virtual threading to increase compute utilization
-# - Lowering to VTA hardware intrinsics
-
-# Create TVM schedule
-s = te.create_schedule(res.op)
-# Let's look at the default TVM schedule
-print(tvm.lower(s, [data, kernel, res], simple_mode=True))
-
-######################################################################
-# Blocking the Computation
-# ~~~~~~~~~~~~~~~~~~~~~~~~
-# The 2D convolution is by default too large for activations or kernel weights
-# to fit on VTA's on-chip buffers all at once.
-# We apply blocking along input channels, output channels, and along
-# the height spatial dimensions.
-# We don't apply blocking along the width spatial dimension since it's
-# the innermost dimension in the NCHW layout (and consequently to increase
-# locality, it's best not to block along the innermost dimension).
-
-# Let's define tiling sizes
-b_block = 1 // env.BATCH
-oc_block = 128 // env.BLOCK_OUT
-ic_block = 16 // env.BLOCK_IN
-h_block = 7
-w_block = 14
-
-# Tile the output tensor along the spatial and output channel dimensions
-# (since by default we are doing single batch inference, the split along
-#  the batch dimension has no effect)
-b, oc, y, x, b_tns, oc_tns = s[res].op.axis
-b_out, b_inn = s[res].split(b, factor=b_block)
-oc_out, oc_inn = s[res].split(oc, factor=oc_block)
-y_out, y_inn = s[res].split(y, factor=h_block)
-x_out, x_inn = s[res].split(x, factor=w_block)
-s[res].reorder(b_out, oc_out, y_out, x_out, b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns)
-
-# Move intermediate computation into each output compute tile
-s[res_conv].compute_at(s[res], x_out)
-s[res_shr].compute_at(s[res], x_out)
-s[res_max].compute_at(s[res], x_out)
-s[res_min].compute_at(s[res], x_out)
-
-# Apply additional loop split along reduction axis (input channel)
-b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns = s[res_conv].op.axis
-ic_out, ic_inn = s[res_conv].split(ic, factor=ic_block)
-
-# Reorder axes.
-# 1) Group the VTA tensor axes in the inner most position: b_tns, oc_tns, ic_tns
-#    to allow TVM to tensorize.
-# 2) We move the ic_out axis all the way out of the convolution loop to block
-#    along the reduction axis.
-# 3) Now we re-order the block axes: b_inn, oc_inn, y_inn, x_inn, ic_inn, dy, dx.
-#    VTA runtime/hardware requires us to write to a different output feature map
-#    location for every VTA tensor operation.
-#    This restriction requires us to order one of oc_inn, y_inn or x_inn right
-#    before b_tns, since they all affect output feature map indexing.
-#    Therefore, we choose to bring x_inn inside as shown below.
-s[res_conv].reorder(ic_out, b_inn, oc_inn, y_inn, ic_inn, dy, dx, x_inn, b_tns, oc_tns, ic_tns)
-
-######################################################################
-# Virtual Threading
-# ~~~~~~~~~~~~~~~~~
-# Virtual threading is a mechanism that increases task-level pipeline
-# parallelism in the VTA hardware design.
-# Put it another way, it increases compute resource utilization by hiding
-# memory access latency.
-#
-# In the implementation below, virtual threading distributes work across two
-# threads split along the output channel axis.
-# We show how work is split when computing the 2D convolution in the figure
-# below.
-#
-# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/virtual_threading.png
-#      :align: center
-#      :width: 480px
-
-# VTA only supports 2 virtual threads
-v_threads = 2
-
-# Perform virtual thread split along output channel outer axis
-_, tx = s[res].split(oc_out, factor=v_threads)
-s[res].reorder(tx, b_out)
-s[res].bind(tx, te.thread_axis("cthread"))
-
-# Let's look at the current TVM schedule after blocking and virtual threading
-print(tvm.lower(s, [data, kernel, res], simple_mode=True))
-
-######################################################################
-# Lowering Copies to DMA Transfers
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Next we set the buffer scopes to the corresponding on-chip VTA SRAM buffers.
-# We move the load loops into the 2D convolution computation loop to stage
-# memory loads such that they fit in the on-chip SRAM buffers.
-# Finally we annotate the load/store loop outer axes with the DMA copy pragma
-# to perform bulk memory transfers on VTA.
-
-# Set scope of SRAM buffers
-s[data_buf].set_scope(env.inp_scope)
-s[kernel_buf].set_scope(env.wgt_scope)
-s[res_conv].set_scope(env.acc_scope)
-s[res_shr].set_scope(env.acc_scope)
-s[res_min].set_scope(env.acc_scope)
-s[res_max].set_scope(env.acc_scope)
-
-# Block data and kernel cache reads
-s[data_buf].compute_at(s[res_conv], ic_out)
-s[kernel_buf].compute_at(s[res_conv], ic_out)
-
-# Use DMA copy pragma on DRAM->SRAM operations
-s[data_buf].pragma(s[data_buf].op.axis[0], env.dma_copy)
-s[kernel_buf].pragma(s[kernel_buf].op.axis[0], env.dma_copy)
-
-# Use DMA copy pragma on SRAM->DRAM operation in each result block
-# (this implies that these copies should be performed along b_inn,
-# or result axis 4)
-s[res].pragma(s[res].op.axis[4], env.dma_copy)
-
-######################################################################
-# Lowering Computation to VTA Compute Intrinsics
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# The last phase is to lower the computation loops down to VTA hardware
-# intrinsics by mapping the 2D convolution to tensor intrinsics,
-# and mapping the shift, and clipping computation to the vector ALU.
-
-# Apply tensorization over the batch tensor tile axis
-s[res_conv].tensorize(b_tns, env.gemm)
-
-# Add an ALU pragma over the shift and clipping operations
-s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
-s[res_min].pragma(s[res_min].op.axis[0], env.alu)
-s[res_max].pragma(s[res_max].op.axis[0], env.alu)
-
-# Let's look at the final lowered TVM schedule after lowering memory
-# loads/stores down to DMA copy intrinsics, and the computation down to
-# VTA compute intrinsics.
-print(vta.lower(s, [data, kernel, res], simple_mode=True))
-
-######################################################################
-# TVM Compilation and Verification
-# --------------------------------
-# After specifying the schedule, we can compile it into a TVM function.
-# We save the module so we can send it over RPC.
-# We run the function and verify it against a numpy implementation to
-# ensure correctness.
-
-# This library facilitates 2D convolution testing
-from tvm.topi.testing import conv2d_nchw_python
-
-# Compile the TVM module
-with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
-    my_conv = vta.build(
-        s, [data, kernel, res], tvm.target.Target("ext_dev", host=env.target_host), name="my_conv"
-    )
-temp = utils.tempdir()
-my_conv.save(temp.relpath("conv2d.o"))
-remote.upload(temp.relpath("conv2d.o"))
-f = remote.load_module("conv2d.o")
-
-# Get the remote device context
-ctx = remote.ext_dev(0)
-
-# Initialize the data and kernel arrays randomly in the int range
-# of (-128, 128] in NCHW layout
-data_np = np.random.randint(-128, 128, size=(batch_size, in_channels, height, width)).astype(
-    data.dtype
-)
-kernel_np = np.random.randint(
-    -128, 128, size=(out_channels, in_channels, kernel_h, kernel_w)
-).astype(kernel.dtype)
-
-# Apply packing to the data and kernel arrays from a 2D NCHW
-# to a 4D NCHWnc packed layout
-data_packed = data_np.reshape(
-    batch_size // env.BATCH, env.BATCH, in_channels // env.BLOCK_IN, env.BLOCK_IN, height, width
-).transpose((0, 2, 4, 5, 1, 3))
-
-kernel_packed = kernel_np.reshape(
-    out_channels // env.BLOCK_OUT,
-    env.BLOCK_OUT,
-    in_channels // env.BLOCK_IN,
-    env.BLOCK_IN,
-    kernel_h,
-    kernel_w,
-).transpose((0, 2, 4, 5, 1, 3))
-
-# Format the input/output arrays with tvm.nd.array to the DLPack standard
-data_nd = tvm.nd.array(data_packed, ctx)
-kernel_nd = tvm.nd.array(kernel_packed, ctx)
-res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)
-
-# Clear stats
-if env.TARGET in ["sim", "tsim"]:
-    simulator.clear_stats()
-
-# Invoke the module to perform the computation
-f(data_nd, kernel_nd, res_nd)
-
-# Verify against numpy implementation
-res_ref = conv2d_nchw_python(
-    data_np.astype(env.acc_dtype),
-    kernel_np.astype(env.acc_dtype),
-    (stride_h, stride_w),
-    (pad_h, pad_w),
-).astype(env.acc_dtype)
-res_ref = res_ref >> env.INP_WIDTH
-res_ref = np.clip(res_ref, 0, inp_max)
-res_ref = res_ref.astype(res.dtype)
-res_ref = res_ref.reshape(
-    (
-        batch_size // env.BATCH,
-        env.BATCH,
-        out_channels // env.BLOCK_OUT,
-        env.BLOCK_OUT,
-        fout_height,
-        fout_width,
-    )
-).transpose((0, 2, 4, 5, 1, 3))
-tvm.testing.assert_allclose(res_ref, res_nd.numpy())
-
-# Print stats
-if env.TARGET in ["sim", "tsim"]:
-    sim_stats = simulator.stats()
-    print("Execution statistics:")
-    for k, v in sim_stats.items():
-        print("\t{:<16}: {:>16}".format(k, v))
-
-print("Successful 2D convolution test!")
-
-######################################################################
-# Summary
-# -------
-# This tutorial demonstrates how TVM scheduling primitives can be used to
-# lower 2D convolution onto hardware accelerator intrinsics, making
-# use of hardware specific optimizations, such as latency hiding with
-# virtual threading.
-#
diff --git a/vta/tutorials/optimize/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py
deleted file mode 100644
index b470475b16e7..000000000000
--- a/vta/tutorials/optimize/matrix_multiply_opt.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _vta-mat-mult-opt:
-
-Matrix Multiply Blocking
-========================
-**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
-
-This tutorial provides an overview on how to use TVM to map matrix
-multiplication efficiently on the VTA design.
-We recommend covering the :ref:`basic-mat-mult` tutorial first.
-
-In this tutorial, we will demonstrate TVM schedule optimizations to break large
-neural network operators down onto smaller blocks to achieve computation within
-limited hardware accelerator resources.
-"""
-
-######################################################################
-# RPC Setup
-# ---------
-# We start by programming the Pynq's FPGA and building its RPC runtime.
-
-from __future__ import absolute_import, print_function
-
-import os
-import tvm
-from tvm import te
-import vta
-import numpy as np
-from tvm import rpc
-from tvm.contrib import utils
-from vta.testing import simulator
-
-# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
-env = vta.get_env()
-
-# We read the Pynq RPC host IP address and port number from the OS environment
-host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
-port = int(os.environ.get("VTA_RPC_PORT", "9091"))
-
-# We configure both the bitstream and the runtime system on the Pynq
-# to match the VTA configuration specified by the vta_config.json file.
-if env.TARGET == "pynq":
-
-    # Make sure that TVM was compiled with RPC=1
-    assert tvm.runtime.enabled("rpc")
-    remote = rpc.connect(host, port)
-
-    # Reconfigure the JIT runtime
-    vta.reconfig_runtime(remote)
-
-    # Program the FPGA with a pre-compiled VTA bitstream.
-    # You can program the FPGA with your own custom bitstream
-    # by passing the path to the bitstream file instead of None.
-    vta.program_fpga(remote, bitstream=None)
-
-# In simulation mode, host the RPC server locally.
-elif env.TARGET in ["sim", "tsim"]:
-    remote = rpc.LocalSession()
-
-######################################################################
-# Computation Declaration
-# -----------------------
-# As a first step, we need to describe our matrix multiplication computation.
-# We define the matrix multiplication as the computation one would find in a
-# fully connected layer, defined by its batch size, input channels, and output
-# channels.
-# These have to be integer multiples of the VTA tensor shape:
-# :code:`BATCH`, :code:`BLOCK_IN`, and :code:`BLOCK_OUT` respectively.
-#
-# We've added extra operators to the matrix multiplication that apply
-# shifting and clipping to the output in order to mimic a fixed-point
-# matrix multiplication followed by a rectified linear activation.
-# We describe the TVM dataflow graph of the fully connected layer below:
-#
-# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/fc_dataflow.png
-#      :align: center
-#
-# This computation is intentionally too large to fit onto VTA's on-chip
-# buffers all at once. Therefore in the scheduling phase we'll
-# rely on computation blocking strategies to break the computation down into
-# manageable chunks.
-
-# Fully connected layer dimensions: 1024 x 1024
-batch_size = 1
-in_channels = 1024
-out_channels = 1024
-assert batch_size % env.BATCH == 0
-assert in_channels % env.BLOCK_IN == 0
-assert out_channels % env.BLOCK_OUT == 0
-
-# Let's derive the tiled input tensor shapes
-data_shape = (batch_size // env.BATCH, in_channels // env.BLOCK_IN, env.BATCH, env.BLOCK_IN)
-weight_shape = (
-    out_channels // env.BLOCK_OUT,
-    in_channels // env.BLOCK_IN,
-    env.BLOCK_OUT,
-    env.BLOCK_IN,
-)
-output_shape = (batch_size // env.BATCH, out_channels // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT)
-num_ops = in_channels * out_channels * batch_size * 2
-
-# Reduction axes
-ic = te.reduce_axis((0, in_channels // env.BLOCK_IN), name="ic")
-ic_tns = te.reduce_axis((0, env.BLOCK_IN), name="ic_tns")
-
-# Input placeholder tensors
-data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-weight = te.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype)
-
-# Copy buffers
-data_buf = te.compute(data_shape, lambda *i: data(*i), "data_buf")
-weight_buf = te.compute(weight_shape, lambda *i: weight(*i), "weight_buf")
-
-# Declare matrix multiply computation
-res_gemm = te.compute(
-    output_shape,
-    lambda bo, co, bi, ci: te.sum(
-        data_buf[bo, ic, bi, ic_tns].astype(env.acc_dtype)
-        * weight_buf[co, ic, ci, ic_tns].astype(env.acc_dtype),
-        axis=[ic, ic_tns],
-    ),
-    name="res_gem",
-)
-
-# Add shift stage for fix-point normalization
-res_shr = te.compute(output_shape, lambda *i: res_gemm(*i) >> env.INP_WIDTH, name="res_shr")
-
-# Apply clipping between (0, input max value)
-inp_max = (1 << (env.INP_WIDTH - 1)) - 1
-res_max = te.compute(output_shape, lambda *i: tvm.te.max(res_shr(*i), 0), "res_max")
-res_min = te.compute(output_shape, lambda *i: tvm.te.min(res_max(*i), inp_max), "res_min")
-
-# Apply typecast to input data type before sending results back
-res = te.compute(output_shape, lambda *i: res_min(*i).astype(env.inp_dtype), name="res")
-
-######################################################################
-# Scheduling the Computation
-# --------------------------
-# We'll look at a set of schedule transformations necessary to map the
-# matrix multiplications onto VTA in an efficient fashion.
-# Those include:
-#
-# - Computation blocking
-# - Lowering to VTA hardware intrinsics
-
-
-# Create TVM schedule
-s = te.create_schedule(res.op)
-# Let's look at the default TVM schedule
-print(tvm.lower(s, [data, weight, res], simple_mode=True))
-
-######################################################################
-# Blocking the Computation
-# ~~~~~~~~~~~~~~~~~~~~~~~~
-# The matrix multiplication is by default too large for activations or weights
-# to fit on VTA's on-chip buffers all at once.
-# We block the (1, 1024) by (1024, 1024) matrix multiplication into
-# smaller (1, 256) by (256, 256) matrix multiplications so the intermediate
-# tensors can fit on the accelerator's on-chip SRAM.
-# This approach is similar to blocking techniques applied to CPUs and GPUs in
-# order to increase cache hit rate.
-#
-# We perform blocking along each axes (the batch axis being untouched since
-# we are performing singe-batch inference).
-# We also leave the inner-most tensorization axes as-is in order to allow
-# TVM to pattern-match tensorization.
-# We show the outcome of blocking on the computation schedule in the diagram
-# below:
-#
-# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/blocking.png
-#      :align: center
-#      :width: 480px
-#
-# .. note::
-#
-#   The code after loop splitting and reordering is equivalent to the following
-#   pseudo-code. We ignore the batch axis since we are only performing single-batch
-#   inference in this example:
-#
-#   .. code-block:: c
-#
-#      for (int oc_out = 0; oc_out < 4; ++oc_out) {
-#        // Initialization loop
-#        for (int oc_inn = 0; oc_inn < 16; ++oc_inn) {
-#         for (int oc_tns = 0; oc_tns < 16; ++oc_tns) {
-#          int j = (oc_out * 16 + oc_inn) * 16 + oc_tns;
-#          C[0][j] = 0;
-#         }
-#        }
-#        for (int ic_out = 0; ic_out < 4; ++ic_out) {
-#         // Block loop
-#         for (int oc_inn = 0; oc_inn < 16; ++oc_inn) {
-#          for (int ic_inn = 0; ic_inn < 16; ++ic_inn) {
-#           // Tensorization loop
-#           for (int oc_tns = 0; oc_tns < 16; ++oc_tns) {
-#            for (int ic_tns = 0; ic_tns < 16; ++ic_tns) {
-#             int i = (ic_out * 16 + ic_inn) * 16 + ic_tns;
-#             int j = (oc_out * 16 + oc_inn) * 16 + oc_tns;
-#             C[0][i] = C[0][i] + A[0][i] * B[j][i];
-#            }
-#           }
-#          }
-#         }
-#        }
-#       }
-#      }
-
-# Let's define tiling sizes (expressed in multiples of VTA tensor shape size)
-b_block = 1 // env.BATCH
-i_block = 256 // env.BLOCK_IN
-o_block = 256 // env.BLOCK_OUT
-
-# Tile the output tensor along the batch and output channel dimensions
-# (since by default we are doing single batch inference, the split along
-#  the batch dimension has no effect)
-b, oc, b_tns, oc_tns = s[res].op.axis
-b_out, b_inn = s[res].split(b, b_block)
-oc_out, oc_inn = s[res].split(oc, o_block)
-s[res].reorder(b_out, oc_out, b_inn, oc_inn)
-
-# Move intermediate computation into each output compute tile
-s[res_gemm].compute_at(s[res], oc_out)
-s[res_shr].compute_at(s[res], oc_out)
-s[res_max].compute_at(s[res], oc_out)
-s[res_min].compute_at(s[res], oc_out)
-
-# Apply additional loop split along reduction axis (input channel)
-b_inn, oc_inn, b_tns, oc_tns = s[res_gemm].op.axis
-ic_out, ic_inn = s[res_gemm].split(ic, i_block)
-
-# Reorder axes. We move the ic_out axis all the way out of the GEMM
-# loop to block along the reduction axis
-s[res_gemm].reorder(ic_out, b_inn, oc_inn, ic_inn, b_tns, oc_tns, ic_tns)
-
-# Let's look at the current TVM schedule after blocking
-print(tvm.lower(s, [data, weight, res], simple_mode=True))
-
-######################################################################
-# Lowering Copies to DMA Transfers
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Next we set the buffer scopes to the corresponding on-chip VTA SRAM buffers.
-# We move the load loops into the matrix multiply computation loop to stage
-# memory loads such that they fit in the on-chip SRAM buffers.
-# Finally we annotate the load/store loop outer axes with the DMA copy pragma
-# to perform bulk memory transfers on VTA.
-
-# Set scope of SRAM buffers
-s[data_buf].set_scope(env.inp_scope)
-s[weight_buf].set_scope(env.wgt_scope)
-s[res_gemm].set_scope(env.acc_scope)
-s[res_shr].set_scope(env.acc_scope)
-s[res_min].set_scope(env.acc_scope)
-s[res_max].set_scope(env.acc_scope)
-
-# Block data and weight cache reads
-s[data_buf].compute_at(s[res_gemm], ic_out)
-s[weight_buf].compute_at(s[res_gemm], ic_out)
-
-# Use DMA copy pragma on DRAM->SRAM operations
-s[data_buf].pragma(s[data_buf].op.axis[0], env.dma_copy)
-s[weight_buf].pragma(s[weight_buf].op.axis[0], env.dma_copy)
-
-# Use DMA copy pragma on SRAM->DRAM operation
-# (this implies that these copies should be performed along b_inn,
-# or result axis 2)
-s[res].pragma(s[res].op.axis[2], env.dma_copy)
-
-######################################################################
-# Lowering Computation to VTA Compute Intrinsics
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# The last phase is to lower the computation loops down to VTA hardware
-# intrinsics by mapping the matrix multiplication to tensor intrinsics,
-# and mapping the shift, and clipping computation to the vector ALU.
-
-# Apply tensorization over the batch tensor tile axis
-s[res_gemm].tensorize(b_tns, env.gemm)
-
-# Add an ALU pragma over the shift and clipping operations
-s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
-s[res_min].pragma(s[res_min].op.axis[0], env.alu)
-s[res_max].pragma(s[res_max].op.axis[0], env.alu)
-
-# Let's look at the final lowered TVM schedule after lowering memory
-# loads/stores down to DMA copy intrinsics, and the computation down to
-# VTA compute intrinsics.
-print(vta.lower(s, [data, weight, res], simple_mode=True))
-
-######################################################################
-# TVM Compilation and Verification
-# --------------------------------
-# After specifying the schedule, we can compile it into a TVM function.
-# We save the module so we can send it over RPC.
-# We run the function and verify it against a numpy implementation to
-# ensure correctness.
-
-# Compile the TVM module
-my_gemm = vta.build(
-    s, [data, weight, res], tvm.target.Target("ext_dev", host=env.target_host), name="my_gemm"
-)
-temp = utils.tempdir()
-my_gemm.save(temp.relpath("gemm.o"))
-remote.upload(temp.relpath("gemm.o"))
-f = remote.load_module("gemm.o")
-
-# Get the remote device context
-ctx = remote.ext_dev(0)
-
-# Initialize the data and weight arrays randomly in the int range of (-128, 128]
-data_np = np.random.randint(-128, 128, size=(batch_size, in_channels)).astype(data.dtype)
-weight_np = np.random.randint(-128, 128, size=(out_channels, in_channels)).astype(weight.dtype)
-
-# Apply packing to the data and weight arrays from a 2D to a 4D packed layout
-data_packed = data_np.reshape(
-    batch_size // env.BATCH, env.BATCH, in_channels // env.BLOCK_IN, env.BLOCK_IN
-).transpose((0, 2, 1, 3))
-weight_packed = weight_np.reshape(
-    out_channels // env.BLOCK_OUT, env.BLOCK_OUT, in_channels // env.BLOCK_IN, env.BLOCK_IN
-).transpose((0, 2, 1, 3))
-
-# Format the input/output arrays with tvm.nd.array to the DLPack standard
-data_nd = tvm.nd.array(data_packed, ctx)
-weight_nd = tvm.nd.array(weight_packed, ctx)
-res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)
-
-# Clear stats
-if env.TARGET in ["sim", "tsim"]:
-    simulator.clear_stats()
-
-# Invoke the module to perform the computation
-f(data_nd, weight_nd, res_nd)
-
-# Verify against numpy implementation
-res_ref = np.dot(data_np.astype(env.acc_dtype), weight_np.T.astype(env.acc_dtype))
-res_ref = res_ref >> env.INP_WIDTH
-res_ref = np.clip(res_ref, 0, inp_max)
-res_ref = res_ref.astype(res.dtype)
-res_ref = res_ref.reshape(
-    batch_size // env.BATCH, env.BATCH, out_channels // env.BLOCK_OUT, env.BLOCK_OUT
-).transpose((0, 2, 1, 3))
-np.testing.assert_equal(res_ref, res_nd.numpy())
-
-# Print stats
-if env.TARGET in ["sim", "tsim"]:
-    sim_stats = simulator.stats()
-    print("Execution statistics:")
-    for k, v in sim_stats.items():
-        print("\t{:<16}: {:>16}".format(k, v))
-
-print("Successful blocked matrix multiply test!")
-
-######################################################################
-# Summary
-# -------
-# This tutorial demonstrates how TVM scheduling primitives can achieve
-# computation blocking for a matrix multiplication example.
-# This allows us to map arbitrarily large computation onto limited
-# hardware accelerator resources.
-#
diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py
deleted file mode 100644
index 3482258dece8..000000000000
--- a/vta/tutorials/vta_get_started.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _vta-get-started:
-
-Get Started with VTA
-====================
-**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
-
-This is an introduction tutorial on how to use TVM to program the VTA design.
-
-In this tutorial, we will demonstrate the basic TVM workflow to implement
-a vector addition on the VTA design's vector ALU.
-This process includes specific scheduling transformations necessary to lower
-computation down to low-level accelerator operations.
-
-To begin, we need to import TVM which is our deep learning optimizing compiler.
-We also need to import the VTA python package which contains VTA specific
-extensions for TVM to target the VTA design.
-"""
-from __future__ import absolute_import, print_function
-
-import os
-import tvm
-from tvm import te
-import vta
-import numpy as np
-
-######################################################################
-# Loading in VTA Parameters
-# ~~~~~~~~~~~~~~~~~~~~~~~~~
-# VTA is a modular and customizable design. Consequently, the user
-# is free to modify high-level hardware parameters that affect
-# the hardware design layout.
-# These parameters are specified in the :code:`vta_config.json` file by their
-# :code:`log2` values.
-# These VTA parameters can be loaded with the :code:`vta.get_env`
-# function.
-#
-# Finally, the TVM target is also specified in the :code:`vta_config.json` file.
-# When set to *sim*, execution will take place inside of a behavioral
-# VTA simulator.
-# If you want to run this tutorial on the Pynq FPGA development platform,
-# follow the *VTA Pynq-Based Testing Setup* guide.
-
-env = vta.get_env()
-
-######################################################################
-# FPGA Programming
-# ----------------
-# When targeting the Pynq FPGA development board, we need to configure
-# the board with a VTA bitstream.
-
-# We'll need the TVM RPC module and the VTA simulator module
-from tvm import rpc
-from tvm.contrib import utils
-from vta.testing import simulator
-
-# We read the Pynq RPC host IP address and port number from the OS environment
-host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
-port = int(os.environ.get("VTA_RPC_PORT", "9091"))
-
-# We configure both the bitstream and the runtime system on the Pynq
-# to match the VTA configuration specified by the vta_config.json file.
-if env.TARGET == "pynq" or env.TARGET == "de10nano":
-
-    # Make sure that TVM was compiled with RPC=1
-    assert tvm.runtime.enabled("rpc")
-    remote = rpc.connect(host, port)
-
-    # Reconfigure the JIT runtime
-    vta.reconfig_runtime(remote)
-
-    # Program the FPGA with a pre-compiled VTA bitstream.
-    # You can program the FPGA with your own custom bitstream
-    # by passing the path to the bitstream file instead of None.
-    vta.program_fpga(remote, bitstream=None)
-
-# In simulation mode, host the RPC server locally.
-elif env.TARGET in ("sim", "tsim", "intelfocl"):
-    remote = rpc.LocalSession()
-
-    if env.TARGET in ["intelfocl"]:
-        # program intelfocl aocx
-        vta.program_fpga(remote, bitstream="vta.bitstream")
-
-######################################################################
-# Computation Declaration
-# -----------------------
-# As a first step, we need to describe our computation.
-# TVM adopts tensor semantics, with each intermediate result
-# represented as multi-dimensional array. The user needs to describe
-# the computation rule that generates the output tensors.
-#
-# In this example we describe a vector addition, which requires multiple
-# computation stages, as shown in the dataflow diagram below.
-# First we describe the input tensors :code:`A` and :code:`B` that are living
-# in main memory.
-# Second, we need to declare intermediate tensors :code:`A_buf` and
-# :code:`B_buf`, which will live in VTA's on-chip buffers.
-# Having this extra computational stage allows us to explicitly
-# stage cached reads and writes.
-# Third, we describe the vector addition computation which will
-# add :code:`A_buf` to :code:`B_buf` to produce :code:`C_buf`.
-# The last operation is a cast and copy back to DRAM, into results tensor
-# :code:`C`.
-#
-# .. image:: https://raw.githubusercontent.com/uwsampl/web-data/main/vta/tutorial/vadd_dataflow.png
-#      :align: center
-
-######################################################################
-# Input Placeholders
-# ~~~~~~~~~~~~~~~~~~
-# We describe the placeholder tensors :code:`A`, and :code:`B` in a tiled data
-# format to match the data layout requirements imposed by the VTA vector ALU.
-#
-# For VTA's general purpose operations such as vector adds, the tile size is
-# :code:`(env.BATCH, env.BLOCK_OUT)`.
-# The dimensions are specified in
-# the :code:`vta_config.json` configuration file and are set by default to
-# a (1, 16) vector.
-#
-# In addition, A and B's data types also needs to match the :code:`env.acc_dtype`
-# which is set by the :code:`vta_config.json` file to be a 32-bit integer.
-
-# Output channel factor m - total 64 x 16 = 1024 output channels
-m = 64
-# Batch factor o - total 1 x 1 = 1
-o = 1
-# A placeholder tensor in tiled data format
-A = te.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="A", dtype=env.acc_dtype)
-# B placeholder tensor in tiled data format
-B = te.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="B", dtype=env.acc_dtype)
-
-######################################################################
-# Copy Buffers
-# ~~~~~~~~~~~~
-# One specificity of hardware accelerators, is that on-chip memory has to be
-# explicitly managed.
-# This means that we'll need to describe intermediate tensors :code:`A_buf`
-# and :code:`B_buf` that can have a different memory scope than the original
-# placeholder tensors :code:`A` and :code:`B`.
-#
-# Later in the scheduling phase, we can tell the compiler that :code:`A_buf`
-# and :code:`B_buf` will live in the VTA's on-chip buffers (SRAM), while
-# :code:`A` and :code:`B` will live in main memory (DRAM).
-# We describe A_buf and B_buf as the result of a compute
-# operation that is the identity function.
-# This can later be interpreted by the compiler as a cached read operation.
-
-# A copy buffer
-A_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: A(*i), "A_buf")
-# B copy buffer
-B_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: B(*i), "B_buf")
-
-######################################################################
-# Vector Addition
-# ~~~~~~~~~~~~~~~
-# Now we're ready to describe the vector addition result tensor :code:`C`,
-# with another compute operation.
-# The compute function takes the shape of the tensor, as well as a lambda
-# function that describes the computation rule for each position of the tensor.
-#
-# No computation happens during this phase, as we are only declaring how
-# the computation should be done.
-
-# Describe the in-VTA vector addition
-C_buf = te.compute(
-    (o, m, env.BATCH, env.BLOCK_OUT),
-    lambda *i: A_buf(*i).astype(env.acc_dtype) + B_buf(*i).astype(env.acc_dtype),
-    name="C_buf",
-)
-
-######################################################################
-# Casting the Results
-# ~~~~~~~~~~~~~~~~~~~
-# After the computation is done, we'll need to send the results computed by VTA
-# back to main memory.
-
-######################################################################
-# .. note::
-#
-#   **Memory Store Restrictions**
-#
-#   One specificity of VTA is that it only supports DRAM stores in the narrow
-#   :code:`env.inp_dtype` data type format.
-#   This lets us reduce the data footprint for memory transfers (more on this
-#   in the basic matrix multiply example).
-#
-# We perform one last typecast operation to the narrow
-# input activation data format.
-
-# Cast to output type, and send to main memory
-C = te.compute(
-    (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: C_buf(*i).astype(env.inp_dtype), name="C"
-)
-
-######################################################################
-# This concludes the computation declaration part of this tutorial.
-
-
-######################################################################
-# Scheduling the Computation
-# --------------------------
-# While the above lines describes the computation rule, we can obtain
-# :code:`C` in many ways.
-# TVM asks the user to provide an implementation of the computation called
-# *schedule*.
-#
-# A schedule is a set of transformations to an original computation that
-# transforms the implementation of the computation without affecting
-# correctness.
-# This simple VTA programming tutorial aims to demonstrate basic schedule
-# transformations that will map the original schedule down to VTA hardware
-# primitives.
-
-
-######################################################################
-# Default Schedule
-# ~~~~~~~~~~~~~~~~
-# After we construct the schedule, by default the schedule computes
-# :code:`C` in the following way:
-
-# Let's take a look at the generated schedule
-s = te.create_schedule(C.op)
-
-print(tvm.lower(s, [A, B, C], simple_mode=True))
-
-######################################################################
-# Although this schedule makes sense, it won't compile to VTA.
-# In order to obtain correct code generation, we need to apply scheduling
-# primitives and code annotation that will transform the schedule into
-# one that can be directly lowered onto VTA hardware intrinsics.
-# Those include:
-#
-#  - DMA copy operations which will take globally-scoped tensors and copy
-#    those into locally-scoped tensors.
-#  - Vector ALU operations that will perform the vector add.
-
-######################################################################
-# Buffer Scopes
-# ~~~~~~~~~~~~~
-# First, we set the scope of the copy buffers to indicate to TVM that these
-# intermediate tensors will be stored in the VTA's on-chip SRAM buffers.
-# Below, we tell TVM that :code:`A_buf`, :code:`B_buf`, :code:`C_buf`
-# will live in VTA's on-chip *accumulator buffer* which serves as
-# VTA's general purpose register file.
-#
-# Set the intermediate tensors' scope to VTA's on-chip accumulator buffer
-s[A_buf].set_scope(env.acc_scope)
-s[B_buf].set_scope(env.acc_scope)
-s[C_buf].set_scope(env.acc_scope)
-
-######################################################################
-# DMA Transfers
-# ~~~~~~~~~~~~~
-# We need to schedule DMA transfers to move data living in DRAM to
-# and from the VTA on-chip buffers.
-# We insert :code:`dma_copy` pragmas to indicate to the compiler
-# that the copy operations will be performed in bulk via DMA,
-# which is common in hardware accelerators.
-
-# Tag the buffer copies with the DMA pragma to map a copy loop to a
-# DMA transfer operation
-s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
-s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
-s[C].pragma(s[C].op.axis[0], env.dma_copy)
-
-######################################################################
-# ALU Operations
-# ~~~~~~~~~~~~~~
-# VTA has a vector ALU that can perform vector operations on tensors
-# in the accumulator buffer.
-# In order to tell TVM that a given operation needs to be mapped to the
-# VTA's vector ALU, we need to explicitly tag the vector addition loop
-# with an :code:`env.alu` pragma.
-
-# Tell TVM that the computation needs to be performed
-# on VTA's vector ALU
-s[C_buf].pragma(C_buf.op.axis[0], env.alu)
-
-# Let's take a look at the finalized schedule
-print(vta.lower(s, [A, B, C], simple_mode=True))
-
-######################################################################
-# This concludes the scheduling portion of this tutorial.
-
-######################################################################
-# TVM Compilation
-# ---------------
-# After we have finished specifying the schedule, we can compile it
-# into a TVM function. By default TVM compiles into a type-erased
-# function that can be directly called from python side.
-#
-# In the following line, we use :code:`tvm.build` to create a function.
-# The build function takes the schedule, the desired signature of the
-# function(including the inputs and outputs) as well as target language
-# we want to compile to.
-#
-my_vadd = vta.build(
-    s, [A, B, C], tvm.target.Target("ext_dev", host=env.target_host), name="my_vadd"
-)
-
-######################################################################
-# Saving the Module
-# ~~~~~~~~~~~~~~~~~
-# TVM lets us save our module into a file so it can loaded back later. This
-# is called ahead-of-time compilation and allows us to save some compilation
-# time.
-# More importantly, this allows us to cross-compile the executable on our
-# development machine and send it over to the Pynq FPGA board over RPC for
-# execution.
-
-# Write the compiled module into an object file.
-temp = utils.tempdir()
-my_vadd.save(temp.relpath("vadd.o"))
-
-# Send the executable over RPC
-remote.upload(temp.relpath("vadd.o"))
-
-######################################################################
-# Loading the Module
-# ~~~~~~~~~~~~~~~~~~
-# We can load the compiled module from the file system to run the code.
-
-f = remote.load_module("vadd.o")
-
-######################################################################
-# Running the Function
-# --------------------
-# The compiled TVM function uses a concise C API and can be invoked from
-# any language.
-#
-# TVM provides an array API in python to aid quick testing and prototyping.
-# The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
-#
-# - We first create a remote context (for remote execution on the Pynq).
-# - Then :code:`tvm.nd.array` formats the data accordingly.
-# - :code:`f()` runs the actual computation.
-# - :code:`numpy()` copies the result array back in a format that can be
-#   interpreted.
-#
-
-# Get the remote device context
-ctx = remote.ext_dev(0)
-
-# Initialize the A and B arrays randomly in the int range of (-128, 128]
-A_orig = np.random.randint(-128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(A.dtype)
-B_orig = np.random.randint(-128, 128, size=(o * env.BATCH, m * env.BLOCK_OUT)).astype(B.dtype)
-
-# Apply packing to the A and B arrays from a 2D to a 4D packed layout
-A_packed = A_orig.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
-B_packed = B_orig.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
-
-# Format the input/output arrays with tvm.nd.array to the DLPack standard
-A_nd = tvm.nd.array(A_packed, ctx)
-B_nd = tvm.nd.array(B_packed, ctx)
-C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)
-
-# Invoke the module to perform the computation
-f(A_nd, B_nd, C_nd)
-
-######################################################################
-# Verifying Correctness
-# ---------------------
-# Compute the reference result with numpy and assert that the output of the
-# matrix multiplication indeed is correct
-
-# Compute reference result with numpy
-C_ref = (A_orig.astype(env.acc_dtype) + B_orig.astype(env.acc_dtype)).astype(C.dtype)
-C_ref = C_ref.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
-np.testing.assert_equal(C_ref, C_nd.numpy())
-print("Successful vector add test!")
-
-######################################################################
-# Summary
-# -------
-# This tutorial provides a walk-through of TVM for programming the
-# deep learning accelerator VTA with a simple vector addition example.
-# The general workflow includes:
-#
-# - Programming the FPGA with the VTA bitstream over RPC.
-# - Describing the vector add computation via a series of computations.
-# - Describing how we want to perform the computation using schedule primitives.
-# - Compiling the function to the VTA target.
-# - Running the compiled module and verifying it against a numpy implementation.
-#
-# You are more than welcome to check other examples out and tutorials
-# to learn more about the supported operations, schedule primitives
-# and other features supported by TVM to program VTA.
-#