diff --git a/vta/tutorials/autotvm/tune_relay_vta_with_one_board.py b/vta/tutorials/autotvm/tune_relay_vta_with_one_board.py
new file mode 100644
index 000000000000..ab509298ebb1
--- /dev/null
+++ b/vta/tutorials/autotvm/tune_relay_vta_with_one_board.py
@@ -0,0 +1,510 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a convolutional network on VTA
+==========================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+Auto-tuning for a specific accelerator design is critical for getting the best
+performance for any given operator. This is a tutorial showcases how to tune a
+whole convolutional network on VTA.
+
+The operator implementation for VTA in TVM is written in template form.
+The template has many tunable knobs (tile factor, virtual threads, etc).
+We will tune all convolution operators in the neural network. After tuning,
+we produce a log file which stores the best schedule parameters for all tuned
+operators. When the TVM compiler compiles these operators, it will query this
+log file to get the best knob parameters.
+
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado mxnet requests "Pillow<7"
+#
+# To make TVM run faster during tuning, it is recommended to use cython
+# as FFI of TVM. In the root directory of TVM, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+from PIL import Image
+
+from tvm import topi
+import tvm
+from tvm import te
+from tvm import rpc, autotvm, relay
+from tvm.contrib import graph_runtime, util, download
+from tvm.autotvm.measure.measure_methods import request_remote
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
+
+#################################################################
+# Compile network
+# ---------------
+# Perform vta-specific compilation with Relay from a Gluon model
+
+
+def compile_network(env, target, model, start_pack, stop_pack):
+
+    # Populate the shape and data type dictionary
+    dtype_dict = {"data": "float32"}
+    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+    # Get off the shelf gluon model, and convert to relay
+    gluon_model = vision.get_model(model, pretrained=True)
+    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+
+    # Update shape and type dictionary
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Perform quantization in Relay
+    # Note: We set opt_level to 3 in order to fold batch norm
+    with tvm.transform.PassContext(opt_level=3):
+        with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
+            mod = relay.quantize.quantize(mod, params=params)
+
+    # Perform graph packing and constant folding for VTA target
+    if target.device_name == "vta":
+        assert env.BLOCK_IN == env.BLOCK_OUT
+        relay_prog = graph_pack(
+            mod["main"],
+            env.BATCH,
+            env.BLOCK_OUT,
+            env.WGT_WIDTH,
+            start_name=start_pack,
+            stop_name=stop_pack,
+        )
+
+    return relay_prog, params
+
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses an RPC session to communicate with Pynq boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up tuning, TVM uses an RPC Tracker to manage multiple devices.
+# The RPC Tracker is a centralized controller node. We can register all devices to
+# the tracker. For example, if we have 10 Pynq boards, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is:
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build the TVM runtime for the Pynq devices.
+#
+# Follow :ref:`vta-index`
+# to build the TVM runtime on the device. Then register the device to the tracker with:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=pynq
+#
+# (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# After registering devices, we can confirm it by querying the rpc_tracker:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 6 Pynq boards and 11 Raspberry Pi 3B,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    pynq         1      1     0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate tuning.
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations.
+# Here we use an Pynq-Z1 board as an example.
+
+# Tracker host and port can be set by your environment
+tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0")
+tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
+
+# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
+env = vta.get_env()
+
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
+
+# Name of Gluon model to compile
+# The ``start_pack`` and ``stop_pack`` labels indicate where
+# to start and end the graph packing relay pass: in other words
+# where to start and finish offloading to VTA.
+network = "resnet18_v1"
+start_pack = "nn.max_pool2d"
+stop_pack = "nn.global_avg_pool2d"
+
+# Tuning option
+log_file = "%s.%s.log" % (device, network)
+tuning_option = {
+    "log_filename": log_file,
+    "tuner": "random",
+    "n_trial": 1000,
+    "early_stopping": None,
+    "measure_option": autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.RPCRunner(
+            env.TARGET,
+            host=tracker_host,
+            port=tracker_port,
+            number=5,
+            timeout=60,
+            check_correctness=True,
+        ),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default values provided here work well.
+#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping`
+#   to larger values, makes the tuning run for longer.
+#   If your device is under-powered or your conv2d operators are large, consider
+#   setting a longer timeout.
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+#
+# Given that the tuning will be done on Pynq FPGA boards, make sure that
+# the ```TARGET`` entry in the ``vta_config.json`` file is set to ``pynq``.
+
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(
+    tasks,
+    measure_option,
+    tuner="xgb",
+    n_trial=1000,
+    early_stopping=None,
+    log_filename="tuning.log",
+    use_transfer_learning=True,
+):
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+
+        # create tuner
+        if tuner == "xgb" or tuner == "xgb-rank":
+            tuner_obj = XGBTuner(tsk, loss_type="rank")
+        elif tuner == "xgb_knob":
+            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
+        elif tuner == "ga":
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == "random":
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == "gridsearch":
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tsk_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(
+            n_trial=tsk_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                autotvm.callback.log_to_file(tmp_log_file),
+            ],
+        )
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Register VTA-specific tuning tasks
+
+
+def register_vta_tuning_tasks():
+    from tvm.autotvm.task import TaskExtractEnv
+
+    @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
+    def my_clip(x, a_min, a_max):
+        """Unlike topi's current clip, put min and max into two stages."""
+        const_min = tvm.tir.const(a_min, x.dtype)
+        const_max = tvm.tir.const(a_max, x.dtype)
+        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+        x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
+        return x
+
+    # init autotvm env to register VTA operator
+    TaskExtractEnv()
+
+    @autotvm.template("conv2d_packed.vta")
+    def _topi_nn_conv2d(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        A, W = args[:2]
+
+        with tvm.target.vta():
+            res = vta.top.conv2d_packed(*args, **kwargs)
+            res = topi.right_shift(res, 8)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.Target.current().device_name == "vta":
+            s = vta.top.schedule_conv2d_packed([res])
+        else:
+            s = te.create_schedule([res.op])
+        return s, [A, W, res]
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+
+def tune_and_evaluate(tuning_opt):
+
+    # Register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # Perform task extraction on Relay program
+    print("Extract tasks...")
+    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
+    mod = tvm.IRModule.from_expr(relay_prog)
+    tasks = autotvm.task.extract_from_program(
+        mod,
+        params=params,
+        ops=(relay.op.get("nn.conv2d"),),
+        target=target,
+        target_host=env.target_host,
+    )
+
+    # filter out non-packed conv2d task
+    tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
+
+    # We should have extracted 10 convolution tasks
+    assert len(tasks) == 10
+    print("Extracted {} conv2d tasks:".format(len(tasks)))
+    for tsk in tasks:
+        inp = tsk.args[0][1]
+        wgt = tsk.args[1][1]
+        batch = inp[0] * inp[4]
+        in_filter = inp[1] * inp[5]
+        out_filter = wgt[0] * wgt[4]
+        height, width = inp[2], inp[3]
+        hkernel, wkernel = wgt[2], wgt[3]
+        hstride, wstride = tsk.args[2][0], tsk.args[2][1]
+        hpad, wpad = tsk.args[3][0], tsk.args[3][1]
+        print(
+            "({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format(
+                batch,
+                height,
+                width,
+                in_filter,
+                out_filter,
+                hkernel,
+                wkernel,
+                hpad,
+                wpad,
+                hstride,
+                wstride,
+            )
+        )
+
+    # We do not run the tuning in our webpage server since it takes too long.
+    # Comment the following line to run it by yourself.
+    return
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # evaluate with tuning history
+    if env.TARGET != "sim":
+        # Get remote from fleet node
+        remote = autotvm.measure.request_remote(
+            env.TARGET, tracker_host, tracker_port, timeout=10000
+        )
+        # Reconfigure the JIT runtime and FPGA.
+        vta.reconfig_runtime(remote)
+        vta.program_fpga(remote, bitstream=None)
+    else:
+        # In simulation mode, host the RPC server locally.
+        remote = rpc.LocalSession()
+
+    # compile kernels with history best records
+    with autotvm.tophub.context(target, extra_files=[log_file]):
+        # Compile network
+        print("Compile...")
+        if target.device_name != "vta":
+            with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
+                lib = relay.build(
+                    relay_prog, target=target, params=params, target_host=env.target_host
+                )
+        else:
+            with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
+                lib = relay.build(
+                    relay_prog, target=target, params=params, target_host=env.target_host
+                )
+
+        # Export library
+        print("Upload...")
+        temp = util.tempdir()
+        lib.save(temp.relpath("graphlib.o"))
+        remote.upload(temp.relpath("graphlib.o"))
+        lib = remote.load_module("graphlib.o")
+
+        # Generate the graph runtime
+        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+        m = graph_runtime.GraphModule(lib["default"](ctx))
+
+        # upload parameters to device
+        image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32"))
+        m.set_input("data", image)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
+        tcost = timer()
+        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
+        print(
+            "Mean inference time (std dev): %.2f ms (%.2f ms)"
+            % (np.mean(prof_res), np.std(prof_res))
+        )
+
+
+# Run the tuning and evaluate the results
+tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below.
+# It takes about 2 hours on a 16T CPU, and 6 Pynq boards.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    [Warning] Invalid shape during AutoTVM task creation
+#    Extracted 10 conv2d tasks:
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (32, 16, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (32, 16, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (16, 8, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (16, 8, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (8, 4, 1, 1, 16, 16), 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (8, 4, 1, 1, 16, 16, 'int8'), (2, 2), (0, 0), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (4, 4, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (4, 4, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (8, 8, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (8, 8, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 4, 56, 56, 1, 16), 'int8'), ('TENSOR', (8, 4, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 4, 56, 56, 1, 16, 'int8'), (8, 4, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (16, 16, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (16, 16, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 8, 28, 28, 1, 16), 'int8'), ('TENSOR', (16, 8, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 8, 28, 28, 1, 16, 'int8'), (16, 8, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 32, 7, 7, 1, 16), 'int8'), ('TENSOR', (32, 32, 3, 3, 16, 16), 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 32, 7, 7, 1, 16, 'int8'), (32, 32, 3, 3, 16, 16, 'int8'), (1, 1), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#        Task(func_name=topi_nn_conv2d, args=(('TENSOR', (1, 16, 14, 14, 1, 16), 'int8'), ('TENSOR', (32, 16, 3, 3, 16, 16), 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'), kwargs={}, workload=('conv2d', (1, 16, 14, 14, 1, 16, 'int8'), (32, 16, 3, 3, 16, 16, 'int8'), (2, 2), (1, 1), (1, 1), 'NCHW1n16c', 'int32'))
+#    Tuning...
+#    [Task  1/10]  Current/Best:    0.72/  23.24 GFLOPS | Progress: (480/1000) | 640.31 s Done.
+#    [Task  2/10]  Current/Best:    0.00/  27.69 GFLOPS | Progress: (576/1000) | 810.09 s Done.
+#    [Task  3/10]  Current/Best:    0.00/  22.97 GFLOPS | Progress: (1000/1000) | 1125.37 s Done.
+#    [Task  4/10]  Current/Best:    0.00/  31.26 GFLOPS | Progress: (1000/1000) | 1025.52 s Done.
+#    [Task  5/10]  Current/Best:    0.00/  15.15 GFLOPS | Progress: (1000/1000) | 1236.58 s Done.
+#    [Task  6/10]  Current/Best:    0.00/  22.74 GFLOPS | Progress: (1000/1000) | 906.60 s Done.
+#    [Task  7/10]  Current/Best:    0.00/  15.27 GFLOPS | Progress: (1000/1000) | 1056.25 s Done.
+#    [Task  8/10]  Current/Best:    0.00/   2.18 GFLOPS | Progress: (1000/1000) | 2275.29 s Done.
+#    [Task  9/10]  Current/Best:    2.23/   3.99 GFLOPS | Progress: (1000/1000) | 2527.25 s Done.
+#    [Task 10/10]  Current/Best:    1.56/   6.32 GFLOPS | Progress: (480/1000) | 1304.84 s Done.
+#    Compile...
+#    Upload...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 621.79 ms (0.14 ms)
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai