From 9afb05d25c142957f6536b31defd851c24c2d596 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Wed, 11 Aug 2021 01:37:04 -0700
Subject: [PATCH 1/4] [VTA] Make vta graph_pack compatible with latest TVM, and
 bring back object detection tutorials.

---
 vta/python/vta/top/graphpack.py            |  33 ++-
 vta/tutorials/frontend/deploy_detection.py | 324 +++++++++++++++++++++
 2 files changed, 351 insertions(+), 6 deletions(-)
 create mode 100644 vta/tutorials/frontend/deploy_detection.py

diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index a982b88b75e8..f15e4922b4a8 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -56,13 +56,24 @@ def _pack_batch_channel(data, dshape, bfactor, cfactor):
     return data
 
 
-def _unpack_batch_channel(data, old_shape):
+def _unpack_batch_channel(data, old_shape, unpack_transpose=False):
     """Unpack the data channel dimension."""
-    data = op.transpose(data, axes=(0, 4, 1, 5, 2, 3))
+    if unpack_transpose:
+        data = op.transpose(data, axes=(0, 4, 1, 5, 2, 3))
     data = op.reshape(data, newshape=old_shape)
     return data
 
 
+def _channel_const_match(channel_length, cfactor_out):
+    """Round the chanel const variant if the value not divisible by cfactor_out"""
+    diff = int(channel_length) % cfactor_out
+    if diff != 0:
+        diff = cfactor_out - diff
+        channel_length = channel_length + diff
+
+    return diff, channel_length
+
+
 def _const_shape_match(data, dshape, cfactor_out):
     """Pad the constant if the shape[0] not divisible by cfactor_out."""
     assert len(dshape) == 3
@@ -299,6 +310,7 @@ def __init__(self, bfactor, cfactor, weight_bits):
         self.upsampling = op.op.get("nn.upsampling")
         self.reshape = op.op.get("reshape")
         self.number_of_conv2d = 0
+        self.unpack_transpose = True
         super().__init__()
 
     def visit_call(self, call):
@@ -319,7 +331,7 @@ def visit_call(self, call):
                 self.start_pack = False
                 data = args[0]
                 data_shape = _get_tensor_shape(call.args[0])
-                return _unpack_batch_channel(data, data_shape)
+                return _unpack_batch_channel(data, data_shape, self.unpack_transpose)
         if self.start_pack:
             # Operator cases
             if call.op == self.conv2d and odtype == "int32":
@@ -429,12 +441,12 @@ def visit_call(self, call):
                 if len(pad_width) == 6:
                     pass
                 elif len(pad_width) == 4:
-                    (data,) = args
+                    (data, pad_value) = args
                     new_pad_width = []
                     new_pad_width.extend(pad_width)
                     for _ in range(2):
                         new_pad_width.append([0, 0])
-                    return op.nn.pad(data, pad_value=call.attrs.pad_value, pad_width=new_pad_width)
+                    return op.nn.pad(data, pad_value=pad_value, pad_width=new_pad_width)
             elif call.op == self.upsampling:
                 (data,) = args
                 scale_h = call.attrs.scale_h
@@ -445,8 +457,17 @@ def visit_call(self, call):
                 return op.nn.upsampling(data, scale_h, scale_w, data_layout, method, align_corners)
             elif call.op == self.reshape and len(input_types[0].shape) == 4:
                 (data,) = args
+                self.unpack_transpose = False
                 data = op.transpose(data, axes=(0, 4, 1, 5, 2, 3))
-                return op.reshape(data, [int(x) for x in input_types[0].shape])
+                new_shape = [int(x) for x in input_types[0].shape]
+                # Check if the reshape match with such shape after pad
+                pad, new_shape[1] = _channel_const_match(new_shape[1], self.cfactor)
+                data = op.reshape(data, new_shape)
+                # remove pad data
+                if pad != 0:
+                    new_pad_width = [[0, 0], [0, -pad], [0, 0], [0, 0]]
+                    data = op.nn.pad(data, pad_width=new_pad_width)
+                return data
 
         return relay.Call(self.visit(call.op), args, call.attrs)
 
diff --git a/vta/tutorials/frontend/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
new file mode 100644
index 000000000000..e80b93c3bc6a
--- /dev/null
+++ b/vta/tutorials/frontend/deploy_detection.py
@@ -0,0 +1,324 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Deploy Pretrained Vision Detection Model from Darknet on VTA
+================================================
+**Author**: `Hua Jiang <https://github.com/huajsj>`_
+
+This tutorial provides an end-to-end demo, on how to run Darknet YoloV3-tiny
+inference onto the VTA accelerator design to perform Image detection tasks.
+It showcases Relay as a front end compiler that can perform quantization (VTA
+only supports int8/32 inference) as well as graph packing (in order to enable
+tensorization in the core) to massage the compute graph for the hardware target.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+# pip3 install "Pillow<7"
+#
+# YOLO-V3-tiny Model with Darknet parsing have dependancy with CFFI and CV2 library,
+# we need to install CFFI and CV2 before executing this script.
+#
+# pip3 install "Pillow<7"
+#
+# pip3 install cffi
+# pip3 install opencv-python
+#
+# Now return to the python code. Import packages.
+
+from __future__ import absolute_import, print_function
+
+import sys
+import os
+import time
+import matplotlib.pyplot as plt
+import numpy as np
+import tvm
+import vta
+from tvm import rpc, autotvm, relay
+from tvm.relay.testing import yolo_detection, darknet
+from tvm.relay.testing.darknet import __darknetffi__
+from tvm.contrib import graph_runtime, graph_runtime, utils
+from tvm.contrib.download import download_testdata
+from vta.testing import simulator
+from vta.top import graph_pack
+
+# Make sure that TVM was compiled with RPC=1
+assert tvm.runtime.enabled("rpc")
+
+##############################################################################
+# Download yolo net configure file, weight file, darknet library file based on
+# Model Name
+# ----------------------------------------------------------------------------
+MODEL_NAME = "yolov3-tiny"
+REPO_URL = "https://github.com/dmlc/web-data/blob/master/darknet/"
+
+cfg_path = download_testdata(
+    "https://github.com/pjreddie/darknet/blob/master/cfg/" + MODEL_NAME + ".cfg" + "?raw=true",
+    MODEL_NAME + ".cfg",
+    module="darknet",
+)
+weights_path = download_testdata(
+    "https://pjreddie.com/media/files/" + MODEL_NAME + ".weights" + "?raw=true",
+    MODEL_NAME + ".weights",
+    module="darknet",
+)
+
+if sys.platform in ["linux", "linux2"]:
+    darknet_lib_path = download_testdata(
+        REPO_URL + "lib/" + "libdarknet2.0.so" + "?raw=true", "libdarknet2.0.so", module="darknet"
+    )
+elif sys.platform == "darwin":
+    darknet_lib_path = download_testdata(
+        REPO_URL + "lib_osx/" + "libdarknet_mac2.0.so" + "?raw=true",
+        "libdarknet_mac2.0.so",
+        module="darknet",
+    )
+else:
+    raise NotImplementedError("Darknet lib is not supported on {} platform".format(sys.platform))
+
+##################################################
+# Download yolo categories and illustration front.
+# ------------------------------------------------
+coco_path = download_testdata(
+    REPO_URL + "data/" + "coco.names" + "?raw=true", "coco.names", module="data"
+)
+font_path = download_testdata(
+    REPO_URL + "data/" + "arial.ttf" + "?raw=true", "arial.ttf", module="data"
+)
+with open(coco_path) as f:
+    content = f.readlines()
+names = [x.strip() for x in content]
+
+########################################
+# Define the platform and model targets.
+# --------------------------------------
+# Execute on CPU vs. VTA, and define the model.
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
+
+pack_dict = {
+    "yolov3-tiny": ["nn.max_pool2d", "cast", 4, 186],
+}
+
+# Name of Darknet model to compile
+# The ``start_pack`` and ``stop_pack`` labels indicate where
+# to start and end the graph packing relay pass: in other words
+# where to start and finish offloading to VTA.
+# the number 4 indicate the the ``start_pack`` index is 4, the
+# number 185 indicate the ``stop_pack index`` is 185, by using
+# name and index number, here we can located to correct place
+# where to start/end when there are multiple ``nn.max_pool2d``
+# or ``cast``, print(mod.astext(show_meta_data=False)) can help
+# to find operator name and index information.
+assert MODEL_NAME in pack_dict
+
+#############################
+# Obtain an execution remote.
+# ---------------------------
+# When target is 'pynq' or other FPGA backend, reconfigure FPGA and runtime.
+# Otherwise, if target is 'sim', execute locally.
+
+if env.TARGET not in ["sim", "tsim"]:
+    # Get remote from tracker node if environment variable is set.
+    # To set up the tracker, you'll need to follow the "Auto-tuning
+    # a convolutional network for VTA" tutorial.
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
+    # Otherwise if you have a device you want to program directly from
+    # the host, make sure you've set the variables below to the IP of
+    # your board.
+    device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+    device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
+    if not tracker_host or not tracker_port:
+        remote = rpc.connect(device_host, int(device_port))
+    else:
+        remote = autotvm.measure.request_remote(
+            env.TARGET, tracker_host, int(tracker_port), timeout=10000
+        )
+    # Reconfigure the JIT runtime and FPGA.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    reconfig_start = time.time()
+    vta.reconfig_runtime(remote)
+    vta.program_fpga(remote, bitstream=None)
+    reconfig_time = time.time() - reconfig_start
+    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
+
+# In simulation mode, host the RPC server locally.
+else:
+    remote = rpc.LocalSession()
+
+# Get execution context from remote
+ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+
+####################################
+# Build the inference graph runtime.
+# ----------------------------------
+# Using Darknet library load downloaded vision model and compile with Relay.
+# The compilation steps are:
+#
+# 1. Front end translation from Darknet into Relay module.
+# 2. Apply 8-bit quantization: here we skip the first conv layer,
+#    and dense layer which will both be executed in fp32 on the CPU.
+# 3. Perform graph packing to alter the data layout for tensorization.
+# 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply).
+# 5. Perform relay build to object file.
+# 6. Load the object file onto remote (FPGA device).
+# 7. Generate graph runtime, `m`.
+
+# Load pre-configured AutoTVM schedules
+with autotvm.tophub.context(target):
+    net = __darknetffi__.dlopen(darknet_lib_path).load_network(
+        cfg_path.encode("utf-8"), weights_path.encode("utf-8"), 0
+    )
+    dshape = (env.BATCH, net.c, net.h, net.w)
+    dtype = "float32"
+
+    # Measure build start time
+    build_start = time.time()
+
+    # Start front end compilation
+    mod, params = relay.frontend.from_darknet(net, dtype=dtype, shape=dshape)
+
+    if target.device_name == "vta":
+        # Perform quantization in Relay
+        # Note: We set opt_level to 3 in order to fold batch norm
+        with relay.build_config(opt_level=3):
+            with relay.quantize.qconfig(
+                global_scale=23.0,
+                skip_conv_layers=[0],
+                store_lowbit_output=True,
+                round_for_shift=True,
+            ):
+                mod = relay.quantize.quantize(mod, params=params)
+            # Perform graph packing and constant folding for VTA target
+            mod = graph_pack(
+                mod["main"],
+                env.BATCH,
+                env.BLOCK_OUT,
+                env.WGT_WIDTH,
+                start_name=pack_dict[MODEL_NAME][0],
+                stop_name=pack_dict[MODEL_NAME][1],
+                start_name_idx=pack_dict[MODEL_NAME][2],
+                stop_name_idx=pack_dict[MODEL_NAME][3],
+            )
+    else:
+        mod = mod["main"]
+
+    # Compile Relay program with AlterOpLayout disabled
+    with vta.build_config(disabled_pass={"AlterOpLayout"}):
+        graph, lib, params = relay.build(
+            mod, target=target, params=params, target_host=env.target_host
+        )
+
+    # Measure Relay build time
+    build_time = time.time() - build_start
+    print(MODEL_NAME + " inference graph built in {0:.2f}s!".format(build_time))
+
+    # Send the inference library over to the remote RPC server
+    temp = utils.tempdir()
+    lib.save(temp.relpath("graphlib.o"))
+    remote.upload(temp.relpath("graphlib.o"))
+    lib = remote.load_module("graphlib.o")
+
+    # Graph runtime
+    m = graph_runtime.create(graph, lib, ctx)
+
+####################################
+# Perform image detection inference.
+# ----------------------------------
+# We run detect on an downloaded image
+# Download test image
+[neth, netw] = dshape[2:]
+test_image = "person.jpg"
+img_url = REPO_URL + "data/" + test_image + "?raw=true"
+img_path = download_testdata(img_url, test_image, "data")
+data = darknet.load_image(img_path, neth, netw).transpose(1, 2, 0)
+
+# Prepare test image for inference
+plt.imshow(data)
+plt.show()
+data = data.transpose((2, 0, 1))
+data = data[np.newaxis, :]
+data = np.repeat(data, env.BATCH, axis=0)
+
+# Set the network parameters and inputs
+m.set_input("data", data)
+m.set_input(**params)
+
+# Perform inference and gather execution statistics
+# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
+num = 1  # number of times we run module for a single measurement
+rep = 1  # number of measurements (we derive std dev from this)
+timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
+
+if env.TARGET in ["sim", "tsim"]:
+    simulator.clear_stats()
+    timer()
+    sim_stats = simulator.stats()
+    print("\nExecution statistics:")
+    for k, v in sim_stats.items():
+        # Since we execute the workload many times, we need to normalize stats
+        # Note that there is always one warm up run
+        # Therefore we divide the overall stats by (num * rep + 1)
+        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
+else:
+    tcost = timer()
+    std = np.std(tcost.results) * 1000
+    mean = tcost.mean * 1000
+    print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
+    print("Average per sample inference time: %.2fms" % (mean / env.BATCH))
+
+# Get detection results from out
+thresh = 0.5
+nms_thresh = 0.45
+tvm_out = []
+for i in range(2):
+    layer_out = {}
+    layer_out["type"] = "Yolo"
+    # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
+    layer_attr = m.get_output(i * 4 + 3).asnumpy()
+    layer_out["biases"] = m.get_output(i * 4 + 2).asnumpy()
+    layer_out["mask"] = m.get_output(i * 4 + 1).asnumpy()
+    out_shape = (layer_attr[0], layer_attr[1] // layer_attr[0], layer_attr[2], layer_attr[3])
+    layer_out["output"] = m.get_output(i * 4).asnumpy().reshape(out_shape)
+    layer_out["classes"] = layer_attr[4]
+    tvm_out.append(layer_out)
+    thresh = 0.560
+
+# Show detection results
+img = darknet.load_image_color(img_path)
+_, im_h, im_w = img.shape
+dets = yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh, 1, tvm_out)
+last_layer = net.layers[net.n - 1]
+yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
+yolo_detection.show_detections(img, dets, thresh, names, last_layer.classes)
+yolo_detection.draw_detections(font_path, img, dets, thresh, names, last_layer.classes)
+plt.imshow(img.transpose(1, 2, 0))
+plt.show()

From 07ed4337aa15326144621656e55ee65f2148217c Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 12 Aug 2021 11:16:22 -0700
Subject: [PATCH 2/4] remove deploy_detection.py.

---
 vta/tutorials/frontend/deploy_detection.py    | 324 ------------------
 .../frontend/legacy/deploy_detection.py       |   6 +-
 2 files changed, 3 insertions(+), 327 deletions(-)
 delete mode 100644 vta/tutorials/frontend/deploy_detection.py

diff --git a/vta/tutorials/frontend/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
deleted file mode 100644
index e80b93c3bc6a..000000000000
--- a/vta/tutorials/frontend/deploy_detection.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Deploy Pretrained Vision Detection Model from Darknet on VTA
-================================================
-**Author**: `Hua Jiang <https://github.com/huajsj>`_
-
-This tutorial provides an end-to-end demo, on how to run Darknet YoloV3-tiny
-inference onto the VTA accelerator design to perform Image detection tasks.
-It showcases Relay as a front end compiler that can perform quantization (VTA
-only supports int8/32 inference) as well as graph packing (in order to enable
-tensorization in the core) to massage the compute graph for the hardware target.
-"""
-
-######################################################################
-# Install dependencies
-# --------------------
-# To use the autotvm package in tvm, we need to install some extra dependencies.
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-# pip3 install "Pillow<7"
-#
-# YOLO-V3-tiny Model with Darknet parsing have dependancy with CFFI and CV2 library,
-# we need to install CFFI and CV2 before executing this script.
-#
-# pip3 install "Pillow<7"
-#
-# pip3 install cffi
-# pip3 install opencv-python
-#
-# Now return to the python code. Import packages.
-
-from __future__ import absolute_import, print_function
-
-import sys
-import os
-import time
-import matplotlib.pyplot as plt
-import numpy as np
-import tvm
-import vta
-from tvm import rpc, autotvm, relay
-from tvm.relay.testing import yolo_detection, darknet
-from tvm.relay.testing.darknet import __darknetffi__
-from tvm.contrib import graph_runtime, graph_runtime, utils
-from tvm.contrib.download import download_testdata
-from vta.testing import simulator
-from vta.top import graph_pack
-
-# Make sure that TVM was compiled with RPC=1
-assert tvm.runtime.enabled("rpc")
-
-##############################################################################
-# Download yolo net configure file, weight file, darknet library file based on
-# Model Name
-# ----------------------------------------------------------------------------
-MODEL_NAME = "yolov3-tiny"
-REPO_URL = "https://github.com/dmlc/web-data/blob/master/darknet/"
-
-cfg_path = download_testdata(
-    "https://github.com/pjreddie/darknet/blob/master/cfg/" + MODEL_NAME + ".cfg" + "?raw=true",
-    MODEL_NAME + ".cfg",
-    module="darknet",
-)
-weights_path = download_testdata(
-    "https://pjreddie.com/media/files/" + MODEL_NAME + ".weights" + "?raw=true",
-    MODEL_NAME + ".weights",
-    module="darknet",
-)
-
-if sys.platform in ["linux", "linux2"]:
-    darknet_lib_path = download_testdata(
-        REPO_URL + "lib/" + "libdarknet2.0.so" + "?raw=true", "libdarknet2.0.so", module="darknet"
-    )
-elif sys.platform == "darwin":
-    darknet_lib_path = download_testdata(
-        REPO_URL + "lib_osx/" + "libdarknet_mac2.0.so" + "?raw=true",
-        "libdarknet_mac2.0.so",
-        module="darknet",
-    )
-else:
-    raise NotImplementedError("Darknet lib is not supported on {} platform".format(sys.platform))
-
-##################################################
-# Download yolo categories and illustration front.
-# ------------------------------------------------
-coco_path = download_testdata(
-    REPO_URL + "data/" + "coco.names" + "?raw=true", "coco.names", module="data"
-)
-font_path = download_testdata(
-    REPO_URL + "data/" + "arial.ttf" + "?raw=true", "arial.ttf", module="data"
-)
-with open(coco_path) as f:
-    content = f.readlines()
-names = [x.strip() for x in content]
-
-########################################
-# Define the platform and model targets.
-# --------------------------------------
-# Execute on CPU vs. VTA, and define the model.
-
-# Load VTA parameters from the vta/config/vta_config.json file
-env = vta.get_env()
-# Set ``device=arm_cpu`` to run inference on the CPU
-# or ``device=vta`` to run inference on the FPGA.
-device = "vta"
-target = env.target if device == "vta" else env.target_vta_cpu
-
-pack_dict = {
-    "yolov3-tiny": ["nn.max_pool2d", "cast", 4, 186],
-}
-
-# Name of Darknet model to compile
-# The ``start_pack`` and ``stop_pack`` labels indicate where
-# to start and end the graph packing relay pass: in other words
-# where to start and finish offloading to VTA.
-# the number 4 indicate the the ``start_pack`` index is 4, the
-# number 185 indicate the ``stop_pack index`` is 185, by using
-# name and index number, here we can located to correct place
-# where to start/end when there are multiple ``nn.max_pool2d``
-# or ``cast``, print(mod.astext(show_meta_data=False)) can help
-# to find operator name and index information.
-assert MODEL_NAME in pack_dict
-
-#############################
-# Obtain an execution remote.
-# ---------------------------
-# When target is 'pynq' or other FPGA backend, reconfigure FPGA and runtime.
-# Otherwise, if target is 'sim', execute locally.
-
-if env.TARGET not in ["sim", "tsim"]:
-    # Get remote from tracker node if environment variable is set.
-    # To set up the tracker, you'll need to follow the "Auto-tuning
-    # a convolutional network for VTA" tutorial.
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
-    # Otherwise if you have a device you want to program directly from
-    # the host, make sure you've set the variables below to the IP of
-    # your board.
-    device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
-    device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
-    if not tracker_host or not tracker_port:
-        remote = rpc.connect(device_host, int(device_port))
-    else:
-        remote = autotvm.measure.request_remote(
-            env.TARGET, tracker_host, int(tracker_port), timeout=10000
-        )
-    # Reconfigure the JIT runtime and FPGA.
-    # You can program the FPGA with your own custom bitstream
-    # by passing the path to the bitstream file instead of None.
-    reconfig_start = time.time()
-    vta.reconfig_runtime(remote)
-    vta.program_fpga(remote, bitstream=None)
-    reconfig_time = time.time() - reconfig_start
-    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
-
-# In simulation mode, host the RPC server locally.
-else:
-    remote = rpc.LocalSession()
-
-# Get execution context from remote
-ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-
-####################################
-# Build the inference graph runtime.
-# ----------------------------------
-# Using Darknet library load downloaded vision model and compile with Relay.
-# The compilation steps are:
-#
-# 1. Front end translation from Darknet into Relay module.
-# 2. Apply 8-bit quantization: here we skip the first conv layer,
-#    and dense layer which will both be executed in fp32 on the CPU.
-# 3. Perform graph packing to alter the data layout for tensorization.
-# 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply).
-# 5. Perform relay build to object file.
-# 6. Load the object file onto remote (FPGA device).
-# 7. Generate graph runtime, `m`.
-
-# Load pre-configured AutoTVM schedules
-with autotvm.tophub.context(target):
-    net = __darknetffi__.dlopen(darknet_lib_path).load_network(
-        cfg_path.encode("utf-8"), weights_path.encode("utf-8"), 0
-    )
-    dshape = (env.BATCH, net.c, net.h, net.w)
-    dtype = "float32"
-
-    # Measure build start time
-    build_start = time.time()
-
-    # Start front end compilation
-    mod, params = relay.frontend.from_darknet(net, dtype=dtype, shape=dshape)
-
-    if target.device_name == "vta":
-        # Perform quantization in Relay
-        # Note: We set opt_level to 3 in order to fold batch norm
-        with relay.build_config(opt_level=3):
-            with relay.quantize.qconfig(
-                global_scale=23.0,
-                skip_conv_layers=[0],
-                store_lowbit_output=True,
-                round_for_shift=True,
-            ):
-                mod = relay.quantize.quantize(mod, params=params)
-            # Perform graph packing and constant folding for VTA target
-            mod = graph_pack(
-                mod["main"],
-                env.BATCH,
-                env.BLOCK_OUT,
-                env.WGT_WIDTH,
-                start_name=pack_dict[MODEL_NAME][0],
-                stop_name=pack_dict[MODEL_NAME][1],
-                start_name_idx=pack_dict[MODEL_NAME][2],
-                stop_name_idx=pack_dict[MODEL_NAME][3],
-            )
-    else:
-        mod = mod["main"]
-
-    # Compile Relay program with AlterOpLayout disabled
-    with vta.build_config(disabled_pass={"AlterOpLayout"}):
-        graph, lib, params = relay.build(
-            mod, target=target, params=params, target_host=env.target_host
-        )
-
-    # Measure Relay build time
-    build_time = time.time() - build_start
-    print(MODEL_NAME + " inference graph built in {0:.2f}s!".format(build_time))
-
-    # Send the inference library over to the remote RPC server
-    temp = utils.tempdir()
-    lib.save(temp.relpath("graphlib.o"))
-    remote.upload(temp.relpath("graphlib.o"))
-    lib = remote.load_module("graphlib.o")
-
-    # Graph runtime
-    m = graph_runtime.create(graph, lib, ctx)
-
-####################################
-# Perform image detection inference.
-# ----------------------------------
-# We run detect on an downloaded image
-# Download test image
-[neth, netw] = dshape[2:]
-test_image = "person.jpg"
-img_url = REPO_URL + "data/" + test_image + "?raw=true"
-img_path = download_testdata(img_url, test_image, "data")
-data = darknet.load_image(img_path, neth, netw).transpose(1, 2, 0)
-
-# Prepare test image for inference
-plt.imshow(data)
-plt.show()
-data = data.transpose((2, 0, 1))
-data = data[np.newaxis, :]
-data = np.repeat(data, env.BATCH, axis=0)
-
-# Set the network parameters and inputs
-m.set_input("data", data)
-m.set_input(**params)
-
-# Perform inference and gather execution statistics
-# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
-num = 1  # number of times we run module for a single measurement
-rep = 1  # number of measurements (we derive std dev from this)
-timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
-
-if env.TARGET in ["sim", "tsim"]:
-    simulator.clear_stats()
-    timer()
-    sim_stats = simulator.stats()
-    print("\nExecution statistics:")
-    for k, v in sim_stats.items():
-        # Since we execute the workload many times, we need to normalize stats
-        # Note that there is always one warm up run
-        # Therefore we divide the overall stats by (num * rep + 1)
-        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
-else:
-    tcost = timer()
-    std = np.std(tcost.results) * 1000
-    mean = tcost.mean * 1000
-    print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
-    print("Average per sample inference time: %.2fms" % (mean / env.BATCH))
-
-# Get detection results from out
-thresh = 0.5
-nms_thresh = 0.45
-tvm_out = []
-for i in range(2):
-    layer_out = {}
-    layer_out["type"] = "Yolo"
-    # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
-    layer_attr = m.get_output(i * 4 + 3).asnumpy()
-    layer_out["biases"] = m.get_output(i * 4 + 2).asnumpy()
-    layer_out["mask"] = m.get_output(i * 4 + 1).asnumpy()
-    out_shape = (layer_attr[0], layer_attr[1] // layer_attr[0], layer_attr[2], layer_attr[3])
-    layer_out["output"] = m.get_output(i * 4).asnumpy().reshape(out_shape)
-    layer_out["classes"] = layer_attr[4]
-    tvm_out.append(layer_out)
-    thresh = 0.560
-
-# Show detection results
-img = darknet.load_image_color(img_path)
-_, im_h, im_w = img.shape
-dets = yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh, 1, tvm_out)
-last_layer = net.layers[net.n - 1]
-yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
-yolo_detection.show_detections(img, dets, thresh, names, last_layer.classes)
-yolo_detection.draw_detections(font_path, img, dets, thresh, names, last_layer.classes)
-plt.imshow(img.transpose(1, 2, 0))
-plt.show()
diff --git a/vta/tutorials/frontend/legacy/deploy_detection.py b/vta/tutorials/frontend/legacy/deploy_detection.py
index 1d78786848e7..be8c52e96112 100644
--- a/vta/tutorials/frontend/legacy/deploy_detection.py
+++ b/vta/tutorials/frontend/legacy/deploy_detection.py
@@ -212,7 +212,7 @@
         # Note: We set opt_level to 3 in order to fold batch norm
         with tvm.transform.PassContext(opt_level=3):
             with relay.quantize.qconfig(
-                global_scale=33.0,
+                global_scale=23.0,
                 skip_conv_layers=[0],
                 store_lowbit_output=True,
                 round_for_shift=True,
@@ -272,8 +272,8 @@
 
 # Perform inference and gather execution statistics
 # More on: :py:method:`tvm.runtime.Module.time_evaluator`
-num = 4  # number of times we run module for a single measurement
-rep = 3  # number of measurements (we derive std dev from this)
+num = 1  # number of times we run module for a single measurement
+rep = 1  # number of measurements (we derive std dev from this)
 timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
 
 if env.TARGET in ["sim", "tsim"]:

From 37dc3f71e78367d526e5d1e8dbadc9262f86bbc7 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 12 Aug 2021 11:19:04 -0700
Subject: [PATCH 3/4] move out deploy_detection.py from legacy folder.

---
 vta/tutorials/frontend/{legacy => }/deploy_detection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename vta/tutorials/frontend/{legacy => }/deploy_detection.py (98%)

diff --git a/vta/tutorials/frontend/legacy/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
similarity index 98%
rename from vta/tutorials/frontend/legacy/deploy_detection.py
rename to vta/tutorials/frontend/deploy_detection.py
index be8c52e96112..48a97bc918eb 100644
--- a/vta/tutorials/frontend/legacy/deploy_detection.py
+++ b/vta/tutorials/frontend/deploy_detection.py
@@ -272,8 +272,8 @@
 
 # Perform inference and gather execution statistics
 # More on: :py:method:`tvm.runtime.Module.time_evaluator`
-num = 1  # number of times we run module for a single measurement
-rep = 1  # number of measurements (we derive std dev from this)
+num = 4  # number of times we run module for a single measurement
+rep = 3  # number of measurements (we derive std dev from this)
 timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
 
 if env.TARGET in ["sim", "tsim"]:

From 0c19d88db3e350bcaade2d05878abeae47e0dca5 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Thu, 12 Aug 2021 15:59:30 -0700
Subject: [PATCH 4/4] fix build error.

---
 vta/tutorials/frontend/deploy_detection.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vta/tutorials/frontend/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
index 48a97bc918eb..771801851a48 100644
--- a/vta/tutorials/frontend/deploy_detection.py
+++ b/vta/tutorials/frontend/deploy_detection.py
@@ -177,9 +177,9 @@
 # Get execution context from remote
 ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
 
-####################################
+#####################################
 # Build the inference graph executor.
-# ----------------------------------
+# -----------------------------------
 # Using Darknet library load downloaded vision model and compile with Relay.
 # The compilation steps are:
 #
@@ -191,7 +191,6 @@
 # 5. Perform relay build to object file.
 # 6. Load the object file onto remote (FPGA device).
 # 7. Generate graph executor, `m`.
-#
 
 # Load pre-configured AutoTVM schedules
 with autotvm.tophub.context(target):