From 004b759f179a2049c7778f57f349c7f8eff50a96 Mon Sep 17 00:00:00 2001
From: Li Jiashu <lijiashu@4paradigm>
Date: Wed, 4 Mar 2020 16:51:02 +0800
Subject: [PATCH 01/44] Added support of Intel OpenCL for FPGA devices

---
 cmake/modules/VTA.cmake                   |   8 +
 vta/python/vta/environment.py             |   2 +-
 vta/python/vta/testing/simulator.py       |   2 +-
 vta/runtime/runtime.cc                    |   1 -
 vta/src/intelfocl/AOCLUtils/aocl_utils.h  |  32 ++
 vta/src/intelfocl/AOCLUtils/opencl.cpp    | 555 ++++++++++++++++++++++
 vta/src/intelfocl/AOCLUtils/opencl.h      | 122 +++++
 vta/src/intelfocl/AOCLUtils/options.cpp   | 105 ++++
 vta/src/intelfocl/AOCLUtils/options.h     | 137 ++++++
 vta/src/intelfocl/AOCLUtils/scoped_ptrs.h | 165 +++++++
 vta/src/intelfocl/intelfocl_device.cc     | 181 +++++++
 vta/src/intelfocl/intelfocl_device.h      |  53 +++
 vta/src/intelfocl/intelfocl_driver.cc     |  74 +++
 vta/src/pynq/pynq_driver.cc               | 167 +++++++
 14 files changed, 1601 insertions(+), 3 deletions(-)
 create mode 100644 vta/src/intelfocl/AOCLUtils/aocl_utils.h
 create mode 100644 vta/src/intelfocl/AOCLUtils/opencl.cpp
 create mode 100644 vta/src/intelfocl/AOCLUtils/opencl.h
 create mode 100644 vta/src/intelfocl/AOCLUtils/options.cpp
 create mode 100644 vta/src/intelfocl/AOCLUtils/options.h
 create mode 100644 vta/src/intelfocl/AOCLUtils/scoped_ptrs.h
 create mode 100644 vta/src/intelfocl/intelfocl_device.cc
 create mode 100644 vta/src/intelfocl/intelfocl_device.h
 create mode 100644 vta/src/intelfocl/intelfocl_driver.cc
 create mode 100644 vta/src/pynq/pynq_driver.cc

diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index d9508470c0a2..33fe0016fe4a 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -99,6 +99,11 @@ elseif(PYTHON)
       find_library(__cma_lib NAMES cma PATH /usr/lib)
     elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
       file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc)
+    elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
+      file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc)
+      file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cpp)
+      list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC})
+      list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc vta/src/vmem/virtual_memory.h)
     endif()
     # Target lib: vta
     add_library(vta SHARED ${FPGA_RUNTIME_SRCS})
@@ -117,6 +122,9 @@ elseif(PYTHON)
       target_include_directories(vta PUBLIC 3rdparty)
       target_include_directories(vta PUBLIC
         "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
+    elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
+      target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include")
+      target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL)
     endif()
   endif()
 
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index e68f098ba53f..c556352e4539 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -241,7 +241,7 @@ def target_host(self):
             return "llvm -target=armv7-none-linux-gnueabihf"
         if self.TARGET == "ultra96":
             return "llvm -target=aarch64-linux-gnu"
-        if self.TARGET in ["sim", "tsim"]:
+        if self.TARGET in ["sim", "tsim", "intelfocl"]:
             return "llvm"
         raise ValueError("Unknown target %s" % self.TARGET)
 
diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
index 16827c4ab079..bf89107f9f79 100644
--- a/vta/python/vta/testing/simulator.py
+++ b/vta/python/vta/testing/simulator.py
@@ -25,7 +25,7 @@ def _load_sw():
     """Load hardware library for simulator."""
 
     env = get_env()
-    lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim"
+    lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim" if env.TARGET == "sim" else "libvta"
 
     # Load driver library
     lib_driver = find_libvta(lib_driver_name, optional=True)
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 49fe9c557336..b1d3ad424d6e 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -413,7 +413,6 @@ class UopQueue : public BaseQueue<VTAUop> {
       kernel->sram_begin_ = 0;
       kernel->sram_end_ = 0;
     }
-
     cache_.clear();
     cache_idx_ = 0;
     BaseQueue<VTAUop>::Reset();
diff --git a/vta/src/intelfocl/AOCLUtils/aocl_utils.h b/vta/src/intelfocl/AOCLUtils/aocl_utils.h
new file mode 100644
index 000000000000..70e0fc6bcc0a
--- /dev/null
+++ b/vta/src/intelfocl/AOCLUtils/aocl_utils.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// Main include file for AOCLUtils. Includes all other utility header files.
+
+#ifndef AOCL_UTILS_H
+#define AOCL_UTILS_H
+
+#include "opencl.h"
+#include "scoped_ptrs.h"
+#include "options.h"
+
+#endif
+
diff --git a/vta/src/intelfocl/AOCLUtils/opencl.cpp b/vta/src/intelfocl/AOCLUtils/opencl.cpp
new file mode 100644
index 000000000000..04d989d7c9ea
--- /dev/null
+++ b/vta/src/intelfocl/AOCLUtils/opencl.cpp
@@ -0,0 +1,555 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+#include "aocl_utils.h"
+#include <algorithm>
+#include <stdarg.h>
+
+#ifdef _WIN32 // Windows
+#include <windows.h>
+#else         // Linux
+#include <stdio.h> 
+#include <unistd.h> // readlink, chdir
+#endif
+
+namespace aocl_utils {
+
+static const char *const VERSION_STR = "161";
+
+//////////////////////////////////////////
+// Host allocation functions for alignment
+//////////////////////////////////////////
+
+// This is the minimum alignment requirement to ensure DMA can be used.
+const unsigned AOCL_ALIGNMENT = 64;
+
+#ifdef _WIN32 // Windows
+void *alignedMalloc(size_t size) {
+  return _aligned_malloc (size, AOCL_ALIGNMENT);
+}
+
+void alignedFree(void * ptr) {
+  _aligned_free(ptr);
+}
+#else          // Linux
+void *alignedMalloc(size_t size) {
+  void *result = NULL;
+  int rc;
+  rc = posix_memalign (&result, AOCL_ALIGNMENT, size);
+  (void) rc;
+  return result;
+}
+
+void alignedFree(void * ptr) {
+  free (ptr);
+}
+#endif
+
+///////////////////////////////
+// Error functions
+///////////////////////////////
+
+// Print the error associciated with an error code
+void printError(cl_int error) {
+  // Print error message
+  switch(error)
+  {
+    case -1:
+      printf("CL_DEVICE_NOT_FOUND ");
+      break;
+    case -2:
+      printf("CL_DEVICE_NOT_AVAILABLE ");
+      break;
+    case -3:
+      printf("CL_COMPILER_NOT_AVAILABLE ");
+      break;
+    case -4:
+      printf("CL_MEM_OBJECT_ALLOCATION_FAILURE ");
+      break;
+    case -5:
+      printf("CL_OUT_OF_RESOURCES ");
+      break;
+    case -6:
+      printf("CL_OUT_OF_HOST_MEMORY ");
+      break;
+    case -7:
+      printf("CL_PROFILING_INFO_NOT_AVAILABLE ");
+      break;
+    case -8:
+      printf("CL_MEM_COPY_OVERLAP ");
+      break;
+    case -9:
+      printf("CL_IMAGE_FORMAT_MISMATCH ");
+      break;
+    case -10:
+      printf("CL_IMAGE_FORMAT_NOT_SUPPORTED ");
+      break;
+    case -11:
+      printf("CL_BUILD_PROGRAM_FAILURE ");
+      break;
+    case -12:
+      printf("CL_MAP_FAILURE ");
+      break;
+    case -13:
+      printf("CL_MISALIGNED_SUB_BUFFER_OFFSET ");
+      break;
+    case -14:
+      printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ");
+      break;
+
+    case -30:
+      printf("CL_INVALID_VALUE ");
+      break;
+    case -31:
+      printf("CL_INVALID_DEVICE_TYPE ");
+      break;
+    case -32:
+      printf("CL_INVALID_PLATFORM ");
+      break;
+    case -33:
+      printf("CL_INVALID_DEVICE ");
+      break;
+    case -34:
+      printf("CL_INVALID_CONTEXT ");
+      break;
+    case -35:
+      printf("CL_INVALID_QUEUE_PROPERTIES ");
+      break;
+    case -36:
+      printf("CL_INVALID_COMMAND_QUEUE ");
+      break;
+    case -37:
+      printf("CL_INVALID_HOST_PTR ");
+      break;
+    case -38:
+      printf("CL_INVALID_MEM_OBJECT ");
+      break;
+    case -39:
+      printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR ");
+      break;
+    case -40:
+      printf("CL_INVALID_IMAGE_SIZE ");
+      break;
+    case -41:
+      printf("CL_INVALID_SAMPLER ");
+      break;
+    case -42:
+      printf("CL_INVALID_BINARY ");
+      break;
+    case -43:
+      printf("CL_INVALID_BUILD_OPTIONS ");
+      break;
+    case -44:
+      printf("CL_INVALID_PROGRAM ");
+      break;
+    case -45:
+      printf("CL_INVALID_PROGRAM_EXECUTABLE ");
+      break;
+    case -46:
+      printf("CL_INVALID_KERNEL_NAME ");
+      break;
+    case -47:
+      printf("CL_INVALID_KERNEL_DEFINITION ");
+      break;
+    case -48:
+      printf("CL_INVALID_KERNEL ");
+      break;
+    case -49:
+      printf("CL_INVALID_ARG_INDEX ");
+      break;
+    case -50:
+      printf("CL_INVALID_ARG_VALUE ");
+      break;
+    case -51:
+      printf("CL_INVALID_ARG_SIZE ");
+      break;
+    case -52:
+      printf("CL_INVALID_KERNEL_ARGS ");
+      break;
+    case -53:
+      printf("CL_INVALID_WORK_DIMENSION ");
+      break;
+    case -54:
+      printf("CL_INVALID_WORK_GROUP_SIZE ");
+      break;
+    case -55:
+      printf("CL_INVALID_WORK_ITEM_SIZE ");
+      break;
+    case -56:
+      printf("CL_INVALID_GLOBAL_OFFSET ");
+      break;
+    case -57:
+      printf("CL_INVALID_EVENT_WAIT_LIST ");
+      break;
+    case -58:
+      printf("CL_INVALID_EVENT ");
+      break;
+    case -59:
+      printf("CL_INVALID_OPERATION ");
+      break;
+    case -60:
+      printf("CL_INVALID_GL_OBJECT ");
+      break;
+    case -61:
+      printf("CL_INVALID_BUFFER_SIZE ");
+      break;
+    case -62:
+      printf("CL_INVALID_MIP_LEVEL ");
+      break;
+    case -63:
+      printf("CL_INVALID_GLOBAL_WORK_SIZE ");
+      break;
+    default:
+      printf("UNRECOGNIZED ERROR CODE (%d)", error);
+  }
+}
+
+// Print line, file name, and error code if there is an error. Exits the
+// application upon error.
+void _checkError(int line,
+                 const char *file,
+                 cl_int error,
+                 const char *msg,
+                 ...) {
+  // If not successful
+  if(error != CL_SUCCESS) {
+    // Print line and file
+    printf("ERROR: ");
+    printError(error);
+    printf("\nLocation: %s:%d\n", file, line);
+
+    // Print custom message.
+    va_list vl;
+    va_start(vl, msg);
+    vprintf(msg, vl);
+    printf("\n");
+    va_end(vl);
+
+    // Cleanup and bail.
+    cleanup();
+    exit(error);
+  }
+}
+
+// Sets the current working directory to be the same as the directory
+// containing the running executable.
+bool setCwdToExeDir() {
+#ifdef _WIN32 // Windows
+  HMODULE hMod = GetModuleHandle(NULL);
+  char path[MAX_PATH];
+  GetModuleFileNameA(hMod, path, MAX_PATH);
+
+#else         // Linux
+  // Get path of executable.
+  char path[300];
+  ssize_t n = readlink("/proc/self/exe", path, sizeof(path)/sizeof(path[0]) - 1);
+  if(n == -1) {
+    return false;
+  }
+  path[n] = 0;
+#endif
+
+  // Find the last '\' or '/' and terminate the path there; it is now
+  // the directory containing the executable.
+  size_t i;
+  for(i = strlen(path) - 1; i > 0 && path[i] != '/' && path[i] != '\\'; --i);
+  path[i] = '\0';
+
+  // Change the current directory.
+#ifdef _WIN32 // Windows
+  SetCurrentDirectoryA(path);
+#else         // Linux
+  int rc;
+  rc = chdir(path);
+  (void) rc;
+#endif
+
+  return true;
+}
+
+// Searches all platforms for the first platform whose name
+// contains the search string (case-insensitive).
+cl_platform_id findPlatform(const char *platform_name_search) {
+  cl_int status;
+
+  std::string search = platform_name_search;
+  std::transform(search.begin(), search.end(), search.begin(), tolower);
+
+  // Get number of platforms.
+  cl_uint num_platforms;
+  status = clGetPlatformIDs(0, NULL, &num_platforms);
+  checkError(status, "Query for number of platforms failed");
+
+  // Get a list of all platform ids.
+  scoped_array<cl_platform_id> pids(num_platforms);
+  status = clGetPlatformIDs(num_platforms, pids, NULL);
+  checkError(status, "Query for all platform ids failed");
+
+  // For each platform, get name and compare against the search string.
+  for(unsigned i = 0; i < num_platforms; ++i) {
+    std::string name = getPlatformName(pids[i]);
+
+    // Convert to lower case.
+    std::transform(name.begin(), name.end(), name.begin(), tolower);
+
+    if(name.find(search) != std::string::npos) {
+      // Found!
+      return pids[i];
+    }
+  }
+
+  // No platform found.
+  return NULL;
+}
+
+// Returns the platform name.
+std::string getPlatformName(cl_platform_id pid) {
+  cl_int status;
+
+  size_t sz;
+  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
+  checkError(status, "Query for platform name size failed");
+
+  scoped_array<char> name(sz);
+  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
+  checkError(status, "Query for platform name failed");
+
+  return name.get();
+}
+
+// Returns the device name.
+std::string getDeviceName(cl_device_id did) {
+  cl_int status;
+
+  size_t sz;
+  status = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, NULL, &sz);
+  checkError(status, "Failed to get device name size");
+
+  scoped_array<char> name(sz);
+  status = clGetDeviceInfo(did, CL_DEVICE_NAME, sz, name, NULL);
+  checkError(status, "Failed to get device name");
+
+  return name.get();
+}
+
+// Returns the list of all devices.
+cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices) {
+  cl_int status;
+
+  status = clGetDeviceIDs(pid, dev_type, 0, NULL, num_devices);
+  checkError(status, "Query for number of devices failed");
+
+  cl_device_id *dids = new cl_device_id[*num_devices];
+  status = clGetDeviceIDs(pid, dev_type, *num_devices, dids, NULL);
+  checkError(status, "Query for device ids");
+
+  return dids;
+}
+
+// Create a program for all devices associated with the context.
+cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices) {
+  // Early exit for potentially the most common way to fail: AOCX does not exist.
+  if(!fileExists(binary_file_name)) {
+    printf("AOCX file '%s' does not exist.\n", binary_file_name);
+    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
+  }
+
+  // Load the binary.
+  size_t binary_size;
+  scoped_array<unsigned char> binary(loadBinaryFile(binary_file_name, &binary_size));
+  if(binary == NULL) {
+    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
+  }
+
+  scoped_array<size_t> binary_lengths(num_devices);
+  scoped_array<unsigned char *> binaries(num_devices);
+  for(unsigned i = 0; i < num_devices; ++i) {
+    binary_lengths[i] = binary_size;
+    binaries[i] = binary;
+  }
+
+  cl_int status;
+  scoped_array<cl_int> binary_status(num_devices);
+
+  cl_program program = clCreateProgramWithBinary(context, num_devices, devices, binary_lengths,
+      (const unsigned char **) binaries.get(), binary_status, &status);
+  checkError(status, "Failed to create program with binary");
+  for(unsigned i = 0; i < num_devices; ++i) {
+    checkError(binary_status[i], "Failed to load binary for device");
+  }
+
+  return program;
+}
+
+// Loads a file in binary form.
+unsigned char *loadBinaryFile(const char *file_name, size_t *size) {
+  // Open the File
+  FILE* fp;
+#ifdef _WIN32
+  if(fopen_s(&fp, file_name, "rb") != 0) {
+    return NULL;
+  }
+#else
+  fp = fopen(file_name, "rb");
+  if(fp == 0) {
+    return NULL;
+  }
+#endif
+
+  // Get the size of the file
+  fseek(fp, 0, SEEK_END);
+  *size = ftell(fp);
+
+  // Allocate space for the binary
+  unsigned char *binary = new unsigned char[*size];
+
+  // Go back to the file start
+  rewind(fp);
+
+  // Read the file into the binary
+  if(fread((void*)binary, *size, 1, fp) == 0) {
+    delete[] binary;
+    fclose(fp);
+    return NULL;
+  }
+
+  return binary;
+}
+
+bool fileExists(const char *file_name) {
+#ifdef _WIN32 // Windows
+  DWORD attrib = GetFileAttributesA(file_name);
+  return (attrib != INVALID_FILE_ATTRIBUTES && !(attrib & FILE_ATTRIBUTE_DIRECTORY));
+#else         // Linux
+  return access(file_name, R_OK) != -1;
+#endif
+}
+
+std::string getBoardBinaryFile(const char *prefix, cl_device_id device) {
+  // First check if <prefix>.aocx exists. Use it if it does.
+  std::string file_name = std::string(prefix) + ".aocx";
+  if(fileExists(file_name.c_str())) {
+    return file_name;
+  }
+
+  // Now get the name of the board. For Intel(R) FPGA SDK for OpenCL(TM) boards,
+  // the name of the device is presented as:
+  //  <board name> : ...
+  std::string device_name = getDeviceName(device);
+
+  // Now search for the " :" in the device name.
+  size_t end = device_name.find(" :");
+  if(end != std::string::npos) {
+    std::string board_name(device_name, 0, end);
+
+    // Look for a AOCX with the name <prefix>_<board_name>_<version>.aocx.
+    file_name = std::string(prefix) + "_" + board_name + "_" + VERSION_STR + ".aocx";
+    if(fileExists(file_name.c_str())) {
+      return file_name;
+    }
+  }
+
+  // At this point just use <prefix>.aocx. This file doesn't exist
+  // and this should trigger an error later.
+  return std::string(prefix) + ".aocx";
+}
+
+// High-resolution timer.
+double getCurrentTimestamp() {
+#ifdef _WIN32 // Windows
+  // Use the high-resolution performance counter.
+
+  static LARGE_INTEGER ticks_per_second = {};
+  if(ticks_per_second.QuadPart == 0) {
+    // First call - get the frequency.
+    QueryPerformanceFrequency(&ticks_per_second);
+  }
+
+  LARGE_INTEGER counter;
+  QueryPerformanceCounter(&counter);
+
+  double seconds = double(counter.QuadPart) / double(ticks_per_second.QuadPart);
+  return seconds;
+#else         // Linux
+  timespec a;
+  clock_gettime(CLOCK_MONOTONIC, &a);
+  return (double(a.tv_nsec) * 1.0e-9) + double(a.tv_sec);
+#endif
+}
+
+cl_ulong getStartEndTime(cl_event event) {
+  cl_int status;
+
+  cl_ulong start, end;
+  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
+  checkError(status, "Failed to query event start time");
+  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
+  checkError(status, "Failed to query event end time");
+
+  return end - start;
+}
+
+cl_ulong getStartEndTime(cl_event *events, unsigned num_events) {
+  cl_int status;
+
+  cl_ulong min_start = 0;
+  cl_ulong max_end = 0;
+  for(unsigned i = 0; i < num_events; ++i) {
+    cl_ulong start, end;
+    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
+    checkError(status, "Failed to query event start time");
+    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
+    checkError(status, "Failed to query event end time");
+
+    if(i == 0) {
+      min_start = start;
+      max_end = end;
+    }
+    else {
+      if(start < min_start) {
+        min_start = start;
+      }
+      if(end > max_end) {
+        max_end = end;
+      }
+    }
+  }
+
+  return max_end - min_start;
+}
+
+void waitMilliseconds(unsigned ms) {
+#ifdef _WIN32 // Windows
+  Sleep(ms);
+#else         // Linux
+  timespec sleeptime = {0, 0};
+  sleeptime.tv_sec = ms / 1000;
+  sleeptime.tv_nsec = long(ms % 1000) * 1000000L;  // convert to nanoseconds
+  nanosleep(&sleeptime, NULL);
+#endif
+}
+
+void oclContextCallback(const char *errinfo, const void *, size_t, void *) {
+  printf("Context callback: %s\n", errinfo);
+}
+
+} // ns aocl_utils
+
diff --git a/vta/src/intelfocl/AOCLUtils/opencl.h b/vta/src/intelfocl/AOCLUtils/opencl.h
new file mode 100644
index 000000000000..4aa5348b67b1
--- /dev/null
+++ b/vta/src/intelfocl/AOCLUtils/opencl.h
@@ -0,0 +1,122 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// OpenCL utility functions.
+
+#ifndef AOCL_UTILS_OPENCL_H
+#define AOCL_UTILS_OPENCL_H
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#include "CL/opencl.h"
+
+// This is assumed to be externally provided by the application.
+extern void cleanup();
+
+namespace aocl_utils {
+
+// Host allocation functions
+void *alignedMalloc(size_t size);
+void alignedFree(void *ptr);
+
+// Error functions
+void printError(cl_int error);
+void _checkError(int line,
+								 const char *file,
+								 cl_int error,
+                 const char *msg,
+                 ...); // does not return
+#define checkError(status, ...) _checkError(__LINE__, __FILE__, status, __VA_ARGS__)
+
+// Sets the current working directory to the same directory that contains
+// this executable. Returns true on success.
+bool setCwdToExeDir();
+
+// Find a platform that contains the search string in its name (case-insensitive match).
+// Returns NULL if no match is found.
+cl_platform_id findPlatform(const char *platform_name_search);
+
+// Returns the name of the platform.
+std::string getPlatformName(cl_platform_id pid);
+
+// Returns the name of the device.
+std::string getDeviceName(cl_device_id did);
+
+// Returns an array of device ids for the given platform and the
+// device type.
+// Return value must be freed with delete[].
+cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices);
+
+// Create a OpenCL program from a binary file.
+// The program is created for all given devices associated with the context. The same
+// binary is used for all devices.
+cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices);
+
+// Load binary file.
+// Return value must be freed with delete[].
+unsigned char *loadBinaryFile(const char *file_name, size_t *size);
+
+// Checks if a file exists.
+bool fileExists(const char *file_name);
+
+// Returns the path to the AOCX file to use for the given device.
+// This is special handling for examples for the Intel(R) FPGA SDK for OpenCL(TM).
+// It uses the device name to get the board name and then looks for a
+// corresponding AOCX file. Specifically, it gets the device name and
+// extracts the board name assuming the device name has the following format:
+//  <board> : ...
+//
+// Then the AOCX file is <prefix>_<version>_<board>.aocx. If this
+// file does not exist, then the file name defaults to <prefix>.aocx.
+std::string getBoardBinaryFile(const char *prefix, cl_device_id device);
+
+// Returns the time from a high-resolution timer in seconds. This value
+// can be used with a value returned previously to measure a high-resolution
+// time difference.
+double getCurrentTimestamp();
+
+// Returns the difference between the CL_PROFILING_COMMAND_END and
+// CL_PROFILING_COMMAND_START values of a cl_event object.
+// This requires that the command queue associated with the event be created
+// with the CL_QUEUE_PROFILING_ENABLE property.
+//
+// The return value is in nanoseconds.
+cl_ulong getStartEndTime(cl_event event);
+
+// Returns the maximum time span for the given set of events.
+// The time span starts at the earliest event start time.
+// The time span ends at the latest event end time.
+cl_ulong getStartEndTime(cl_event *events, unsigned num_events);
+
+// Wait for the specified number of milliseconds.
+void waitMilliseconds(unsigned ms);
+
+// OpenCL context callback function that simply prints the error information
+// to stdout (via printf).
+void oclContextCallback(const char *errinfo, const void *, size_t, void *);
+
+} // ns aocl_utils
+
+#endif
+
diff --git a/vta/src/intelfocl/AOCLUtils/options.cpp b/vta/src/intelfocl/AOCLUtils/options.cpp
new file mode 100644
index 000000000000..05d025b43faf
--- /dev/null
+++ b/vta/src/intelfocl/AOCLUtils/options.cpp
@@ -0,0 +1,105 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+#include "aocl_utils.h"
+#include <algorithm>
+#include <iostream>
+#include <stdlib.h>
+#include <vector>
+
+namespace aocl_utils {
+
+Options::Options() {
+}
+
+Options::Options(int num, char *argv[]) {
+  addFromCommandLine(num, argv);
+}
+
+bool Options::has(const std::string &name) const {
+  return m_options.find(name) != m_options.end();
+}
+
+std::string &Options::get(const std::string &name) {
+  return m_options[name];
+}
+
+const std::string &Options::get(const std::string &name) const {
+  OptionMap::const_iterator it = m_options.find(name);
+  if(it == m_options.end()) {
+    errorNonExistent(name);
+    std::cerr << "Option '" << name << "' does not exist.\n";
+    exit(1);
+  }
+  return it->second;
+}
+
+void Options::addFromCommandLine(int num, char *argv[]) {
+  for(int i = 1; i < num; ++i) {
+    const std::string arg = argv[i];
+
+    // Look for the first '-'.
+    if(arg.size() > 1 && arg[0] == '-') {
+      size_t eq = arg.find('=');
+      size_t name_start = 1;
+
+      // Check if there's a second '-'.
+      if(arg.size() > 2 && arg[1] == '-') {
+        name_start = 2;
+      }
+
+      if(eq == std::string::npos) {
+        // No '='; treat as a boolean option.
+        set(arg.substr(name_start), true);
+      }
+      else if(eq == name_start) {
+        // No name?!
+        errorNameless();
+      }
+      else {
+        set(arg.substr(name_start, eq - name_start), arg.substr(eq + 1));
+      }
+    }
+    else {
+      // Not an option.
+      m_nonoptions.push_back(arg);
+    }
+  }
+}
+
+void Options::errorNameless() const {
+  std::cerr << "No name provided for option.\n";
+  exit(1);
+}
+
+void Options::errorNonExistent(const std::string &name) const {
+  std::cerr << "Option '" << name << "' does not exist.\n";
+  exit(1);
+}
+
+void Options::errorWrongType(const std::string &name) const {
+  std::cerr << "Value for option '" << name << "' is not of the right type (value = '"
+            << get(name) << "').\n";
+  exit(1);
+}
+
+} // ns aocl_utils
+
diff --git a/vta/src/intelfocl/AOCLUtils/options.h b/vta/src/intelfocl/AOCLUtils/options.h
new file mode 100644
index 000000000000..78d34605e60e
--- /dev/null
+++ b/vta/src/intelfocl/AOCLUtils/options.h
@@ -0,0 +1,137 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// Declares a utility class used to parse command-line options.
+
+#ifndef AOCL_UTILS_OPTIONS_H
+#define AOCL_UTILS_OPTIONS_H
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace aocl_utils {
+
+class Options {
+public:
+  typedef std::vector<std::string> StringVec;
+
+  Options();
+  Options(int num, char *argv[]);
+
+  bool has(const std::string &name) const;
+  std::string &get(const std::string &name); // will create an empty option if it does not exist
+  const std::string &get(const std::string &name) const; // error if option does not exist
+
+  void set(const std::string &name, const std::string &value) { get(name) = value; }
+
+  // Command line options must be of the following form:
+  //  [-]-name (indicates option exists)
+  //  [-]-name=value
+  //
+  // This function assumes that the values are from main(int, char *).
+  // This means that the argv[0] is skipped.
+  void addFromCommandLine(int num, char *argv[]);
+
+  // This templated function converts the option value to the given type.
+  // An assert is raised if the conversion fails.
+  template<typename T>
+  T get(const std::string &name) const;
+
+  template<typename T>
+  void set(const std::string &name, const T &value);
+
+  // Non-options are arguments processed in addFromCommandLine
+  // that were not recognized as options.
+  const StringVec &getNonOptions() const { return m_nonoptions; }
+  size_t getNonOptionCount() const { return m_nonoptions.size(); }
+  const std::string &getNonOption(size_t i) const { return m_nonoptions[i]; }
+
+private:
+  typedef std::map<std::string, std::string> OptionMap;
+
+  // Displays an error message indicating that a nameless option
+  // was provided.
+  void errorNameless() const;
+
+  // Displays an error message indicating that the given option
+  // has the wrong type and then exits with an error code.
+  void errorWrongType(const std::string &name) const;
+
+  // Displays an error message indicating that the given option
+  // does not exist and then exits with an error code.
+  void errorNonExistent(const std::string &name) const;
+
+  OptionMap m_options;
+  StringVec m_nonoptions;
+
+  Options(const Options &); // not implemented
+  void operator =(const Options &); // not implemented
+};
+
+template<typename T>
+T Options::get(const std::string &name) const {
+  std::stringstream ss;
+  ss << get(name);
+
+  T v;
+  ss >> v;
+  if(ss.fail() || !ss.eof()) {
+    // Failed to parse or did not consume the whole string value.
+    errorWrongType(name);
+  }
+  return v;
+}
+
+// Specialization for bool. 
+template<>
+inline bool Options::get<bool>(const std::string &name) const {
+  if(has(name)) {
+    const std::string &v = get(name);
+    if(v == "1") {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Specialization for std::string. Simply returns the option string.
+// Requires specialization because using stringstream to read the string
+// will stop at the first whitespace character (which is wrong).
+template<>
+inline std::string Options::get<std::string>(const std::string &name) const {
+  return get(name);
+}
+
+// This assumes the type T can be serialized to a string and back (when get
+// is called).
+template<typename T>
+void Options::set(const std::string &name, const T &value) {
+  std::stringstream ss;
+  ss << value;
+  set(name, ss.str());
+}
+
+} // ns aocl_utils
+
+#endif
+
diff --git a/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h b/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h
new file mode 100644
index 000000000000..b11085c5226e
--- /dev/null
+++ b/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h
@@ -0,0 +1,165 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// Scoped pointer definitions.
+
+#ifndef AOCL_UTILS_SCOPED_PTRS_H
+#define AOCL_UTILS_SCOPED_PTRS_H
+
+namespace aocl_utils {
+
+// Interface is essentially the combination of std::auto_ptr and boost's smart pointers,
+// along with some small extensions (auto conversion to T*).
+
+// scoped_ptr: assumes pointer was allocated with operator new; destroys with operator delete
+template<typename T>
+class scoped_ptr {
+public:
+  typedef scoped_ptr<T> this_type;
+
+  scoped_ptr() : m_ptr(NULL) {}
+  scoped_ptr(T *ptr) : m_ptr(ptr) {}
+  ~scoped_ptr() { reset(); }
+
+  T *get() const { return m_ptr; }
+  operator T *() const { return m_ptr; }
+  T *operator ->() const { return m_ptr; }
+  T &operator *() const { return *m_ptr; }
+
+  this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+  void reset(T *ptr = NULL) { delete m_ptr; m_ptr = ptr; }
+  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+  T *m_ptr;
+
+  // noncopyable
+  scoped_ptr(const this_type &);
+  this_type &operator =(const this_type &);
+};
+
+// scoped_array: assumes pointer was allocated with operator new[]; destroys with operator delete[]
+// Also supports allocation/reset with a number, which is the number of
+// elements of type T.
+template<typename T>
+class scoped_array {
+public:
+  typedef scoped_array<T> this_type;
+
+  scoped_array() : m_ptr(NULL) {}
+  scoped_array(T *ptr) : m_ptr(NULL) { reset(ptr); }
+  explicit scoped_array(size_t n) : m_ptr(NULL) { reset(n); }
+  ~scoped_array() { reset(); }
+
+  T *get() const { return m_ptr; }
+  operator T *() const { return m_ptr; }
+  T *operator ->() const { return m_ptr; }
+  T &operator *() const { return *m_ptr; }
+  T &operator [](int index) const { return m_ptr[index]; }
+
+  this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+  void reset(T *ptr = NULL) { delete[] m_ptr; m_ptr = ptr; }
+  void reset(size_t n) { reset(new T[n]); }
+  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+  T *m_ptr;
+
+  // noncopyable
+  scoped_array(const this_type &);
+  this_type &operator =(const this_type &);
+};
+
+// scoped_aligned_ptr: assumes pointer was allocated with alignedMalloc; destroys with alignedFree
+// Also supports allocation/reset with a number, which is the number of
+// elements of type T
+template<typename T>
+class scoped_aligned_ptr {
+public:
+  typedef scoped_aligned_ptr<T> this_type;
+
+  scoped_aligned_ptr() : m_ptr(NULL) {}
+  scoped_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
+  explicit scoped_aligned_ptr(size_t n) : m_ptr(NULL) { reset(n); }
+  ~scoped_aligned_ptr() { reset(); }
+
+  T *get() const { return m_ptr; }
+  operator T *() const { return m_ptr; }
+  T *operator ->() const { return m_ptr; }
+  T &operator *() const { return *m_ptr; }
+  T &operator [](int index) const { return m_ptr[index]; }
+
+  this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+  void reset(T *ptr = NULL) { if(m_ptr) alignedFree(m_ptr); m_ptr = ptr; }
+  void reset(size_t n) { reset((T*) alignedMalloc(sizeof(T) * n)); }
+  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+  T *m_ptr;
+
+  // noncopyable
+  scoped_aligned_ptr(const this_type &);
+  this_type &operator =(const this_type &);
+};
+
+#if USE_SVM_API == 1
+// scoped_SVM_aligned_ptr: assumes pointer was allocated with clSVMAlloc; destroys with clSVMFree
+// Also supports allocation/reset with a number, which is the number of
+// elements of type T
+template<typename T>
+class scoped_SVM_aligned_ptr {
+public:
+	typedef scoped_SVM_aligned_ptr<T> this_type;
+
+	scoped_SVM_aligned_ptr() : m_ptr(NULL) {}
+	scoped_SVM_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
+	explicit scoped_SVM_aligned_ptr(cl_context ctx, size_t n) : m_ptr(NULL) { reset(ctx, n); }
+	~scoped_SVM_aligned_ptr() { reset(); }
+
+	T *get() const { return m_ptr; }
+	operator T *() const { return m_ptr; }
+	T *operator ->() const { return m_ptr; }
+	T &operator *() const { return *m_ptr; }
+	T &operator [](int index) const { return m_ptr[index]; }
+
+	this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+	void reset(T *ptr = NULL) { if (m_ptr) clSVMFree(m_ctx, m_ptr); m_ptr = ptr; }
+	void reset(cl_context ctx, size_t n) { reset((T*)clSVMAlloc(ctx, 0, sizeof(T) * n, 0)); m_ctx = ctx; }
+	T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+	T *m_ptr;
+	cl_context m_ctx;
+
+	// noncopyable
+	scoped_SVM_aligned_ptr(const this_type &);
+	this_type &operator =(const this_type &);
+};
+#endif /* USE_SVM_API == 1 */
+
+} // ns aocl_utils
+
+#endif
+
diff --git a/vta/src/intelfocl/intelfocl_device.cc b/vta/src/intelfocl/intelfocl_device.cc
new file mode 100644
index 000000000000..5eb1519b1124
--- /dev/null
+++ b/vta/src/intelfocl/intelfocl_device.cc
@@ -0,0 +1,181 @@
+#include <dmlc/logging.h>
+#include <vta/hw_spec.h>
+#include "intelfocl_device.h"
+#include "AOCLUtils/aocl_utils.h"
+
+#define MEM_ALIGNMENT (1024)
+
+#define CL_STATUS_SUCCESS(x) ((x) == CL_SUCCESS)
+
+void cleanup() {}
+
+int IntelFOCLDevice::init(size_t mem_size, std::string aocx_file)
+{
+    cl_int status;
+    cl_device_id device;
+    cl_platform_id platform;
+    unsigned int argi;
+    bool focl_device_avail;
+    unsigned int num_devices;
+    aocl_utils::scoped_array<cl_device_id> devices;
+
+    platform = aocl_utils::findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
+    CHECK(platform) << "Unable to find Intel(R) FPGA OpenCL platform";
+    
+    devices.reset(aocl_utils::getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
+    focl_device_avail = false;
+    for ( unsigned int i = 0; i < num_devices; i ++ )
+    {
+        device = devices[i];
+        _context = clCreateContext(NULL, 1, &device, &aocl_utils::oclContextCallback, NULL, &status);
+        if ( CL_STATUS_SUCCESS(status) )
+        {
+            focl_device_avail = true;
+            LOG(INFO) << "Using device: " << aocl_utils::getDeviceName(device);
+            break;
+        }
+    }
+    CHECK(focl_device_avail) << "No FPGA device available";
+    num_devices = 1;
+
+    LOG(INFO) << "Using AOCX: " << aocx_file;
+    _program = aocl_utils::createProgramFromBinary(_context, aocx_file.c_str(), &device, num_devices);
+    status = clBuildProgram(_program, 0, NULL, "", NULL, NULL);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to build program";
+
+    for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ )
+    {
+        _kernels[i] = clCreateKernel(_program, kernel_names[i].c_str(), &status);
+        CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create kernel";
+        _queues[i] = clCreateCommandQueue(_context, device, 0, &status);
+        CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create command queue";
+    }
+
+    _mem = clCreateBuffer(_context, CL_MEM_READ_WRITE, mem_size, NULL, &status);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create buffer mem";
+    mem_chunk_t init_chunk = {.offset = 0, .size = mem_size, .occupied = false};
+    _mem_chunks.push_back(init_chunk);
+
+    argi = 1;
+    status = clSetKernelArg(_kernels[KERNEL_FETCH], argi++, sizeof(cl_mem), &_mem);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
+    argi = 0;
+    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
+    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
+    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
+    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
+    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
+
+    return 0;
+}
+
+ifocl_mem_off_t IntelFOCLDevice::alloc(size_t size)
+{
+    auto iter = _mem_chunks.begin();
+    size_t aligned_size = ((size + MEM_ALIGNMENT - 1) / MEM_ALIGNMENT) * MEM_ALIGNMENT;
+
+    while ( iter != _mem_chunks.end() && (iter->occupied || (iter->size < aligned_size)) )
+    {
+        iter++;
+    }
+
+    if ( iter == _mem_chunks.end() ) return IFOCL_MEM_OFF_ERR;
+
+    iter->occupied = true;
+    if ( iter->size != aligned_size )
+    {
+        mem_chunk_t rem = {iter->offset + aligned_size, iter->size - aligned_size, false};
+        iter->size = aligned_size;
+        _mem_chunks.insert(std::next(iter), rem);
+    }
+
+    return iter->offset;
+}
+
+void IntelFOCLDevice::free(ifocl_mem_off_t offset)
+{
+    auto iter = _mem_chunks.begin();
+    while ( iter != _mem_chunks.end() && iter->offset < offset ) iter++;
+
+    if ( iter == _mem_chunks.end() || iter->offset != offset || !iter->occupied )
+    {
+        return;
+    }
+
+    iter->occupied = false;
+    if ( iter != _mem_chunks.begin() && !std::prev(iter)->occupied ) iter--;
+
+    while ( std::next(iter) != _mem_chunks.end() && !std::next(iter)->occupied )
+    {
+        iter->size += std::next(iter)->size;
+        _mem_chunks.erase(std::next(iter));
+    }
+}
+
+
+void IntelFOCLDevice::write_mem(ifocl_mem_off_t offset, const void *buf, size_t nbyte)
+{
+    cl_int status = clEnqueueWriteBuffer(_queues[0], _mem, CL_TRUE, offset, nbyte, buf, 0, NULL, NULL);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue write buffer";
+}
+
+void IntelFOCLDevice::read_mem(ifocl_mem_off_t offset, void *buf, size_t nbyte)
+{
+    cl_int status = clEnqueueReadBuffer(_queues[0], _mem, CL_TRUE, offset, nbyte, buf, 0, NULL, NULL);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue read buffer";
+};
+
+int IntelFOCLDevice::execute_instructions(ifocl_mem_off_t offset, size_t count)
+{
+    cl_int status;
+    unsigned int argi;
+    unsigned int insn_offset = offset / VTA_INS_ELEM_BYTES;
+    unsigned int insn_count = count;
+    const size_t global_work_size = 1;
+
+    argi = 0;
+    status = clSetKernelArg(_kernels[KERNEL_FETCH], argi, sizeof(unsigned int), &insn_count);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
+    argi = 2;
+    status = clSetKernelArg(_kernels[KERNEL_FETCH], argi, sizeof(unsigned int), &insn_offset);
+    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
+
+    for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ )
+    {
+        status = clEnqueueNDRangeKernel(_queues[i], _kernels[i], 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
+        CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue kernel";
+    }
+
+    for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ )
+    {
+        status = clFinish(_queues[i]);
+        CHECK(CL_STATUS_SUCCESS(status)) << "Failed to clFinish";
+    }
+
+    return 0;
+};
+
+void IntelFOCLDevice::deinit()
+{
+    for ( unsigned int i = 0; i < NUM_OCL_KERNELS; i++ )
+    {
+        clReleaseKernel(_kernels[i]);
+        clReleaseCommandQueue(_queues[i]);
+    }
+
+    clReleaseMemObject(_mem);
+
+    clReleaseProgram(_program);
+
+    clReleaseContext(_context);
+}
+
+IntelFOCLDevice::~IntelFOCLDevice()
+{
+    deinit();
+}
diff --git a/vta/src/intelfocl/intelfocl_device.h b/vta/src/intelfocl/intelfocl_device.h
new file mode 100644
index 000000000000..6c53a4d47323
--- /dev/null
+++ b/vta/src/intelfocl/intelfocl_device.h
@@ -0,0 +1,53 @@
+#ifndef VTA_INTEL_FOCL_DEVICE_H_
+#define VTA_INTEL_FOCL_DEVICE_H_
+
+#include <list>
+#include <string>
+
+#include "CL/opencl.h"
+
+#define NUM_OCL_KERNELS 3
+enum kernel_index {KERNEL_FETCH, KERNEL_COMPUTE, KERNEL_PROFILE};
+static std::string kernel_names[3] = {"fetch", "compute", "profile"};
+
+typedef size_t ifocl_mem_off_t;
+#define IFOCL_MEM_OFF_ERR (SIZE_MAX)
+
+typedef struct
+{
+    ifocl_mem_off_t offset;
+    size_t size;
+    bool occupied;
+} mem_chunk_t;
+
+class IntelFOCLDevice {
+    private:
+        cl_context _context;
+        cl_program _program;
+        cl_mem _mem;
+        cl_kernel _kernels[NUM_OCL_KERNELS];
+        cl_command_queue _queues[NUM_OCL_KERNELS];
+        std::list<mem_chunk_t> _mem_chunks;
+
+    public:
+        IntelFOCLDevice() { init(4*1024*1024*1024ULL, "vta_opencl.aocx"); }
+
+        int init(size_t mem_size, std::string aocx_file);
+
+        ifocl_mem_off_t alloc(size_t size);
+
+        void free(ifocl_mem_off_t offset);
+
+        void write_mem(ifocl_mem_off_t offset, const void *buf, size_t nbyte);
+
+        void read_mem(ifocl_mem_off_t offset, void *buf, size_t nbyte);
+
+        int execute_instructions(ifocl_mem_off_t offset, size_t count);
+
+        void deinit();
+
+        ~IntelFOCLDevice();
+};
+
+#endif  // VTA_INTEL_FOCL_DEVICE_H_
+
diff --git a/vta/src/intelfocl/intelfocl_driver.cc b/vta/src/intelfocl/intelfocl_driver.cc
new file mode 100644
index 000000000000..a8db9cd0e394
--- /dev/null
+++ b/vta/src/intelfocl/intelfocl_driver.cc
@@ -0,0 +1,74 @@
+#include <string>
+#include <iostream>
+#include <vta/driver.h>
+#include "intelfocl_device.h"
+
+#define MEM_ADDR_IDENTIFIER (0x18000000)
+
+static IntelFOCLDevice focl_device;
+
+static inline void* mem_get_addr(ifocl_mem_off_t offset)
+{
+    void *ret = (void *) (offset + MEM_ADDR_IDENTIFIER);
+    return ret;
+}
+
+static inline ifocl_mem_off_t mem_get_offset(const void *addr)
+{
+    ifocl_mem_off_t ret = (ifocl_mem_off_t) addr - MEM_ADDR_IDENTIFIER;
+    return ret;
+}
+
+void* VTAMemAlloc(size_t size, int cached) {
+    (void) cached;
+    ifocl_mem_off_t offset = focl_device.alloc(size);
+    if ( offset == IFOCL_MEM_OFF_ERR ) return NULL;
+    void *addr = mem_get_addr(offset);
+    return addr;
+}
+
+void VTAMemFree(void *buf) {
+    ifocl_mem_off_t offset = mem_get_offset(buf);
+    focl_device.free(offset);
+}
+
+vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
+    ifocl_mem_off_t offset = mem_get_offset(buf);
+    return (vta_phy_addr_t) offset;
+}
+
+void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
+    ifocl_mem_off_t dst_offset = mem_get_offset(dst);
+    focl_device.write_mem(dst_offset, src, size);
+}
+
+void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
+    ifocl_mem_off_t src_offset = mem_get_offset(src);
+    focl_device.read_mem(src_offset, dst, size);
+}
+
+void VTAFlushCache(void * offset, vta_phy_addr_t buf, int size) {
+    std::cout << "VTAFlushCache not implemented for Intel OpenCL for FPGA devices" << std::endl;
+}
+
+void VTAInvalidateCache(void * offset, vta_phy_addr_t buf, int size) {
+    std::cout << "VTAInvalidateCache not implemented for Intel OpenCL for FPGA devices" << std::endl;
+}
+
+VTADeviceHandle VTADeviceAlloc() {
+    return (VTADeviceHandle) &focl_device;
+}
+
+void VTADeviceFree(VTADeviceHandle handle) {
+    (void) handle;
+}
+
+int VTADeviceRun(VTADeviceHandle handle,
+        vta_phy_addr_t insn_phy_addr,
+        uint32_t insn_count,
+        uint32_t wait_cycles)
+{
+    (void) wait_cycles;
+    ifocl_mem_off_t offset = (ifocl_mem_off_t) insn_phy_addr;
+    return focl_device.execute_instructions(offset, insn_count);
+}
diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc
new file mode 100644
index 000000000000..518b6c368926
--- /dev/null
+++ b/vta/src/pynq/pynq_driver.cc
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * \file pynq_driver.c
+ * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
+ */
+
+#include <vta/driver.h>
+#include <thread>
+#include <time.h>
+#include "pynq_driver.h"
+
+
+void* VTAMemAlloc(size_t size, int cached) {
+  assert(size <= VTA_MAX_XFER);
+  // Rely on the pynq-specific cma library
+  return cma_alloc(size, cached);
+}
+
+void VTAMemFree(void* buf) {
+  // Rely on the pynq-specific cma library
+  cma_free(buf);
+}
+
+vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
+  return cma_get_phy_addr(buf);
+}
+
+void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
+  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
+  memcpy(dst, src, size);
+}
+
+void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
+  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
+  memcpy(dst, src, size);
+}
+
+void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
+  // Call the cma_flush_cache on the CMA buffer
+  // so that the FPGA can read the buffer data.
+  cma_flush_cache(vir_addr, phy_addr, size);
+}
+
+void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
+  // Call the cma_invalidate_cache on the CMA buffer
+  // so that the host needs to read the buffer data.
+  cma_invalidate_cache(vir_addr, phy_addr, size);
+}
+
+void *VTAMapRegister(uint32_t addr) {
+  // Align the base address with the pages
+  uint32_t virt_base = addr & ~(getpagesize() - 1);
+  // Calculate base address offset w.r.t the base address
+  uint32_t virt_offset = addr - virt_base;
+  // Open file and mmap
+  uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC);
+  return mmap(NULL,
+              (VTA_IP_REG_MAP_RANGE + virt_offset),
+              PROT_READ|PROT_WRITE,
+              MAP_SHARED,
+              mmap_file,
+              virt_base);
+}
+
+void VTAUnmapRegister(void *vta) {
+  // Unmap memory
+  int status = munmap(vta, VTA_IP_REG_MAP_RANGE);
+  assert(status == 0);
+}
+
+void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
+  *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)) = val;
+}
+
+uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
+  return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
+}
+
+class VTADevice {
+ public:
+  VTADevice() {
+    // VTA stage handles
+    vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR);
+    vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR);
+    vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR);
+    vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR);
+  }
+
+  ~VTADevice() {
+    // Close VTA stage handle
+    VTAUnmapRegister(vta_fetch_handle_);
+    VTAUnmapRegister(vta_load_handle_);
+    VTAUnmapRegister(vta_compute_handle_);
+    VTAUnmapRegister(vta_store_handle_);
+  }
+
+  int Run(vta_phy_addr_t insn_phy_addr,
+          uint32_t insn_count,
+          uint32_t wait_cycles) {
+    VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
+    VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy_addr);
+    VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_INP_ADDR_OFFSET, 0);
+    VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_WGT_ADDR_OFFSET, 0);
+    VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_UOP_ADDR_OFFSET, 0);
+    VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_BIAS_ADDR_OFFSET, 0);
+    VTAWriteMappedReg(vta_store_handle_, VTA_STORE_OUT_ADDR_OFFSET, 0);
+
+    // VTA start
+    VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
+    VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
+    VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
+    VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
+
+    // Allow device to respond
+    struct timespec ts = { .tv_sec = 0, .tv_nsec = 1000 };
+    nanosleep(&ts, &ts);
+
+    // Loop until the VTA is done
+    unsigned t, flag = 0;
+    for (t = 0; t < wait_cycles; ++t) {
+      flag = VTAReadMappedReg(vta_compute_handle_, VTA_COMPUTE_DONE_RD_OFFSET);
+      if (flag == VTA_DONE) break;
+      std::this_thread::yield();
+    }
+    // Report error if timeout
+    return t < wait_cycles ? 0 : 1;
+  }
+
+ private:
+  // VTA handles (register maps)
+  void* vta_fetch_handle_{nullptr};
+  void* vta_load_handle_{nullptr};
+  void* vta_compute_handle_{nullptr};
+  void* vta_store_handle_{nullptr};
+};
+
+VTADeviceHandle VTADeviceAlloc() {
+  return new VTADevice();
+}
+
+void VTADeviceFree(VTADeviceHandle handle) {
+  delete static_cast<VTADevice*>(handle);
+}
+
+int VTADeviceRun(VTADeviceHandle handle,
+                 vta_phy_addr_t insn_phy_addr,
+                 uint32_t insn_count,
+                 uint32_t wait_cycles) {
+  return static_cast<VTADevice*>(handle)->Run(
+      insn_phy_addr, insn_count, wait_cycles);
+}

From 3e51e493f07c41d75baf9ae05fcd1342b72d2375 Mon Sep 17 00:00:00 2001
From: zhanghao <zhanghao@node-1.sg.4pd.io>
Date: Mon, 9 Mar 2020 09:42:36 +0800
Subject: [PATCH 02/44] put resnet18 middle layers to run on vta - add load
 acc_int8 in simulation - remove copy op - add vta schedule - add always
 32-bits

---
 .gitignore                                    |  3 +
 python/tvm/autotvm/measure/measure_methods.py |  5 +-
 python/tvm/autotvm/tuner/tuner.py             | 13 ++--
 python/tvm/contrib/util.py                    |  1 +
 python/tvm/relay/op/_tensor.py                |  4 +-
 python/tvm/relay/op/op.py                     |  7 ++
 python/tvm/relay/quantize/_partition.py       | 12 +++
 src/relay/backend/compile_engine.cc           |  4 +-
 src/relay/quantize/realize.cc                 | 31 ++++++--
 topi/python/topi/generic/injective.py         |  4 +
 vta/python/vta/environment.py                 |  2 +
 vta/python/vta/top/graphpack.py               |  7 +-
 vta/python/vta/top/op.py                      |  3 +-
 vta/python/vta/top/vta_conv2d.py              | 77 +++++++++++++++++++
 vta/runtime/runtime.cc                        | 54 +++++++------
 15 files changed, 183 insertions(+), 44 deletions(-)

diff --git a/.gitignore b/.gitignore
index b9357018a64c..dd3634bf2bb0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -233,3 +233,6 @@ conda/pkg
 # antlr files
 *.tokens
 *.interp
+
+*log*
+*.txt
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index b8969f55c00a..7f915132fdc8 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -468,8 +468,9 @@ def run_through_rpc(measure_input, build_result,
             measure_input.target.device_name == 'vta':
             # pylint: disable=import-outside-toplevel
             from vta import program_fpga, reconfig_runtime
-            program_fpga(remote, None)
-            reconfig_runtime(remote)
+            # FIXME(zhanghao): remove this
+            # program_fpga(remote, None)
+            # reconfig_runtime(remote)
         remote.upload(build_result.filename)
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 2441a4ae642f..4f984aae701f 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -161,12 +161,13 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr
                 logger.debug("Early stopped. Best iter: %d.", self.best_iter)
                 break
 
-            if error_ct > 150:
-                logging.basicConfig()
-                logger.warning("Too many errors happen in the tuning. Now is in debug mode")
-                logger.setLevel(logging.DEBUG)
-            else:
-                logger.setLevel(old_level)
+            # NOTE(zhanghao): comment out as it will raise too many logs
+            # if error_ct > 150:
+            #     logging.basicConfig()
+            #     logger.warning("Too many errors happen in the tuning. Now is in debug mode")
+            #     logger.setLevel(logging.DEBUG)
+            # else:
+            #     logger.setLevel(old_level)
 
         GLOBAL_SCOPE.in_tuning = False
         del measure_batch
diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py
index 8f6dfc7f28ec..474741fc1e35 100644
--- a/python/tvm/contrib/util.py
+++ b/python/tvm/contrib/util.py
@@ -19,6 +19,7 @@
 import contextlib
 import datetime
 import os
+import sys
 import tempfile
 import threading
 import shutil
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index cd9e4ed050d2..cca44429f7df 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -51,7 +51,9 @@
 register_broadcast_schedule("sign")
 register_broadcast_schedule("abs")
 register_broadcast_schedule("tanh")
-register_broadcast_schedule("add")
+# NOTE(zhanghao): use customized add schedule
+register_schedule("add", schedule_add)
+# register_broadcast_schedule("add")
 register_broadcast_schedule("subtract")
 register_broadcast_schedule("multiply")
 register_broadcast_schedule("divide")
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 7fad9a258f2b..f2428f9db2ef 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -394,6 +394,13 @@ def register_external_compiler(op_name, fexternal=None, level=10):
     return tvm.ir.register_op_attr(op_name, "FTVMExternalCompiler", fexternal, level)
 
 
+
+def schedule_add(attrs, outputs, target):
+    """Generic schedule for add."""
+    with target:
+        return topi.generic.schedule_add(outputs)
+
+
 @tvm._ffi.register_func("relay.op.compiler._lower")
 def _lower(name, schedule, inputs, outputs):
     return lower(schedule, list(inputs) + list(outputs), name=name)
diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py
index a607f4ea50b8..bba5a6d842f9 100644
--- a/python/tvm/relay/quantize/_partition.py
+++ b/python/tvm/relay/quantize/_partition.py
@@ -144,3 +144,15 @@ def multiply_partition_function(ref_call, new_args, ctx):
         return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
     assert (not lhs_cond) and (not rhs_cond)
     return None
+
+# @register_partition_function("nn.global_avg_pool2d")
+# def global_avg_pool2d_partition_function(ref_call, new_args, ctx):
+#     cond, expr = partition_expr_check(new_args[0])
+#     eprint("global_avg_pool2d partition")
+#     if cond:
+#         expr = stop_fusion(new_args[0].realize())
+#         return _forward_op(ref_call, [expr])
+#     else:
+#         expr = stop_fusion(QPartitionExpr(new_args[0]).realize())
+#         return _forward_op(ref_call, [expr])
+#     return None
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 3687b75c8ce8..3b0b1b39c62c 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -123,7 +123,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     readable_name_stream_ << "fused";
     cache_node->outputs = this->VisitExpr(prim_func->body);
     auto candidate_name = readable_name_stream_.str();
-    constexpr static size_t kMaxFuncNameLength = 80;
+    constexpr static size_t kMaxFuncNameLength = 800;
     if (candidate_name.size() > kMaxFuncNameLength) {
       std::stringstream truncated_name;
       truncated_name << candidate_name.substr(0, kMaxFuncNameLength);
@@ -343,7 +343,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     auto cache_node = make_object<CachedFuncNode>();
     cache_node->outputs = VisitExpr(prim_func->body);
     auto candidate_name = readable_name_stream_.str();
-    constexpr static size_t kMaxFuncNameLength = 80;
+    constexpr static size_t kMaxFuncNameLength = 800;
     if (candidate_name.size() > kMaxFuncNameLength) {
       std::stringstream truncated_name;
       truncated_name << candidate_name.substr(0, kMaxFuncNameLength);
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 49d1e522f7d7..41680b655a66 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -312,19 +312,38 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
   CHECK_EQ(ref_args.size(), args.size());
   DataType dtype;
 
-  if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
-    dtype = cfg->dtype_input;
-  } else {
-    dtype = cfg->dtype_activation;
-  }
+  // FIXME(zhanghao): force to use add(int32, int32) in order to put in VTA ALU
+  // but this may be not necessary for other devices
+  // if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
+  //   dtype = cfg->dtype_input;
+  // } else {
+  //   dtype = cfg->dtype_activation;
+  // }
+  dtype = cfg->dtype_activation;
   for (size_t i = 0; i < ret.size(); ++i) {
     auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
-      ret.Set(i, Cast(ret[i], dtype));
+      auto new_arg = Cast(ret[i], dtype);
+
+      // NOTE(zhanghao)
+      // if you want to let cpu to do all the cast, use the following code
+      // ret.Set(i, StopFusion(new_arg));
+
+      // do not fuse float32 cast
+      if (nptrs[i]->dtype == DataType::Float(32)) {
+        ret.Set(i, StopFusion(new_arg));
+      } else {
+        ret.Set(i, new_arg);
+      }
     } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) &&
                ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
       auto new_arg = Cast(ret[i], cfg->dtype_input);
       new_arg = StopFusion(new_arg);
+
+      // NOTE(zhanghao)
+      // if you want to let cpu to do all the cast, use the following code
+      // ret.Set(i, StopFusion(Cast(new_arg, dtype)));
+
       ret.Set(i, Cast(new_arg, dtype));
     }
   }
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index fa6aee4864ec..8aae9a3c5f14 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -63,5 +63,9 @@ def schedule_injective(outs):
     schedule_injective_from_existing(s, x)
     return s
 
+@tvm.target.generic_func
+def schedule_add(outs):
+    return schedule_injective(outs)
+
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index c556352e4539..9f82d65f1d4e 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -62,11 +62,13 @@ class DevContext(object):
     MEM_ID_INP = 2
     MEM_ID_ACC = 3
     MEM_ID_OUT = 4
+    MEM_ID_ACC_8 = 5
     # VTA ALU Opcodes
     ALU_OPCODE_MIN = 0
     ALU_OPCODE_MAX = 1
     ALU_OPCODE_ADD = 2
     ALU_OPCODE_SHR = 3
+    # ALU_OPCODE_CAST = 4
     # Task queue id (pipeline stage)
     QID_LOAD_INP = 1
     QID_LOAD_WGT = 1
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 231d40033350..f6b22ce67ce5 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -317,7 +317,12 @@ def visit_call(self, call):
             elif self.start_pack and call.op == op.op.get('cast') and \
                     input_types[0].dtype == 'int32':
                 cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs)
-                return relay.Call(op.op.get('copy'), [cast])
+                # zhanghao: force separate cast and copy (to let copy do on cpu)
+                # cast = relay.Call(op.op.get('annotation.stop_fusion'), [cast])
+
+                # zhanghao: remove the redudant copy
+                # return relay.Call(op.op.get('copy'), [cast])
+                return cast
             elif call.op == self.pad:
                 pad_width = call.attrs.pad_width
                 if len(pad_width) == 6:
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 2198ed4c191f..010daaedf2bc 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -35,7 +35,8 @@
 
 
 # override to force partition at copy
-reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
+# TODO(zhanghao): remove all copy
+# reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
 
 # add clip vta strategy
 def compute_clip_vta(attrs, inputs, output_type):
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 5b23ddeba1c1..2f30aba45d10 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -25,6 +25,9 @@
 
 from .util import is_packed_layout
 from ..environment import get_env
+from tvm.relay import op as Op
+from tvm.contrib.util import eprint
+
 
 @autotvm.register_topi_compute("conv2d_packed.vta")
 def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
@@ -33,6 +36,7 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
         raise topi.InvalidShapeError()
     assert dilation == (1, 1)
 
+    eprint("data.shape, kernel.shape", data.shape, kernel.shape)
     if padding[0]:
         pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data")
     else:
@@ -63,6 +67,79 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
 
     return res
 
+
+# FIXME(zhanghao): move this code to a proper location
+@topi.generic.schedule_add.register(["vta"])
+def _schedule_add(outs):
+    eprint("schedule_add vta")
+    assert len(outs) == 1
+
+    def is_cast_op(op):
+        # return op.same_as(Op.op.get("cast"))
+        # FIXME(zhanghao): find a better way to do compare
+        return op.name == 'T_cast'
+
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    output = outs[0]
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+    # s[output].fuse(s[output].op.axis)
+
+    ewise_inputs = []
+    ewise_ops = []
+    const_ops = []
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                if not op.axis:
+                    const_ops.append(op)
+                elif not is_cast_op(op):
+                    ewise_ops.append(op)
+
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.tensor.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                elif is_cast_op(tensor.op) and not op.same_as(output.op):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            for tensor in op.input_tensors:
+                if (not isinstance(tensor.op, tvm.tensor.PlaceholderOp)) \
+                        and (not is_cast_op(tensor.op)):
+                    _traverse(tensor.op)
+
+    op = output.op
+    _traverse(op)
+    # only put the int-related ops to vta
+    if "int" in output.dtype:
+        env = get_env()
+        for eo in ewise_ops:
+            eprint("add ewise_ops ", eo)
+            s[eo].set_scope(env.acc_scope)
+            s[eo].pragma(s[eo].op.axis[0], env.alu)
+            s[eo].compute_at(s[output], s[output].op.axis[-2])
+
+        # cache read input
+        cache_read_ewise = []
+        for consumer, tensor in ewise_inputs:
+            eprint("add dma_copy", consumer, tensor, tensor.op)
+            cache_read_ewise.append(
+                s.cache_read(tensor, env.acc_scope, [consumer]))
+
+        for tensor in cache_read_ewise:
+            s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
+            s[tensor].compute_at(s[output], s[output].op.axis[-2])
+
+        for op in const_ops:
+            s[op].compute_inline()
+
+        s[output].pragma(s[output].op.axis[-1], env.dma_copy)
+
+    return s
+
+
 @autotvm.register_topi_schedule("conv2d_packed.vta")
 def schedule_conv2d_packed(cfg, outs):
     """Schedule packed conv2d"""
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index b1d3ad424d6e..314eb46fcf56 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -608,29 +608,29 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     CommitPendingPop(kComputeStage);
   }
   // Helper function: Get Opcode string
-  const char* getOpcodeString(int opcode, bool use_imm) {
-    // The string name
-    if (opcode == VTA_ALU_OPCODE_MIN) {
-      if (use_imm) {
-        return "min imm";
-      } else {
-        return "min";
-      }
-    } else if (opcode == VTA_ALU_OPCODE_MAX) {
-      if (use_imm) {
-        return "max imm";
-      } else {
-        return "max";
-      }
-    } else if (opcode == VTA_ALU_OPCODE_ADD) {
-      if (use_imm) {
-        return "add imm";
-      } else {
-        return "add";
+  std::string getOpcodeString(int opcode, bool use_imm, int64_t imm) {
+      // The string name
+      if (opcode == VTA_ALU_OPCODE_MIN) {
+          if (use_imm) {
+              return std::string("min imm ") + std::to_string(imm);
+          } else {
+              return "min";
+          }
+      } else if (opcode == VTA_ALU_OPCODE_MAX) {
+          if (use_imm) {
+              return (std::string("max imm ") + std::to_string(imm));
+          } else {
+              return "max";
+          }
+      } else if (opcode == VTA_ALU_OPCODE_ADD) {
+          if (use_imm) {
+              return (std::string("add imm ") + std::to_string(imm));
+          } else {
+              return "add";
+          }
+      } else if (opcode == VTA_ALU_OPCODE_SHR) {
+          return (std::string("shr ") + std::to_string(imm));
       }
-    } else if (opcode == VTA_ALU_OPCODE_SHR) {
-      return "shr";
-    }
 
     return "unknown op";
   }
@@ -692,6 +692,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
           if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
           if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
           if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
+          if (c.mem.memory_type == VTA_MEM_ID_ACC_8) printf("ACC 8\n");
         }
         if (c.mem.opcode == VTA_OPCODE_STORE) {
           printf("STORE:\n");
@@ -724,7 +725,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
                static_cast<int>(c.gemm.src_factor_in), static_cast<int>(c.gemm.dst_factor_in));
       } else if (c.mem.opcode == VTA_OPCODE_ALU) {
         // Print instruction field information
-        printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
+        printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm, c.alu.imm).c_str());
         printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
                static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
                static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
@@ -829,7 +830,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
   }
   // Get stage of the memory
   static PipelineStage GetMemPipelineStage(int memory_type) {
-    if (memory_type == VTA_MEM_ID_ACC) return kComputeStage;
+    if (memory_type == VTA_MEM_ID_ACC || memory_type == VTA_MEM_ID_ACC_8) return kComputeStage;
     if (memory_type == VTA_MEM_ID_UOP) return kComputeStage;
     return kLoadStage;
   }
@@ -839,7 +840,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage;
     if (insn->opcode == VTA_OPCODE_LOAD) {
       if (insn->x_size == 0) return kNoneStage;
-      if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage;
+      if (insn->memory_type == VTA_MEM_ID_ACC || insn->memory_type == VTA_MEM_ID_ACC_8) return kComputeStage;
       if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage;
       return kLoadStage;
     }
@@ -922,6 +923,9 @@ class CommandQueue {
       case VTA_MEM_ID_OUT:
         elem_bytes = VTA_OUT_ELEM_BYTES;
         break;
+      case VTA_MEM_ID_ACC_8:
+        elem_bytes = VTA_ACC_ELEM_BYTES / 4;
+        break;
       default:
         LOG(FATAL) << "Memory id not recognized:" << memory_id;
         break;

From 082f64e561b7834055c6b32119e02e0ef0af1329 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 9 Mar 2020 11:19:39 +0800
Subject: [PATCH 03/44] adapt to the code base

---
 python/tvm/relay/op/_tensor.py          |   2 +-
 python/tvm/relay/op/op.py               |  14 +++
 python/tvm/relay/op/strategy/generic.py |   7 ++
 vta/python/vta/top/vta_conv2d.py        | 144 ++++++++++++------------
 4 files changed, 94 insertions(+), 73 deletions(-)

diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index cca44429f7df..44d0a60227d6 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -52,7 +52,7 @@
 register_broadcast_schedule("abs")
 register_broadcast_schedule("tanh")
 # NOTE(zhanghao): use customized add schedule
-register_schedule("add", schedule_add)
+register_add_schedule("add")
 # register_broadcast_schedule("add")
 register_broadcast_schedule("subtract")
 register_broadcast_schedule("multiply")
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index f2428f9db2ef..5056825d007c 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -240,6 +240,20 @@ def register_injective_schedule(op_name, level=10):
     return register_schedule(op_name, _schedule_injective, level)
 
 
+def register_add_schedule(op_name, level=10):
+    """Register schedule function for add.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    level : int
+        The priority level
+    """
+    return register_schedule(op_name, _schedule_add, level)
+
+
 def register_broadcast_schedule(op_name, level=10):
     """Register broadcast schedule function for an op.
 
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 4fa2b11d554d..3d24cdf73e9d 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -69,6 +69,12 @@ def schedule_injective(attrs, outs, target):
     with target:
         return topi.generic.schedule_injective(outs)
 
+@generic_func
+def schedule_add(attrs, outputs, target):
+    """Generic schedule for add."""
+    with target:
+        return topi.generic.schedule_add(outputs)
+
 @generic_func
 def schedule_reduce(attrs, outs, target):
     """Schedule reduction ops"""
@@ -77,6 +83,7 @@ def schedule_reduce(attrs, outs, target):
 
 _op._schedule_injective = schedule_injective
 _op._schedule_reduce = schedule_reduce
+_op._schedule_add = schedule_add
 
 # concatenate
 @generic_func
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 2f30aba45d10..44430b9123c7 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -68,78 +68,6 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
     return res
 
 
-# FIXME(zhanghao): move this code to a proper location
-@topi.generic.schedule_add.register(["vta"])
-def _schedule_add(outs):
-    eprint("schedule_add vta")
-    assert len(outs) == 1
-
-    def is_cast_op(op):
-        # return op.same_as(Op.op.get("cast"))
-        # FIXME(zhanghao): find a better way to do compare
-        return op.name == 'T_cast'
-
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    output = outs[0]
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
-    # s[output].fuse(s[output].op.axis)
-
-    ewise_inputs = []
-    ewise_ops = []
-    const_ops = []
-
-    def _traverse(op):
-        if topi.tag.is_broadcast(op.tag):
-            if not op.same_as(output.op):
-                if not op.axis:
-                    const_ops.append(op)
-                elif not is_cast_op(op):
-                    ewise_ops.append(op)
-
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.PlaceholderOp):
-                    ewise_inputs.append((op, tensor))
-                elif is_cast_op(tensor.op) and not op.same_as(output.op):
-                    ewise_inputs.append((op, tensor))
-                else:
-                    _traverse(tensor.op)
-        else:
-            for tensor in op.input_tensors:
-                if (not isinstance(tensor.op, tvm.tensor.PlaceholderOp)) \
-                        and (not is_cast_op(tensor.op)):
-                    _traverse(tensor.op)
-
-    op = output.op
-    _traverse(op)
-    # only put the int-related ops to vta
-    if "int" in output.dtype:
-        env = get_env()
-        for eo in ewise_ops:
-            eprint("add ewise_ops ", eo)
-            s[eo].set_scope(env.acc_scope)
-            s[eo].pragma(s[eo].op.axis[0], env.alu)
-            s[eo].compute_at(s[output], s[output].op.axis[-2])
-
-        # cache read input
-        cache_read_ewise = []
-        for consumer, tensor in ewise_inputs:
-            eprint("add dma_copy", consumer, tensor, tensor.op)
-            cache_read_ewise.append(
-                s.cache_read(tensor, env.acc_scope, [consumer]))
-
-        for tensor in cache_read_ewise:
-            s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
-            s[tensor].compute_at(s[output], s[output].op.axis[-2])
-
-        for op in const_ops:
-            s[op].compute_inline()
-
-        s[output].pragma(s[output].op.axis[-1], env.dma_copy)
-
-    return s
-
-
 @autotvm.register_topi_schedule("conv2d_packed.vta")
 def schedule_conv2d_packed(cfg, outs):
     """Schedule packed conv2d"""
@@ -261,3 +189,75 @@ def _traverse(op):
     s[output].pragma(x_co1, env.dma_copy)
 
     return s
+
+
+# FIXME(zhanghao): move this code to a proper location
+@topi.generic.schedule_add.register(["vta"])
+def _schedule_add(outs):
+    eprint("schedule_add vta")
+    assert len(outs) == 1
+
+    def is_cast_op(op):
+        # return op.same_as(Op.op.get("cast"))
+        # FIXME(zhanghao): find a better way to do compare
+        return op.name == 'T_cast'
+
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    output = outs[0]
+    s = te.create_schedule([x.op for x in outs])
+    te.schedule.AutoInlineInjective(s)
+    # s[output].fuse(s[output].op.axis)
+
+    ewise_inputs = []
+    ewise_ops = []
+    const_ops = []
+
+    def _traverse(op):
+        if topi.tag.is_broadcast(op.tag):
+            if not op.same_as(output.op):
+                if not op.axis:
+                    const_ops.append(op)
+                elif not is_cast_op(op):
+                    ewise_ops.append(op)
+
+            for tensor in op.input_tensors:
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                    ewise_inputs.append((op, tensor))
+                elif is_cast_op(tensor.op) and not op.same_as(output.op):
+                    ewise_inputs.append((op, tensor))
+                else:
+                    _traverse(tensor.op)
+        else:
+            for tensor in op.input_tensors:
+                if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \
+                        and (not is_cast_op(tensor.op)):
+                    _traverse(tensor.op)
+
+    op = output.op
+    _traverse(op)
+    # only put the int-related ops to vta
+    if "int" in output.dtype:
+        env = get_env()
+        for eo in ewise_ops:
+            eprint("add ewise_ops ", eo)
+            s[eo].set_scope(env.acc_scope)
+            s[eo].pragma(s[eo].op.axis[0], env.alu)
+            s[eo].compute_at(s[output], s[output].op.axis[-2])
+
+        # cache read input
+        cache_read_ewise = []
+        for consumer, tensor in ewise_inputs:
+            eprint("add dma_copy", consumer, tensor, tensor.op)
+            cache_read_ewise.append(
+                s.cache_read(tensor, env.acc_scope, [consumer]))
+
+        for tensor in cache_read_ewise:
+            s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
+            s[tensor].compute_at(s[output], s[output].op.axis[-2])
+
+        for op in const_ops:
+            s[op].compute_inline()
+
+        s[output].pragma(s[output].op.axis[-1], env.dma_copy)
+
+    return s

From b6bc82a6ac598234b2a6c01a02fc68cf730b4b2c Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Wed, 18 Mar 2020 16:45:07 +0800
Subject: [PATCH 04/44] auto device_copy feature for vta

---
 include/tvm/relay/transform.h                 |  28 +++++
 python/tvm/relay/quantize/_partition.py       |  23 ++--
 python/tvm/relay/transform/transform.py       |   4 +
 src/relay/backend/build_module.cc             |   8 ++
 src/relay/transforms/device_annotation.cc     | 114 ++++++++++++++++-
 src/tir/transforms/lower_tvm_builtin.cc       |  21 ++--
 vta.resnet18_v1.log-manual-formatv0_2         |  10 ++
 vta/python/vta/top/graphpack.py               | 116 +++++++++++++++++-
 vta/runtime/runtime.cc                        |  27 +++-
 .../frontend/deploy_classification.py         |  23 +++-
 10 files changed, 338 insertions(+), 36 deletions(-)
 create mode 100644 vta.resnet18_v1.log-manual-formatv0_2

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index b287c053e8a9..61eb6dd50ce2 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -116,6 +116,16 @@ TVM_DLL Pass FuseOps(int fuse_opt_level = -1);
  */
 TVM_DLL Pass RewriteAnnotatedOps(int fallback_device);
 
+/*!
+ * \brief add device_copy if two adjacent nodes are on different devices
+ *
+ * \param expr The expression.
+ *
+ * \return The updated program.
+ */
+TVM_DLL Pass AddDeviceCopyOps();
+
+
 /*!
  * \brief turn a dataflow graph into Administrative Normal Form, or A-Normal Form (ANF).
  *
@@ -418,6 +428,24 @@ TVM_DLL Expr ForwardRewrite(const Expr& expr, const FForwardRewrite& rewrite_fun
  */
 TVM_DLL Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device);
 
+/*!
+ * \brief add device_copy if two adjacent nodes are on different devices
+ *
+ * \param expr The expression.
+ *
+ * \return The updated program.
+ */
+TVM_DLL Expr AddDeviceCopyOps(const Expr& expr);
+
+/*!
+ * \brief Fuse operations into expr into seperate functions.
+ *
+ * \param fuse_opt_level Optimization level. If it is -1 it will be inferred from pass context.
+ *
+ * \return The pass.
+ */
+TVM_DLL Expr FuseOps(const Expr& expr, int fuse_opt_level, const IRModule& module);
+
 /*!
  * \brief Turn an expression into continuation passing style(CPS).
  *
diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py
index bba5a6d842f9..6ff2a8be0b4a 100644
--- a/python/tvm/relay/quantize/_partition.py
+++ b/python/tvm/relay/quantize/_partition.py
@@ -145,14 +145,15 @@ def multiply_partition_function(ref_call, new_args, ctx):
     assert (not lhs_cond) and (not rhs_cond)
     return None
 
-# @register_partition_function("nn.global_avg_pool2d")
-# def global_avg_pool2d_partition_function(ref_call, new_args, ctx):
-#     cond, expr = partition_expr_check(new_args[0])
-#     eprint("global_avg_pool2d partition")
-#     if cond:
-#         expr = stop_fusion(new_args[0].realize())
-#         return _forward_op(ref_call, [expr])
-#     else:
-#         expr = stop_fusion(QPartitionExpr(new_args[0]).realize())
-#         return _forward_op(ref_call, [expr])
-#     return None
+
+# add cast after the relu op to make it run on vta
+@register_partition_function("nn.global_avg_pool2d")
+def global_avg_pool2d_partition_function(ref_call, new_args, ctx):
+    cond, expr = partition_expr_check(new_args[0])
+    if cond:
+        expr = new_args[0].realize()
+        return _forward_op(ref_call, [expr])
+    else:
+        expr = QPartitionExpr(new_args[0]).realize()
+        return _forward_op(ref_call, [expr])
+    return None
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 8f4ec1046500..d1a93fd5f9b8 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -441,6 +441,10 @@ def RewriteAnnotatedOps(fallback_device):
     return _ffi_api.RewriteDeviceAnnotation(fallback_device)
 
 
+def AddDeviceCopy():
+    return _transform.AddDeviceCopy()
+
+
 def ToANormalForm():
     """Turn Graph Normal Form expression into A Normal Form Expression.
     The scope of the root expression is the global scope.
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index f9ce24d410b7..cbe4ae2d4256 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -419,6 +419,14 @@ class RelayBuildModule : public runtime::ModuleNode {
     // Get the updated function.
     auto func = Downcast<Function>(relay_module->Lookup("main"));
 
+    // do extra pass to check to insert device_copy if necessary
+    if (targets_.size() > 1) {
+      func = Downcast<Function>(relay::AddDeviceCopyOps(func));
+      // we have to do fuseops again as we may add new device_copy ops
+      func = Downcast<Function>(relay::FuseOps(func, -1, relay_module));
+      func = Downcast<Function>(relay::InferType(func, relay_module));
+    }
+
     // Generate code for the updated function.
     graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen());
     graph_codegen_->Init(nullptr, targets_);
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index 39cf563f730a..5a87b06b0540 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -60,8 +60,10 @@ bool IsDeviceCopyNode(const ExprNode* node) {
 
 class ValidateAnnotation : private ExprVisitor {
  public:
-  static std::unordered_map<const ExprNode*, int> Validate(const Expr& expr) {
-    ValidateAnnotation valid;
+  ValidateAnnotation(int fallback_device): fallback_device_(fallback_device) {}
+
+  static std::unordered_map<const ExprNode*, int> Validate(const Expr& expr, int fallback_device) {
+    ValidateAnnotation valid(fallback_device);
     valid(expr);
     return valid.annotation_map_;
   }
@@ -80,12 +82,24 @@ class ValidateAnnotation : private ExprVisitor {
 
       CHECK_EQ(call_node->args.size(), 1U);
       const auto* node = call_node->args[0].operator->();
+      // LOG(WARNING) << "annotated node, device_type = " << device_type << " : " << GetRef<Expr>(node);
       if (annotation_map_.count(node)) {
         CHECK_EQ(annotation_map_.at(node), device_type)
             << "An expression node can only be annotated to one device.";
       } else {
         annotation_map_.insert({node, GetDeviceId(call_node)});
       }
+
+      // FIXME(zhanghao): find a better way
+      // here assume there are max two device types
+      if (device_type == fallback_device_ && extra_device_ && extra_device_ != fallback_device_) {
+        const auto* child = GetRef<Expr>(node).as<CallNode>()->args[0].operator->();
+        // here we mark as negative to indicate this is for copy from only
+        int ext_dev = -extra_device_;
+        annotation_map_.insert({child, ext_dev});
+      }
+
+      if (device_type != fallback_device_) extra_device_ = device_type;
     }
   }
 
@@ -109,6 +123,8 @@ class ValidateAnnotation : private ExprVisitor {
   }
 
   std::unordered_map<const ExprNode*, int> annotation_map_;
+  int fallback_device_ = 0;
+  int extra_device_ = 0;
 };
 
 // Replace the use of an expression with the output of a `copy_device` operator
@@ -122,7 +138,7 @@ class RewriteAnnotation : public ExprMutator {
  public:
   Expr Rewrite(const Expr& expr, int fallback_device) {
     fallback_device_ = fallback_device;
-    annotation_map_ = ValidateAnnotation::Validate(expr);
+    annotation_map_ = ValidateAnnotation::Validate(expr, fallback_device);
     return this->VisitExpr(expr);
   }
 
@@ -229,6 +245,7 @@ class RewriteAnnotation : public ExprMutator {
       CHECK(dit != annotation_map_.end())
           << "Device copy op is not required when both src and dst ops are not "
              "annotated.";
+      // LOG(WARNING) << "Create device copy " << fallback_device_ << " to " << dit->second << ": " << src;
       return CreateDeviceCopy(src, fallback_device_, dit->second);
     } else {
       const auto dit = annotation_map_.find(dst);
@@ -244,10 +261,15 @@ class RewriteAnnotation : public ExprMutator {
       if (annotation_map_.count(dst)) {
         return src_dev_type != annotation_map_.at(dst);
       } else {
-        return src_dev_type != fallback_device_;
+        // TODO(zhanghao): for now, we only make a device_copy when dst is "on_device" marked
+        // This allows us to do a start-end mark (mark two points)
+        // to mark all the middle ops with a device_type
+        return false;
+        // return src_dev_type != fallback_device_;
       }
     } else {
-      if (annotation_map_.count(dst)) {
+      // if annotation value < 0, it means this is for "copy from" only
+      if (annotation_map_.count(dst) && annotation_map_.at(dst) > 0) {
         // Though data copy op could be inserted whenever the `src` and `dst`
         // ops are annotated to different devices, it leads to high overhead.
         //
@@ -494,6 +516,66 @@ class DeviceInfo {
   Map<Expr, Integer> device_map_;
 };
 
+
+class AddDeviceCopy : public ExprMutator {
+ public:
+  Expr Rewrite(const Expr& expr) {
+    device_map_ = DeviceInfo::GetDeviceMap(expr);
+    return this->Mutate(expr);
+  }
+
+ private:
+  // add device copy if two nodes not on the same device
+  Expr VisitExpr_(const CallNode* call_node) override {
+    auto func_node = call_node->op.as<FunctionNode>();
+    bool src_is_copy_node = false;
+    if (func_node && IsDeviceCopyNode(func_node->body.as<CallNode>())) {
+      // LOG(WARNING) << "DeviceCopy skip device_copy node";
+      src_is_copy_node = true;
+    }
+
+    tvm::Array<Expr> call_args;
+    auto call_expr = GetRef<Expr>(call_node);
+    CHECK(device_map_.count(call_expr));
+
+    for (auto& arg: call_node->args) {
+      CHECK(device_map_.count(arg));
+      bool dst_is_copy_node = false;
+      if (auto arg_node = arg.as<CallNode>()) {
+        auto func_node = arg_node->op.as<FunctionNode>();
+        if (func_node && IsDeviceCopyNode(func_node->body.as<CallNode>())) {
+          // LOG(WARNING) << "DeviceCopy skip dst device_copy node";
+          dst_is_copy_node = true;
+        }
+      }
+
+      int src_dev_type = device_map_.count(arg) ? device_map_[arg]->value : 1;
+      int dst_dev_type = device_map_.count(call_expr) ? device_map_[call_expr]->value : 1;
+      if (!src_is_copy_node && !dst_is_copy_node && src_dev_type != dst_dev_type) {
+        // LOG(WARNING) << "Not consistent device type, src = " << src_dev_type << ":" << arg;
+        // LOG(WARNING) << "Not consistent device type, dst = " << dst_dev_type << ":" << call_expr;
+        auto attrs = make_object<DeviceCopyAttrs>();
+        attrs->src_dev_type = src_dev_type;
+        attrs->dst_dev_type = dst_dev_type;
+        static const Op& op = Op::Get("device_copy");
+        Call device_copy = CallNode::make(op, {this->Mutate(arg)}, Attrs(attrs), {});
+        device_copy->checked_type_ = arg->checked_type_;
+        call_args.push_back(device_copy);
+      } else {
+        call_args.push_back(this->Mutate(arg));
+      }
+    }
+
+    auto ret = CallNode::make(call_node->op, call_args, call_node->attrs, call_node->type_args);
+    // manually add the checked_type_
+    // alternatively, can call InferType Pass after this
+    ret->checked_type_ = call_node->checked_type_;
+    return ret;
+  }
+
+  Map<Expr, Integer> device_map_;
+};
+
 Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
   RewriteAnnotation rewrote = RewriteAnnotation();
   Expr new_expr = rewrote.Rewrite(expr, fallback_device);
@@ -541,7 +623,15 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
   }
 }
 
-Map<Expr, Integer> CollectDeviceInfo(const Expr& expr) { return DeviceInfo::GetDeviceMap(expr); }
+Expr AddDeviceCopyOps(const Expr& expr) {
+  auto rewrote = AddDeviceCopy();
+  Expr new_expr = rewrote.Rewrite(expr);
+  return new_expr;
+}
+
+Map<Expr, Integer> CollectDeviceInfo(const Expr& expr) {
+  return DeviceInfo::GetDeviceMap(expr);
+}
 
 Map<Expr, Integer> CollectDeviceAnnotationOps(const Expr& expr) {
   return AnnotatationVisitor::GetAnnotations(expr);
@@ -564,6 +654,18 @@ Pass RewriteAnnotatedOps(int fallback_device) {
 
 TVM_REGISTER_GLOBAL("relay._transform.RewriteDeviceAnnotation").set_body_typed(RewriteAnnotatedOps);
 
+Pass AddDeviceCopyOps() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+    [=](Function f, IRModule m, PassContext pc) {
+    return Downcast<Function>(AddDeviceCopyOps(f));
+  };
+  return CreateFunctionPass(pass_func, 1, "AddDeviceCopyOps",
+                            {tir::StringImmNode::make("InferType")});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.AddDeviceCopy")
+.set_body_typed(AddDeviceCopyOps);
+
 }  // namespace transform
 
 }  // namespace relay
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 7611e0fcc8b3..386e9885807b 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -86,16 +86,17 @@ class BuiltinLower : public StmtExprMutator {
     op = stmt.as<AllocateNode>();
     // Get constant allocation bound.
     int64_t nbytes = GetVectorBytes(op->dtype);
-    if (device_type_.defined()) {
-      if (const auto* dev_type = device_type_.as<IntImmNode>()) {
-        if (dev_type->value == kDLCPU) {
-          int32_t constant_size = op->constant_allocation_size();
-          if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
-            return stmt;
-          }
-        }
-      }
-    }
+    // FIXME(zhanghao): remove special handling for kDLCPU
+    // if (device_type_.defined()) {
+    //   if (arith::GetConst(device_type_, &dev_type)) {
+    //     if (dev_type == kDLCPU) {
+    //       int32_t constant_size = op->constant_allocation_size();
+    //       if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
+    //         return stmt;
+    //       }
+    //     }
+    //   }
+    // }
     PrimExpr total_bytes = make_const(op->extents[0].dtype(), nbytes);
     for (size_t i = 0; i < op->extents.size(); ++i) {
       total_bytes = total_bytes * op->extents[i];
diff --git a/vta.resnet18_v1.log-manual-formatv0_2 b/vta.resnet18_v1.log-manual-formatv0_2
new file mode 100644
index 000000000000..7b3c9d61a318
--- /dev/null
+++ b/vta.resnet18_v1.log-manual-formatv0_2
@@ -0,0 +1,10 @@
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [16, 8, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 131, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 8]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.0014505], 0, 1.328160047531128, 1578987870.726089], "version": 0.2, "tvm_version": "0.7.dev0"}
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 16, 7, 7, 1, 32], "int8"], ["TENSOR", [16, 16, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 163, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 8]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.002734464], 0, 1.7085223197937012, 1578988000.5012062], "version": 0.2, "tvm_version": "0.7.dev0"}
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [8, 4, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 302, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.0008805], 0, 1.2376818656921387, 1578988097.9650147], "version": 0.2, "tvm_version": "0.7.dev0"}
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [8, 8, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 143, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 14]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.001309522], 0, 1.3671045303344727, 1578988174.358436], "version": 0.2, "tvm_version": "0.7.dev0"}
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [4, 2, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 177, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.00079938], 0, 1.1500802040100098, 1578988361.3194962], "version": 0.2, "tvm_version": "0.7.dev0"}
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [4, 4, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 681, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 14]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 1]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 2]]}, "result": [[0.001198882], 0, 1.2445652484893799, 1578988503.2178001], "version": 0.2, "tvm_version": "0.7.dev0"}
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [2, 2, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 570, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 4]], ["tile_w", "sp", [-1, 56]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 2]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 2]]}, "result": [[0.001230756], 0, 1.4033727645874023, 1578988610.0491438], "version": 0.2, "tvm_version": "0.7.dev0"}
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [4, 2, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 176, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.000339938], 0, 1.025542974472046, 1578988875.3407557], "version": 0.2, "tvm_version": "0.7.dev0"}
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [8, 4, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 299, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.000387532], 0, 1.095754861831665, 1578988972.0000997], "version": 0.2, "tvm_version": "0.7.dev0"}
+{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [16, 8, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 67, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 16]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.000294566], 0, 0.9454472064971924, 1578989137.6281488], "version": 0.2, "tvm_version": "0.7.dev0"}
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index f6b22ce67ce5..0aff565cdf2e 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -21,6 +21,7 @@
 from tvm import relay
 from tvm.relay import op, transform
 from tvm.relay import ExprMutator
+from tvm.contrib.util import eprint
 
 def run_opt_pass(expr, opt_pass):
     """Exectue a relay pass."""
@@ -174,6 +175,95 @@ def _operator_idx_inc(expr, count_meta, operator_current_idx):
         operator_current_idx = operator_current_idx + 1
     return operator_current_idx
 
+
+class ExprDeviceAnnot(ExprMutator):
+    """Visitor to perform graph annotation on an AST.
+
+    Parameters
+    ----------
+    start: int
+        the start location to mark run on vta (inclusive)
+    end: int
+        the end location to mark run on vta (exclusive)
+
+    Returns
+    ---------
+    None
+    """
+    def __init__(self, start=-1, end=-1):
+        self.ext_ctx = tvm.context("ext_dev")
+        self.cpu_ctx = tvm.context("cpu")
+        self.counter = -1
+        self.start = start
+        self.end = end
+        super().__init__()
+
+    def visit_call(self, call):
+        """ Visit the children. """
+        # First visit the children.
+        oshape = _get_tensor_shape(call)
+        odtype = _get_tensor_type(call)
+        input_types = [arg.checked_type for arg in call.args]
+        args = [self.visit(arg) for arg in call.args]
+
+        self.counter += 1
+        if self.counter == self.start:
+            ret = relay.Call(call.op, args, call.attrs)
+            ret = relay.annotation.on_device(ret, self.ext_ctx)
+            eprint("add on_device {}: {}".format("ext", ret))
+            return ret
+        elif self.counter == self.end:
+            ret = relay.Call(call.op, args, call.attrs)
+            ret = relay.annotation.on_device(ret, self.cpu_ctx)
+            eprint("add on_device {}: {}".format("cpu", ret))
+            return ret
+
+#        if call.op == self.global_avg_pool2d:
+#            eprint("graphpack call = ", call)
+#            eprint("graphpack call annot relu, ", args[0])
+#            ret = relay.Call(call.op, args, call.attrs)
+#            ret = relay.annotation.on_device(ret, self.cpu_ctx)
+#            return ret
+#
+#        if call.op == self.conv2d and odtype == 'int32':
+#            if not self.first_conv2d:
+#                ret = relay.Call(call.op, args, call.attrs)
+#                ret = relay.annotation.on_device(ret, self.ext_ctx)
+#                eprint("graphpack call conv2d", type(ret.op), ret.op, type(ret), ret)
+#                self.first_conv2d = True
+#                return ret
+
+        return relay.Call(
+            self.visit(call.op),
+            args,
+            call.attrs)
+
+
+class ExprLocater(ExprMutator):
+    """Visitor to locate op on an AST.
+    """
+    def __init__(self):
+        self.counter = -1
+        self.op2nodes = {}
+        super().__init__()
+
+    def visit_call(self, call):
+        """ Visit the children. """
+        # First visit the children.
+        args = [self.visit(arg) for arg in call.args]
+
+        self.counter += 1
+        if call.op in self.op2nodes:
+            self.op2nodes[call.op].append(self.counter)
+        else:
+            self.op2nodes[call.op] = [self.counter]
+
+        return relay.Call(
+            self.visit(call.op),
+            args,
+            call.attrs)
+
+
 class ExprPack(ExprMutator):
     """Visitor to perform graph packing on an AST.
     """
@@ -468,4 +558,28 @@ def graph_pack(expr,
         weight_bits)
     expr = packer.visit(expr)
     assert not packer.start_pack
-    return run_opt_pass(expr, transform.InferType())
+    expr = run_opt_pass(expr, transform.InferType())
+
+    expr_locator = ExprLocater()
+    expr_locator.visit(expr)
+
+    # from the second conv2d to the global_avg_pool2d, all will run on vta
+    conv2d = op.op.get("nn.conv2d")
+    avg_pool2d = op.op.get("nn.global_avg_pool2d")
+    start = expr_locator.op2nodes[conv2d][1]
+    # preceeding the nn.global_avg_pool2d, it will look like this
+    #
+    # %310 = annotation.stop_fusion(%309) /* ty=Tensor[(1, 16, 7, 7, 1, 32), int8] */;
+    # %311 = cast(%310, dtype="int32") /* ty=Tensor[(1, 16, 7, 7, 1, 32), int32] */;
+    # %312 = transpose(%311, axes=[0, 4, 1, 5, 2, 3]) /* ty=Tensor[(1, 1, 16, 32, 7, 7), int32] */;
+    # %313 = reshape(%312, newshape=[1, 512, 7, 7]) /* ty=Tensor[(1, 512, 7, 7), int32] */;
+    # %314 = nn.global_avg_pool2d(%313) /* ty=Tensor[(1, 512, 1, 1), int32] */;
+    #
+    # we mark the preceeding three ops also on cpu device
+    end = expr_locator.op2nodes[avg_pool2d][0] - 3
+
+    device_annot = ExprDeviceAnnot(start=start, end=end)
+    expr = device_annot.visit(expr)
+    ret = run_opt_pass(expr, transform.InferType())
+
+    return ret
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 314eb46fcf56..c5c8ba44a0a2 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -36,6 +36,7 @@
 #include <cstring>
 #include <memory>
 #include <vector>
+#include <set>
 
 namespace vta {
 
@@ -101,6 +102,8 @@ struct DataBuffer {
     DataBuffer* buffer = new DataBuffer();
     buffer->data_ = data;
     buffer->phy_addr_ = VTAMemGetPhyAddr(data);
+
+    allocated_.insert(buffer);
     return buffer;
   }
   /*!
@@ -108,6 +111,7 @@ struct DataBuffer {
    * \param buffer The buffer to be freed.
    */
   static void Free(DataBuffer* buffer) {
+    allocated_.erase(buffer);
     VTAMemFree(buffer->data_);
     delete buffer;
   }
@@ -117,7 +121,12 @@ struct DataBuffer {
    * \return The corresponding data buffer header.
    */
   static DataBuffer* FromHandle(const void* buffer) {
-    return const_cast<DataBuffer*>(reinterpret_cast<const DataBuffer*>(buffer));
+    if (allocated_.count(buffer)) {
+      return const_cast<DataBuffer*>(
+          reinterpret_cast<const DataBuffer*>(buffer));
+    } else {
+      return nullptr;
+    }
   }
 
  private:
@@ -125,8 +134,13 @@ struct DataBuffer {
   void* data_;
   /*! \brief The physical address of the buffer, excluding header. */
   vta_phy_addr_t phy_addr_;
+
+  static std::set<const void*> allocated_;
 };
 
+// init static member
+std::set<const void*> DataBuffer::allocated_;
+
 /*!
  * \brief Micro op kernel.
  *  Contains functions to construct the kernel with prefix Push.
@@ -1207,10 +1221,12 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off
   if (kind_mask & 2) {
     from_buffer = vta::DataBuffer::FromHandle(from);
     from = from_buffer->virt_addr();
+    // LOG(WARNING) << "BufferCopy from " << from << ", from_offset " << from_offset << ", size = " << size;
   }
   if (kind_mask & 1) {
     to_buffer = vta::DataBuffer::FromHandle(to);
     to = to_buffer->virt_addr();
+    // LOG(WARNING) << "BufferCopy to " << to << ", to_offset " << to_offset << ", size = " << size;
   }
 
   if (from_buffer) {
@@ -1234,8 +1250,15 @@ void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) {
   static_cast<vta::CommandQueue*>(cmd)->SetDebugFlag(debug_flag);
 }
 
+// TODO(zhanghao): now we do the check here
+// it would be better to do the check in ir_pass before adding the "VTABufferCPUPtr"
 void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) {
-  return vta::DataBuffer::FromHandle(buffer)->virt_addr();
+  auto data_buf = vta::DataBuffer::FromHandle(buffer);
+  if (data_buf) {
+    return data_buf->virt_addr();
+  } else {  // it is a raw ptr allocated by CPU
+    return buffer;
+  }
 }
 
 void VTAWriteBarrier(VTACommandHandle cmd, void* buffer, uint32_t elem_bits, uint32_t start,
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 3a367851ed25..56abe6f70b76 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -59,6 +59,7 @@
 import vta
 from vta.testing import simulator
 from vta.top import graph_pack
+from tvm.contrib.util import eprint
 
 # Make sure that TVM was compiled with RPC=1
 assert tvm.runtime.enabled("rpc")
@@ -75,6 +76,11 @@
 # or ``device=vta`` to run inference on the FPGA.
 device = "vta"
 target = env.target if device == "vta" else env.target_vta_cpu
+# multiple targets to run both on cpu and vta
+targets = {
+    "cpu": env.target_vta_cpu,
+    "ext_dev": env.target
+}
 
 # Dictionary lookup for when to start/end bit packing
 pack_dict = {
@@ -130,7 +136,8 @@
     remote = rpc.LocalSession()
 
 # Get execution context from remote
-ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+ctxes = [remote.ext_dev(0), remote.cpu(0)]
 
 ######################################################################
 # Build the inference graph runtime
@@ -149,7 +156,8 @@
 #
 
 # Load pre-configured AutoTVM schedules
-with autotvm.tophub.context(target):
+log_file = "%s.%s.log-manual-formatv0_2" % (device, model)
+with autotvm.tophub.context(target, extra_files=[log_file]):
 
     # Populate the shape and data type dictionary for ImageNet classifier input
     dtype_dict = {"data": 'float32'}
@@ -163,6 +171,7 @@
 
     # Start front end compilation
     mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+    eprint("from_mxnet mod = ", mod)
 
     # Update shape and type dictionary
     shape_dict.update({k: v.shape for k, v in params.items()})
@@ -175,6 +184,7 @@
             with relay.quantize.qconfig(global_scale=8.0,
                                         skip_conv_layers=[0]):
                 mod = relay.quantize.quantize(mod, params=params)
+                eprint("done quantize", mod)
             # Perform graph packing and constant folding for VTA target
             assert env.BLOCK_IN == env.BLOCK_OUT
             relay_prog = graph_pack(
@@ -184,6 +194,7 @@
                 env.WGT_WIDTH,
                 start_name=pack_dict[model][0],
                 stop_name=pack_dict[model][1])
+            eprint("done graphpack ", relay_prog)
     else:
         relay_prog = mod["main"]
 
@@ -196,7 +207,7 @@
     else:
         with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             graph, lib, params = relay.build(
-                relay_prog, target=target,
+                relay_prog, target=targets,
                 params=params, target_host=env.target_host)
 
     # Measure Relay build time
@@ -210,7 +221,7 @@
     lib = remote.load_module("graphlib.o")
 
     # Graph runtime
-    m = graph_runtime.create(graph, lib, ctx)
+    m = graph_runtime.create(graph, lib, ctxes)
 
 ######################################################################
 # Perform image classification inference
@@ -245,10 +256,10 @@
 m.set_input('data', image)
 
 # Perform inference and gather execution statistics
-# More on: :py:method:`tvm.runtime.Module.time_evaluator`
+# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
 num = 4 # number of times we run module for a single measurement
 rep = 3 # number of measurements (we derive std dev from this)
-timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
+timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep)
 
 if env.TARGET in ["sim", "tsim"]:
     simulator.clear_stats()

From ef153e253bacb2443fd092694b759f5f5aa1cfbd Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Tue, 24 Mar 2020 14:19:10 +0800
Subject: [PATCH 05/44] bugfix for AddDeviceCopy pass; add Mul for vta
 simulation

---
 include/tvm/relay/attrs/device_copy.h     |  1 +
 python/tvm/relay/op/_tensor.py            |  5 ++++-
 python/tvm/relay/quantize/_partition.py   |  4 +++-
 src/relay/transforms/device_annotation.cc | 10 ++++++----
 vta/python/vta/environment.py             |  2 +-
 vta/runtime/runtime.cc                    |  2 ++
 6 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/tvm/relay/attrs/device_copy.h b/include/tvm/relay/attrs/device_copy.h
index 7da92b3ff763..c4a60c827048 100644
--- a/include/tvm/relay/attrs/device_copy.h
+++ b/include/tvm/relay/attrs/device_copy.h
@@ -37,6 +37,7 @@ namespace relay {
 struct DeviceCopyAttrs : public tvm::AttrsNode<DeviceCopyAttrs> {
   int dst_dev_type;
   int src_dev_type;
+  bool used_for_propagate = true;
 
   TVM_DECLARE_ATTRS(DeviceCopyAttrs, "relay.attrs.DeviceCopyAttrs") {
     TVM_ATTR_FIELD(src_dev_type)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 44d0a60227d6..4f409ff4538f 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -82,7 +82,10 @@
 register_broadcast_schedule("isinf")
 register_injective_schedule("maximum")
 register_injective_schedule("minimum")
-register_injective_schedule("right_shift")
+# NOTE(zhanghao): use customized add schedule
+# TODO(zhanghao): change the schedule name
+register_add_schedule("right_shift")
+# register_injective_schedule("right_shift")
 register_injective_schedule("left_shift")
 register_injective_schedule("shape_of")
 register_injective_schedule("ndarray_size")
diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py
index 6ff2a8be0b4a..315986d55607 100644
--- a/python/tvm/relay/quantize/_partition.py
+++ b/python/tvm/relay/quantize/_partition.py
@@ -21,6 +21,7 @@
 from .. import analysis as _analysis
 from . import _quantize
 from .quantize import _forward_op
+from tvm.contrib.util import eprint
 
 def register_partition_function(op_name, frewrite=None, level=10):
     return tvm.ir.register_op_attr(op_name, "FQPartitionRewrite", frewrite, level)
@@ -81,7 +82,7 @@ def add_partition_generic(ref_call, new_args, ctx):
         #     ...
         lhs = new_args[0].realize()
         rhs = new_args[1].realize()
-        return _forward_op(ref_call, [lhs, rhs])
+        return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
     if not lhs_cond and rhs_cond:
         # - introduced by residual connection in ResNet
         #     ...
@@ -141,6 +142,7 @@ def multiply_partition_function(ref_call, new_args, ctx):
     rhs_cond, rhs = partition_expr_check(new_args[1])
     if lhs_cond:
         # introduced by bn: multiply(out, scale)
+        lhs = new_args[0].realize()
         return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
     assert (not lhs_cond) and (not rhs_cond)
     return None
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index 5a87b06b0540..2d53751665da 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -82,7 +82,7 @@ class ValidateAnnotation : private ExprVisitor {
 
       CHECK_EQ(call_node->args.size(), 1U);
       const auto* node = call_node->args[0].operator->();
-      // LOG(WARNING) << "annotated node, device_type = " << device_type << " : " << GetRef<Expr>(node);
+      // LOG(WARNING) << "annotated node, device_type = " << device_type << " : " << GetRef<Expr>(node).as<CallNode>()->op;
       if (annotation_map_.count(node)) {
         CHECK_EQ(annotation_map_.at(node), device_type)
             << "An expression node can only be annotated to one device.";
@@ -245,7 +245,7 @@ class RewriteAnnotation : public ExprMutator {
       CHECK(dit != annotation_map_.end())
           << "Device copy op is not required when both src and dst ops are not "
              "annotated.";
-      // LOG(WARNING) << "Create device copy " << fallback_device_ << " to " << dit->second << ": " << src;
+      // LOG(WARNING) << "Create device copy " << fallback_device_ << " to " << dit->second << ": " << src.as<CallNode>()->op;
       return CreateDeviceCopy(src, fallback_device_, dit->second);
     } else {
       const auto dit = annotation_map_.find(dst);
@@ -552,11 +552,13 @@ class AddDeviceCopy : public ExprMutator {
       int src_dev_type = device_map_.count(arg) ? device_map_[arg]->value : 1;
       int dst_dev_type = device_map_.count(call_expr) ? device_map_[call_expr]->value : 1;
       if (!src_is_copy_node && !dst_is_copy_node && src_dev_type != dst_dev_type) {
-        // LOG(WARNING) << "Not consistent device type, src = " << src_dev_type << ":" << arg;
-        // LOG(WARNING) << "Not consistent device type, dst = " << dst_dev_type << ":" << call_expr;
+        // auto arg_call = arg.as<CallNode>();
+        // LOG(WARNING) << "Not consistent device type, src = " << src_dev_type << ":" << (arg_call ? arg_call->op : arg);
+        // LOG(WARNING) << "Not consistent device type, dst = " << dst_dev_type << ":" << call_node->op;
         auto attrs = make_object<DeviceCopyAttrs>();
         attrs->src_dev_type = src_dev_type;
         attrs->dst_dev_type = dst_dev_type;
+        attrs->used_for_propagate = false;
         static const Op& op = Op::Get("device_copy");
         Call device_copy = CallNode::make(op, {this->Mutate(arg)}, Attrs(attrs), {});
         device_copy->checked_type_ = arg->checked_type_;
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index 9f82d65f1d4e..3aa63cbb3415 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -68,7 +68,7 @@ class DevContext(object):
     ALU_OPCODE_MAX = 1
     ALU_OPCODE_ADD = 2
     ALU_OPCODE_SHR = 3
-    # ALU_OPCODE_CAST = 4
+    ALU_OPCODE_MUL = 4
     # Task queue id (pipeline stage)
     QID_LOAD_INP = 1
     QID_LOAD_WGT = 1
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index c5c8ba44a0a2..4ebf7bdab450 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -644,6 +644,8 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
           }
       } else if (opcode == VTA_ALU_OPCODE_SHR) {
           return (std::string("shr ") + std::to_string(imm));
+      } else if (opcode == VTA_ALU_OPCODE_MUL) {
+        return "mul";
       }
 
     return "unknown op";

From 87461d10973b0643f29a87f3717b85689ada2052 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 26 Mar 2020 16:21:35 +0800
Subject: [PATCH 06/44] intelfocl support in samples

---
 vta/python/vta/testing/util.py                |  2 +-
 vta/python/vta/top/vta_conv2d.py              | 19 ++++++++++++++++---
 .../integration/test_benchmark_topi_conv2d.py |  6 +++---
 .../frontend/deploy_classification.py         |  9 +++++++--
 vta/tutorials/vta_get_started.py              |  2 +-
 5 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py
index afbf00ddac8c..83da2e157164 100644
--- a/vta/python/vta/testing/util.py
+++ b/vta/python/vta/testing/util.py
@@ -32,7 +32,7 @@ def run(run_func):
     """
     env = get_env()
 
-    if env.TARGET in ["sim", "tsim"]:
+    if env.TARGET in ["sim", "tsim", "intelfocl"]:
         # Talk to local RPC if necessary to debug RPC server.
         # Compile vta on your host with make at the root.
         # Make sure TARGET is set to "sim" in the config.json file.
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 44430b9123c7..c87a89ecfe80 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -235,6 +235,19 @@ def _traverse(op):
 
     op = output.op
     _traverse(op)
+    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
+
+    x_co_max = topi.util.get_const_int(x_bo.dom.extent)
+    x_i_max = topi.util.get_const_int(x_i.dom.extent)
+    x_j_max = topi.util.get_const_int(x_j.dom.extent)
+
+    # TODO(zhanghao): auto-tune
+    x_co0, x_co1 = s[output].split(x_co, factor=1)
+    x_i0, x_i1 = s[output].split(x_i, factor=min(28, x_i_max))
+    x_j0, x_j1 = s[output].split(x_j, factor=min(14, x_j_max))
+    s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
+    store_pt = x_j0
+
     # only put the int-related ops to vta
     if "int" in output.dtype:
         env = get_env()
@@ -242,7 +255,7 @@ def _traverse(op):
             eprint("add ewise_ops ", eo)
             s[eo].set_scope(env.acc_scope)
             s[eo].pragma(s[eo].op.axis[0], env.alu)
-            s[eo].compute_at(s[output], s[output].op.axis[-2])
+            s[eo].compute_at(s[output], store_pt)
 
         # cache read input
         cache_read_ewise = []
@@ -253,11 +266,11 @@ def _traverse(op):
 
         for tensor in cache_read_ewise:
             s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
-            s[tensor].compute_at(s[output], s[output].op.axis[-2])
+            s[tensor].compute_at(s[output], store_pt)
 
         for op in const_ops:
             s[op].compute_inline()
 
-        s[output].pragma(s[output].op.axis[-1], env.dma_copy)
+        s[output].pragma(x_co1, env.dma_copy)
 
     return s
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index b3c36e85d56b..ea6b9cf1e9da 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -240,18 +240,18 @@ def test_conv2d(device):
     def _run(env, remote):
         if device == "vta":
             target = env.target
-            if env.TARGET not in ["sim", "tsim"]:
+            if env.TARGET not in ["sim", "tsim", "intelfocl"]:
                 assert tvm.runtime.enabled("rpc")
                 program_fpga(remote, bitstream=None)
                 reconfig_runtime(remote)
         elif device == "arm_cpu":
             target = env.target_vta_cpu
-        with autotvm.tophub.context(target): # load pre-tuned schedule parameters
+        with autotvm.tophub.context(target, extra_files = ['vta.resnet18_v1.log-manual-formatv0_2']): # load pre-tuned schedule parameters
             for _, wl in resnet_wkls:
                 print(wl)
                 run_conv2d(env, remote, wl, target)
     vta.testing.run(_run)
 
 if __name__ == "__main__":
-    test_conv2d(device="arm_cpu")
+    # test_conv2d(device="arm_cpu")
     test_conv2d(device="vta")
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 56abe6f70b76..30fe7f2b0b06 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -105,7 +105,7 @@
 # When target is 'pynq', reconfigure FPGA and runtime.
 # Otherwise, if target is 'sim', execute locally.
 
-if env.TARGET not in ["sim", "tsim"]:
+if env.TARGET not in ["sim", "tsim", "intelfocl"]:
 
     # Get remote from tracker node if environment variable is set.
     # To set up the tracker, you'll need to follow the "Auto-tuning
@@ -127,7 +127,12 @@
     # by passing the path to the bitstream file instead of None.
     reconfig_start = time.time()
     vta.reconfig_runtime(remote)
-    vta.program_fpga(remote, bitstream=None)
+    # vta.program_fpga(remote, bitstream=None)
+    bitstream = os.environ.get("TVM_BIT", None)
+    if bitstream:
+        print("Program fpga with {}".format(bitstream))
+        vta.program_fpga(remote, bitstream)
+
     reconfig_time = time.time() - reconfig_start
     print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
 
diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py
index ab416874b71b..8ac7307f5a05 100644
--- a/vta/tutorials/vta_get_started.py
+++ b/vta/tutorials/vta_get_started.py
@@ -91,7 +91,7 @@
     vta.program_fpga(remote, bitstream=None)
 
 # In simulation mode, host the RPC server locally.
-elif env.TARGET == "sim":
+elif env.TARGET in ("sim", "tsim", "intelfocl"):
     remote = rpc.LocalSession()
 
 ######################################################################

From bd79e8398ee33ae4eac6837d7362040519a8f873 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Fri, 3 Apr 2020 12:04:24 +0800
Subject: [PATCH 07/44] sync all insts and uops in one batch

---
 src/relay/backend/graph_plan_memory.cc |  57 ++++++------
 vta/python/vta/environment.py          |   2 +-
 vta/runtime/runtime.cc                 | 118 +++++++++++++++++++++++--
 vta/runtime/runtime.h                  |   2 +-
 4 files changed, 140 insertions(+), 39 deletions(-)

diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 820e17f8a498..8ebf9847c3a7 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -309,34 +309,35 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     if (match_range_ == 0) {
       return this->Alloc(prototype, size);
     }
-    auto begin = free_.lower_bound(size / match_range_);
-    auto mid = free_.lower_bound(size);
-    auto end = free_.upper_bound(size * match_range_);
-    // search for memory blocks larger than requested
-    for (auto it = mid; it != end; ++it) {
-      StorageToken* tok = it->second;
-      if (tok->device_type != prototype->device_type) continue;
-      CHECK_EQ(tok->ref_counter, 0);
-      // Use exect matching strategy
-      tok->max_bytes = std::max(size, tok->max_bytes);
-      tok->ref_counter = prototype->ref_counter;
-      // find a exact match, erase from map and return
-      free_.erase(it);
-      return tok;
-    }
-    // then search for memory blocks smaller than requested space
-    for (auto it = mid; it != begin;) {
-      --it;
-      StorageToken* tok = it->second;
-      if (tok->device_type != prototype->device_type) continue;
-      CHECK_EQ(tok->ref_counter, 0);
-      // Use exect matching strategy
-      tok->max_bytes = std::max(size, tok->max_bytes);
-      tok->ref_counter = prototype->ref_counter;
-      // erase from map and return
-      free_.erase(it);
-      return tok;
-    }
+    // TODO(zhanghao): to avoid overwrite shared storage when we copy all the instructions in a single batch
+    // auto begin = free_.lower_bound(size / match_range_);
+    // auto mid = free_.lower_bound(size);
+    // auto end = free_.upper_bound(size * match_range_);
+    // // search for memory blocks larger than requested
+    // for (auto it = mid; it != end; ++it) {
+    //   StorageToken *tok = it->second;
+    //   if (tok->device_type != prototype->device_type) continue;
+    //   CHECK_EQ(tok->ref_counter, 0);
+    //   // Use exect matching strategy
+    //   tok->max_bytes = std::max(size, tok->max_bytes);
+    //   tok->ref_counter = prototype->ref_counter;
+    //   // find a exact match, erase from map and return
+    //   free_.erase(it);
+    //   return tok;
+    // }
+    // // then search for memory blocks smaller than requested space
+    // for (auto it = mid; it != begin;) {
+    //   --it;
+    //   StorageToken *tok = it->second;
+    //   if (tok->device_type != prototype->device_type) continue;
+    //   CHECK_EQ(tok->ref_counter, 0);
+    //   // Use exect matching strategy
+    //   tok->max_bytes = std::max(size, tok->max_bytes);
+    //   tok->ref_counter = prototype->ref_counter;
+    //   // erase from map and return
+    //   free_.erase(it);
+    //   return tok;
+    // }
     // cannot find anything return a new one.
     return this->Alloc(prototype, size);
   }
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index 3aa63cbb3415..548dc03aae78 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -297,7 +297,7 @@ def coproc_sync(op):
     return tvm.tir.call_extern(
         "int32", "VTASynchronize",
         get_env().dev.command_handle,
-        tvm.runtime.const(1<<31, dtype="uint32"))
+        tvm.runtime.const(1<<31, dtype="uint32"), True)
 
 
 @tvm.register_func("tvm.intrin.rule.default.vta.coproc_dep_push")
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 4ebf7bdab450..3e42727c5bfc 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -37,6 +37,8 @@
 #include <memory>
 #include <vector>
 #include <set>
+#include <stdlib.h>
+#include <malloc.h>
 
 namespace vta {
 
@@ -48,6 +50,72 @@ static const bool kBufferCoherent = VTA_COHERENT_ACCESSES;
 /*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */
 static const bool kAlwaysCache = true;
 
+template <typename T, std::size_t N = 64>
+class AlignmentAllocator {
+public:
+  typedef T value_type;
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+
+  typedef T * pointer;
+  typedef const T * const_pointer;
+
+  typedef T & reference;
+  typedef const T & const_reference;
+
+  public:
+  inline AlignmentAllocator () throw () { }
+
+  template <typename T2>
+  inline AlignmentAllocator (const AlignmentAllocator<T2, N> &) throw () { }
+
+  inline ~AlignmentAllocator () throw () { }
+
+  inline pointer adress (reference r) {
+    return &r;
+  }
+
+  inline const_pointer adress (const_reference r) const {
+    return &r;
+  }
+
+  inline pointer allocate (size_type n) {
+     return (pointer)memalign(N, n*sizeof(value_type));
+  }
+
+  inline void deallocate (pointer p, size_type) {
+    free(p);
+  }
+
+  inline void construct (pointer p, const value_type & wert) {
+     new (p) value_type (wert);
+  }
+
+  inline void destroy (pointer p) {
+    p->~value_type ();
+  }
+
+  inline size_type max_size () const throw () {
+    return size_type (-1) / sizeof (value_type);
+  }
+
+  template <typename T2>
+  struct rebind {
+    typedef AlignmentAllocator<T2, N> other;
+  };
+
+  bool operator!=(const AlignmentAllocator<T,N>& other) const  {
+    return !(*this == other);
+  }
+
+  // Returns true if and only if storage allocated from *this
+  // can be deallocated from other, and vice versa.
+  // Always returns true for stateless allocators.
+  bool operator==(const AlignmentAllocator<T,N>& other) const {
+    return true;
+  }
+};
+
 /*!
  * \brief Data buffer represents data on CMA.
  */
@@ -84,6 +152,7 @@ struct DataBuffer {
    */
   void MemCopyFromHost(void* dst, const void* src, size_t size) {
     VTAMemCopyFromHost(dst, src, size);
+
   }
   /*!
    * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
@@ -343,7 +412,7 @@ class BaseQueue {
   // End location of current SRAM write in FIFO mode
   uint32_t sram_end_{0};
   // The buffer in DRAM
-  std::vector<T> dram_buffer_;
+  std::vector<T, AlignmentAllocator<T, 64>> dram_buffer_;
   // FPGA accessible buffer
   void* fpga_buff_{NULL};
   // Physical address of the FPGA buffer
@@ -443,13 +512,33 @@ class UopQueue : public BaseQueue<VTAUop> {
     }
     CHECK(buff_size <= kMaxBytes);
     // Move kernel contents to FPGA readable buffer
+    // uint32_t offset = 0;
+    // for (uint32_t i = 0; i < cache_.size(); ++i) {
+    //   uint32_t ksize = cache_[i]->size() * kElemBytes;
+    //   VTAMemCopyFromHost(static_cast<char*>(fpga_buff_) + offset,
+    //                      cache_[i]->data(),
+    //                      ksize);
+    //   // Update offset
+    //   offset += ksize;
+    // }
+
+    // merge all the cache entries and do CopyFromHost once
+    uint32_t total_size = 0;
+    for (uint32_t i = 0; i < cache_.size(); ++i) {
+      uint32_t ksize = cache_[i]->size() * kElemBytes;
+      total_size += ksize;
+    }
+
+    char *lbuf = (char*)memalign(64, total_size);
     uint32_t offset = 0;
     for (uint32_t i = 0; i < cache_.size(); ++i) {
       uint32_t ksize = cache_[i]->size() * kElemBytes;
-      VTAMemCopyFromHost(static_cast<char*>(fpga_buff_) + offset, cache_[i]->data(), ksize);
-      // Update offset
+      memcpy(lbuf + offset, cache_[i]->data(), ksize);
       offset += ksize;
     }
+    VTAMemCopyFromHost(static_cast<char*>(fpga_buff_), lbuf, total_size);
+    free(lbuf);
+
     // Flush if we're using a shared memory system
     // and if interface is non-coherent
     if (!coherent_ && always_cache_) {
@@ -904,6 +993,8 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
   int pending_pop_next_[4];
   static constexpr int kElemBytes = sizeof(VTAGenericInsn);
   static constexpr int kMaxElems = kMaxBytes / kElemBytes;
+
+  friend class CommandQueue;
 };
 
 /*!
@@ -1011,7 +1102,16 @@ class CommandQueue {
     }
   }
 
-  void Synchronize(uint32_t wait_cycles) {
+  void Synchronize(uint32_t wait_cycles, bool skip=true) {
+    // FIXME(zhanghao): It is required to use force_serial
+    // by using skip and sync at the final layer, we can avoid do DeviceCopy every time
+    if (skip) {
+      if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) {
+        LOG(ERROR) << "Synchronizing all in one round requires to use force_serial to make things right";
+      }
+      return;
+    }
+
     // Insert dependences to force serialization
     if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) {
       insn_queue_.RewriteForceSerial();
@@ -1223,16 +1323,16 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off
   if (kind_mask & 2) {
     from_buffer = vta::DataBuffer::FromHandle(from);
     from = from_buffer->virt_addr();
-    // LOG(WARNING) << "BufferCopy from " << from << ", from_offset " << from_offset << ", size = " << size;
   }
   if (kind_mask & 1) {
     to_buffer = vta::DataBuffer::FromHandle(to);
     to = to_buffer->virt_addr();
-    // LOG(WARNING) << "BufferCopy to " << to << ", to_offset " << to_offset << ", size = " << size;
   }
 
   if (from_buffer) {
     // This is an FPGA to host mem transfer
+    // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly
+    VTASynchronize(VTATLSCommandHandle(), 1<<31, false);
     from_buffer->InvalidateCache(from_offset, size);
     from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
                                static_cast<const char*>(from) + from_offset, size);
@@ -1323,6 +1423,6 @@ int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) {
   return 0;
 }
 
-void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
-  static_cast<vta::CommandQueue*>(cmd)->Synchronize(wait_cycles);
-}
+void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip) {
+  static_cast<vta::CommandQueue*>(cmd)->
+      Synchronize(wait_cycles, skip); }
diff --git a/vta/runtime/runtime.h b/vta/runtime/runtime.h
index 24ebb8e1247b..360970118144 100644
--- a/vta/runtime/runtime.h
+++ b/vta/runtime/runtime.h
@@ -251,7 +251,7 @@ TVM_DLL int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
  * \param wait_cycles The limit of poll cycles.
  *
  */
-TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
+TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip=true);
 
 #ifdef __cplusplus
 }

From f8eaef9d40436c97005fbf2e43a0a9778d422a35 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Wed, 8 Apr 2020 16:11:47 +0800
Subject: [PATCH 08/44] support for static auto-tune

---
 python/tvm/autotvm/measure/measure_methods.py | 110 +++++-
 vta/python/vta/top/graphpack.py               |  51 +--
 vta/runtime/runtime.cc                        | 369 +++++++++++++-----
 vta/runtime/runtime.h                         |   1 +
 vta/tutorials/autotvm/tune_relay_vta.py       |  31 +-
 5 files changed, 430 insertions(+), 132 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 7f915132fdc8..43ee291bfdd9 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -32,6 +32,7 @@
 import tempfile
 
 import numpy as np
+import json
 
 import tvm._ffi
 import tvm.ir.transform
@@ -47,6 +48,8 @@
 from .measure import MeasureResult, MeasureErrorNo, Builder, Runner
 from .local_executor import LocalExecutor
 
+from tvm.contrib.util import eprint
+
 logger = logging.getLogger('autotvm')
 
 class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))):
@@ -186,6 +189,16 @@ def __init__(self,
                  timeout=10, n_parallel=None,
                  number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
                  check_correctness=False):
+        static_tune = os.getenv("TVM_STATIC_TUNE")
+        if static_tune:
+            if n_parallel is None or n_parallel > 1:
+                print("static tune only allows n_parallel == 1")
+                n_parallel = 1
+
+            if check_correctness == True:
+                print("static tune does not support check_correctness")
+                check_correctness = False
+
         super(RPCRunner, self).__init__(timeout, n_parallel)
 
         self.key = key
@@ -369,7 +382,15 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti
             measure_input.target.device_name == 'vta':
             # pylint: disable=import-outside-toplevel
             import vta
-            func = vta.build(s, args, target_host=task.target_host)
+
+            static_tune = os.getenv("TVM_STATIC_TUNE")
+            if static_tune:
+                debug_flag = 1 << 6
+            else:
+                debug_flag = 0
+
+            with vta.build_config(debug_flag=debug_flag):
+                func = vta.build(s, args, target_host=task.target_host)
         else:
             with tvm.ir.transform.PassContext(config=opts):
                 func = build(s, args, target_host=task.target_host)
@@ -419,6 +440,63 @@ def _wrapped(measure_input, tmp_dir, **kwargs):
     return _wrapped
 
 
+def cal_cost(insn):
+    """
+    Cal the runtime cost statically
+
+    Parameters
+    ------------
+    insn: the insn (json)
+
+    Returns
+    ------------
+    the cost in s
+    """
+    def alu_imm_cost(outer, inner, uops):
+        return 0.00001
+
+    def alu_cost(outer, inner, uops):
+        return 0.00001
+
+    def gemm_cost(outer, inner, uops):
+        return 0.00001
+
+    def load_inp_cost(y, x):
+        return 0.00001
+
+    def load_uop_cost(y, x):
+        return 0.00001
+
+    def load_wgt_cost(y, x):
+        return 0.00001
+
+    def store_cost(y, x):
+        return 0.00001
+
+    if insn['type'] == "ALU":
+        return alu_cost(insn['outer_loop'], insn['inner_loop'],
+                        insn['range'][1] - insn['range'][0])
+    elif insn['type'] == "ALU IMM":
+        return alu_imm_cost(insn['outer_loop'], insn['inner_loop'],
+                        insn['range'][1] - insn['range'][0])
+    elif insn['type'] == "GEMM":
+        return gemm_cost(insn['outer_loop'], insn['inner_loop'],
+                        insn['range'][1] - insn['range'][0])
+    elif insn['name'] == "LOAD INP":
+        return load_inp_cost(insn['y'][0], insn['x'][0])
+    elif insn['name'] == "LOAD WGT":
+        return load_wgt_cost(insn['y'][0], insn['x'][0])
+    elif insn['name'] == "LOAD UOP":
+        return load_uop_cost(insn['y'][0], insn['x'][0])
+    elif insn['type'] == "STORE":
+        return store_cost(insn['y'][0], insn['x'][0])
+    elif insn['type'] == "NOP":
+        return 0
+    else:
+        print("Unknown op type: {}".format(insn['type']))
+        return 0
+
+
 def run_through_rpc(measure_input, build_result,
                     number, repeat, min_repeat_ms, cooldown_interval,
                     remote_args, ref_input=None, ref_output=None):
@@ -460,6 +538,7 @@ def run_through_rpc(measure_input, build_result,
 
     tic = time.time()
     errno = MeasureErrorNo.NO_ERROR
+    static_tune = os.getenv("TVM_STATIC_TUNE")
     try:
         # upload built module
         remote = request_remote(*remote_args)
@@ -474,8 +553,6 @@ def run_through_rpc(measure_input, build_result,
         remote.upload(build_result.filename)
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
-        time_f = func.time_evaluator(
-            func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)
 
         # set input
         if ref_input:
@@ -487,12 +564,25 @@ def run_through_rpc(measure_input, build_result,
             args = [nd.array(x, ctx=ctx) for x in args]
             ctx.sync()
 
-        costs = time_f(*args).results
+        if static_tune is None:
+            time_f = func.time_evaluator(
+                func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)
+            costs = time_f(*args).results
+
+            # clean up remote files
+            remote.remove(build_result.filename)
+            remote.remove(os.path.splitext(build_result.filename)[0] + '.so')
+            remote.remove('')
+        else:
+            func(*args)
+            cost = 0
+            insn_dump = os.getenv('TVM_INSN_DUMP', "insn.dump")
+            with open(insn_dump) as json_file:
+                insns = json.load(json_file)
+                for insn in insns:
+                    cost += cal_cost(insn)
 
-        # clean up remote files
-        remote.remove(build_result.filename)
-        remote.remove(os.path.splitext(build_result.filename)[0] + '.so')
-        remote.remove('')
+            costs = [cost] * repeat
 
         if len(costs) > 2:  # remove largest and smallest value to reduce variance
             costs = list(costs)
@@ -540,6 +630,10 @@ def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
     ------
     session: RPCSession
     """
+    static_tune = os.getenv("TVM_STATIC_TUNE")
+    if static_tune:
+        return _rpc.LocalSession()
+
     # connect to the tracker
     host = host or os.environ['TVM_TRACKER_HOST']
     port = port or int(os.environ['TVM_TRACKER_PORT'])
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 0aff565cdf2e..cdfd3c4281e2 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -507,7 +507,7 @@ def graph_pack(expr,
                stop_name="nn.global_avg_pool2d",
                start_name_idx=None,
                stop_name_idx=None,
-               count_meta=False):
+               count_meta=False, device_annot=True):
     """Pack the graph into batch&channel packed format.
 
     Parameters
@@ -560,26 +560,29 @@ def graph_pack(expr,
     assert not packer.start_pack
     expr = run_opt_pass(expr, transform.InferType())
 
-    expr_locator = ExprLocater()
-    expr_locator.visit(expr)
-
-    # from the second conv2d to the global_avg_pool2d, all will run on vta
-    conv2d = op.op.get("nn.conv2d")
-    avg_pool2d = op.op.get("nn.global_avg_pool2d")
-    start = expr_locator.op2nodes[conv2d][1]
-    # preceeding the nn.global_avg_pool2d, it will look like this
-    #
-    # %310 = annotation.stop_fusion(%309) /* ty=Tensor[(1, 16, 7, 7, 1, 32), int8] */;
-    # %311 = cast(%310, dtype="int32") /* ty=Tensor[(1, 16, 7, 7, 1, 32), int32] */;
-    # %312 = transpose(%311, axes=[0, 4, 1, 5, 2, 3]) /* ty=Tensor[(1, 1, 16, 32, 7, 7), int32] */;
-    # %313 = reshape(%312, newshape=[1, 512, 7, 7]) /* ty=Tensor[(1, 512, 7, 7), int32] */;
-    # %314 = nn.global_avg_pool2d(%313) /* ty=Tensor[(1, 512, 1, 1), int32] */;
-    #
-    # we mark the preceeding three ops also on cpu device
-    end = expr_locator.op2nodes[avg_pool2d][0] - 3
-
-    device_annot = ExprDeviceAnnot(start=start, end=end)
-    expr = device_annot.visit(expr)
-    ret = run_opt_pass(expr, transform.InferType())
-
-    return ret
+    if device_annot:
+        expr_locator = ExprLocater()
+        expr_locator.visit(expr)
+
+        # from the second conv2d to the global_avg_pool2d, all will run on vta
+        conv2d = op.op.get("nn.conv2d")
+        avg_pool2d = op.op.get("nn.global_avg_pool2d")
+        start = expr_locator.op2nodes[conv2d][1]
+        # preceeding the nn.global_avg_pool2d, it will look like this
+        #
+        # %310 = annotation.stop_fusion(%309) /* ty=Tensor[(1, 16, 7, 7, 1, 32), int8] */;
+        # %311 = cast(%310, dtype="int32") /* ty=Tensor[(1, 16, 7, 7, 1, 32), int32] */;
+        # %312 = transpose(%311, axes=[0, 4, 1, 5, 2, 3]) /* ty=Tensor[(1, 1, 16, 32, 7, 7), int32] */;
+        # %313 = reshape(%312, newshape=[1, 512, 7, 7]) /* ty=Tensor[(1, 512, 7, 7), int32] */;
+        # %314 = nn.global_avg_pool2d(%313) /* ty=Tensor[(1, 512, 1, 1), int32] */;
+        #
+        # we mark the preceeding three ops also on cpu device
+        end = expr_locator.op2nodes[avg_pool2d][0] - 3
+
+        device_annot = ExprDeviceAnnot(start=start, end=end)
+        expr = device_annot.visit(expr)
+        ret = run_opt_pass(expr, transform.InferType())
+
+        return ret
+    else:
+        return expr
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 3e42727c5bfc..911aac301ae6 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -40,6 +40,10 @@
 #include <stdlib.h>
 #include <malloc.h>
 
+#include <rapidjson/document.h>
+#include <rapidjson/writer.h>
+#include <rapidjson/stringbuffer.h>
+
 namespace vta {
 
 // Avoid bad configurations.
@@ -151,8 +155,12 @@ struct DataBuffer {
    * Bytes.
    */
   void MemCopyFromHost(void* dst, const void* src, size_t size) {
+    // struct timespec start, stop;
+    // clock_gettime(CLOCK_REALTIME, &start);
     VTAMemCopyFromHost(dst, src, size);
-
+    // clock_gettime(CLOCK_REALTIME, &stop);
+    // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000;
+    // LOG(WARNING) << "DataBuffer VTAMemCopyFromHost: " << elapsed << " us";
   }
   /*!
    * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
@@ -739,8 +747,79 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
 
     return "unknown op";
   }
+
+  std::string GetOpName(const union VTAInsn& c) {
+    switch (c.mem.opcode) {
+      case VTA_OPCODE_LOAD:
+        if (c.mem.x_size == 0) {
+          if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
+            return "NOP-COMPUTE-STAGE";
+          } else {
+            return "NOP-MEMORY-STAGE";
+          }
+        } else {
+          if (c.mem.memory_type == VTA_MEM_ID_UOP) {
+            return "LOAD UOP";
+          } else if (c.mem.memory_type == VTA_MEM_ID_WGT) {
+            return "LOAD WGT";
+          } else if (c.mem.memory_type == VTA_MEM_ID_INP) {
+            return "LOAD INP";
+          } else if (c.mem.memory_type == VTA_MEM_ID_ACC) {
+            return "LOAD ACC";
+          } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8) {
+            return "LOAD ACC 8";
+          } else {
+            return "LOAD";
+          }
+        }
+      case VTA_OPCODE_STORE:
+        if (c.mem.x_size == 0) {
+          return "NOP-STORE-STAGE";
+        } else {
+          return "STORE";
+        }
+      case VTA_OPCODE_GEMM:
+        return "GEMM";
+      case VTA_OPCODE_ALU:
+        return "ALU - " + getOpcodeString(c.alu.alu_opcode, c.alu.use_imm, c.alu.imm);
+      case VTA_OPCODE_FINISH:
+        return "FINISH";
+      default:
+        return "Not recogonized";
+    }
+  }
+
+  std::string GetOpcodeName(const union VTAInsn& c) {
+    switch (c.mem.opcode) {
+      case VTA_OPCODE_LOAD:
+        if (c.mem.x_size == 0) {
+          return "NOP";
+        } else {
+          return "LOAD";
+        }
+      case VTA_OPCODE_STORE:
+        if (c.mem.x_size == 0) {
+          return "NOP";
+        } else {
+          return "STORE";
+        }
+      case VTA_OPCODE_GEMM:
+        return "GEMM";
+      case VTA_OPCODE_ALU:
+        if (c.alu.use_imm) {
+          return "ALU IMM";
+        } else {
+          return "ALU";
+        }
+      case VTA_OPCODE_FINISH:
+        return "NOP";
+      default:
+        return "Unknown";
+    }
+  }
+
   // Dump instructions in the queue
-  void DumpInsn() {
+  void DumpInsn(FILE* out = stderr, bool json=false) {
     // Keep tabs on dependence queues
     int l2g_queue = 0;
     int g2l_queue = 0;
@@ -751,98 +830,158 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     // Iterate over all instructions
     int insn_count = count();
     const VTAGenericInsn* insn = data();
-    printf("There are %u instructions\n", insn_count);
+    rapidjson::StringBuffer s;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(s);
+
+    if (!json) {
+      fprintf(out, "There are %u instructions\n", insn_count);
+    } else {
+      writer.StartArray();
+    }
+
     for (int i = 0; i < insn_count; ++i) {
       // Fetch instruction and decode opcode
       c.generic = insn[i];
-      printf("INSTRUCTION %u: ", i);
+      if (json) {
+        writer.StartObject();
+        writer.Key("name");
+        writer.String(GetOpName(c).c_str());
+
+        writer.Key("type");
+        writer.String(GetOpcodeName(c).c_str());
+
+        writer.Key("pop_prev");
+        writer.Int(c.mem.pop_prev_dep);
+        writer.Key("pop_next");
+        writer.Int(c.mem.pop_next_dep);
+        writer.Key("push_prev");
+        writer.Int(c.mem.push_prev_dep);
+        writer.Key("push_next");
+        writer.Int(c.mem.push_next_dep);
+      } else {
+        fprintf(out, "INSTRUCTION %u: ", i);
+        fprintf(out, "%s\n", GetOpName(c).c_str());
+
+        fprintf(out, "\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+               static_cast<int>(c.mem.pop_prev_dep),
+               static_cast<int>(c.mem.pop_next_dep),
+               static_cast<int>(c.mem.push_prev_dep),
+               static_cast<int>(c.mem.push_next_dep));
+      }
+
       if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
-        if (c.mem.x_size == 0) {
-          if (c.mem.opcode == VTA_OPCODE_STORE) {
-            printf("NOP-STORE-STAGE\n");
-          } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
-            printf("NOP-COMPUTE-STAGE\n");
-          } else {
-            printf("NOP-MEMORY-STAGE\n");
-          }
-          printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-                 static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-                 static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-          // Count status in queues
-          if (c.mem.opcode == VTA_OPCODE_STORE) {
-            CHECK(c.mem.pop_next_dep == false);
-            CHECK(c.mem.push_next_dep == false);
-            if (c.mem.pop_prev_dep) g2s_queue--;
-            if (c.mem.push_prev_dep) s2g_queue++;
-          } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
-                     (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
-            CHECK(c.mem.pop_prev_dep == false);
-            CHECK(c.mem.push_prev_dep == false);
-            if (c.mem.pop_next_dep) g2l_queue--;
-            if (c.mem.push_next_dep) l2g_queue++;
-          } else {
-            if (c.mem.pop_prev_dep) l2g_queue--;
-            if (c.mem.push_prev_dep) g2l_queue++;
-            if (c.mem.pop_next_dep) s2g_queue--;
-            if (c.mem.push_next_dep) g2s_queue++;
-          }
-          printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
-          printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
-          continue;
-        }
-        // Print instruction field information
-        if (c.mem.opcode == VTA_OPCODE_LOAD) {
-          printf("LOAD ");
-          if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
-          if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
-          if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
-          if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
-          if (c.mem.memory_type == VTA_MEM_ID_ACC_8) printf("ACC 8\n");
-        }
-        if (c.mem.opcode == VTA_OPCODE_STORE) {
-          printf("STORE:\n");
+        if (json) {
+          writer.Key("dram");
+          writer.Uint64(c.mem.dram_base);
+          writer.Key("sram");
+          writer.Uint64(c.mem.sram_base);
+
+          writer.Key("y");
+          writer.StartArray();
+          writer.Uint64(c.mem.y_size);
+          writer.Uint64(c.mem.y_pad_0);
+          writer.Uint64(c.mem.y_pad_1);
+          writer.EndArray();
+
+          writer.Key("x");
+          writer.StartArray();
+          writer.Uint64(c.mem.x_size);
+          writer.Uint64(c.mem.x_pad_0);
+          writer.Uint64(c.mem.x_pad_1);
+          writer.Uint64(c.mem.x_stride);
+          writer.EndArray();
+        } else {
+          fprintf(out, "\tDRAM: 0x%08x, SRAM:0x%04x\n",
+                 static_cast<int>(c.mem.dram_base),
+                 static_cast<int>(c.mem.sram_base));
+          fprintf(out, "\ty: size=%d, pad=[%d, %d]\n",
+                 static_cast<int>(c.mem.y_size),
+                 static_cast<int>(c.mem.y_pad_0),
+                 static_cast<int>(c.mem.y_pad_1));
+          fprintf(out, "\tx: size=%d, stride=%d, pad=[%d, %d]\n",
+                 static_cast<int>(c.mem.x_size),
+                 static_cast<int>(c.mem.x_stride),
+                 static_cast<int>(c.mem.x_pad_0),
+                 static_cast<int>(c.mem.x_pad_1));
         }
-        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-               static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-               static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-        printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", static_cast<int>(c.mem.dram_base),
-               static_cast<int>(c.mem.sram_base));
-        printf("\ty: size=%d, pad=[%d, %d]\n", static_cast<int>(c.mem.y_size),
-               static_cast<int>(c.mem.y_pad_0), static_cast<int>(c.mem.y_pad_1));
-        printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", static_cast<int>(c.mem.x_size),
-               static_cast<int>(c.mem.x_stride), static_cast<int>(c.mem.x_pad_0),
-               static_cast<int>(c.mem.x_pad_1));
       } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
-        // Print instruction field information
-        printf("GEMM\n");
-
-        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-               static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-               static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-        printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
-        printf("\trange (%d, %d)\n", static_cast<int>(c.gemm.uop_bgn),
-               static_cast<int>(c.gemm.uop_end));
-        printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
-               static_cast<int>(c.gemm.iter_out), static_cast<int>(c.gemm.wgt_factor_out),
-               static_cast<int>(c.gemm.src_factor_out), static_cast<int>(c.gemm.dst_factor_out));
-        printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
-               static_cast<int>(c.gemm.iter_in), static_cast<int>(c.gemm.wgt_factor_in),
-               static_cast<int>(c.gemm.src_factor_in), static_cast<int>(c.gemm.dst_factor_in));
+        if (json) {
+          writer.Key("reset_out");
+          writer.Int(c.gemm.reset_reg);
+          writer.Key("range");
+          writer.StartArray();
+          writer.Int(c.gemm.uop_bgn);
+          writer.Int(c.gemm.uop_end);
+          writer.EndArray();
+
+          writer.Key("outer_loop");
+          writer.StartArray();
+          writer.Int(c.gemm.iter_out);
+          writer.Int(c.gemm.wgt_factor_out),
+          writer.Int(c.gemm.src_factor_out),
+          writer.Int(c.gemm.dst_factor_out);
+          writer.EndArray();
+
+          writer.Key("inner_loop");
+          writer.StartArray();
+          writer.Int(c.gemm.iter_in);
+          writer.Int(c.gemm.wgt_factor_in),
+          writer.Int(c.gemm.src_factor_in),
+          writer.Int(c.gemm.dst_factor_in);
+          writer.EndArray();
+        } else {
+          fprintf(out, "\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
+          fprintf(out, "\trange (%d, %d)\n",
+                 static_cast<int>(c.gemm.uop_bgn),
+                 static_cast<int>(c.gemm.uop_end));
+          fprintf(out, "\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
+                 static_cast<int>(c.gemm.iter_out),
+                 static_cast<int>(c.gemm.wgt_factor_out),
+                 static_cast<int>(c.gemm.src_factor_out),
+                 static_cast<int>(c.gemm.dst_factor_out));
+          fprintf(out, "\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
+                 static_cast<int>(c.gemm.iter_in),
+                 static_cast<int>(c.gemm.wgt_factor_in),
+                 static_cast<int>(c.gemm.src_factor_in),
+                 static_cast<int>(c.gemm.dst_factor_in));
+        }
       } else if (c.mem.opcode == VTA_OPCODE_ALU) {
-        // Print instruction field information
-        printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm, c.alu.imm).c_str());
-        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-               static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-               static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-        printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
-        printf("\trange (%d, %d)\n", static_cast<int>(c.alu.uop_bgn),
-               static_cast<int>(c.alu.uop_end));
-        printf("\touter loop - iter: %d, dst: %d, src: %d\n", static_cast<int>(c.alu.iter_out),
-               static_cast<int>(c.alu.dst_factor_out), static_cast<int>(c.alu.src_factor_out));
-        printf("\tinner loop - iter: %d, dst: %d, src: %d\n", static_cast<int>(c.alu.iter_in),
-               static_cast<int>(c.alu.dst_factor_in), static_cast<int>(c.alu.src_factor_in));
-      } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
-        printf("FINISH\n");
+        if (json) {
+          writer.Key("reset_out");
+          writer.Int(c.alu.reset_reg);
+          writer.Key("range");
+          writer.StartArray();
+          writer.Int(c.alu.uop_bgn);
+          writer.Int(c.alu.uop_end);
+          writer.EndArray();
+
+          writer.Key("outer_loop");
+          writer.StartArray();
+          writer.Int(c.alu.iter_out);
+          writer.Int(c.alu.dst_factor_out),
+          writer.Int(c.alu.src_factor_out),
+          writer.EndArray();
+
+          writer.Key("inner_loop");
+          writer.StartArray();
+          writer.Int(c.alu.iter_in);
+          writer.Int(c.alu.dst_factor_in);
+          writer.Int(c.alu.src_factor_in),
+          writer.EndArray();
+        } else {
+          fprintf(out, "\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
+          fprintf(out, "\trange (%d, %d)\n",
+                 static_cast<int>(c.alu.uop_bgn),
+                 static_cast<int>(c.alu.uop_end));
+          fprintf(out, "\touter loop - iter: %d, dst: %d, src: %d\n",
+                 static_cast<int>(c.alu.iter_out),
+                 static_cast<int>(c.alu.dst_factor_out),
+                 static_cast<int>(c.alu.src_factor_out));
+          fprintf(out, "\tinner loop - iter: %d, dst: %d, src: %d\n",
+                 static_cast<int>(c.alu.iter_in),
+                 static_cast<int>(c.alu.dst_factor_in),
+                 static_cast<int>(c.alu.src_factor_in));
+        }
       }
 
       // Count status in queues
@@ -871,8 +1010,27 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
         if (c.gemm.pop_next_dep) s2g_queue--;
         if (c.gemm.push_next_dep) g2s_queue++;
       }
-      printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
-      printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
+      if (json) {
+        writer.Key("l2g_queue");
+        writer.Int(l2g_queue);
+        writer.Key("g2l_queue");
+        writer.Int(g2l_queue);
+        writer.Key("s2g_queue");
+        writer.Int(s2g_queue);
+        writer.Key("g2s_queue");
+        writer.Int(g2s_queue);
+
+        writer.EndObject();
+      } else {
+        fprintf(out, "\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
+        fprintf(out, "\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
+      }
+    }
+
+    if (json) {
+      writer.EndArray();
+      auto str = s.GetString();
+      fwrite(str, 1, s.GetSize(), out);
     }
   }
   // Commit all pending pop of corresponding stage
@@ -1103,11 +1261,27 @@ class CommandQueue {
   }
 
   void Synchronize(uint32_t wait_cycles, bool skip=true) {
+    if (debug_flag_ & VTA_DEBUG_AUTO_TUNE) {
+      const char* insn_file = std::getenv("TVM_INSN_DUMP");
+      if (insn_file == nullptr) {
+        insn_file = "insn.dump";
+      }
+      FILE* out = fopen(insn_file, "w+");
+      if (out) {
+        insn_queue_.DumpInsn(out, true);
+        fclose(out);
+      } else {
+        LOG(ERROR) << insn_file << " open failed";
+      }
+      return;
+    }
+
     // FIXME(zhanghao): It is required to use force_serial
     // by using skip and sync at the final layer, we can avoid do DeviceCopy every time
     if (skip) {
       if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) {
-        LOG(ERROR) << "Synchronizing all in one round requires to use force_serial to make things right";
+        LOG(ERROR) <<
+            "Synchronizing all in one round requires to use force_serial to make things right";
       }
       return;
     }
@@ -1130,8 +1304,18 @@ class CommandQueue {
     // Check if there are no instruction to execute at all
     if (insn_queue_.count() == 0) return;
     // Synchronization for the queues
+    // struct timespec start, stop;
+    // clock_gettime(CLOCK_REALTIME, &start);
     uop_queue_.AutoReadBarrier();
+    // clock_gettime(CLOCK_REALTIME, &stop);
+    // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000;
+    // LOG(WARNING) << "UopQueue VTAMemCopyFromHost: " << elapsed << " us";
+
+    // clock_gettime(CLOCK_REALTIME, &start);
     insn_queue_.AutoReadBarrier();
+    // clock_gettime(CLOCK_REALTIME, &stop);
+    // elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000;
+    // LOG(WARNING) << "InsnQueue VTAMemCopyFromHost: " << elapsed << " us";
     // Dump instructions if debug enabled
     if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
       insn_queue_.DumpInsn();
@@ -1332,7 +1516,12 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off
   if (from_buffer) {
     // This is an FPGA to host mem transfer
     // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly
+    // struct timespec start, stop;
+    // clock_gettime(CLOCK_REALTIME, &start);
     VTASynchronize(VTATLSCommandHandle(), 1<<31, false);
+    // clock_gettime(CLOCK_REALTIME, &stop);
+    // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000;
+    // LOG(WARNING) << "Final Synchronize: " << elapsed << " us";
     from_buffer->InvalidateCache(from_offset, size);
     from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
                                static_cast<const char*>(from) + from_offset, size);
diff --git a/vta/runtime/runtime.h b/vta/runtime/runtime.h
index 360970118144..22cf15a91503 100644
--- a/vta/runtime/runtime.h
+++ b/vta/runtime/runtime.h
@@ -41,6 +41,7 @@ extern "C" {
 #define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
 #define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
 #define VTA_DEBUG_FORCE_SERIAL (1 << 5)
+#define VTA_DEBUG_AUTO_TUNE (1 << 6)
 
 /*!
  * \brief Allocate data buffer.
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index a92b1ee5d90b..bc819c20d470 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -76,7 +76,7 @@
 # Perform vta-specific compilation with Relay from a Gluon model
 
 
-def compile_network(env, target, model, start_pack, stop_pack):
+def compile_network(env, target, model, start_pack, stop_pack, device_annot=False):
 
     # Populate the shape and data type dictionary
     dtype_dict = {"data": 'float32'}
@@ -104,7 +104,8 @@ def compile_network(env, target, model, start_pack, stop_pack):
                                 env.BLOCK_OUT,
                                 env.WGT_WIDTH,
                                 start_name=start_pack,
-                                stop_name=stop_pack)
+                                stop_name=stop_pack,
+                                device_annot=device_annot)
 
     return relay_prog, params
 
@@ -341,8 +342,11 @@ def tune_and_evaluate(tuning_opt):
                                                 tracker_port,
                                                 timeout=10000)
         # Reconfigure the JIT runtime and FPGA.
-        vta.reconfig_runtime(remote)
-        vta.program_fpga(remote, bitstream=None)
+        bitstream = os.environ.get("TVM_BIT", None)
+        if bitstream:
+            print("Program fpga with {}".format(bitstream))
+            vta.reconfig_runtime(remote)
+            vta.program_fpga(remote, bitstream)
     else:
         # In simulation mode, host the RPC server locally.
         remote = rpc.LocalSession()
@@ -382,12 +386,14 @@ def tune_and_evaluate(tuning_opt):
 
     # We do not run the tuning in our webpage server since it takes too long.
     # Comment the following line to run it by yourself.
-    return
+    # return
 
     # run tuning tasks
     print("Tuning...")
     tune_tasks(tasks, **tuning_opt)
 
+    # recompile the programs with device annotations
+    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack, device_annot=True)
     # compile kernels with history best records
     with autotvm.tophub.context(target, extra_files=[log_file]):
         # Compile network
@@ -395,14 +401,18 @@ def tune_and_evaluate(tuning_opt):
         if target.device_name != "vta":
             with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
                 graph, lib, params = relay.build(relay_prog,
-                                                target=target,
-                                                params=params,
-                                                target_host=env.target_host)
+                                                 target=target,
+                                                 params=params,
+                                                 target_host=env.target_host)
         else:
+            targets = {
+                "cpu": env.target_vta_cpu,
+                "ext_dev": env.target
+            }
             with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
                 graph, lib, params = relay.build(
                     relay_prog,
-                    target=target,
+                    target=targets,
                     params=params,
                     target_host=env.target_host)
 
@@ -415,7 +425,8 @@ def tune_and_evaluate(tuning_opt):
 
         # Generate the graph runtime
         ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-        m = graph_runtime.create(graph, lib, ctx)
+        ctxes = [ctx, remote.cpu(0)]
+        m = graph_runtime.create(graph, lib, ctxes)
 
         # upload parameters to device
         image = tvm.nd.array(

From 82cbd4f41b21443fd7bfab04dd96b6d13c55ef98 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 9 Apr 2020 11:07:41 +0800
Subject: [PATCH 09/44] update cost calculation formula

---
 python/tvm/autotvm/measure/measure_methods.py | 110 +++++++++++++-----
 1 file changed, 78 insertions(+), 32 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 43ee291bfdd9..c545fb7aa23c 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -452,46 +452,92 @@ def cal_cost(insn):
     ------------
     the cost in s
     """
-    def alu_imm_cost(outer, inner, uops):
-        return 0.00001
-
-    def alu_cost(outer, inner, uops):
-        return 0.00001
-
-    def gemm_cost(outer, inner, uops):
-        return 0.00001
-
-    def load_inp_cost(y, x):
-        return 0.00001
-
-    def load_uop_cost(y, x):
-        return 0.00001
-
-    def load_wgt_cost(y, x):
-        return 0.00001
-
-    def store_cost(y, x):
-        return 0.00001
+    factor = 1000000.0
+    def alu_imm_cost(iter_out, iter_in, uop_bgn, uop_end):
+        x = (uop_end - uop_bgn) * iter_out * iter_in
+        cycles = x + 46
+        return cycles / factor
+
+    def alu_cost(iter_out, iter_in, uop_bgn, uop_end):
+        x = (uop_end - uop_bgn) * iter_out * iter_in
+        cycles = 2 * x + 46
+        return cycles / factor
+
+    def gemm_cost(iter_out, iter_in, uop_bgn, uop_end):
+        x = (uop_end - uop_bgn) * iter_out * iter_in
+        cycles = x + 80
+        return cycles / factor
+
+    def load_acc_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = x + 150
+        return cycles / factor
+
+    def load_acc8_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = 1.2 * x + 150
+        return cycles / factor
+
+    def load_inp_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = (x_size + x_pad_0 + x_pad_1) * (y_size + y_pad_0 + y_pad_1)
+        cycles = 1.1 * x + 150
+        return cycles / factor
+
+    def load_uop_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = 1.1 * x + 150
+        return cycles / factor
+
+    def load_wgt_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = 17 * x + 150
+        return cycles / factor
+
+    def store_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = x + 150
+        return cycles / factor
+
+    def nop_cost(name):
+        if name == "NOP-COMPUTE-STAGE":
+            return 38 / factor
+        elif name == "NOP-MEMORY-STAGE":
+            return 50 / factor
+        elif name == "NOP-STORE-STAGE":
+            return 39 / factor
+        else:
+            print("Unknown nop op {}".format(name))
+            return 0
 
     if insn['type'] == "ALU":
-        return alu_cost(insn['outer_loop'], insn['inner_loop'],
-                        insn['range'][1] - insn['range'][0])
+        return alu_cost(insn['outer_loop'][0], insn['inner_loop'][0],
+                        insn['range'][0], insn['range'][1])
     elif insn['type'] == "ALU IMM":
-        return alu_imm_cost(insn['outer_loop'], insn['inner_loop'],
-                        insn['range'][1] - insn['range'][0])
+        return alu_imm_cost(insn['outer_loop'][0], insn['inner_loop'][0],
+                        insn['range'][0], insn['range'][1])
     elif insn['type'] == "GEMM":
-        return gemm_cost(insn['outer_loop'], insn['inner_loop'],
-                        insn['range'][1] - insn['range'][0])
+        return gemm_cost(insn['outer_loop'][0], insn['inner_loop'][0],
+                        insn['range'][0], insn['range'][1])
     elif insn['name'] == "LOAD INP":
-        return load_inp_cost(insn['y'][0], insn['x'][0])
+        return load_inp_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
     elif insn['name'] == "LOAD WGT":
-        return load_wgt_cost(insn['y'][0], insn['x'][0])
+        return load_wgt_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
     elif insn['name'] == "LOAD UOP":
-        return load_uop_cost(insn['y'][0], insn['x'][0])
-    elif insn['type'] == "STORE":
-        return store_cost(insn['y'][0], insn['x'][0])
+        return load_uop_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
+    elif insn['name'] == "LOAD ACC":
+        return load_acc_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
+    elif insn['name'] == "LOAD ACC 8":
+        return load_acc8_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
+    elif insn['name'] == "STORE":
+        return store_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                          insn['x'][0], insn['x'][1], insn['x'][2])
     elif insn['type'] == "NOP":
-        return 0
+        return nop_cost(insn['name'])
     else:
         print("Unknown op type: {}".format(insn['type']))
         return 0

From a810b853d4d17ef974cfede11c47596cf7101d08 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 9 Apr 2020 16:02:38 +0800
Subject: [PATCH 10/44] bugfix for vta add schedule

---
 vta/python/vta/top/vta_conv2d.py | 80 ++++++++++++++++----------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index c87a89ecfe80..5c856384c605 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -208,48 +208,48 @@ def is_cast_op(op):
     te.schedule.AutoInlineInjective(s)
     # s[output].fuse(s[output].op.axis)
 
-    ewise_inputs = []
-    ewise_ops = []
-    const_ops = []
-
-    def _traverse(op):
-        if topi.tag.is_broadcast(op.tag):
-            if not op.same_as(output.op):
-                if not op.axis:
-                    const_ops.append(op)
-                elif not is_cast_op(op):
-                    ewise_ops.append(op)
-
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.te.PlaceholderOp):
-                    ewise_inputs.append((op, tensor))
-                elif is_cast_op(tensor.op) and not op.same_as(output.op):
-                    ewise_inputs.append((op, tensor))
-                else:
-                    _traverse(tensor.op)
-        else:
-            for tensor in op.input_tensors:
-                if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \
-                        and (not is_cast_op(tensor.op)):
-                    _traverse(tensor.op)
-
-    op = output.op
-    _traverse(op)
-    x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
-
-    x_co_max = topi.util.get_const_int(x_bo.dom.extent)
-    x_i_max = topi.util.get_const_int(x_i.dom.extent)
-    x_j_max = topi.util.get_const_int(x_j.dom.extent)
-
-    # TODO(zhanghao): auto-tune
-    x_co0, x_co1 = s[output].split(x_co, factor=1)
-    x_i0, x_i1 = s[output].split(x_i, factor=min(28, x_i_max))
-    x_j0, x_j1 = s[output].split(x_j, factor=min(14, x_j_max))
-    s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
-    store_pt = x_j0
-
     # only put the int-related ops to vta
     if "int" in output.dtype:
+        ewise_inputs = []
+        ewise_ops = []
+        const_ops = []
+
+        def _traverse(op):
+            if topi.tag.is_broadcast(op.tag):
+                if not op.same_as(output.op):
+                    if not op.axis:
+                        const_ops.append(op)
+                    elif not is_cast_op(op):
+                        ewise_ops.append(op)
+
+                for tensor in op.input_tensors:
+                    if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                        ewise_inputs.append((op, tensor))
+                    elif is_cast_op(tensor.op) and not op.same_as(output.op):
+                        ewise_inputs.append((op, tensor))
+                    else:
+                        _traverse(tensor.op)
+            else:
+                for tensor in op.input_tensors:
+                    if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \
+                            and (not is_cast_op(tensor.op)):
+                        _traverse(tensor.op)
+
+        op = output.op
+        _traverse(op)
+        x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
+
+        x_co_max = topi.util.get_const_int(x_bo.dom.extent)
+        x_i_max = topi.util.get_const_int(x_i.dom.extent)
+        x_j_max = topi.util.get_const_int(x_j.dom.extent)
+
+        # TODO(zhanghao): auto-tune
+        x_co0, x_co1 = s[output].split(x_co, factor=1)
+        x_i0, x_i1 = s[output].split(x_i, factor=min(28, x_i_max))
+        x_j0, x_j1 = s[output].split(x_j, factor=min(14, x_j_max))
+        s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
+        store_pt = x_j0
+
         env = get_env()
         for eo in ewise_ops:
             eprint("add ewise_ops ", eo)

From 3a8e244f345a707940fd0b827bf7f0c8482ca34b Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Fri, 10 Apr 2020 12:16:33 +0800
Subject: [PATCH 11/44] bugfix for insn buffer overflow

---
 vta/runtime/runtime.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 911aac301ae6..39038da00b51 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -1474,12 +1474,12 @@ class CommandQueue {
   void CheckInsnOverFlow() {
     // At each API call, we can at most commit:
     // one pending store, one pending load, and one uop
-    if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
+    if ((insn_queue_.count() + 5) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
       this->AutoSync();
     }
   }
   // Auto sync when instruction overflow
-  void AutoSync() { this->Synchronize(1 << 31); }
+  void AutoSync() { this->Synchronize(1 << 31, false); }
 
   // Internal debug flag
   int debug_flag_{0};

From 5c7ead70ee6c47881edeb8e176380c9740f4616b Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 13 Apr 2020 11:13:27 +0800
Subject: [PATCH 12/44] tune vta relay refine

---
 vta/tutorials/autotvm/tune_relay_vta.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index bc819c20d470..7e537fae9128 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -195,7 +195,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals
 # The ``start_pack`` and ``stop_pack`` labels indicate where
 # to start and end the graph packing relay pass: in other words
 # where to start and finish offloading to VTA.
-network = "resnet18_v1"
+network = "resnet50_v2"
 start_pack = "nn.max_pool2d"
 stop_pack = "nn.global_avg_pool2d"
 
@@ -368,7 +368,7 @@ def tune_and_evaluate(tuning_opt):
     tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
 
     # We should have extracted 10 convolution tasks
-    assert len(tasks) == 10
+    # assert len(tasks) == 10
     print("Extracted {} conv2d tasks:".format(len(tasks)))
     for tsk in tasks:
         inp = tsk.args[0][1]
@@ -392,10 +392,11 @@ def tune_and_evaluate(tuning_opt):
     print("Tuning...")
     tune_tasks(tasks, **tuning_opt)
 
-    # recompile the programs with device annotations
-    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack, device_annot=True)
     # compile kernels with history best records
     with autotvm.tophub.context(target, extra_files=[log_file]):
+        # recompile the programs with device annotations
+        print("Recompile")
+        relay_prog, params = compile_network(env, target, network, start_pack, stop_pack, device_annot=True)
         # Compile network
         print("Compile...")
         if target.device_name != "vta":
@@ -409,7 +410,7 @@ def tune_and_evaluate(tuning_opt):
                 "cpu": env.target_vta_cpu,
                 "ext_dev": env.target
             }
-            with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
+            with vta.build_config(opt_level=3, debug_flag=32, disabled_pass={"AlterOpLayout"}):
                 graph, lib, params = relay.build(
                     relay_prog,
                     target=targets,

From cc96cbbbc4bdf5bddb5d157e949d5c9904cfb0d1 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 13 Apr 2020 13:24:45 +0800
Subject: [PATCH 13/44] separate cost function from general method_methods

---
 python/tvm/autotvm/measure/measure_methods.py | 115 ++----------------
 vta/config/vta_cost.py                        | 102 ++++++++++++++++
 2 files changed, 114 insertions(+), 103 deletions(-)
 create mode 100644 vta/config/vta_cost.py

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index c545fb7aa23c..f32725c2e9cc 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -33,6 +33,8 @@
 
 import numpy as np
 import json
+import sys
+from importlib import import_module
 
 import tvm._ffi
 import tvm.ir.transform
@@ -440,109 +442,6 @@ def _wrapped(measure_input, tmp_dir, **kwargs):
     return _wrapped
 
 
-def cal_cost(insn):
-    """
-    Cal the runtime cost statically
-
-    Parameters
-    ------------
-    insn: the insn (json)
-
-    Returns
-    ------------
-    the cost in s
-    """
-    factor = 1000000.0
-    def alu_imm_cost(iter_out, iter_in, uop_bgn, uop_end):
-        x = (uop_end - uop_bgn) * iter_out * iter_in
-        cycles = x + 46
-        return cycles / factor
-
-    def alu_cost(iter_out, iter_in, uop_bgn, uop_end):
-        x = (uop_end - uop_bgn) * iter_out * iter_in
-        cycles = 2 * x + 46
-        return cycles / factor
-
-    def gemm_cost(iter_out, iter_in, uop_bgn, uop_end):
-        x = (uop_end - uop_bgn) * iter_out * iter_in
-        cycles = x + 80
-        return cycles / factor
-
-    def load_acc_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = x + 150
-        return cycles / factor
-
-    def load_acc8_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = 1.2 * x + 150
-        return cycles / factor
-
-    def load_inp_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = (x_size + x_pad_0 + x_pad_1) * (y_size + y_pad_0 + y_pad_1)
-        cycles = 1.1 * x + 150
-        return cycles / factor
-
-    def load_uop_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = 1.1 * x + 150
-        return cycles / factor
-
-    def load_wgt_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = 17 * x + 150
-        return cycles / factor
-
-    def store_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = x + 150
-        return cycles / factor
-
-    def nop_cost(name):
-        if name == "NOP-COMPUTE-STAGE":
-            return 38 / factor
-        elif name == "NOP-MEMORY-STAGE":
-            return 50 / factor
-        elif name == "NOP-STORE-STAGE":
-            return 39 / factor
-        else:
-            print("Unknown nop op {}".format(name))
-            return 0
-
-    if insn['type'] == "ALU":
-        return alu_cost(insn['outer_loop'][0], insn['inner_loop'][0],
-                        insn['range'][0], insn['range'][1])
-    elif insn['type'] == "ALU IMM":
-        return alu_imm_cost(insn['outer_loop'][0], insn['inner_loop'][0],
-                        insn['range'][0], insn['range'][1])
-    elif insn['type'] == "GEMM":
-        return gemm_cost(insn['outer_loop'][0], insn['inner_loop'][0],
-                        insn['range'][0], insn['range'][1])
-    elif insn['name'] == "LOAD INP":
-        return load_inp_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "LOAD WGT":
-        return load_wgt_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "LOAD UOP":
-        return load_uop_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "LOAD ACC":
-        return load_acc_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "LOAD ACC 8":
-        return load_acc8_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "STORE":
-        return store_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                          insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['type'] == "NOP":
-        return nop_cost(insn['name'])
-    else:
-        print("Unknown op type: {}".format(insn['type']))
-        return 0
-
-
 def run_through_rpc(measure_input, build_result,
                     number, repeat, min_repeat_ms, cooldown_interval,
                     remote_args, ref_input=None, ref_output=None):
@@ -623,6 +522,12 @@ def run_through_rpc(measure_input, build_result,
             func(*args)
             cost = 0
             insn_dump = os.getenv('TVM_INSN_DUMP', "insn.dump")
+            insn_cost_file = os.getenv('TVM_INSN_COST', "cost.py")
+            path, filename = os.path.split(insn_cost_file)
+            sys.path.append(path)
+            module_path = filename[:-3]  # remove the .py suffix
+            module = import_module(module_path)
+            cal_cost = getattr(module, "cal_cost")
             with open(insn_dump) as json_file:
                 insns = json.load(json_file)
                 for insn in insns:
@@ -649,6 +554,10 @@ def run_through_rpc(measure_input, build_result,
             msg = msg[:msg.index("CUDA Source")]
         costs = (RuntimeError(msg[:1024]),)
         errno = MeasureErrorNo.RUNTIME_DEVICE
+    except Exception as exc:
+        costs = (exc,)
+        errno = MeasureErrorNo.UNKNOWN_ERROR
+
     tstamp = time.time()
     time.sleep(cooldown_interval)
     return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
diff --git a/vta/config/vta_cost.py b/vta/config/vta_cost.py
new file mode 100644
index 000000000000..9e1d7389b8c3
--- /dev/null
+++ b/vta/config/vta_cost.py
@@ -0,0 +1,102 @@
+# cost function for intelfocl 32*32 gemm version
+def cal_cost(insn):
+    """
+    Cal the runtime cost statically
+
+    Parameters
+    ------------
+    insn: the insn (json)
+
+    Returns
+    ------------
+    the cost in s
+    """
+    factor = 1000000.0
+    def alu_imm_cost(iter_out, iter_in, uop_bgn, uop_end):
+        x = (uop_end - uop_bgn) * iter_out * iter_in
+        cycles = x + 46
+        return cycles / factor
+
+    def alu_cost(iter_out, iter_in, uop_bgn, uop_end):
+        x = (uop_end - uop_bgn) * iter_out * iter_in
+        cycles = 2 * x + 46
+        return cycles / factor
+
+    def gemm_cost(iter_out, iter_in, uop_bgn, uop_end):
+        x = (uop_end - uop_bgn) * iter_out * iter_in
+        cycles = x + 80
+        return cycles / factor
+
+    def load_acc_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = x + 150
+        return cycles / factor
+
+    def load_acc8_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = 1.2 * x + 150
+        return cycles / factor
+
+    def load_inp_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = (x_size + x_pad_0 + x_pad_1) * (y_size + y_pad_0 + y_pad_1)
+        cycles = 1.1 * x + 150
+        return cycles / factor
+
+    def load_uop_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = 1.1 * x + 150
+        return cycles / factor
+
+    def load_wgt_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = 17 * x + 150
+        return cycles / factor
+
+    def store_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
+        x = x_size * y_size
+        cycles = x + 150
+        return cycles / factor
+
+    def nop_cost(name):
+        if name == "NOP-COMPUTE-STAGE":
+            return 38 / factor
+        elif name == "NOP-MEMORY-STAGE":
+            return 50 / factor
+        elif name == "NOP-STORE-STAGE":
+            return 39 / factor
+        else:
+            print("Unknown nop op {}".format(name))
+            return 0
+
+    if insn['type'] == "ALU":
+        return alu_cost(insn['outer_loop'][0], insn['inner_loop'][0],
+                        insn['range'][0], insn['range'][1])
+    elif insn['type'] == "ALU IMM":
+        return alu_imm_cost(insn['outer_loop'][0], insn['inner_loop'][0],
+                        insn['range'][0], insn['range'][1])
+    elif insn['type'] == "GEMM":
+        return gemm_cost(insn['outer_loop'][0], insn['inner_loop'][0],
+                        insn['range'][0], insn['range'][1])
+    elif insn['name'] == "LOAD INP":
+        return load_inp_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
+    elif insn['name'] == "LOAD WGT":
+        return load_wgt_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
+    elif insn['name'] == "LOAD UOP":
+        return load_uop_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
+    elif insn['name'] == "LOAD ACC":
+        return load_acc_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
+    elif insn['name'] == "LOAD ACC 8":
+        return load_acc8_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                             insn['x'][0], insn['x'][1], insn['x'][2])
+    elif insn['name'] == "STORE":
+        return store_cost(insn['y'][0], insn['y'][1], insn['y'][2],
+                          insn['x'][0], insn['x'][1], insn['x'][2])
+    elif insn['type'] == "NOP":
+        return nop_cost(insn['name'])
+    else:
+        print("Unknown op type: {}".format(insn['type']))
+        return 0

From d880b3ba6e66f7ea9e5535591571e309747f9247 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 16 Apr 2020 14:35:14 +0800
Subject: [PATCH 14/44] vta mobilenetG prediction script

---
 python/tvm/relay/testing/mobilenet.py      |  50 +++--
 vta/tutorials/frontend/deploy_mobilenet.py | 226 +++++++++++++++++++++
 2 files changed, 258 insertions(+), 18 deletions(-)
 create mode 100644 vta/tutorials/frontend/deploy_mobilenet.py

diff --git a/python/tvm/relay/testing/mobilenet.py b/python/tvm/relay/testing/mobilenet.py
index d5a4d5f1e08f..e83336525ea7 100644
--- a/python/tvm/relay/testing/mobilenet.py
+++ b/python/tvm/relay/testing/mobilenet.py
@@ -44,20 +44,22 @@ def conv_block(data, name, channels, kernel_size=(3, 3), strides=(1, 1),
 
 def separable_conv_block(data, name, depthwise_channels, pointwise_channels,
                          kernel_size=(3, 3), downsample=False, padding=(1, 1),
-                         epsilon=1e-5, layout='NCHW', dtype="float32"):
+                         epsilon=1e-5, layout='NCHW', dtype="float32", depthwise_group_factor=1):
     """Helper function to get a separable conv block"""
     if downsample:
         strides = (2, 2)
     else:
         strides = (1, 1)
     # depthwise convolution + bn + relu
-    wshape = (depthwise_channels, 1) + kernel_size
+    wshape = (depthwise_channels, depthwise_group_factor) + kernel_size
     weight = relay.var(name + "_weight", shape=wshape, dtype=dtype)
+    depthwise_group_factor = min(depthwise_group_factor, depthwise_channels)
+    groups = int(depthwise_channels/depthwise_group_factor)
     conv1 = layers.conv2d(
         data=data,
         weight=weight,
         channels=depthwise_channels,
-        groups=depthwise_channels,
+        groups=groups,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
@@ -82,47 +84,59 @@ def separable_conv_block(data, name, depthwise_channels, pointwise_channels,
 
 
 def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224),
-               dtype='float32', alpha=1.0, is_shallow=False, layout='NCHW'):
+               dtype='float32', alpha=1.0, is_shallow=False, layout='NCHW',
+               depthwise_group_factor=1):
     """Function to construct a MobileNet"""
     data = relay.var("data", shape=data_shape, dtype=dtype)
     body = conv_block(data, 'conv_block_1', int(32*alpha), strides=(2, 2),
                       layout=layout)
     body = separable_conv_block(body, 'separable_conv_block_1',
                                 int(32*alpha), int(64*alpha), layout=layout,
-                                dtype=dtype)
+                                dtype=dtype,
+                                depthwise_group_factor=depthwise_group_factor)
     body = separable_conv_block(body, 'separable_conv_block_2',
                                 int(64*alpha), int(128*alpha), downsample=True,
-                                layout=layout, dtype=dtype)
+                                layout=layout, dtype=dtype,
+                                depthwise_group_factor=depthwise_group_factor)
     body = separable_conv_block(body, 'separable_conv_block_3',
                                 int(128*alpha), int(128*alpha), layout=layout,
-                                dtype=dtype)
+                                dtype=dtype,
+                                depthwise_group_factor=depthwise_group_factor)
     body = separable_conv_block(body, 'separable_conv_block_4',
                                 int(128*alpha), int(256*alpha), downsample=True,
-                                layout=layout, dtype=dtype)
+                                layout=layout, dtype=dtype,
+                                depthwise_group_factor=depthwise_group_factor)
     body = separable_conv_block(body, 'separable_conv_block_5',
                                 int(256*alpha), int(256*alpha), layout=layout,
-                                dtype=dtype)
+                                dtype=dtype,
+                                depthwise_group_factor=depthwise_group_factor)
     body = separable_conv_block(body, 'separable_conv_block_6',
                                 int(256*alpha), int(512*alpha), downsample=True,
-                                layout=layout, dtype=dtype)
+                                layout=layout, dtype=dtype,
+                                depthwise_group_factor=depthwise_group_factor)
     if is_shallow:
         body = separable_conv_block(body, 'separable_conv_block_7',
                                     int(512*alpha), int(1024*alpha),
-                                    downsample=True, layout=layout, dtype=dtype)
+                                    downsample=True, layout=layout, dtype=dtype,
+                                    depthwise_group_factor=depthwise_group_factor)
         body = separable_conv_block(body, 'separable_conv_block_8',
                                     int(1024*alpha), int(1024*alpha),
-                                    downsample=True, layout=layout, dtype=dtype)
+                                    downsample=True, layout=layout, dtype=dtype,
+                                    depthwise_group_factor=depthwise_group_factor)
     else:
         for i in range(7, 12):
             body = separable_conv_block(body, 'separable_conv_block_%d' % i,
                                         int(512*alpha), int(512*alpha),
-                                        layout=layout, dtype=dtype)
+                                        layout=layout, dtype=dtype,
+                                        depthwise_group_factor=depthwise_group_factor)
         body = separable_conv_block(body, 'separable_conv_block_12',
                                     int(512*alpha), int(1024*alpha),
-                                    downsample=True, layout=layout, dtype=dtype)
+                                    downsample=True, layout=layout, dtype=dtype,
+                                    depthwise_group_factor=depthwise_group_factor)
         body = separable_conv_block(body, 'separable_conv_block_13',
-                                    int(1024*alpha), int(1024*alpha),
-                                    layout=layout, dtype=dtype)
+                                   int(1024*alpha), int(1024*alpha),
+                                   layout=layout, dtype=dtype,
+                                   depthwise_group_factor=depthwise_group_factor)
     pool = relay.nn.global_avg_pool2d(data=body, layout=layout)
     flatten = relay.nn.batch_flatten(data=pool)
     weight = relay.var('fc_weight')
@@ -134,7 +148,7 @@ def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224),
 
 
 def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224),
-                 dtype='float32', layout='NCHW'):
+                 dtype='float32', layout='NCHW', depthwise_group_factor=1):
     """Get benchmark workload for mobilenet
 
     Parameters
@@ -166,5 +180,5 @@ def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224),
     data_shape = tuple([batch_size] + list(image_shape))
     net = mobile_net(num_classes=num_classes, data_shape=data_shape,
                      dtype=dtype, alpha=1.0, is_shallow=False,
-                     layout=layout)
+                     layout=layout, depthwise_group_factor=depthwise_group_factor)
     return create_workload(net)
diff --git a/vta/tutorials/frontend/deploy_mobilenet.py b/vta/tutorials/frontend/deploy_mobilenet.py
new file mode 100644
index 000000000000..8a94a588741e
--- /dev/null
+++ b/vta/tutorials/frontend/deploy_mobilenet.py
@@ -0,0 +1,226 @@
+from __future__ import absolute_import, print_function
+
+import argparse, json, os, requests, sys, time
+from io import BytesIO
+from os.path import join, isfile
+from PIL import Image
+
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+from matplotlib import pyplot as plt
+
+import tvm
+from tvm import te
+from tvm import rpc, autotvm, relay
+from tvm.contrib import graph_runtime, util, download
+from tvm.contrib.debugger import debug_runtime
+from tvm.relay import transform
+import tvm.relay.testing
+
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
+from tvm.contrib.util import eprint
+
+# Make sure that TVM was compiled with RPC=1
+assert tvm.runtime.enabled("rpc")
+
+######################################################################
+# Define the platform and model targets
+# -------------------------------------
+# Execute on CPU vs. VTA, and define the model.
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
+# multiple targets to run both on cpu and vta
+targets = {
+    "cpu": env.target_vta_cpu,
+    "ext_dev": env.target
+}
+
+model = "mobilenetG"
+
+######################################################################
+# Obtain an execution remote
+# --------------------------
+# When target is 'pynq', reconfigure FPGA and runtime.
+# Otherwise, if target is 'sim', execute locally.
+
+if env.TARGET not in ["sim", "tsim", "intelfocl"]:
+
+    # Get remote from tracker node if environment variable is set.
+    # To set up the tracker, you'll need to follow the "Auto-tuning
+    # a convolutional network for VTA" tutorial.
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
+    # Otherwise if you have a device you want to program directly from
+    # the host, make sure you've set the variables below to the IP of
+    # your board.
+    device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+    device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
+    if not tracker_host or not tracker_port:
+        remote = rpc.connect(device_host, int(device_port))
+    else:
+        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000)
+
+    # Reconfigure the JIT runtime and FPGA.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    reconfig_start = time.time()
+    vta.reconfig_runtime(remote)
+    bitstream = os.environ.get("TVM_BIT", None)
+    if bitstream:
+        print("Program fpga with {}".format(bitstream))
+        vta.program_fpga(remote, bitstream)
+
+    reconfig_time = time.time() - reconfig_start
+    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
+
+# In simulation mode, host the RPC server locally.
+else:
+    remote = rpc.LocalSession()
+
+# Get execution context from remote
+# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+ctxes = [remote.ext_dev(0), remote.cpu(0)]
+
+# Load pre-configured AutoTVM schedules
+with autotvm.tophub.context(target):
+
+    # Populate the shape and data type dictionary for ImageNet classifier input
+    dtype_dict = {"data": 'float32'}
+    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+    # get the mobilenet model
+    mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32",
+                                                       depthwise_group_factor=env.BLOCK_IN)
+
+    # Measure build start time
+    build_start = time.time()
+
+    # Update shape and type dictionary
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    if target.device_name == "vta":
+        # Perform quantization in Relay
+        # Note: We set opt_level to 3 in order to fold batch norm
+        with relay.build_config(opt_level=3):
+            with relay.quantize.qconfig(global_scale=8.0,
+                                        skip_conv_layers=[0]):
+                mod = relay.quantize.quantize(mod, params=params)
+            # Perform graph packing and constant folding for VTA target
+            assert env.BLOCK_IN == env.BLOCK_OUT
+            relay_prog = graph_pack(
+                mod["main"],
+                env.BATCH,
+                env.BLOCK_OUT,
+                env.WGT_WIDTH,
+                start_name="nn.conv2d",
+                stop_name="nn.global_avg_pool2d")
+    else:
+        relay_prog = mod["main"]
+
+    # Compile Relay program with AlterOpLayout disabled
+    with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
+        if target.device_name != "vta":
+            graph, lib, params = relay.build(
+                relay_prog, target=target,
+                params=params, target_host=env.target_host)
+        else:
+            with vta.build_config(debug_flag=32):
+                graph, lib, params = relay.build(
+                    relay_prog, target=targets,
+                    params=params, target_host=env.target_host)
+
+    # Measure Relay build time
+    build_time = time.time() - build_start
+    print(model + " inference graph built in {0:.2f}s!".format(build_time))
+
+    # Graph runtime
+    m = graph_runtime.create(graph, lib, ctxes)
+
+######################################################################
+# Perform image classification inference
+# --------------------------------------
+# We run classification on an image sample from ImageNet
+# We just need to download the categories files, `synset.txt`
+# and an input test image.
+
+# Download ImageNet categories
+categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
+categ_fn = "synset.txt"
+download.download(join(categ_url, categ_fn), categ_fn)
+synset = eval(open(categ_fn).read())
+
+# Download test image
+image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
+image_fn = 'cat.png'
+download.download(image_url, image_fn)
+
+# Prepare test image for inference
+image = Image.open(image_fn).resize((224, 224))
+plt.imshow(image)
+plt.show()
+image = np.array(image) - np.array([123., 117., 104.])
+image /= np.array([58.395, 57.12, 57.375])
+image = image.transpose((2, 0, 1))
+image = image[np.newaxis, :]
+image = np.repeat(image, env.BATCH, axis=0)
+
+# Set the network parameters and inputs
+m.set_input(**params)
+m.set_input('data', image)
+
+# Perform inference and gather execution statistics
+# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
+num = 3 # number of times we run module for a single measurement
+rep = 3 # number of measurements (we derive std dev from this)
+timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep)
+
+if env.TARGET in ["sim", "tsim"]:
+    simulator.clear_stats()
+    timer()
+
+    sim_stats = simulator.stats()
+    print("\nExecution statistics:")
+    for k, v in sim_stats.items():
+        # Since we execute the workload many times, we need to normalize stats
+        # Note that there is always one warm up run
+        # Therefore we divide the overall stats by (num * rep + 1)
+        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
+else:
+    tcost = timer()
+    std = np.std(tcost.results) * 1000
+    mean = tcost.mean * 1000
+    print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
+    print("Average per sample inference time: %.2fms" % (mean/env.BATCH))
+
+# Get classification results
+tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0)))
+output = tvm_output.asnumpy()
+for b in range(env.BATCH):
+    top_categories = np.argsort(tvm_output.asnumpy()[b])
+    # print("top_categories = ", top_categories)
+    # Report top-5 classification results
+    print("\n{} prediction for sample {}".format(model, b))
+    print("\t#1:", synset[top_categories[-1]], output[b][top_categories[-1]])
+    print("\t#2:", synset[top_categories[-2]], output[b][top_categories[-2]])
+    print("\t#3:", synset[top_categories[-3]], output[b][top_categories[-3]])
+    print("\t#4:", synset[top_categories[-4]], output[b][top_categories[-4]])
+    print("\t#5:", synset[top_categories[-5]], output[b][top_categories[-5]])
+    # This just checks that one of the 5 top categories
+    # is one variety of cat; this is by no means an accurate
+    # assessment of how quantization affects classification
+    # accuracy but is meant to catch changes to the
+    # quantization pass that would accuracy in the CI.
+    cat_detected = False
+    for k in top_categories[-5:]:
+        if "cat" in synset[k]:
+            cat_detected = True
+    assert(cat_detected)

From f80c3e05adf22f07b22debe988fa1d60e184a89a Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 20 Apr 2020 16:03:01 +0800
Subject: [PATCH 15/44] quickfix for auto-tune segfault

---
 python/tvm/autotvm/measure/measure_methods.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index f32725c2e9cc..666d307247c1 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -85,6 +85,8 @@ class LocalBuilder(Builder):
         If is callable, use it as custom build function, expect lib_format field.
     """
     def __init__(self, timeout=10, n_parallel=None, build_func='default'):
+        # FIXME(zhanghao): quickfix - use single thread. otherwise may cause seg fault
+        n_parallel = 1
         super(LocalBuilder, self).__init__(timeout, n_parallel)
 
         if isinstance(build_func, str):

From dadf0459b4b1e7cb878cbef94ea15aae1b6cdb4a Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Wed, 6 May 2020 13:26:41 +0800
Subject: [PATCH 16/44] add dcgan support (simulation)

---
 python/tvm/relay/op/strategy/generic.py    |   6 +-
 python/tvm/relay/quantize/_annotate.py     |  25 +++
 python/tvm/relay/quantize/_partition.py    |  13 ++
 python/tvm/relay/quantize/quantize.py      |  14 +-
 src/arith/detect_linear_equation.cc        |  16 ++
 src/relay/quantize/realize.cc              |  37 ++++
 src/tir/transforms/inject_copy_intrin.cc   |  14 +-
 vta/python/vta/top/graphpack.py            |  35 ++--
 vta/python/vta/top/vta_conv2d.py           |  24 ++-
 vta/python/vta/top/vta_conv2d_transpose.py |  21 ++-
 vta/tutorials/frontend/deploy_dcgan.py     | 186 +++++++++++++++++++++
 11 files changed, 358 insertions(+), 33 deletions(-)
 create mode 100644 vta/tutorials/frontend/deploy_dcgan.py

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 3d24cdf73e9d..48944474e272 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -330,8 +330,10 @@ def compute_conv2d_transpose(attrs, inputs, out_dtype):
         out = topi_compute(
             inputs[0], inputs[1], strides, padding, out_dtype)
         output_padding = get_const_tuple(attrs.output_padding)
-        out = topi.nn.pad(out, [0, 0, 0, 0],
-                          [0, 0, output_padding[0], output_padding[1]])
+        if output_padding[0] != 0 or output_padding[1] != 0:
+            pad_before = [0] * len(out.shape)
+            pad_after = [0, 0, output_padding[0], output_padding[1]] + [0] * (len(out.shape) - 4)
+            out = topi.nn.pad(out, pad_before, pad_after)
         return [out]
     return compute_conv2d_transpose
 
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 952a86466300..08930527b443 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -173,6 +173,30 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
+@register_annotate_function("nn.conv2d_transpose")
+def conv2d_transpose_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for conv2d_transpose. Lhs of conv will be quantized to
+    input field, and rhs of conv will be quantized to weight field.
+    Output would be in activation field"""
+    if quantize_context().check_to_skip(ref_call):
+        return None
+
+    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
+    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
+
+    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
+        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
+
+    assert rhs_kind is None
+    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
+
+    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+
+    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+
+
+# TODO(tmoreau89,ziheng) need to include an option to turn off dense quant
+# @register_annotate_function("nn.dense")
 @register_annotate_function("nn.dense")
 def dense_rewrite(ref_call, new_args, ctx):
     """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
@@ -281,6 +305,7 @@ def identity_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(ret_expr, x_kind)
 
 
+register_annotate_function("reshape", identity_rewrite)
 register_annotate_function("clip", identity_rewrite)
 register_annotate_function("nn.relu", identity_rewrite)
 register_annotate_function("strided_slice", identity_rewrite)
diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py
index 315986d55607..f26e88301894 100644
--- a/python/tvm/relay/quantize/_partition.py
+++ b/python/tvm/relay/quantize/_partition.py
@@ -53,6 +53,19 @@ def conv2d_partition_function(ref_call, new_args, ctx):
     return QPartitionExpr(ret)
 
 
+@register_partition_function("nn.conv2d_transpose")
+def conv2d_partition_function(ref_call, new_args, ctx):
+    """Rewrite function for conv2d for partition"""
+    data_cond, data = partition_expr_check(new_args[0])
+    kernel_cond, kernel = partition_expr_check(new_args[1])
+
+    assert not kernel_cond
+    if data_cond:
+        data = new_args[0].realize()
+    ret = _forward_op(ref_call, [data, kernel])
+    return QPartitionExpr(ret)
+
+
 def identity_partition_function(ref_call, new_args, ctx):
     cond, expr = partition_expr_check(new_args[0])
     if cond:
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 28ebf7f3032b..b7371a3c3068 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -209,10 +209,18 @@ def check_to_skip(self, ref_call):
             # check skip conv layers
             skipped_indices = [int(x) for x in current_qconfig().skip_conv_layers]
             if self._conv2d_counter in skipped_indices:
-                if ref_call.op.name == 'nn.conv2d':
+                if ref_call.op.name == 'nn.conv2d' or ref_call.op.name == 'nn.conv2d_transpose':
                     self._conv2d_counter += 1
-                return True
-            if ref_call.op.name == 'nn.conv2d':
+                    return True
+                else:
+                    # counter is 0 before visiting the first conv2d
+                    # if the first conv2d is skipped, all ops before it will also be skipped
+                    # otherwise, we don't skip until the counter become +1
+                    if self._conv2d_counter == 0:
+                        return True
+                    else:
+                        return False
+            if ref_call.op.name == 'nn.conv2d' or ref_call.op.name == 'nn.conv2d_transpose':
                 self._conv2d_counter += 1
 
         return False
diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc
index f0634feac083..18d28b53a431 100644
--- a/src/arith/detect_linear_equation.cc
+++ b/src/arith/detect_linear_equation.cc
@@ -71,6 +71,16 @@ class LinearEqDetector : public ExprFunctor<LinearEqEntry(const PrimExpr&, const
     return ret;
   }
 
+  LinearEqEntry VisitExpr_(const FloorDivNode* op, const PrimExpr& e) final {
+    if (fail_) return LinearEqEntry();
+    LinearEqEntry a = VisitExpr(op->a, op->a);
+    LinearEqEntry b = VisitExpr(op->b, op->b);
+    LinearEqEntry ret;
+    ret.base = FloorDivCombine(a.base, b.base);
+    ret.coeff = FloorDivCombine(a.coeff, b.coeff);
+    return ret;
+  }
+
   LinearEqEntry VisitExpr_(const SubNode* op, const PrimExpr& e) final {
     if (fail_) return LinearEqEntry();
     LinearEqEntry a = VisitExpr(op->a, op->a);
@@ -138,6 +148,12 @@ class LinearEqDetector : public ExprFunctor<LinearEqEntry(const PrimExpr&, const
     if (!b.defined()) return b;
     return a * b;
   }
+  // Combine by div
+  PrimExpr FloorDivCombine(PrimExpr a, PrimExpr b) {
+    if (!a.defined()) return b;
+    if (!b.defined()) return a;
+    return FloorDivNode::make(a, b);
+  }
 };
 
 Array<PrimExpr> DetectLinearEquation(const PrimExpr& e, const Array<Var>& vars) {
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 41680b655a66..07e61de82958 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -220,6 +220,41 @@ Expr Conv2dRealize(const Call& ref_call, const Array<Expr>& new_args, const Obje
 
 RELAY_REGISTER_OP("nn.conv2d").set_attr<FForwardRewrite>("FQRealizeRewrite", Conv2dRealize);
 
+Expr Conv2dTransposeRealize(const Call& ref_call,
+                   const Array<Expr>& new_args,
+                   const ObjectRef& ctx) {
+  const QConfig& cfg = QConfig::Current();
+  CHECK_EQ(new_args.size(), 2);
+  if (!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>()) {
+    return Expr(nullptr);
+  }
+  const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
+  CHECK(lhs);
+  const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
+  CHECK(rhs);
+
+  Expr ldata = lhs->data;
+  if (lhs->dtype != cfg->dtype_input) {
+    ldata = Cast(ldata, cfg->dtype_input);
+  }
+  Expr rdata = Cast(rhs->data, cfg->dtype_weight);
+
+  const auto ref_attrs = ref_call->attrs.as<Conv2DTransposeAttrs>();
+  auto attrs = make_object<Conv2DTransposeAttrs>();
+  *attrs = *ref_attrs;
+  DataType out_dtype = cfg->dtype_activation;
+  attrs->out_dtype = out_dtype;
+
+  Expr ret = CallNode::make(ref_call->op,
+    {ldata, rdata}, Attrs(attrs), ref_call->type_args);
+  Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
+  Expr dom_scale = FoldConstantOpt(mul);
+  return QRealizeIntExprNode::make(ret, dom_scale, out_dtype);
+}
+
+RELAY_REGISTER_OP("nn.conv2d_transpose")
+.set_attr<FForwardRewrite>("FQRealizeRewrite", Conv2dTransposeRealize);
+
 Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   CHECK_EQ(new_args.size(), 2);
@@ -435,6 +470,8 @@ Expr IdentityRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
 
 RELAY_REGISTER_OP("nn.relu").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
+RELAY_REGISTER_OP("reshape").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
+
 RELAY_REGISTER_OP("strided_slice").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
 RELAY_REGISTER_OP("annotation.stop_fusion")
diff --git a/src/tir/transforms/inject_copy_intrin.cc b/src/tir/transforms/inject_copy_intrin.cc
index b27459f4bd45..279274632648 100644
--- a/src/tir/transforms/inject_copy_intrin.cc
+++ b/src/tir/transforms/inject_copy_intrin.cc
@@ -80,7 +80,19 @@ class CopyIntrinInjector : public StmtMutator {
     }
     // for now only support true condition matching
     if (has_cond) {
-      load = sel_true_value.Eval().as<LoadNode>();
+      auto true_val = sel_true_value.Eval();
+
+      // TODO(zhanghao): we do cond unfold one more further
+      // this is used to lift the pad(dilate) to one load op
+      // However, ignoring false condition may cause incorrect results
+      PVar<PrimExpr> sel_cond_extra, sel_true_value_extra, sel_false_value_extra;
+      bool has_cond_extra = if_then_else(sel_cond_extra, sel_true_value_extra, sel_false_value_extra).Match(true_val) ||
+          select(sel_cond_extra, sel_true_value_extra, sel_false_value_extra).Match(true_val);
+      if (has_cond_extra) {
+        load = sel_true_value_extra.Eval().as<LoadNode>();
+      } else {
+        load = true_val.as<LoadNode>();
+      }
     }
     // cast can be part of the pattern
     if (cast != nullptr) {
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index cdfd3c4281e2..9cdc355f6c64 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -94,7 +94,7 @@ def _weight_shape_match_transpose(data, dshape, channels, cfactor_out):
     if pad_width != 0:
         pad_width = cfactor_out - pad_width
         data = op.nn.pad(data, [[0, 0], [0, pad_width], [0, 0], [0, 0]])
-        dshape = tuple(dshape[0], [dshape[1] + pad_width, dshape[2], dshape[3]])
+        dshape = tuple([dshape[0]] + [dshape[1] + pad_width, dshape[2], dshape[3]])
 
     if channels_pad != 0:
         channels = channels + (cfactor_out - channels_pad)
@@ -252,11 +252,12 @@ def visit_call(self, call):
         # First visit the children.
         args = [self.visit(arg) for arg in call.args]
 
+        odtype = _get_tensor_type(call)
         self.counter += 1
-        if call.op in self.op2nodes:
-            self.op2nodes[call.op].append(self.counter)
+        if (call.op, odtype) in self.op2nodes:
+            self.op2nodes[(call.op, odtype)].append(self.counter)
         else:
-            self.op2nodes[call.op] = [self.counter]
+            self.op2nodes[(call.op, odtype)] = [self.counter]
 
         return relay.Call(
             self.visit(call.op),
@@ -550,7 +551,8 @@ def graph_pack(expr,
         The transformed expression.
     """
     assert isinstance(expr, relay.Function)
-    assert ((start_name != stop_name) or (start_name_idx < stop_name_idx))
+    assert ((start_name != stop_name) or (start_name_idx is None != stop_name_idx is None) or \
+            (not (start_name_idx is None and stop_name_idx is None)) or (start_name_idx < stop_name_idx))
     expr = get_subgraph(expr, start_name, stop_name, start_name_idx, stop_name_idx, count_meta)
     expr = run_opt_pass(expr, transform.InferType())
     packer = ExprPack(
@@ -564,20 +566,17 @@ def graph_pack(expr,
         expr_locator = ExprLocater()
         expr_locator.visit(expr)
 
-        # from the second conv2d to the global_avg_pool2d, all will run on vta
+        # from the first int conv2d to the last int stop_fusion, all will run on vta
         conv2d = op.op.get("nn.conv2d")
-        avg_pool2d = op.op.get("nn.global_avg_pool2d")
-        start = expr_locator.op2nodes[conv2d][1]
-        # preceeding the nn.global_avg_pool2d, it will look like this
-        #
-        # %310 = annotation.stop_fusion(%309) /* ty=Tensor[(1, 16, 7, 7, 1, 32), int8] */;
-        # %311 = cast(%310, dtype="int32") /* ty=Tensor[(1, 16, 7, 7, 1, 32), int32] */;
-        # %312 = transpose(%311, axes=[0, 4, 1, 5, 2, 3]) /* ty=Tensor[(1, 1, 16, 32, 7, 7), int32] */;
-        # %313 = reshape(%312, newshape=[1, 512, 7, 7]) /* ty=Tensor[(1, 512, 7, 7), int32] */;
-        # %314 = nn.global_avg_pool2d(%313) /* ty=Tensor[(1, 512, 1, 1), int32] */;
-        #
-        # we mark the preceeding three ops also on cpu device
-        end = expr_locator.op2nodes[avg_pool2d][0] - 3
+        conv2d_transpose = op.op.get("nn.conv2d_transpose")
+        stop_fusion = op.op.get("annotation.stop_fusion")
+        if (conv2d, "int32") in expr_locator.op2nodes:
+            start = expr_locator.op2nodes[(conv2d, "int32")][0]
+        else:
+            start = expr_locator.op2nodes[(conv2d_transpose, "int32")][0]
+
+        # we mark the next op to the last stop_fusion on cpu device
+        end = expr_locator.op2nodes[(stop_fusion, "int8")][-1] + 1
 
         device_annot = ExprDeviceAnnot(start=start, end=end)
         expr = device_annot.visit(expr)
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 5c856384c605..7ef71074caa4 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -36,7 +36,6 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
         raise topi.InvalidShapeError()
     assert dilation == (1, 1)
 
-    eprint("data.shape, kernel.shape", data.shape, kernel.shape)
     if padding[0]:
         pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data")
     else:
@@ -194,7 +193,6 @@ def _traverse(op):
 # FIXME(zhanghao): move this code to a proper location
 @topi.generic.schedule_add.register(["vta"])
 def _schedule_add(outs):
-    eprint("schedule_add vta")
     assert len(outs) == 1
 
     def is_cast_op(op):
@@ -245,14 +243,29 @@ def _traverse(op):
 
         # TODO(zhanghao): auto-tune
         x_co0, x_co1 = s[output].split(x_co, factor=1)
-        x_i0, x_i1 = s[output].split(x_i, factor=min(28, x_i_max))
-        x_j0, x_j1 = s[output].split(x_j, factor=min(14, x_j_max))
+
+        from functools import reduce
+        def factors(n):
+            return sorted(set(reduce(list.__add__,
+                              ([i, n//i] for i in range(1, int(n**0.5) + 1) if n % i == 0))))
+
+        # FIXME(zhanghao): use auto-tune
+        i_factors = factors(x_i_max)
+        i_factor = i_factors[-1]
+        if i_factor > 28:
+            i_factor = i_factors[-2]
+
+        j_factors = factors(x_j_max)
+        j_factor = j_factors[-1]
+        if j_factor > 14:
+            j_factor = j_factors[-2]
+        x_i0, x_i1 = s[output].split(x_i, factor=i_factor)
+        x_j0, x_j1 = s[output].split(x_j, factor=j_factor)
         s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
         store_pt = x_j0
 
         env = get_env()
         for eo in ewise_ops:
-            eprint("add ewise_ops ", eo)
             s[eo].set_scope(env.acc_scope)
             s[eo].pragma(s[eo].op.axis[0], env.alu)
             s[eo].compute_at(s[output], store_pt)
@@ -260,7 +273,6 @@ def _traverse(op):
         # cache read input
         cache_read_ewise = []
         for consumer, tensor in ewise_inputs:
-            eprint("add dma_copy", consumer, tensor, tensor.op)
             cache_read_ewise.append(
                 s.cache_read(tensor, env.acc_scope, [consumer]))
 
diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py
index 4f213f64d0da..15383e557c3b 100644
--- a/vta/python/vta/top/vta_conv2d_transpose.py
+++ b/vta/python/vta/top/vta_conv2d_transpose.py
@@ -77,6 +77,7 @@ def schedule_conv2d_transpose_packed(cfg, outs):
     """Schedule packed conv2d_transpose"""
     assert len(outs) == 1
     output = outs[0]
+    const_ops = []
     ewise_inputs = []
     ewise_ops = []
     conv2d_res = []
@@ -86,7 +87,10 @@ def schedule_conv2d_transpose_packed(cfg, outs):
     def _traverse(op):
         if topi.tag.is_broadcast(op.tag):
             if not op.same_as(output.op):
-                ewise_ops.append(op)
+                if not op.axis:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
             for tensor in op.input_tensors:
                 if isinstance(tensor.op, tvm.te.PlaceholderOp):
                     ewise_inputs.append((op, tensor))
@@ -116,8 +120,16 @@ def _traverse(op):
     data, kernel = conv2d_stage.op.input_tensors
     if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
         temp = data.op.input_tensors[0]
-        pad_data = data
-        data = temp
+        # FIXME(zhanghao): force merge pad(dilate(xx)) to one load op
+        # this may cause results in-correct
+        # disable for now
+        if False and isinstance(temp.op, tvm.te.ComputeOp) and ("pad" in temp.op.tag or temp.op.name == "DilatedInput"):
+            pad_data = data
+            data = temp.op.input_tensors[0]
+            s[temp.op].compute_inline()
+        else:
+            pad_data = data
+            data = temp
     else:
         pad_data = None
 
@@ -142,6 +154,9 @@ def _traverse(op):
         s[op].set_scope(env.acc_scope)
         s[op].pragma(s[op].op.axis[0], env.alu)
 
+    for op in const_ops:
+        s[op].compute_inline()
+
     # tile
     x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
     x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co)
diff --git a/vta/tutorials/frontend/deploy_dcgan.py b/vta/tutorials/frontend/deploy_dcgan.py
new file mode 100644
index 000000000000..95a3731f98f9
--- /dev/null
+++ b/vta/tutorials/frontend/deploy_dcgan.py
@@ -0,0 +1,186 @@
+from __future__ import absolute_import, print_function
+
+import argparse, json, os, requests, sys, time
+from io import BytesIO
+from os.path import join, isfile
+from PIL import Image
+
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+from matplotlib import pyplot as plt
+
+import tvm
+from tvm import te
+from tvm import rpc, autotvm, relay
+from tvm.contrib import graph_runtime, util, download
+from tvm.contrib.debugger import debug_runtime
+from tvm.relay import transform
+import tvm.relay.testing
+
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
+from tvm.contrib.util import eprint
+
+# Make sure that TVM was compiled with RPC=1
+assert tvm.runtime.enabled("rpc")
+
+######################################################################
+# Define the platform and model targets
+# -------------------------------------
+# Execute on CPU vs. VTA, and define the model.
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
+# multiple targets to run both on cpu and vta
+targets = {
+    "cpu": env.target_vta_cpu,
+    "ext_dev": env.target
+}
+
+model = "DCGAN"
+
+######################################################################
+# Obtain an execution remote
+# --------------------------
+# When target is 'pynq', reconfigure FPGA and runtime.
+# Otherwise, if target is 'sim', execute locally.
+
+if env.TARGET not in ["sim", "tsim", "intelfocl"]:
+
+    # Get remote from tracker node if environment variable is set.
+    # To set up the tracker, you'll need to follow the "Auto-tuning
+    # a convolutional network for VTA" tutorial.
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
+    # Otherwise if you have a device you want to program directly from
+    # the host, make sure you've set the variables below to the IP of
+    # your board.
+    device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+    device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
+    if not tracker_host or not tracker_port:
+        remote = rpc.connect(device_host, int(device_port))
+    else:
+        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000)
+
+    # Reconfigure the JIT runtime and FPGA.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    reconfig_start = time.time()
+    vta.reconfig_runtime(remote)
+    bitstream = os.environ.get("TVM_BIT", None)
+    if bitstream:
+        print("Program fpga with {}".format(bitstream))
+        vta.program_fpga(remote, bitstream)
+
+    reconfig_time = time.time() - reconfig_start
+    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
+
+# In simulation mode, host the RPC server locally.
+else:
+    remote = rpc.LocalSession()
+
+# Get execution context from remote
+# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+ctxes = [remote.ext_dev(0), remote.cpu(0)]
+
+# Load pre-configured AutoTVM schedules
+with autotvm.tophub.context(target):
+
+    # Populate the shape and data type dictionary for ImageNet classifier input
+    dtype_dict = {"data": 'float32'}
+    shape_dict = {"data": (env.BATCH, 100)}
+
+    # get the mobilenet model
+    mod, params = relay.testing.dcgan.get_workload(batch_size=1, dtype="float32", oshape=(3, 64, 64))
+
+    # Measure build start time
+    build_start = time.time()
+
+    # Update shape and type dictionary
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    if target.device_name == "vta":
+        # Perform quantization in Relay
+        # Note: We set opt_level to 3 in order to fold batch norm
+        with relay.build_config(opt_level=3):
+            with relay.quantize.qconfig(global_scale=8.0,
+                                        skip_conv_layers=[3]):
+                mod = relay.quantize.quantize(mod, params=params)
+            # Perform graph packing and constant folding for VTA target
+            assert env.BLOCK_IN == env.BLOCK_OUT
+            relay_prog = graph_pack(
+                mod["main"],
+                env.BATCH,
+                env.BLOCK_OUT,
+                env.WGT_WIDTH,
+                start_name="cast",
+                stop_name="cast", stop_name_idx=52, device_annot=True)
+    else:
+        relay_prog = mod["main"]
+
+    # Compile Relay program with AlterOpLayout disabled
+    with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
+        if target.device_name != "vta":
+            graph, lib, params = relay.build(
+                relay_prog, target=target,
+                params=params, target_host=env.target_host)
+        else:
+            with vta.build_config(debug_flag=38):
+                graph, lib, params = relay.build(
+                    relay_prog, target=targets,
+                    params=params, target_host=env.target_host)
+
+    # Measure Relay build time
+    build_time = time.time() - build_start
+    print(model + " inference graph built in {0:.2f}s!".format(build_time))
+
+    # Graph runtime
+    m = graph_runtime.create(graph, lib, ctxes)
+
+image = np.zeros((1, 100), dtype=np.float32)
+eprint("image", image.dtype, image)
+image = np.repeat(image, env.BATCH, axis=0)
+
+# Set the network parameters and inputs
+m.set_input(**params)
+m.set_input('data', image)
+
+# Perform inference and gather execution statistics
+# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
+num = 3 # number of times we run module for a single measurement
+rep = 3 # number of measurements (we derive std dev from this)
+timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep)
+
+if env.TARGET in ["sim", "tsim"]:
+    simulator.clear_stats()
+    # timer()
+    m['run']()
+
+    sim_stats = simulator.stats()
+    print("\nExecution statistics:")
+    for k, v in sim_stats.items():
+        # Since we execute the workload many times, we need to normalize stats
+        # Note that there is always one warm up run
+        # Therefore we divide the overall stats by (num * rep + 1)
+        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
+else:
+    m['run']()
+    print("Run done")
+    # tcost = timer()
+    # std = np.std(tcost.results) * 1000
+    # mean = tcost.mean * 1000
+    # print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
+    # print("Average per sample inference time: %.2fms" % (mean/env.BATCH))
+
+# Get classification results
+tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 3, 64, 64), "float32", remote.cpu(0)))
+output = tvm_output.asnumpy()
+for b in range(env.BATCH):
+    print(tvm_output.asnumpy()[b])

From bb3dc0eb570f96a5a5407885a12d0cf7e4b1e95b Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Wed, 6 May 2020 15:33:24 +0800
Subject: [PATCH 17/44] make sync in batch as an option

---
 vta/runtime/runtime.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 39038da00b51..cedecc59ba55 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -1278,7 +1278,8 @@ class CommandQueue {
 
     // FIXME(zhanghao): It is required to use force_serial
     // by using skip and sync at the final layer, we can avoid do DeviceCopy every time
-    if (skip) {
+    const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE");
+    if (sync_once && skip) {
       if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) {
         LOG(ERROR) <<
             "Synchronizing all in one round requires to use force_serial to make things right";
@@ -1518,7 +1519,8 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off
     // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly
     // struct timespec start, stop;
     // clock_gettime(CLOCK_REALTIME, &start);
-    VTASynchronize(VTATLSCommandHandle(), 1<<31, false);
+    const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE");
+    if (sync_once) VTASynchronize(VTATLSCommandHandle(), 1<<31, false);
     // clock_gettime(CLOCK_REALTIME, &stop);
     // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000;
     // LOG(WARNING) << "Final Synchronize: " << elapsed << " us";

From cb464779f42698d0fc6faa52c2927e551e0493a4 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 11 May 2020 09:51:49 +0800
Subject: [PATCH 18/44] quickfix for buffer overflow

---
 vta/python/vta/top/vta_conv2d.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 7ef71074caa4..40e2530ef63c 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -252,13 +252,16 @@ def factors(n):
         # FIXME(zhanghao): use auto-tune
         i_factors = factors(x_i_max)
         i_factor = i_factors[-1]
-        if i_factor > 28:
-            i_factor = i_factors[-2]
+        while i_factor > 28:
+            del i_factors[-1]
+            i_factor = i_factors[-1]
 
         j_factors = factors(x_j_max)
         j_factor = j_factors[-1]
-        if j_factor > 14:
-            j_factor = j_factors[-2]
+        while j_factor > 14:
+            del j_factors[-1]
+            j_factor = j_factors[-1]
+
         x_i0, x_i1 = s[output].split(x_i, factor=i_factor)
         x_j0, x_j1 = s[output].split(x_j, factor=j_factor)
         s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)

From 4ede46679fc2bb33b6a0eb9984ebf5b1eedbfb87 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 11 May 2020 18:02:38 +0800
Subject: [PATCH 19/44] bugfix for allocated_ destructor order

---
 vta/runtime/runtime.cc | 44 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index cedecc59ba55..92d5ab06cc8f 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -43,6 +43,8 @@
 #include <rapidjson/document.h>
 #include <rapidjson/writer.h>
 #include <rapidjson/stringbuffer.h>
+#include <thread>
+#include <mutex>
 
 namespace vta {
 
@@ -120,10 +122,39 @@ class AlignmentAllocator {
   }
 };
 
+class DeviceAllocStat {
+ public:
+  void AddAlloc(const void* ptr) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    allocated_.insert(ptr);
+  }
+
+  bool CheckAlloc(const void* ptr) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    return allocated_.count(ptr);
+  }
+
+  void DelAlloc(const void* ptr) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    allocated_.erase(ptr);
+  }
+
+ private:
+  std::set<const void*> allocated_;
+  std::mutex mtx_;
+};
+
+// here we use a global variable to memorize the allocation stats
+static std::shared_ptr<DeviceAllocStat> alloc_stat(new DeviceAllocStat());
+
 /*!
  * \brief Data buffer represents data on CMA.
  */
 struct DataBuffer {
+  DataBuffer() {
+    alloc_stat_ = alloc_stat;
+  }
+
   /*! \return Virtual address of the data. */
   void* virt_addr() const { return data_; }
   /*! \return Physical address of the data. */
@@ -180,7 +211,7 @@ struct DataBuffer {
     buffer->data_ = data;
     buffer->phy_addr_ = VTAMemGetPhyAddr(data);
 
-    allocated_.insert(buffer);
+    alloc_stat->AddAlloc(buffer);
     return buffer;
   }
   /*!
@@ -188,7 +219,7 @@ struct DataBuffer {
    * \param buffer The buffer to be freed.
    */
   static void Free(DataBuffer* buffer) {
-    allocated_.erase(buffer);
+    alloc_stat->DelAlloc(buffer);
     VTAMemFree(buffer->data_);
     delete buffer;
   }
@@ -198,7 +229,7 @@ struct DataBuffer {
    * \return The corresponding data buffer header.
    */
   static DataBuffer* FromHandle(const void* buffer) {
-    if (allocated_.count(buffer)) {
+    if (alloc_stat->CheckAlloc(buffer)) {
       return const_cast<DataBuffer*>(
           reinterpret_cast<const DataBuffer*>(buffer));
     } else {
@@ -212,12 +243,11 @@ struct DataBuffer {
   /*! \brief The physical address of the buffer, excluding header. */
   vta_phy_addr_t phy_addr_;
 
-  static std::set<const void*> allocated_;
+  // a copy of global shared_ptr instance
+  // to avoid the global instance is destructed before there are still some pending DataBuffers not destructed
+  std::shared_ptr<DeviceAllocStat> alloc_stat_;
 };
 
-// init static member
-std::set<const void*> DataBuffer::allocated_;
-
 /*!
  * \brief Micro op kernel.
  *  Contains functions to construct the kernel with prefix Push.

From 4f375d5cd205f51c635d01eeb4d278f92e4992fc Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 14 May 2020 17:57:43 +0800
Subject: [PATCH 20/44] refine device annotation

---
 include/tvm/relay/attrs/device_copy.h     |  1 -
 python/tvm/relay/op/_tensor.py            |  3 +
 src/relay/op/annotation/annotation.cc     |  8 +-
 src/relay/transforms/device_annotation.cc | 92 +++++++++--------------
 vta/python/vta/top/graphpack.py           | 48 +++++++-----
 5 files changed, 74 insertions(+), 78 deletions(-)

diff --git a/include/tvm/relay/attrs/device_copy.h b/include/tvm/relay/attrs/device_copy.h
index c4a60c827048..7da92b3ff763 100644
--- a/include/tvm/relay/attrs/device_copy.h
+++ b/include/tvm/relay/attrs/device_copy.h
@@ -37,7 +37,6 @@ namespace relay {
 struct DeviceCopyAttrs : public tvm::AttrsNode<DeviceCopyAttrs> {
   int dst_dev_type;
   int src_dev_type;
-  bool used_for_propagate = true;
 
   TVM_DECLARE_ATTRS(DeviceCopyAttrs, "relay.attrs.DeviceCopyAttrs") {
     TVM_ATTR_FIELD(src_dev_type)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 4f409ff4538f..1dd431ac2785 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -92,6 +92,9 @@
 register_broadcast_schedule("fast_exp")
 register_broadcast_schedule("fast_tanh")
 register_broadcast_schedule("fast_erf")
+# a fake on_device schedule.
+# this will not be used in actual computation as on_device will be removed during DeviceAnnotation pass
+register_injective_schedule("on_device")
 
 
 # zeros
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 6be9b0d4a3d5..4db3f930d3b5 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -54,7 +54,13 @@ RELAY_REGISTER_OP("on_device")
     .add_type_rel("Identity", IdentityRel)
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<TOpIsStateful>("TOpIsStateful", false)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<FTVMCompute>("FTVMCompute",
+                           [] (const Attrs& attrs,
+                               const Array<te::Tensor>& inputs,
+                               const Type& out_type) -> Array<te::Tensor> {
+                           return {topi::identity(inputs[0])};
+                           });
 
 Expr StopFusion(Expr data) {
   static const Op& op = Op::Get("annotation.stop_fusion");
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index 2d53751665da..4862a999b85c 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -90,15 +90,6 @@ class ValidateAnnotation : private ExprVisitor {
         annotation_map_.insert({node, GetDeviceId(call_node)});
       }
 
-      // FIXME(zhanghao): find a better way
-      // here assume there are max two device types
-      if (device_type == fallback_device_ && extra_device_ && extra_device_ != fallback_device_) {
-        const auto* child = GetRef<Expr>(node).as<CallNode>()->args[0].operator->();
-        // here we mark as negative to indicate this is for copy from only
-        int ext_dev = -extra_device_;
-        annotation_map_.insert({child, ext_dev});
-      }
-
       if (device_type != fallback_device_) extra_device_ = device_type;
     }
   }
@@ -261,11 +252,7 @@ class RewriteAnnotation : public ExprMutator {
       if (annotation_map_.count(dst)) {
         return src_dev_type != annotation_map_.at(dst);
       } else {
-        // TODO(zhanghao): for now, we only make a device_copy when dst is "on_device" marked
-        // This allows us to do a start-end mark (mark two points)
-        // to mark all the middle ops with a device_type
-        return false;
-        // return src_dev_type != fallback_device_;
+        return src_dev_type != fallback_device_;
       }
     } else {
       // if annotation value < 0, it means this is for "copy from" only
@@ -408,22 +395,38 @@ class DeviceInfo {
     }
 
     void VisitExpr_(const ConstantNode* cn) final {
-      post_dfs_order_.push_back(std::make_pair(cn, has_copy_));
+      device_tag_[cn] = dev_type_;
     }
 
     void VisitExpr_(const CallNode* call) final {
       // Skip annotation nodes.
       if (!IsOnDeviceNode(call)) {
-        if (GetDeviceCopyNode(call)) {
+        if (const auto* node = GetDeviceCopyNode(call)) {
+          CHECK(node->IsInstance<CallNode>());
+          const auto* call_node = static_cast<const CallNode*>(node);
+          auto attrs = call_node->attrs.as<DeviceCopyAttrs>();
+
           num_device_copy_ops_++;
           bool has_copy_prev = has_copy_;
           has_copy_ = true;
-          ExprVisitor::VisitExpr_(call);
-          post_dfs_order_.push_back(std::make_pair(call, has_copy_));
+          dev_type_ = attrs->src_dev_type;
+          for (auto& arg : call->args) {
+            Visit(arg);
+            // restore the type for remaining arguments
+            dev_type_ = attrs->src_dev_type;
+          }
+          device_tag_[call] = attrs->dst_dev_type;
+          // update the out_dev_type_, which should be the dst_dev_type of last copy
+          out_dev_type_ = attrs->dst_dev_type;
           has_copy_ = has_copy_prev;
         } else {
-          ExprVisitor::VisitExpr_(call);
-          post_dfs_order_.push_back(std::make_pair(call, has_copy_));
+          for (auto& arg : call->args) {
+            int cur_dev_type = dev_type_;
+            Visit(arg);
+            // restore the type for remaining arguments
+            dev_type_ = cur_dev_type;
+          }
+          device_tag_[call] = dev_type_;
         }
       }
     }
@@ -436,22 +439,24 @@ class DeviceInfo {
     void VisitExpr_(const TupleGetItemNode* op) final { ExprVisitor::VisitExpr_(op); }
 
     void VisitExpr_(const VarNode* vn) final {
-      post_dfs_order_.push_back(std::make_pair(vn, has_copy_));
+      device_tag_[vn] = dev_type_;
     }
 
     void VisitExpr_(const LetNode* ln) final {
       ExprVisitor::VisitExpr_(ln);
-      post_dfs_order_.push_back(std::make_pair(ln, has_copy_));
+      device_tag_[ln] = dev_type_;
     }
 
     void VisitExpr_(const IfNode* in) final {
       ExprVisitor::VisitExpr_(in);
-      post_dfs_order_.push_back(std::make_pair(in, has_copy_));
+      device_tag_[in] = dev_type_;
     }
 
     int num_device_copy_ops_{0};
     bool has_copy_ = false;
-    std::vector<std::pair<const ExprNode*, bool>> post_dfs_order_;
+    int dev_type_ = -1;
+    int out_dev_type_ = -1;
+    std::unordered_map<const ExprNode*, int> device_tag_;
     friend DeviceInfo;
   };
 
@@ -477,39 +482,14 @@ class DeviceInfo {
   }
 
   void PropagateDeviceId() {
-    // Bottom-up propagation.
-    int out_dev_type = BottomUpPropagation();
-    // propagation for remained nodes.
-    FillPropagation(out_dev_type);
-  }
-
-  int BottomUpPropagation() {
-    const CallNode* last_copy_node = nullptr;
-    int cur_dev_type = -1;
-    int out_dev_type = -1;
-    for (auto it = post_visitor_.post_dfs_order_.crbegin();
-         it != post_visitor_.post_dfs_order_.crend(); ++it) {
-      if (const auto* node = GetDeviceCopyNode(it->first)) {
-        CHECK(node->IsInstance<CallNode>());
-        last_copy_node = static_cast<const CallNode*>(node);
-        const auto* attrs = last_copy_node->attrs.as<DeviceCopyAttrs>();
-        cur_dev_type = attrs->src_dev_type;
-        if (out_dev_type == -1) out_dev_type = attrs->dst_dev_type;
-        if (it->second) device_map_.Set(GetRef<Expr>(it->first), attrs->dst_dev_type);
-      } else if (last_copy_node) {
-        Expr expr = GetRef<Expr>(it->first);
-        CHECK_EQ(device_map_.count(expr), 0U);
-        if (it->second) device_map_.Set(expr, cur_dev_type);
+    int out_dev_type = post_visitor_.out_dev_type_;
+    for (auto& it : post_visitor_.device_tag_) {
+      if (it.second != -1) {
+        device_map_.Set(GetRef<Expr>(it.first), it.second);
+      } else {
+        device_map_.Set(GetRef<Expr>(it.first), out_dev_type);
       }
     }
-    return out_dev_type;
-  }
-
-  void FillPropagation(int out_dev_type) {
-    for (const auto& it : post_visitor_.post_dfs_order_) {
-      Expr expr = GetRef<Expr>(it.first);
-      if (!it.second) device_map_.Set(expr, out_dev_type);
-    }
   }
 
   PostDfsOrderVisitor post_visitor_;
@@ -517,6 +497,7 @@ class DeviceInfo {
 };
 
 
+// TODO(zhanghao): consider to remove this as I think it is not necessary for now
 class AddDeviceCopy : public ExprMutator {
  public:
   Expr Rewrite(const Expr& expr) {
@@ -558,7 +539,6 @@ class AddDeviceCopy : public ExprMutator {
         auto attrs = make_object<DeviceCopyAttrs>();
         attrs->src_dev_type = src_dev_type;
         attrs->dst_dev_type = dst_dev_type;
-        attrs->used_for_propagate = false;
         static const Op& op = Op::Get("device_copy");
         Call device_copy = CallNode::make(op, {this->Mutate(arg)}, Attrs(attrs), {});
         device_copy->checked_type_ = arg->checked_type_;
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 9cdc355f6c64..ac4c8aac4539 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -193,6 +193,7 @@ class ExprDeviceAnnot(ExprMutator):
     def __init__(self, start=-1, end=-1):
         self.ext_ctx = tvm.context("ext_dev")
         self.cpu_ctx = tvm.context("cpu")
+        self.cast = op.op.get("cast")
         self.counter = -1
         self.start = start
         self.end = end
@@ -210,33 +211,40 @@ def visit_call(self, call):
         if self.counter == self.start:
             ret = relay.Call(call.op, args, call.attrs)
             ret = relay.annotation.on_device(ret, self.ext_ctx)
-            eprint("add on_device {}: {}".format("ext", ret))
             return ret
         elif self.counter == self.end:
             ret = relay.Call(call.op, args, call.attrs)
             ret = relay.annotation.on_device(ret, self.cpu_ctx)
-            eprint("add on_device {}: {}".format("cpu", ret))
             return ret
+        elif self.counter > self.start and self.counter < self.end:
+            ret = relay.Call(call.op, args, call.attrs)
 
-#        if call.op == self.global_avg_pool2d:
-#            eprint("graphpack call = ", call)
-#            eprint("graphpack call annot relu, ", args[0])
-#            ret = relay.Call(call.op, args, call.attrs)
-#            ret = relay.annotation.on_device(ret, self.cpu_ctx)
-#            return ret
-#
-#        if call.op == self.conv2d and odtype == 'int32':
-#            if not self.first_conv2d:
-#                ret = relay.Call(call.op, args, call.attrs)
-#                ret = relay.annotation.on_device(ret, self.ext_ctx)
-#                eprint("graphpack call conv2d", type(ret.op), ret.op, type(ret), ret)
-#                self.first_conv2d = True
-#                return ret
+            # skip the float op, i.e., float->int cast
+            if self.is_float_op(call):
+                return ret
 
-        return relay.Call(
-            self.visit(call.op),
-            args,
-            call.attrs)
+            return relay.annotation.on_device(ret, self.ext_ctx)
+
+        return relay.Call(self.visit(call.op), args, call.attrs)
+
+    def is_float_op(self, call):
+        """check if this op belongs to a float op
+        in general, float op's odtype is float;
+        a special case is float->int cast, which follow this op sequence:
+        multiply(float) -> round(float) -> clip(float) -> cast(int);
+        """
+        args = call.args
+        odtype = _get_tensor_type(call)
+        op = call.op
+
+        if odtype == "float32":
+            return True
+        elif op == self.cast:
+            idtype = _get_tensor_type(args[0])
+            if idtype == "float32":
+                return True
+
+        return False
 
 
 class ExprLocater(ExprMutator):

From d16d5ecdb92affae76b1887fb8d5f7eae8096da2 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 21 May 2020 13:28:08 +0800
Subject: [PATCH 21/44] auto-tune for vta alu ops

---
 python/tvm/autotvm/task/space.py            |   2 +-
 python/tvm/autotvm/task/topi_integration.py |   8 +-
 python/tvm/autotvm/tuner/callback.py        |   4 +-
 python/tvm/relay/op/_tensor.py              |   9 +-
 python/tvm/relay/op/op.py                   |  20 --
 python/tvm/relay/op/strategy/generic.py     |  15 +-
 src/relay/backend/compile_engine.cc         |   4 +-
 topi/python/topi/generic/injective.py       |   4 -
 vta/python/vta/top/op.py                    | 152 +++++++++
 vta/python/vta/top/vta_conv2d.py            |  98 ------
 vta/tutorials/autotvm/tune_alu_vta.py       | 341 ++++++++++++++++++++
 11 files changed, 513 insertions(+), 144 deletions(-)
 create mode 100644 vta/tutorials/autotvm/tune_alu_vta.py

diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index fbf474fc4df7..53ed78a7570d 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -779,7 +779,7 @@ def _add_new_transform(self, space_class, name, axes, policy, **kwargs):
         return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))]
 
     def __len__(self):
-        if self._length is None:
+        if self._length is None or self._length <= 1:
             self._length = int(np.prod([len(x) for x in self.space_map.values()]))
         return self._length
 
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 59e77f7d0098..d7fa69d571b1 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -215,7 +215,7 @@ def _decorate(topi_schedule):
         @_register_task_schedule(task_name)
         def wrapper(outs, *args, **kwargs):
             """wrapper function for topi schedule"""
-            workload = get_workload(outs)
+            workload = get_workload(outs, task_name)
             if workload is None:
                 raise RuntimeError("Cannot find workload in attribute of this schedule")
             tgt = _target.Target.current()
@@ -227,14 +227,16 @@ def wrapper(outs, *args, **kwargs):
     return _decorate
 
 
-def get_workload(outs):
+def get_workload(outs, task_name=None):
     """Retrieve the workload from outputs"""
     def traverse(tensors):
         """traverse all ops to find attached workload"""
         for t in tensors:
             op = t.op
             if 'workload' in op.attrs:
-                return args_to_workload(op.attrs['workload'])
+                ret = args_to_workload(op.attrs['workload'])
+                if ret[0] == task_name:
+                    return ret
             wkl = traverse(op.input_tensors)
             if wkl:
                 return wkl
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index cfc1b2c38f85..6c53be582b40 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -137,7 +137,7 @@ def __del__(self):
     format_si_prefix(0, si_prefix)
 
     if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
-        sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
+        sys.stdout.write('\r%s Current/Best: %7.4f/%7.4f GFLOPS | Progress: (%d/%d) '
                          '| %.2f s' % (prefix, 0, 0, 0, total, time.time() - tic))
         sys.stdout.flush()
 
@@ -153,7 +153,7 @@ def _callback(tuner, inputs, results):
             ctx.cur_flops = flops
             ctx.best_flops = tuner.best_flops
 
-            sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f %sFLOPS | Progress: (%d/%d) '
+            sys.stdout.write('\r%s Current/Best: %7.4f/%7.4f %sFLOPS | Progress: (%d/%d) '
                              '| %.2f s' %
                              (prefix, format_si_prefix(ctx.cur_flops, si_prefix),
                               format_si_prefix(ctx.best_flops, si_prefix), si_prefix,
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 1dd431ac2785..4c3a2378e9d4 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -51,9 +51,7 @@
 register_broadcast_schedule("sign")
 register_broadcast_schedule("abs")
 register_broadcast_schedule("tanh")
-# NOTE(zhanghao): use customized add schedule
-register_add_schedule("add")
-# register_broadcast_schedule("add")
+register_broadcast_schedule("add")
 register_broadcast_schedule("subtract")
 register_broadcast_schedule("multiply")
 register_broadcast_schedule("divide")
@@ -82,10 +80,7 @@
 register_broadcast_schedule("isinf")
 register_injective_schedule("maximum")
 register_injective_schedule("minimum")
-# NOTE(zhanghao): use customized add schedule
-# TODO(zhanghao): change the schedule name
-register_add_schedule("right_shift")
-# register_injective_schedule("right_shift")
+register_injective_schedule("right_shift")
 register_injective_schedule("left_shift")
 register_injective_schedule("shape_of")
 register_injective_schedule("ndarray_size")
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 5056825d007c..8ef51cf595fc 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -240,20 +240,6 @@ def register_injective_schedule(op_name, level=10):
     return register_schedule(op_name, _schedule_injective, level)
 
 
-def register_add_schedule(op_name, level=10):
-    """Register schedule function for add.
-
-    Parameters
-    ----------
-    op_name : str
-        The name of the op.
-
-    level : int
-        The priority level
-    """
-    return register_schedule(op_name, _schedule_add, level)
-
-
 def register_broadcast_schedule(op_name, level=10):
     """Register broadcast schedule function for an op.
 
@@ -409,12 +395,6 @@ def register_external_compiler(op_name, fexternal=None, level=10):
 
 
 
-def schedule_add(attrs, outputs, target):
-    """Generic schedule for add."""
-    with target:
-        return topi.generic.schedule_add(outputs)
-
-
 @tvm._ffi.register_func("relay.op.compiler._lower")
 def _lower(name, schedule, inputs, outputs):
     return lower(schedule, list(inputs) + list(outputs), name=name)
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 48944474e272..025d67630cf9 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -33,6 +33,14 @@ def wrapper(attrs, outs, target):
             return topi_schedule(outs)
     return wrapper
 
+
+def wrap_topi_compute(topi_compute):
+    """Wrap TOPI schedule which doesn't use attrs"""
+    def wrapper(attrs, inputs, out_type):
+        return [topi_compute(*inputs)]
+    return wrapper
+
+
 def get_conv2d_in_channels(data_shape, data_layout):
     """Get conv2d input channels"""
     data_shape = get_const_tuple(data_shape)
@@ -69,12 +77,6 @@ def schedule_injective(attrs, outs, target):
     with target:
         return topi.generic.schedule_injective(outs)
 
-@generic_func
-def schedule_add(attrs, outputs, target):
-    """Generic schedule for add."""
-    with target:
-        return topi.generic.schedule_add(outputs)
-
 @generic_func
 def schedule_reduce(attrs, outs, target):
     """Schedule reduction ops"""
@@ -83,7 +85,6 @@ def schedule_reduce(attrs, outs, target):
 
 _op._schedule_injective = schedule_injective
 _op._schedule_reduce = schedule_reduce
-_op._schedule_add = schedule_add
 
 # concatenate
 @generic_func
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 3b0b1b39c62c..37fb0108f111 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -230,7 +230,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
           << "Two complicated op in a primitive function "
           << " master=" << master_op_ << " current=" << op;
     }
-    if (op_pattern >= master_op_pattern_) {
+    if (op_pattern > master_op_pattern_) {
       master_op_ = op;
       master_attrs_ = call_node->attrs;
       master_op_pattern_ = op_pattern;
@@ -288,7 +288,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   tvm::Target target_;
   Op master_op_;
   Attrs master_attrs_;
-  int master_op_pattern_{0};
+  int master_op_pattern_{-1};
   OpImplementation master_implementation_;
   std::ostringstream readable_name_stream_;
   Array<te::Operation> scalars_;
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index 8aae9a3c5f14..fa6aee4864ec 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -63,9 +63,5 @@ def schedule_injective(outs):
     schedule_injective_from_existing(s, x)
     return s
 
-@tvm.target.generic_func
-def schedule_add(outs):
-    return schedule_injective(outs)
-
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 010daaedf2bc..ae9ca1a90142 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import te
+from tvm import autotvm
 import topi
 
 from tvm.relay.op import op as reg
@@ -63,6 +64,157 @@ def clip_strategy_vta(attrs, inputs, out_type, target):
 
 reg.get("clip").get_attr("FTVMStrategy").register(clip_strategy_vta, "vta")
 
+
+@autotvm.register_topi_compute("add.vta")
+def add_packed(cfg, lhs, rhs):
+    ret = topi.add(lhs, rhs)
+    return ret
+
+
+@autotvm.register_topi_compute("multiply.vta")
+def multiply_packed(cfg, lhs, rhs):
+    return topi.multiply(lhs, rhs)
+
+
+@autotvm.register_topi_compute("copy.vta")
+def copy_packed(cfg, i):
+    return topi.identify(i)
+
+
+def schedule_alu_packed(cfg, outs):
+    assert len(outs) == 1
+
+    def is_cast_op(op):
+        # return op.same_as(Op.op.get("cast"))
+        # FIXME(zhanghao): find a better way to do compare
+        return op.name == 'T_cast'
+
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    output = outs[0]
+    s = te.create_schedule([x.op for x in outs])
+    te.schedule.AutoInlineInjective(s)
+    # s[output].fuse(s[output].op.axis)
+
+    # only put the int-related ops to vta
+    if "int" in output.dtype and len(output.shape) == 6:
+        ewise_inputs = []
+        ewise_ops = []
+        const_ops = []
+
+        def _traverse(op):
+            if topi.tag.is_broadcast(op.tag):
+                if not op.same_as(output.op):
+                    if not op.axis:
+                        const_ops.append(op)
+                    elif not is_cast_op(op):
+                        ewise_ops.append(op)
+
+                for tensor in op.input_tensors:
+                    if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                        ewise_inputs.append((op, tensor))
+                    elif is_cast_op(tensor.op) and not op.same_as(output.op):
+                        ewise_inputs.append((op, tensor))
+                    else:
+                        _traverse(tensor.op)
+            else:
+                for tensor in op.input_tensors:
+                    if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \
+                            and (not is_cast_op(tensor.op)):
+                        _traverse(tensor.op)
+
+        op = output.op
+        _traverse(op)
+        for _, t in ewise_inputs:
+            if t.dtype == 'float32':
+                return s
+
+        x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
+
+        cfg.define_split('tile_co', x_co, num_outputs=2)
+        cfg.define_split('tile_h', x_i, num_outputs=2)
+        cfg.define_split('tile_w', x_j, num_outputs=2)
+
+        x_co_max = topi.util.get_const_int(x_bo.dom.extent)
+        x_i_max = topi.util.get_const_int(x_i.dom.extent)
+        x_j_max = topi.util.get_const_int(x_j.dom.extent)
+
+        x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co)
+        x_i0, x_i1 = cfg['tile_h'].apply(s, output, x_i)
+        x_j0, x_j1 = cfg['tile_w'].apply(s, output, x_j)
+        s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
+        store_pt = x_j0
+
+        env = get_env()
+        for eo in ewise_ops:
+            s[eo].set_scope(env.acc_scope)
+            s[eo].pragma(s[eo].op.axis[0], env.alu)
+            s[eo].compute_at(s[output], store_pt)
+
+        # cache read input
+        cache_read_ewise = []
+        for consumer, tensor in ewise_inputs:
+            cache_read_ewise.append(
+                s.cache_read(tensor, env.acc_scope, [consumer]))
+
+        for tensor in cache_read_ewise:
+            if s[tensor].op.axis:
+                s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
+            s[tensor].compute_at(s[output], store_pt)
+
+        for op in const_ops:
+            s[op].compute_inline()
+
+        s[output].pragma(x_co1, env.dma_copy)
+
+    return s
+
+
+@autotvm.register_topi_schedule("add.vta")
+def schedule_add_packed(cfg, outs):
+    return schedule_alu_packed(cfg, outs)
+
+
+@autotvm.register_topi_schedule("multiply.vta")
+def schedule_multiply_packed(cfg, outs):
+    return schedule_alu_packed(cfg, outs)
+
+
+@autotvm.register_topi_schedule("copy.vta")
+def schedule_copy_packed(cfg, outs):
+    return schedule_alu_packed(cfg, outs)
+
+
+def add_strategy_vta(attrs, inputs, out_type, target):
+    strategy = OpStrategy()
+    strategy.add_implementation(
+        _strategy.wrap_topi_compute(add_packed),
+        _strategy.wrap_topi_schedule(schedule_add_packed),
+        name="add.vta")
+    return strategy
+
+
+def multiply_strategy_vta(attrs, inputs, out_type, target):
+    strategy = OpStrategy()
+    strategy.add_implementation(
+        _strategy.wrap_topi_compute(multiply_packed),
+        _strategy.wrap_topi_schedule(schedule_multiply_packed),
+        name="multiply.vta")
+    return strategy
+
+
+def copy_strategy_vta(attrs, inputs, out_type, target):
+    strategy = OpStrategy()
+    strategy.add_implementation(
+        _strategy.wrap_topi_compute(copy_packed),
+        _strategy.wrap_topi_schedule(schedule_copy_packed),
+        name="copy.vta")
+    return strategy
+
+
+reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
+reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
+reg.get("copy").get_attr("FTVMStrategy").register(copy_strategy_vta, "vta")
+
 @_strategy.conv2d_strategy.register("vta")
 def conv2d_strategy_vta(attrs, inputs, out_type, target):
     """conv2d vta strategy"""
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 40e2530ef63c..525d60ae383d 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -190,102 +190,4 @@ def _traverse(op):
     return s
 
 
-# FIXME(zhanghao): move this code to a proper location
-@topi.generic.schedule_add.register(["vta"])
-def _schedule_add(outs):
-    assert len(outs) == 1
-
-    def is_cast_op(op):
-        # return op.same_as(Op.op.get("cast"))
-        # FIXME(zhanghao): find a better way to do compare
-        return op.name == 'T_cast'
 
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    output = outs[0]
-    s = te.create_schedule([x.op for x in outs])
-    te.schedule.AutoInlineInjective(s)
-    # s[output].fuse(s[output].op.axis)
-
-    # only put the int-related ops to vta
-    if "int" in output.dtype:
-        ewise_inputs = []
-        ewise_ops = []
-        const_ops = []
-
-        def _traverse(op):
-            if topi.tag.is_broadcast(op.tag):
-                if not op.same_as(output.op):
-                    if not op.axis:
-                        const_ops.append(op)
-                    elif not is_cast_op(op):
-                        ewise_ops.append(op)
-
-                for tensor in op.input_tensors:
-                    if isinstance(tensor.op, tvm.te.PlaceholderOp):
-                        ewise_inputs.append((op, tensor))
-                    elif is_cast_op(tensor.op) and not op.same_as(output.op):
-                        ewise_inputs.append((op, tensor))
-                    else:
-                        _traverse(tensor.op)
-            else:
-                for tensor in op.input_tensors:
-                    if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \
-                            and (not is_cast_op(tensor.op)):
-                        _traverse(tensor.op)
-
-        op = output.op
-        _traverse(op)
-        x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
-
-        x_co_max = topi.util.get_const_int(x_bo.dom.extent)
-        x_i_max = topi.util.get_const_int(x_i.dom.extent)
-        x_j_max = topi.util.get_const_int(x_j.dom.extent)
-
-        # TODO(zhanghao): auto-tune
-        x_co0, x_co1 = s[output].split(x_co, factor=1)
-
-        from functools import reduce
-        def factors(n):
-            return sorted(set(reduce(list.__add__,
-                              ([i, n//i] for i in range(1, int(n**0.5) + 1) if n % i == 0))))
-
-        # FIXME(zhanghao): use auto-tune
-        i_factors = factors(x_i_max)
-        i_factor = i_factors[-1]
-        while i_factor > 28:
-            del i_factors[-1]
-            i_factor = i_factors[-1]
-
-        j_factors = factors(x_j_max)
-        j_factor = j_factors[-1]
-        while j_factor > 14:
-            del j_factors[-1]
-            j_factor = j_factors[-1]
-
-        x_i0, x_i1 = s[output].split(x_i, factor=i_factor)
-        x_j0, x_j1 = s[output].split(x_j, factor=j_factor)
-        s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
-        store_pt = x_j0
-
-        env = get_env()
-        for eo in ewise_ops:
-            s[eo].set_scope(env.acc_scope)
-            s[eo].pragma(s[eo].op.axis[0], env.alu)
-            s[eo].compute_at(s[output], store_pt)
-
-        # cache read input
-        cache_read_ewise = []
-        for consumer, tensor in ewise_inputs:
-            cache_read_ewise.append(
-                s.cache_read(tensor, env.acc_scope, [consumer]))
-
-        for tensor in cache_read_ewise:
-            s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
-            s[tensor].compute_at(s[output], store_pt)
-
-        for op in const_ops:
-            s[op].compute_inline()
-
-        s[output].pragma(x_co1, env.dma_copy)
-
-    return s
diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py
new file mode 100644
index 000000000000..8a9a09c76856
--- /dev/null
+++ b/vta/tutorials/autotvm/tune_alu_vta.py
@@ -0,0 +1,341 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a ALU fused op on VTA
+"""
+
+import os
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+from PIL import Image
+
+import topi
+import tvm
+from tvm import te
+from tvm import rpc, autotvm, relay
+from tvm.contrib import graph_runtime, util, download
+from tvm.autotvm.measure.measure_methods import request_remote
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.autotvm import record
+
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
+import copy
+
+from tvm.contrib.util import eprint
+
+#################################################################
+# Compile network
+# ---------------
+# Perform vta-specific compilation with Relay from a Gluon model
+def compile_network(env, target, model, start_pack, stop_pack, device_annot=False):
+
+    # Populate the shape and data type dictionary
+    dtype_dict = {"data": 'float32'}
+    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+    # Get off the shelf gluon model, and convert to relay
+    gluon_model = vision.get_model(model, pretrained=True)
+    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+
+    # Update shape and type dictionary
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Perform quantization in Relay
+    # Note: We set opt_level to 3 in order to fold batch norm
+    with relay.build_config(opt_level=3):
+        with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
+            mod = relay.quantize.quantize(mod, params=params)
+
+    # Perform graph packing and constant folding for VTA target
+    if target.device_name == "vta":
+        assert env.BLOCK_IN == env.BLOCK_OUT
+        relay_prog = graph_pack(mod["main"],
+                                env.BATCH,
+                                env.BLOCK_OUT,
+                                env.WGT_WIDTH,
+                                start_name=start_pack,
+                                stop_name=stop_pack,
+                                device_annot=device_annot)
+
+    return relay_prog, params
+
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations.
+# Here we use an Pynq-Z1 board as an example.
+
+# Tracker host and port can be set by your environment
+tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0')
+tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
+
+# Name of Gluon model to compile
+# The ``start_pack`` and ``stop_pack`` labels indicate where
+# to start and end the graph packing relay pass: in other words
+# where to start and finish offloading to VTA.
+network = "resnet50_v2"
+start_pack = "nn.max_pool2d"
+stop_pack = "nn.global_avg_pool2d"
+
+# Tuning option
+log_file = "%s.%s.log" % (device, network)
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'random',
+    'n_trial': 1000,
+    'early_stopping': None,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.RPCRunner(env.TARGET,
+                                 host=tracker_host,
+                                 port=tracker_port,
+                                 number=5,
+                                 timeout=60,
+                                 check_correctness=True),
+    ),
+}
+
+
+def log_to_file(file_out, protocol='json'):
+    """Log the tuning records into file.
+    The rows of the log are stored in the format of autotvm.record.encode.
+    for lhs == rhs, we add an extra rhs = [] record
+
+    Parameters
+    ----------
+    file_out : str
+        The file to log to.
+    protocol: str, optional
+        The log protocol. Can be 'json' or 'pickle'
+
+    Returns
+    -------
+    callback : callable
+        Callback function to do the logging.
+    """
+    def _callback(_, inputs, results):
+        with open(file_out, "a") as f:
+            for inp, result in zip(inputs, results):
+                eprint("inp = {}, result = {}".format(inp, result))
+                f.write(record.encode(inp, result, protocol) + "\n")
+
+                # we only consider task with same lhs and rhs
+                if inp.task.args[0] == inp.task.args[1]:
+                    args = list(inp.task.args)
+                    args[1] = (args[0][0], (), args[0][2])
+                    inp_copy = copy.deepcopy(inp)
+                    inp_copy.task.args = tuple(args)
+                    f.write(record.encode(inp_copy, result, protocol) + "\n")
+
+    return _callback
+
+
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=10,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True):
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'xgb_knob':
+            tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tsk_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial=tsk_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                           log_to_file(tmp_log_file)
+                       ])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Register VTA-specific tuning tasks
+def register_vta_tuning_tasks():
+    from tvm.autotvm.task import TaskExtractEnv
+
+    @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
+    def my_clip(x, a_min, a_max):
+        """Unlike topi's current clip, put min and max into two stages."""
+        const_min = tvm.tir.const(a_min, x.dtype)
+        const_max = tvm.tir.const(a_max, x.dtype)
+        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+        x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
+        return x
+
+    # init autotvm env to register VTA operator
+    TaskExtractEnv()
+
+    @autotvm.register_customized_task("add.vta")
+    def _topi_add(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        A, B = args[:2]
+
+        with tvm.target.vta():
+            res = vta.top.op.add_packed(*args, **kwargs)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.Target.current().device_name == 'vta':
+            s = vta.top.op.schedule_add_packed([res])
+        else:
+            s = te.create_schedule([res.op])
+        return s, [A, B, res]
+
+    @autotvm.register_customized_task("multiply.vta")
+    def _topi_multiply(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        A, B = args[:2]
+
+        with tvm.target.vta():
+            res = vta.top.op.multiply_packed(*args, **kwargs)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.Target.current().device_name == 'vta':
+            s = vta.top.op.schedule_multiply_packed([res])
+        else:
+            s = te.create_schedule([res.op])
+        return s, [A, B, res]
+
+    @autotvm.register_customized_task("copy.vta")
+    def _topi_identity(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        A = args[0]
+
+        with tvm.target.vta():
+            res = vta.top.op.copy_packed(*args, **kwargs)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.Target.current().device_name == 'vta':
+            s = vta.top.op.schedule_copy_packed([res])
+        else:
+            s = te.create_schedule([res.op])
+        return s, [A, res]
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+def tune_and_evaluate(tuning_opt):
+
+    if env.TARGET != "sim":
+        # Get remote from fleet node
+        remote = autotvm.measure.request_remote(env.TARGET,
+                                                tracker_host,
+                                                tracker_port,
+                                                timeout=10000)
+        # Reconfigure the JIT runtime and FPGA.
+        bitstream = os.environ.get("TVM_BIT", None)
+        if bitstream:
+            print("Program fpga with {}".format(bitstream))
+            vta.reconfig_runtime(remote)
+            vta.program_fpga(remote, bitstream)
+    else:
+        # In simulation mode, host the RPC server locally.
+        remote = rpc.LocalSession()
+
+    # Register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # Perform task extraction on Relay program
+    print("Extract tasks...")
+    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
+    mod = tvm.IRModule.from_expr(relay_prog)
+    tasks = autotvm.task.extract_from_program(mod,
+                                              params=params,
+                                              ops=(relay.op.get("add"), relay.op.get("multiply"),),
+                                              target=target,
+                                              target_host=env.target_host)
+
+    # filter out non-packed alu task
+    tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
+    # filter out float alu task
+    tasks = list(filter(lambda t: t.args[0][2] != "float32", tasks))
+    # filter const rhs, which will be fused with conv2d
+    # tasks = list(filter(lambda t: len(t.args[1][1]) < 1, tasks))
+
+    # We should have extracted 10 convolution tasks
+    tasks_set = {}
+    print("Extracted {} alu tasks:".format(len(tasks)))
+    for tsk in tasks:
+        print("tsk = ", tsk)
+
+        if len(tsk.args[1][1]) == 0:
+            args = list(tsk.args)
+            args[1] = args[0]
+            tsk.args = tuple(args)
+
+        if (tsk.name, tsk.args) in tasks_set:
+            print("task {} already exists".format(tsk))
+        tasks_set[(tsk.name, tsk.args)] = tsk
+
+    tasks = list(tasks_set.values())
+    print("After merged, final #tasks={}, tasks = {}".format(len(tasks), tasks))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+
+# Run the tuning and evaluate the results
+tune_and_evaluate(tuning_option)

From a7526389f104d9f92c4b2e510bc44040cf2a20f5 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Fri, 22 May 2020 16:01:01 +0800
Subject: [PATCH 22/44] bugfix: make get_workload consistent with master_op
 selection

---
 python/tvm/autotvm/task/topi_integration.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index d7fa69d571b1..25d1156e2af8 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -233,13 +233,15 @@ def traverse(tensors):
         """traverse all ops to find attached workload"""
         for t in tensors:
             op = t.op
+            wkl = traverse(op.input_tensors)
+            if wkl:
+                return wkl
+
             if 'workload' in op.attrs:
                 ret = args_to_workload(op.attrs['workload'])
                 if ret[0] == task_name:
                     return ret
-            wkl = traverse(op.input_tensors)
-            if wkl:
-                return wkl
         return None
+
     outs = [outs] if isinstance(outs, tensor.Tensor) else outs
     return traverse(outs)

From 1b7aa5889d2dafcdd0b4195b350dcc2bd719688a Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Wed, 10 Jun 2020 18:47:53 +0800
Subject: [PATCH 23/44] some fixes after rebase with master

---
 cmake/modules/VTA.cmake                       |   3 +-
 include/tvm/relay/transform.h                 |   1 +
 python/tvm/contrib/util.py                    |   5 +
 src/relay/quantize/realize.cc                 |   4 +-
 src/relay/transforms/device_annotation.cc     |   7 +-
 vta/config/vta_cost.py                        | 102 ----
 vta/python/vta/transform.py                   |  12 +
 vta/src/intelfocl/AOCLUtils/aocl_utils.h      |  32 -
 vta/src/intelfocl/AOCLUtils/opencl.cpp        | 555 ------------------
 vta/src/intelfocl/AOCLUtils/opencl.h          | 122 ----
 vta/src/intelfocl/AOCLUtils/options.cpp       | 105 ----
 vta/src/intelfocl/AOCLUtils/options.h         | 137 -----
 vta/src/intelfocl/AOCLUtils/scoped_ptrs.h     | 165 ------
 vta/src/intelfocl/intelfocl_device.cc         | 181 ------
 vta/src/intelfocl/intelfocl_device.h          |  53 --
 vta/src/intelfocl/intelfocl_driver.cc         |  74 ---
 vta/src/pynq/pynq_driver.cc                   | 167 ------
 .../frontend/deploy_classification.py         |   8 +-
 18 files changed, 28 insertions(+), 1705 deletions(-)
 delete mode 100644 vta/config/vta_cost.py
 delete mode 100644 vta/src/intelfocl/AOCLUtils/aocl_utils.h
 delete mode 100644 vta/src/intelfocl/AOCLUtils/opencl.cpp
 delete mode 100644 vta/src/intelfocl/AOCLUtils/opencl.h
 delete mode 100644 vta/src/intelfocl/AOCLUtils/options.cpp
 delete mode 100644 vta/src/intelfocl/AOCLUtils/options.h
 delete mode 100644 vta/src/intelfocl/AOCLUtils/scoped_ptrs.h
 delete mode 100644 vta/src/intelfocl/intelfocl_device.cc
 delete mode 100644 vta/src/intelfocl/intelfocl_device.h
 delete mode 100644 vta/src/intelfocl/intelfocl_driver.cc
 delete mode 100644 vta/src/pynq/pynq_driver.cc

diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 33fe0016fe4a..371bd27fa80e 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -103,7 +103,7 @@ elseif(PYTHON)
       file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc)
       file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cpp)
       list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC})
-      list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc vta/src/vmem/virtual_memory.h)
+      list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h)
     endif()
     # Target lib: vta
     add_library(vta SHARED ${FPGA_RUNTIME_SRCS})
@@ -124,6 +124,7 @@ elseif(PYTHON)
         "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
     elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
       target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include")
+      target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include)
       target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL)
     endif()
   endif()
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 61eb6dd50ce2..a7f5fea98ea2 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -384,6 +384,7 @@ TVM_DLL Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& binds);
  * \note this function mutates mod and is not thread-safe.
  */
 TVM_DLL Function InferType(const Function& f, const IRModule& mod, const GlobalVar& var);
+TVM_DLL Expr InferType(const Expr& expr, const IRModule& mod);
 
 /*!
  * \brief Apply rewrite rules to rewrite the expr in post DFS order. This
diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py
index 474741fc1e35..20854ab3fb27 100644
--- a/python/tvm/contrib/util.py
+++ b/python/tvm/contrib/util.py
@@ -29,6 +29,11 @@
     fcntl = None
 
 
+def eprint(*args, **kwargs):
+    # return
+    print(*args, file=sys.stderr, flush=True, **kwargs)
+
+
 class DirectoryCreatedPastAtExit(Exception):
     """Raised when a TempDirectory is created after the atexit hook runs."""
 
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 07e61de82958..b71249c8c755 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -245,11 +245,11 @@ Expr Conv2dTransposeRealize(const Call& ref_call,
   DataType out_dtype = cfg->dtype_activation;
   attrs->out_dtype = out_dtype;
 
-  Expr ret = CallNode::make(ref_call->op,
+  Expr ret = Call(ref_call->op,
     {ldata, rdata}, Attrs(attrs), ref_call->type_args);
   Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
   Expr dom_scale = FoldConstantOpt(mul);
-  return QRealizeIntExprNode::make(ret, dom_scale, out_dtype);
+  return QRealizeIntExpr(ret, dom_scale, out_dtype);
 }
 
 RELAY_REGISTER_OP("nn.conv2d_transpose")
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index 4862a999b85c..3609ee0bacc4 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -540,7 +540,7 @@ class AddDeviceCopy : public ExprMutator {
         attrs->src_dev_type = src_dev_type;
         attrs->dst_dev_type = dst_dev_type;
         static const Op& op = Op::Get("device_copy");
-        Call device_copy = CallNode::make(op, {this->Mutate(arg)}, Attrs(attrs), {});
+        Call device_copy = Call(op, {this->Mutate(arg)}, Attrs(attrs), {});
         device_copy->checked_type_ = arg->checked_type_;
         call_args.push_back(device_copy);
       } else {
@@ -548,7 +548,7 @@ class AddDeviceCopy : public ExprMutator {
       }
     }
 
-    auto ret = CallNode::make(call_node->op, call_args, call_node->attrs, call_node->type_args);
+    auto ret = Call(call_node->op, call_args, call_node->attrs, call_node->type_args);
     // manually add the checked_type_
     // alternatively, can call InferType Pass after this
     ret->checked_type_ = call_node->checked_type_;
@@ -641,8 +641,7 @@ Pass AddDeviceCopyOps() {
     [=](Function f, IRModule m, PassContext pc) {
     return Downcast<Function>(AddDeviceCopyOps(f));
   };
-  return CreateFunctionPass(pass_func, 1, "AddDeviceCopyOps",
-                            {tir::StringImmNode::make("InferType")});
+  return CreateFunctionPass(pass_func, 1, "AddDeviceCopyOps", {"InferType"});
 }
 
 TVM_REGISTER_GLOBAL("relay._transform.AddDeviceCopy")
diff --git a/vta/config/vta_cost.py b/vta/config/vta_cost.py
deleted file mode 100644
index 9e1d7389b8c3..000000000000
--- a/vta/config/vta_cost.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# cost function for intelfocl 32*32 gemm version
-def cal_cost(insn):
-    """
-    Cal the runtime cost statically
-
-    Parameters
-    ------------
-    insn: the insn (json)
-
-    Returns
-    ------------
-    the cost in s
-    """
-    factor = 1000000.0
-    def alu_imm_cost(iter_out, iter_in, uop_bgn, uop_end):
-        x = (uop_end - uop_bgn) * iter_out * iter_in
-        cycles = x + 46
-        return cycles / factor
-
-    def alu_cost(iter_out, iter_in, uop_bgn, uop_end):
-        x = (uop_end - uop_bgn) * iter_out * iter_in
-        cycles = 2 * x + 46
-        return cycles / factor
-
-    def gemm_cost(iter_out, iter_in, uop_bgn, uop_end):
-        x = (uop_end - uop_bgn) * iter_out * iter_in
-        cycles = x + 80
-        return cycles / factor
-
-    def load_acc_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = x + 150
-        return cycles / factor
-
-    def load_acc8_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = 1.2 * x + 150
-        return cycles / factor
-
-    def load_inp_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = (x_size + x_pad_0 + x_pad_1) * (y_size + y_pad_0 + y_pad_1)
-        cycles = 1.1 * x + 150
-        return cycles / factor
-
-    def load_uop_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = 1.1 * x + 150
-        return cycles / factor
-
-    def load_wgt_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = 17 * x + 150
-        return cycles / factor
-
-    def store_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1):
-        x = x_size * y_size
-        cycles = x + 150
-        return cycles / factor
-
-    def nop_cost(name):
-        if name == "NOP-COMPUTE-STAGE":
-            return 38 / factor
-        elif name == "NOP-MEMORY-STAGE":
-            return 50 / factor
-        elif name == "NOP-STORE-STAGE":
-            return 39 / factor
-        else:
-            print("Unknown nop op {}".format(name))
-            return 0
-
-    if insn['type'] == "ALU":
-        return alu_cost(insn['outer_loop'][0], insn['inner_loop'][0],
-                        insn['range'][0], insn['range'][1])
-    elif insn['type'] == "ALU IMM":
-        return alu_imm_cost(insn['outer_loop'][0], insn['inner_loop'][0],
-                        insn['range'][0], insn['range'][1])
-    elif insn['type'] == "GEMM":
-        return gemm_cost(insn['outer_loop'][0], insn['inner_loop'][0],
-                        insn['range'][0], insn['range'][1])
-    elif insn['name'] == "LOAD INP":
-        return load_inp_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "LOAD WGT":
-        return load_wgt_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "LOAD UOP":
-        return load_uop_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "LOAD ACC":
-        return load_acc_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "LOAD ACC 8":
-        return load_acc8_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                             insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['name'] == "STORE":
-        return store_cost(insn['y'][0], insn['y'][1], insn['y'][2],
-                          insn['x'][0], insn['x'][1], insn['x'][2])
-    elif insn['type'] == "NOP":
-        return nop_cost(insn['name'])
-    else:
-        print("Unknown op type: {}".format(insn['type']))
-        return 0
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index 207f784b5885..3b13c1769103 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -548,10 +548,22 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
                 allow_fold = True
 
             _check_compact(dst)
+
+            # for int8 -> int32 cast/load
+            orig_dtype = src.dtype
+            if src.dtype != data_type:
+                assert(data_type == "int%d" % env.ACC_WIDTH and \
+                       src.dtype == "int%d" % env.INP_WIDTH)
+                src.dtype = data_type
+
             x_size, y_size, x_stride, offset = _get_2d_pattern(
                 src, elem_width, elem_bytes, data_type,
                 dst.scope, allow_fold=allow_fold)
 
+            if orig_dtype != src.dtype:
+                src.dtype = orig_dtype
+                mem_type = env.dev.MEM_ID_ACC_8
+
             irb = tvm.tir.ir_builder.create()
             irb.scope_attr(env.dev.vta_axis, "coproc_scope",
                            env.dev.get_task_qid(task_qid))
diff --git a/vta/src/intelfocl/AOCLUtils/aocl_utils.h b/vta/src/intelfocl/AOCLUtils/aocl_utils.h
deleted file mode 100644
index 70e0fc6bcc0a..000000000000
--- a/vta/src/intelfocl/AOCLUtils/aocl_utils.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this
-// software and associated documentation files (the "Software"), to deal in the Software
-// without restriction, including without limitation the rights to use, copy, modify, merge,
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
-// whom the Software is furnished to do so, subject to the following conditions:
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-// 
-// This agreement shall be governed in all respects by the laws of the State of California and
-// by the laws of the United States of America.
-
-// Main include file for AOCLUtils. Includes all other utility header files.
-
-#ifndef AOCL_UTILS_H
-#define AOCL_UTILS_H
-
-#include "opencl.h"
-#include "scoped_ptrs.h"
-#include "options.h"
-
-#endif
-
diff --git a/vta/src/intelfocl/AOCLUtils/opencl.cpp b/vta/src/intelfocl/AOCLUtils/opencl.cpp
deleted file mode 100644
index 04d989d7c9ea..000000000000
--- a/vta/src/intelfocl/AOCLUtils/opencl.cpp
+++ /dev/null
@@ -1,555 +0,0 @@
-// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this
-// software and associated documentation files (the "Software"), to deal in the Software
-// without restriction, including without limitation the rights to use, copy, modify, merge,
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
-// whom the Software is furnished to do so, subject to the following conditions:
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-// 
-// This agreement shall be governed in all respects by the laws of the State of California and
-// by the laws of the United States of America.
-
-#include "aocl_utils.h"
-#include <algorithm>
-#include <stdarg.h>
-
-#ifdef _WIN32 // Windows
-#include <windows.h>
-#else         // Linux
-#include <stdio.h> 
-#include <unistd.h> // readlink, chdir
-#endif
-
-namespace aocl_utils {
-
-static const char *const VERSION_STR = "161";
-
-//////////////////////////////////////////
-// Host allocation functions for alignment
-//////////////////////////////////////////
-
-// This is the minimum alignment requirement to ensure DMA can be used.
-const unsigned AOCL_ALIGNMENT = 64;
-
-#ifdef _WIN32 // Windows
-void *alignedMalloc(size_t size) {
-  return _aligned_malloc (size, AOCL_ALIGNMENT);
-}
-
-void alignedFree(void * ptr) {
-  _aligned_free(ptr);
-}
-#else          // Linux
-void *alignedMalloc(size_t size) {
-  void *result = NULL;
-  int rc;
-  rc = posix_memalign (&result, AOCL_ALIGNMENT, size);
-  (void) rc;
-  return result;
-}
-
-void alignedFree(void * ptr) {
-  free (ptr);
-}
-#endif
-
-///////////////////////////////
-// Error functions
-///////////////////////////////
-
-// Print the error associciated with an error code
-void printError(cl_int error) {
-  // Print error message
-  switch(error)
-  {
-    case -1:
-      printf("CL_DEVICE_NOT_FOUND ");
-      break;
-    case -2:
-      printf("CL_DEVICE_NOT_AVAILABLE ");
-      break;
-    case -3:
-      printf("CL_COMPILER_NOT_AVAILABLE ");
-      break;
-    case -4:
-      printf("CL_MEM_OBJECT_ALLOCATION_FAILURE ");
-      break;
-    case -5:
-      printf("CL_OUT_OF_RESOURCES ");
-      break;
-    case -6:
-      printf("CL_OUT_OF_HOST_MEMORY ");
-      break;
-    case -7:
-      printf("CL_PROFILING_INFO_NOT_AVAILABLE ");
-      break;
-    case -8:
-      printf("CL_MEM_COPY_OVERLAP ");
-      break;
-    case -9:
-      printf("CL_IMAGE_FORMAT_MISMATCH ");
-      break;
-    case -10:
-      printf("CL_IMAGE_FORMAT_NOT_SUPPORTED ");
-      break;
-    case -11:
-      printf("CL_BUILD_PROGRAM_FAILURE ");
-      break;
-    case -12:
-      printf("CL_MAP_FAILURE ");
-      break;
-    case -13:
-      printf("CL_MISALIGNED_SUB_BUFFER_OFFSET ");
-      break;
-    case -14:
-      printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ");
-      break;
-
-    case -30:
-      printf("CL_INVALID_VALUE ");
-      break;
-    case -31:
-      printf("CL_INVALID_DEVICE_TYPE ");
-      break;
-    case -32:
-      printf("CL_INVALID_PLATFORM ");
-      break;
-    case -33:
-      printf("CL_INVALID_DEVICE ");
-      break;
-    case -34:
-      printf("CL_INVALID_CONTEXT ");
-      break;
-    case -35:
-      printf("CL_INVALID_QUEUE_PROPERTIES ");
-      break;
-    case -36:
-      printf("CL_INVALID_COMMAND_QUEUE ");
-      break;
-    case -37:
-      printf("CL_INVALID_HOST_PTR ");
-      break;
-    case -38:
-      printf("CL_INVALID_MEM_OBJECT ");
-      break;
-    case -39:
-      printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR ");
-      break;
-    case -40:
-      printf("CL_INVALID_IMAGE_SIZE ");
-      break;
-    case -41:
-      printf("CL_INVALID_SAMPLER ");
-      break;
-    case -42:
-      printf("CL_INVALID_BINARY ");
-      break;
-    case -43:
-      printf("CL_INVALID_BUILD_OPTIONS ");
-      break;
-    case -44:
-      printf("CL_INVALID_PROGRAM ");
-      break;
-    case -45:
-      printf("CL_INVALID_PROGRAM_EXECUTABLE ");
-      break;
-    case -46:
-      printf("CL_INVALID_KERNEL_NAME ");
-      break;
-    case -47:
-      printf("CL_INVALID_KERNEL_DEFINITION ");
-      break;
-    case -48:
-      printf("CL_INVALID_KERNEL ");
-      break;
-    case -49:
-      printf("CL_INVALID_ARG_INDEX ");
-      break;
-    case -50:
-      printf("CL_INVALID_ARG_VALUE ");
-      break;
-    case -51:
-      printf("CL_INVALID_ARG_SIZE ");
-      break;
-    case -52:
-      printf("CL_INVALID_KERNEL_ARGS ");
-      break;
-    case -53:
-      printf("CL_INVALID_WORK_DIMENSION ");
-      break;
-    case -54:
-      printf("CL_INVALID_WORK_GROUP_SIZE ");
-      break;
-    case -55:
-      printf("CL_INVALID_WORK_ITEM_SIZE ");
-      break;
-    case -56:
-      printf("CL_INVALID_GLOBAL_OFFSET ");
-      break;
-    case -57:
-      printf("CL_INVALID_EVENT_WAIT_LIST ");
-      break;
-    case -58:
-      printf("CL_INVALID_EVENT ");
-      break;
-    case -59:
-      printf("CL_INVALID_OPERATION ");
-      break;
-    case -60:
-      printf("CL_INVALID_GL_OBJECT ");
-      break;
-    case -61:
-      printf("CL_INVALID_BUFFER_SIZE ");
-      break;
-    case -62:
-      printf("CL_INVALID_MIP_LEVEL ");
-      break;
-    case -63:
-      printf("CL_INVALID_GLOBAL_WORK_SIZE ");
-      break;
-    default:
-      printf("UNRECOGNIZED ERROR CODE (%d)", error);
-  }
-}
-
-// Print line, file name, and error code if there is an error. Exits the
-// application upon error.
-void _checkError(int line,
-                 const char *file,
-                 cl_int error,
-                 const char *msg,
-                 ...) {
-  // If not successful
-  if(error != CL_SUCCESS) {
-    // Print line and file
-    printf("ERROR: ");
-    printError(error);
-    printf("\nLocation: %s:%d\n", file, line);
-
-    // Print custom message.
-    va_list vl;
-    va_start(vl, msg);
-    vprintf(msg, vl);
-    printf("\n");
-    va_end(vl);
-
-    // Cleanup and bail.
-    cleanup();
-    exit(error);
-  }
-}
-
-// Sets the current working directory to be the same as the directory
-// containing the running executable.
-bool setCwdToExeDir() {
-#ifdef _WIN32 // Windows
-  HMODULE hMod = GetModuleHandle(NULL);
-  char path[MAX_PATH];
-  GetModuleFileNameA(hMod, path, MAX_PATH);
-
-#else         // Linux
-  // Get path of executable.
-  char path[300];
-  ssize_t n = readlink("/proc/self/exe", path, sizeof(path)/sizeof(path[0]) - 1);
-  if(n == -1) {
-    return false;
-  }
-  path[n] = 0;
-#endif
-
-  // Find the last '\' or '/' and terminate the path there; it is now
-  // the directory containing the executable.
-  size_t i;
-  for(i = strlen(path) - 1; i > 0 && path[i] != '/' && path[i] != '\\'; --i);
-  path[i] = '\0';
-
-  // Change the current directory.
-#ifdef _WIN32 // Windows
-  SetCurrentDirectoryA(path);
-#else         // Linux
-  int rc;
-  rc = chdir(path);
-  (void) rc;
-#endif
-
-  return true;
-}
-
-// Searches all platforms for the first platform whose name
-// contains the search string (case-insensitive).
-cl_platform_id findPlatform(const char *platform_name_search) {
-  cl_int status;
-
-  std::string search = platform_name_search;
-  std::transform(search.begin(), search.end(), search.begin(), tolower);
-
-  // Get number of platforms.
-  cl_uint num_platforms;
-  status = clGetPlatformIDs(0, NULL, &num_platforms);
-  checkError(status, "Query for number of platforms failed");
-
-  // Get a list of all platform ids.
-  scoped_array<cl_platform_id> pids(num_platforms);
-  status = clGetPlatformIDs(num_platforms, pids, NULL);
-  checkError(status, "Query for all platform ids failed");
-
-  // For each platform, get name and compare against the search string.
-  for(unsigned i = 0; i < num_platforms; ++i) {
-    std::string name = getPlatformName(pids[i]);
-
-    // Convert to lower case.
-    std::transform(name.begin(), name.end(), name.begin(), tolower);
-
-    if(name.find(search) != std::string::npos) {
-      // Found!
-      return pids[i];
-    }
-  }
-
-  // No platform found.
-  return NULL;
-}
-
-// Returns the platform name.
-std::string getPlatformName(cl_platform_id pid) {
-  cl_int status;
-
-  size_t sz;
-  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
-  checkError(status, "Query for platform name size failed");
-
-  scoped_array<char> name(sz);
-  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
-  checkError(status, "Query for platform name failed");
-
-  return name.get();
-}
-
-// Returns the device name.
-std::string getDeviceName(cl_device_id did) {
-  cl_int status;
-
-  size_t sz;
-  status = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, NULL, &sz);
-  checkError(status, "Failed to get device name size");
-
-  scoped_array<char> name(sz);
-  status = clGetDeviceInfo(did, CL_DEVICE_NAME, sz, name, NULL);
-  checkError(status, "Failed to get device name");
-
-  return name.get();
-}
-
-// Returns the list of all devices.
-cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices) {
-  cl_int status;
-
-  status = clGetDeviceIDs(pid, dev_type, 0, NULL, num_devices);
-  checkError(status, "Query for number of devices failed");
-
-  cl_device_id *dids = new cl_device_id[*num_devices];
-  status = clGetDeviceIDs(pid, dev_type, *num_devices, dids, NULL);
-  checkError(status, "Query for device ids");
-
-  return dids;
-}
-
-// Create a program for all devices associated with the context.
-cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices) {
-  // Early exit for potentially the most common way to fail: AOCX does not exist.
-  if(!fileExists(binary_file_name)) {
-    printf("AOCX file '%s' does not exist.\n", binary_file_name);
-    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
-  }
-
-  // Load the binary.
-  size_t binary_size;
-  scoped_array<unsigned char> binary(loadBinaryFile(binary_file_name, &binary_size));
-  if(binary == NULL) {
-    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
-  }
-
-  scoped_array<size_t> binary_lengths(num_devices);
-  scoped_array<unsigned char *> binaries(num_devices);
-  for(unsigned i = 0; i < num_devices; ++i) {
-    binary_lengths[i] = binary_size;
-    binaries[i] = binary;
-  }
-
-  cl_int status;
-  scoped_array<cl_int> binary_status(num_devices);
-
-  cl_program program = clCreateProgramWithBinary(context, num_devices, devices, binary_lengths,
-      (const unsigned char **) binaries.get(), binary_status, &status);
-  checkError(status, "Failed to create program with binary");
-  for(unsigned i = 0; i < num_devices; ++i) {
-    checkError(binary_status[i], "Failed to load binary for device");
-  }
-
-  return program;
-}
-
-// Loads a file in binary form.
-unsigned char *loadBinaryFile(const char *file_name, size_t *size) {
-  // Open the File
-  FILE* fp;
-#ifdef _WIN32
-  if(fopen_s(&fp, file_name, "rb") != 0) {
-    return NULL;
-  }
-#else
-  fp = fopen(file_name, "rb");
-  if(fp == 0) {
-    return NULL;
-  }
-#endif
-
-  // Get the size of the file
-  fseek(fp, 0, SEEK_END);
-  *size = ftell(fp);
-
-  // Allocate space for the binary
-  unsigned char *binary = new unsigned char[*size];
-
-  // Go back to the file start
-  rewind(fp);
-
-  // Read the file into the binary
-  if(fread((void*)binary, *size, 1, fp) == 0) {
-    delete[] binary;
-    fclose(fp);
-    return NULL;
-  }
-
-  return binary;
-}
-
-bool fileExists(const char *file_name) {
-#ifdef _WIN32 // Windows
-  DWORD attrib = GetFileAttributesA(file_name);
-  return (attrib != INVALID_FILE_ATTRIBUTES && !(attrib & FILE_ATTRIBUTE_DIRECTORY));
-#else         // Linux
-  return access(file_name, R_OK) != -1;
-#endif
-}
-
-std::string getBoardBinaryFile(const char *prefix, cl_device_id device) {
-  // First check if <prefix>.aocx exists. Use it if it does.
-  std::string file_name = std::string(prefix) + ".aocx";
-  if(fileExists(file_name.c_str())) {
-    return file_name;
-  }
-
-  // Now get the name of the board. For Intel(R) FPGA SDK for OpenCL(TM) boards,
-  // the name of the device is presented as:
-  //  <board name> : ...
-  std::string device_name = getDeviceName(device);
-
-  // Now search for the " :" in the device name.
-  size_t end = device_name.find(" :");
-  if(end != std::string::npos) {
-    std::string board_name(device_name, 0, end);
-
-    // Look for a AOCX with the name <prefix>_<board_name>_<version>.aocx.
-    file_name = std::string(prefix) + "_" + board_name + "_" + VERSION_STR + ".aocx";
-    if(fileExists(file_name.c_str())) {
-      return file_name;
-    }
-  }
-
-  // At this point just use <prefix>.aocx. This file doesn't exist
-  // and this should trigger an error later.
-  return std::string(prefix) + ".aocx";
-}
-
-// High-resolution timer.
-double getCurrentTimestamp() {
-#ifdef _WIN32 // Windows
-  // Use the high-resolution performance counter.
-
-  static LARGE_INTEGER ticks_per_second = {};
-  if(ticks_per_second.QuadPart == 0) {
-    // First call - get the frequency.
-    QueryPerformanceFrequency(&ticks_per_second);
-  }
-
-  LARGE_INTEGER counter;
-  QueryPerformanceCounter(&counter);
-
-  double seconds = double(counter.QuadPart) / double(ticks_per_second.QuadPart);
-  return seconds;
-#else         // Linux
-  timespec a;
-  clock_gettime(CLOCK_MONOTONIC, &a);
-  return (double(a.tv_nsec) * 1.0e-9) + double(a.tv_sec);
-#endif
-}
-
-cl_ulong getStartEndTime(cl_event event) {
-  cl_int status;
-
-  cl_ulong start, end;
-  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
-  checkError(status, "Failed to query event start time");
-  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
-  checkError(status, "Failed to query event end time");
-
-  return end - start;
-}
-
-cl_ulong getStartEndTime(cl_event *events, unsigned num_events) {
-  cl_int status;
-
-  cl_ulong min_start = 0;
-  cl_ulong max_end = 0;
-  for(unsigned i = 0; i < num_events; ++i) {
-    cl_ulong start, end;
-    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
-    checkError(status, "Failed to query event start time");
-    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
-    checkError(status, "Failed to query event end time");
-
-    if(i == 0) {
-      min_start = start;
-      max_end = end;
-    }
-    else {
-      if(start < min_start) {
-        min_start = start;
-      }
-      if(end > max_end) {
-        max_end = end;
-      }
-    }
-  }
-
-  return max_end - min_start;
-}
-
-void waitMilliseconds(unsigned ms) {
-#ifdef _WIN32 // Windows
-  Sleep(ms);
-#else         // Linux
-  timespec sleeptime = {0, 0};
-  sleeptime.tv_sec = ms / 1000;
-  sleeptime.tv_nsec = long(ms % 1000) * 1000000L;  // convert to nanoseconds
-  nanosleep(&sleeptime, NULL);
-#endif
-}
-
-void oclContextCallback(const char *errinfo, const void *, size_t, void *) {
-  printf("Context callback: %s\n", errinfo);
-}
-
-} // ns aocl_utils
-
diff --git a/vta/src/intelfocl/AOCLUtils/opencl.h b/vta/src/intelfocl/AOCLUtils/opencl.h
deleted file mode 100644
index 4aa5348b67b1..000000000000
--- a/vta/src/intelfocl/AOCLUtils/opencl.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this
-// software and associated documentation files (the "Software"), to deal in the Software
-// without restriction, including without limitation the rights to use, copy, modify, merge,
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
-// whom the Software is furnished to do so, subject to the following conditions:
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-// 
-// This agreement shall be governed in all respects by the laws of the State of California and
-// by the laws of the United States of America.
-
-// OpenCL utility functions.
-
-#ifndef AOCL_UTILS_OPENCL_H
-#define AOCL_UTILS_OPENCL_H
-
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-
-#include "CL/opencl.h"
-
-// This is assumed to be externally provided by the application.
-extern void cleanup();
-
-namespace aocl_utils {
-
-// Host allocation functions
-void *alignedMalloc(size_t size);
-void alignedFree(void *ptr);
-
-// Error functions
-void printError(cl_int error);
-void _checkError(int line,
-								 const char *file,
-								 cl_int error,
-                 const char *msg,
-                 ...); // does not return
-#define checkError(status, ...) _checkError(__LINE__, __FILE__, status, __VA_ARGS__)
-
-// Sets the current working directory to the same directory that contains
-// this executable. Returns true on success.
-bool setCwdToExeDir();
-
-// Find a platform that contains the search string in its name (case-insensitive match).
-// Returns NULL if no match is found.
-cl_platform_id findPlatform(const char *platform_name_search);
-
-// Returns the name of the platform.
-std::string getPlatformName(cl_platform_id pid);
-
-// Returns the name of the device.
-std::string getDeviceName(cl_device_id did);
-
-// Returns an array of device ids for the given platform and the
-// device type.
-// Return value must be freed with delete[].
-cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices);
-
-// Create a OpenCL program from a binary file.
-// The program is created for all given devices associated with the context. The same
-// binary is used for all devices.
-cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices);
-
-// Load binary file.
-// Return value must be freed with delete[].
-unsigned char *loadBinaryFile(const char *file_name, size_t *size);
-
-// Checks if a file exists.
-bool fileExists(const char *file_name);
-
-// Returns the path to the AOCX file to use for the given device.
-// This is special handling for examples for the Intel(R) FPGA SDK for OpenCL(TM).
-// It uses the device name to get the board name and then looks for a
-// corresponding AOCX file. Specifically, it gets the device name and
-// extracts the board name assuming the device name has the following format:
-//  <board> : ...
-//
-// Then the AOCX file is <prefix>_<version>_<board>.aocx. If this
-// file does not exist, then the file name defaults to <prefix>.aocx.
-std::string getBoardBinaryFile(const char *prefix, cl_device_id device);
-
-// Returns the time from a high-resolution timer in seconds. This value
-// can be used with a value returned previously to measure a high-resolution
-// time difference.
-double getCurrentTimestamp();
-
-// Returns the difference between the CL_PROFILING_COMMAND_END and
-// CL_PROFILING_COMMAND_START values of a cl_event object.
-// This requires that the command queue associated with the event be created
-// with the CL_QUEUE_PROFILING_ENABLE property.
-//
-// The return value is in nanoseconds.
-cl_ulong getStartEndTime(cl_event event);
-
-// Returns the maximum time span for the given set of events.
-// The time span starts at the earliest event start time.
-// The time span ends at the latest event end time.
-cl_ulong getStartEndTime(cl_event *events, unsigned num_events);
-
-// Wait for the specified number of milliseconds.
-void waitMilliseconds(unsigned ms);
-
-// OpenCL context callback function that simply prints the error information
-// to stdout (via printf).
-void oclContextCallback(const char *errinfo, const void *, size_t, void *);
-
-} // ns aocl_utils
-
-#endif
-
diff --git a/vta/src/intelfocl/AOCLUtils/options.cpp b/vta/src/intelfocl/AOCLUtils/options.cpp
deleted file mode 100644
index 05d025b43faf..000000000000
--- a/vta/src/intelfocl/AOCLUtils/options.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this
-// software and associated documentation files (the "Software"), to deal in the Software
-// without restriction, including without limitation the rights to use, copy, modify, merge,
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
-// whom the Software is furnished to do so, subject to the following conditions:
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-// 
-// This agreement shall be governed in all respects by the laws of the State of California and
-// by the laws of the United States of America.
-
-#include "aocl_utils.h"
-#include <algorithm>
-#include <iostream>
-#include <stdlib.h>
-#include <vector>
-
-namespace aocl_utils {
-
-Options::Options() {
-}
-
-Options::Options(int num, char *argv[]) {
-  addFromCommandLine(num, argv);
-}
-
-bool Options::has(const std::string &name) const {
-  return m_options.find(name) != m_options.end();
-}
-
-std::string &Options::get(const std::string &name) {
-  return m_options[name];
-}
-
-const std::string &Options::get(const std::string &name) const {
-  OptionMap::const_iterator it = m_options.find(name);
-  if(it == m_options.end()) {
-    errorNonExistent(name);
-    std::cerr << "Option '" << name << "' does not exist.\n";
-    exit(1);
-  }
-  return it->second;
-}
-
-void Options::addFromCommandLine(int num, char *argv[]) {
-  for(int i = 1; i < num; ++i) {
-    const std::string arg = argv[i];
-
-    // Look for the first '-'.
-    if(arg.size() > 1 && arg[0] == '-') {
-      size_t eq = arg.find('=');
-      size_t name_start = 1;
-
-      // Check if there's a second '-'.
-      if(arg.size() > 2 && arg[1] == '-') {
-        name_start = 2;
-      }
-
-      if(eq == std::string::npos) {
-        // No '='; treat as a boolean option.
-        set(arg.substr(name_start), true);
-      }
-      else if(eq == name_start) {
-        // No name?!
-        errorNameless();
-      }
-      else {
-        set(arg.substr(name_start, eq - name_start), arg.substr(eq + 1));
-      }
-    }
-    else {
-      // Not an option.
-      m_nonoptions.push_back(arg);
-    }
-  }
-}
-
-void Options::errorNameless() const {
-  std::cerr << "No name provided for option.\n";
-  exit(1);
-}
-
-void Options::errorNonExistent(const std::string &name) const {
-  std::cerr << "Option '" << name << "' does not exist.\n";
-  exit(1);
-}
-
-void Options::errorWrongType(const std::string &name) const {
-  std::cerr << "Value for option '" << name << "' is not of the right type (value = '"
-            << get(name) << "').\n";
-  exit(1);
-}
-
-} // ns aocl_utils
-
diff --git a/vta/src/intelfocl/AOCLUtils/options.h b/vta/src/intelfocl/AOCLUtils/options.h
deleted file mode 100644
index 78d34605e60e..000000000000
--- a/vta/src/intelfocl/AOCLUtils/options.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this
-// software and associated documentation files (the "Software"), to deal in the Software
-// without restriction, including without limitation the rights to use, copy, modify, merge,
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
-// whom the Software is furnished to do so, subject to the following conditions:
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-// 
-// This agreement shall be governed in all respects by the laws of the State of California and
-// by the laws of the United States of America.
-
-// Declares a utility class used to parse command-line options.
-
-#ifndef AOCL_UTILS_OPTIONS_H
-#define AOCL_UTILS_OPTIONS_H
-
-#include <map>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace aocl_utils {
-
-class Options {
-public:
-  typedef std::vector<std::string> StringVec;
-
-  Options();
-  Options(int num, char *argv[]);
-
-  bool has(const std::string &name) const;
-  std::string &get(const std::string &name); // will create an empty option if it does not exist
-  const std::string &get(const std::string &name) const; // error if option does not exist
-
-  void set(const std::string &name, const std::string &value) { get(name) = value; }
-
-  // Command line options must be of the following form:
-  //  [-]-name (indicates option exists)
-  //  [-]-name=value
-  //
-  // This function assumes that the values are from main(int, char *).
-  // This means that the argv[0] is skipped.
-  void addFromCommandLine(int num, char *argv[]);
-
-  // This templated function converts the option value to the given type.
-  // An assert is raised if the conversion fails.
-  template<typename T>
-  T get(const std::string &name) const;
-
-  template<typename T>
-  void set(const std::string &name, const T &value);
-
-  // Non-options are arguments processed in addFromCommandLine
-  // that were not recognized as options.
-  const StringVec &getNonOptions() const { return m_nonoptions; }
-  size_t getNonOptionCount() const { return m_nonoptions.size(); }
-  const std::string &getNonOption(size_t i) const { return m_nonoptions[i]; }
-
-private:
-  typedef std::map<std::string, std::string> OptionMap;
-
-  // Displays an error message indicating that a nameless option
-  // was provided.
-  void errorNameless() const;
-
-  // Displays an error message indicating that the given option
-  // has the wrong type and then exits with an error code.
-  void errorWrongType(const std::string &name) const;
-
-  // Displays an error message indicating that the given option
-  // does not exist and then exits with an error code.
-  void errorNonExistent(const std::string &name) const;
-
-  OptionMap m_options;
-  StringVec m_nonoptions;
-
-  Options(const Options &); // not implemented
-  void operator =(const Options &); // not implemented
-};
-
-template<typename T>
-T Options::get(const std::string &name) const {
-  std::stringstream ss;
-  ss << get(name);
-
-  T v;
-  ss >> v;
-  if(ss.fail() || !ss.eof()) {
-    // Failed to parse or did not consume the whole string value.
-    errorWrongType(name);
-  }
-  return v;
-}
-
-// Specialization for bool. 
-template<>
-inline bool Options::get<bool>(const std::string &name) const {
-  if(has(name)) {
-    const std::string &v = get(name);
-    if(v == "1") {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Specialization for std::string. Simply returns the option string.
-// Requires specialization because using stringstream to read the string
-// will stop at the first whitespace character (which is wrong).
-template<>
-inline std::string Options::get<std::string>(const std::string &name) const {
-  return get(name);
-}
-
-// This assumes the type T can be serialized to a string and back (when get
-// is called).
-template<typename T>
-void Options::set(const std::string &name, const T &value) {
-  std::stringstream ss;
-  ss << value;
-  set(name, ss.str());
-}
-
-} // ns aocl_utils
-
-#endif
-
diff --git a/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h b/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h
deleted file mode 100644
index b11085c5226e..000000000000
--- a/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this
-// software and associated documentation files (the "Software"), to deal in the Software
-// without restriction, including without limitation the rights to use, copy, modify, merge,
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
-// whom the Software is furnished to do so, subject to the following conditions:
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-// 
-// This agreement shall be governed in all respects by the laws of the State of California and
-// by the laws of the United States of America.
-
-// Scoped pointer definitions.
-
-#ifndef AOCL_UTILS_SCOPED_PTRS_H
-#define AOCL_UTILS_SCOPED_PTRS_H
-
-namespace aocl_utils {
-
-// Interface is essentially the combination of std::auto_ptr and boost's smart pointers,
-// along with some small extensions (auto conversion to T*).
-
-// scoped_ptr: assumes pointer was allocated with operator new; destroys with operator delete
-template<typename T>
-class scoped_ptr {
-public:
-  typedef scoped_ptr<T> this_type;
-
-  scoped_ptr() : m_ptr(NULL) {}
-  scoped_ptr(T *ptr) : m_ptr(ptr) {}
-  ~scoped_ptr() { reset(); }
-
-  T *get() const { return m_ptr; }
-  operator T *() const { return m_ptr; }
-  T *operator ->() const { return m_ptr; }
-  T &operator *() const { return *m_ptr; }
-
-  this_type &operator =(T *ptr) { reset(ptr); return *this; }
-
-  void reset(T *ptr = NULL) { delete m_ptr; m_ptr = ptr; }
-  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
-
-private:
-  T *m_ptr;
-
-  // noncopyable
-  scoped_ptr(const this_type &);
-  this_type &operator =(const this_type &);
-};
-
-// scoped_array: assumes pointer was allocated with operator new[]; destroys with operator delete[]
-// Also supports allocation/reset with a number, which is the number of
-// elements of type T.
-template<typename T>
-class scoped_array {
-public:
-  typedef scoped_array<T> this_type;
-
-  scoped_array() : m_ptr(NULL) {}
-  scoped_array(T *ptr) : m_ptr(NULL) { reset(ptr); }
-  explicit scoped_array(size_t n) : m_ptr(NULL) { reset(n); }
-  ~scoped_array() { reset(); }
-
-  T *get() const { return m_ptr; }
-  operator T *() const { return m_ptr; }
-  T *operator ->() const { return m_ptr; }
-  T &operator *() const { return *m_ptr; }
-  T &operator [](int index) const { return m_ptr[index]; }
-
-  this_type &operator =(T *ptr) { reset(ptr); return *this; }
-
-  void reset(T *ptr = NULL) { delete[] m_ptr; m_ptr = ptr; }
-  void reset(size_t n) { reset(new T[n]); }
-  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
-
-private:
-  T *m_ptr;
-
-  // noncopyable
-  scoped_array(const this_type &);
-  this_type &operator =(const this_type &);
-};
-
-// scoped_aligned_ptr: assumes pointer was allocated with alignedMalloc; destroys with alignedFree
-// Also supports allocation/reset with a number, which is the number of
-// elements of type T
-template<typename T>
-class scoped_aligned_ptr {
-public:
-  typedef scoped_aligned_ptr<T> this_type;
-
-  scoped_aligned_ptr() : m_ptr(NULL) {}
-  scoped_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
-  explicit scoped_aligned_ptr(size_t n) : m_ptr(NULL) { reset(n); }
-  ~scoped_aligned_ptr() { reset(); }
-
-  T *get() const { return m_ptr; }
-  operator T *() const { return m_ptr; }
-  T *operator ->() const { return m_ptr; }
-  T &operator *() const { return *m_ptr; }
-  T &operator [](int index) const { return m_ptr[index]; }
-
-  this_type &operator =(T *ptr) { reset(ptr); return *this; }
-
-  void reset(T *ptr = NULL) { if(m_ptr) alignedFree(m_ptr); m_ptr = ptr; }
-  void reset(size_t n) { reset((T*) alignedMalloc(sizeof(T) * n)); }
-  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
-
-private:
-  T *m_ptr;
-
-  // noncopyable
-  scoped_aligned_ptr(const this_type &);
-  this_type &operator =(const this_type &);
-};
-
-#if USE_SVM_API == 1
-// scoped_SVM_aligned_ptr: assumes pointer was allocated with clSVMAlloc; destroys with clSVMFree
-// Also supports allocation/reset with a number, which is the number of
-// elements of type T
-template<typename T>
-class scoped_SVM_aligned_ptr {
-public:
-	typedef scoped_SVM_aligned_ptr<T> this_type;
-
-	scoped_SVM_aligned_ptr() : m_ptr(NULL) {}
-	scoped_SVM_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
-	explicit scoped_SVM_aligned_ptr(cl_context ctx, size_t n) : m_ptr(NULL) { reset(ctx, n); }
-	~scoped_SVM_aligned_ptr() { reset(); }
-
-	T *get() const { return m_ptr; }
-	operator T *() const { return m_ptr; }
-	T *operator ->() const { return m_ptr; }
-	T &operator *() const { return *m_ptr; }
-	T &operator [](int index) const { return m_ptr[index]; }
-
-	this_type &operator =(T *ptr) { reset(ptr); return *this; }
-
-	void reset(T *ptr = NULL) { if (m_ptr) clSVMFree(m_ctx, m_ptr); m_ptr = ptr; }
-	void reset(cl_context ctx, size_t n) { reset((T*)clSVMAlloc(ctx, 0, sizeof(T) * n, 0)); m_ctx = ctx; }
-	T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
-
-private:
-	T *m_ptr;
-	cl_context m_ctx;
-
-	// noncopyable
-	scoped_SVM_aligned_ptr(const this_type &);
-	this_type &operator =(const this_type &);
-};
-#endif /* USE_SVM_API == 1 */
-
-} // ns aocl_utils
-
-#endif
-
diff --git a/vta/src/intelfocl/intelfocl_device.cc b/vta/src/intelfocl/intelfocl_device.cc
deleted file mode 100644
index 5eb1519b1124..000000000000
--- a/vta/src/intelfocl/intelfocl_device.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-#include <dmlc/logging.h>
-#include <vta/hw_spec.h>
-#include "intelfocl_device.h"
-#include "AOCLUtils/aocl_utils.h"
-
-#define MEM_ALIGNMENT (1024)
-
-#define CL_STATUS_SUCCESS(x) ((x) == CL_SUCCESS)
-
-void cleanup() {}
-
-int IntelFOCLDevice::init(size_t mem_size, std::string aocx_file)
-{
-    cl_int status;
-    cl_device_id device;
-    cl_platform_id platform;
-    unsigned int argi;
-    bool focl_device_avail;
-    unsigned int num_devices;
-    aocl_utils::scoped_array<cl_device_id> devices;
-
-    platform = aocl_utils::findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
-    CHECK(platform) << "Unable to find Intel(R) FPGA OpenCL platform";
-    
-    devices.reset(aocl_utils::getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
-    focl_device_avail = false;
-    for ( unsigned int i = 0; i < num_devices; i ++ )
-    {
-        device = devices[i];
-        _context = clCreateContext(NULL, 1, &device, &aocl_utils::oclContextCallback, NULL, &status);
-        if ( CL_STATUS_SUCCESS(status) )
-        {
-            focl_device_avail = true;
-            LOG(INFO) << "Using device: " << aocl_utils::getDeviceName(device);
-            break;
-        }
-    }
-    CHECK(focl_device_avail) << "No FPGA device available";
-    num_devices = 1;
-
-    LOG(INFO) << "Using AOCX: " << aocx_file;
-    _program = aocl_utils::createProgramFromBinary(_context, aocx_file.c_str(), &device, num_devices);
-    status = clBuildProgram(_program, 0, NULL, "", NULL, NULL);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to build program";
-
-    for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ )
-    {
-        _kernels[i] = clCreateKernel(_program, kernel_names[i].c_str(), &status);
-        CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create kernel";
-        _queues[i] = clCreateCommandQueue(_context, device, 0, &status);
-        CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create command queue";
-    }
-
-    _mem = clCreateBuffer(_context, CL_MEM_READ_WRITE, mem_size, NULL, &status);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create buffer mem";
-    mem_chunk_t init_chunk = {.offset = 0, .size = mem_size, .occupied = false};
-    _mem_chunks.push_back(init_chunk);
-
-    argi = 1;
-    status = clSetKernelArg(_kernels[KERNEL_FETCH], argi++, sizeof(cl_mem), &_mem);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
-    argi = 0;
-    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
-    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
-    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
-    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
-    status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
-
-    return 0;
-}
-
-ifocl_mem_off_t IntelFOCLDevice::alloc(size_t size)
-{
-    auto iter = _mem_chunks.begin();
-    size_t aligned_size = ((size + MEM_ALIGNMENT - 1) / MEM_ALIGNMENT) * MEM_ALIGNMENT;
-
-    while ( iter != _mem_chunks.end() && (iter->occupied || (iter->size < aligned_size)) )
-    {
-        iter++;
-    }
-
-    if ( iter == _mem_chunks.end() ) return IFOCL_MEM_OFF_ERR;
-
-    iter->occupied = true;
-    if ( iter->size != aligned_size )
-    {
-        mem_chunk_t rem = {iter->offset + aligned_size, iter->size - aligned_size, false};
-        iter->size = aligned_size;
-        _mem_chunks.insert(std::next(iter), rem);
-    }
-
-    return iter->offset;
-}
-
-void IntelFOCLDevice::free(ifocl_mem_off_t offset)
-{
-    auto iter = _mem_chunks.begin();
-    while ( iter != _mem_chunks.end() && iter->offset < offset ) iter++;
-
-    if ( iter == _mem_chunks.end() || iter->offset != offset || !iter->occupied )
-    {
-        return;
-    }
-
-    iter->occupied = false;
-    if ( iter != _mem_chunks.begin() && !std::prev(iter)->occupied ) iter--;
-
-    while ( std::next(iter) != _mem_chunks.end() && !std::next(iter)->occupied )
-    {
-        iter->size += std::next(iter)->size;
-        _mem_chunks.erase(std::next(iter));
-    }
-}
-
-
-void IntelFOCLDevice::write_mem(ifocl_mem_off_t offset, const void *buf, size_t nbyte)
-{
-    cl_int status = clEnqueueWriteBuffer(_queues[0], _mem, CL_TRUE, offset, nbyte, buf, 0, NULL, NULL);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue write buffer";
-}
-
-void IntelFOCLDevice::read_mem(ifocl_mem_off_t offset, void *buf, size_t nbyte)
-{
-    cl_int status = clEnqueueReadBuffer(_queues[0], _mem, CL_TRUE, offset, nbyte, buf, 0, NULL, NULL);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue read buffer";
-};
-
-int IntelFOCLDevice::execute_instructions(ifocl_mem_off_t offset, size_t count)
-{
-    cl_int status;
-    unsigned int argi;
-    unsigned int insn_offset = offset / VTA_INS_ELEM_BYTES;
-    unsigned int insn_count = count;
-    const size_t global_work_size = 1;
-
-    argi = 0;
-    status = clSetKernelArg(_kernels[KERNEL_FETCH], argi, sizeof(unsigned int), &insn_count);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
-    argi = 2;
-    status = clSetKernelArg(_kernels[KERNEL_FETCH], argi, sizeof(unsigned int), &insn_offset);
-    CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi;
-
-    for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ )
-    {
-        status = clEnqueueNDRangeKernel(_queues[i], _kernels[i], 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
-        CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue kernel";
-    }
-
-    for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ )
-    {
-        status = clFinish(_queues[i]);
-        CHECK(CL_STATUS_SUCCESS(status)) << "Failed to clFinish";
-    }
-
-    return 0;
-};
-
-void IntelFOCLDevice::deinit()
-{
-    for ( unsigned int i = 0; i < NUM_OCL_KERNELS; i++ )
-    {
-        clReleaseKernel(_kernels[i]);
-        clReleaseCommandQueue(_queues[i]);
-    }
-
-    clReleaseMemObject(_mem);
-
-    clReleaseProgram(_program);
-
-    clReleaseContext(_context);
-}
-
-IntelFOCLDevice::~IntelFOCLDevice()
-{
-    deinit();
-}
diff --git a/vta/src/intelfocl/intelfocl_device.h b/vta/src/intelfocl/intelfocl_device.h
deleted file mode 100644
index 6c53a4d47323..000000000000
--- a/vta/src/intelfocl/intelfocl_device.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef VTA_INTEL_FOCL_DEVICE_H_
-#define VTA_INTEL_FOCL_DEVICE_H_
-
-#include <list>
-#include <string>
-
-#include "CL/opencl.h"
-
-#define NUM_OCL_KERNELS 3
-enum kernel_index {KERNEL_FETCH, KERNEL_COMPUTE, KERNEL_PROFILE};
-static std::string kernel_names[3] = {"fetch", "compute", "profile"};
-
-typedef size_t ifocl_mem_off_t;
-#define IFOCL_MEM_OFF_ERR (SIZE_MAX)
-
-typedef struct
-{
-    ifocl_mem_off_t offset;
-    size_t size;
-    bool occupied;
-} mem_chunk_t;
-
-class IntelFOCLDevice {
-    private:
-        cl_context _context;
-        cl_program _program;
-        cl_mem _mem;
-        cl_kernel _kernels[NUM_OCL_KERNELS];
-        cl_command_queue _queues[NUM_OCL_KERNELS];
-        std::list<mem_chunk_t> _mem_chunks;
-
-    public:
-        IntelFOCLDevice() { init(4*1024*1024*1024ULL, "vta_opencl.aocx"); }
-
-        int init(size_t mem_size, std::string aocx_file);
-
-        ifocl_mem_off_t alloc(size_t size);
-
-        void free(ifocl_mem_off_t offset);
-
-        void write_mem(ifocl_mem_off_t offset, const void *buf, size_t nbyte);
-
-        void read_mem(ifocl_mem_off_t offset, void *buf, size_t nbyte);
-
-        int execute_instructions(ifocl_mem_off_t offset, size_t count);
-
-        void deinit();
-
-        ~IntelFOCLDevice();
-};
-
-#endif  // VTA_INTEL_FOCL_DEVICE_H_
-
diff --git a/vta/src/intelfocl/intelfocl_driver.cc b/vta/src/intelfocl/intelfocl_driver.cc
deleted file mode 100644
index a8db9cd0e394..000000000000
--- a/vta/src/intelfocl/intelfocl_driver.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <string>
-#include <iostream>
-#include <vta/driver.h>
-#include "intelfocl_device.h"
-
-#define MEM_ADDR_IDENTIFIER (0x18000000)
-
-static IntelFOCLDevice focl_device;
-
-static inline void* mem_get_addr(ifocl_mem_off_t offset)
-{
-    void *ret = (void *) (offset + MEM_ADDR_IDENTIFIER);
-    return ret;
-}
-
-static inline ifocl_mem_off_t mem_get_offset(const void *addr)
-{
-    ifocl_mem_off_t ret = (ifocl_mem_off_t) addr - MEM_ADDR_IDENTIFIER;
-    return ret;
-}
-
-void* VTAMemAlloc(size_t size, int cached) {
-    (void) cached;
-    ifocl_mem_off_t offset = focl_device.alloc(size);
-    if ( offset == IFOCL_MEM_OFF_ERR ) return NULL;
-    void *addr = mem_get_addr(offset);
-    return addr;
-}
-
-void VTAMemFree(void *buf) {
-    ifocl_mem_off_t offset = mem_get_offset(buf);
-    focl_device.free(offset);
-}
-
-vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
-    ifocl_mem_off_t offset = mem_get_offset(buf);
-    return (vta_phy_addr_t) offset;
-}
-
-void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
-    ifocl_mem_off_t dst_offset = mem_get_offset(dst);
-    focl_device.write_mem(dst_offset, src, size);
-}
-
-void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
-    ifocl_mem_off_t src_offset = mem_get_offset(src);
-    focl_device.read_mem(src_offset, dst, size);
-}
-
-void VTAFlushCache(void * offset, vta_phy_addr_t buf, int size) {
-    std::cout << "VTAFlushCache not implemented for Intel OpenCL for FPGA devices" << std::endl;
-}
-
-void VTAInvalidateCache(void * offset, vta_phy_addr_t buf, int size) {
-    std::cout << "VTAInvalidateCache not implemented for Intel OpenCL for FPGA devices" << std::endl;
-}
-
-VTADeviceHandle VTADeviceAlloc() {
-    return (VTADeviceHandle) &focl_device;
-}
-
-void VTADeviceFree(VTADeviceHandle handle) {
-    (void) handle;
-}
-
-int VTADeviceRun(VTADeviceHandle handle,
-        vta_phy_addr_t insn_phy_addr,
-        uint32_t insn_count,
-        uint32_t wait_cycles)
-{
-    (void) wait_cycles;
-    ifocl_mem_off_t offset = (ifocl_mem_off_t) insn_phy_addr;
-    return focl_device.execute_instructions(offset, insn_count);
-}
diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc
deleted file mode 100644
index 518b6c368926..000000000000
--- a/vta/src/pynq/pynq_driver.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- * \file pynq_driver.c
- * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io).
- */
-
-#include <vta/driver.h>
-#include <thread>
-#include <time.h>
-#include "pynq_driver.h"
-
-
-void* VTAMemAlloc(size_t size, int cached) {
-  assert(size <= VTA_MAX_XFER);
-  // Rely on the pynq-specific cma library
-  return cma_alloc(size, cached);
-}
-
-void VTAMemFree(void* buf) {
-  // Rely on the pynq-specific cma library
-  cma_free(buf);
-}
-
-vta_phy_addr_t VTAMemGetPhyAddr(void* buf) {
-  return cma_get_phy_addr(buf);
-}
-
-void VTAMemCopyFromHost(void* dst, const void* src, size_t size) {
-  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
-  memcpy(dst, src, size);
-}
-
-void VTAMemCopyToHost(void* dst, const void* src, size_t size) {
-  // For SoC-based FPGAs that used shared memory with the CPU, use memcopy()
-  memcpy(dst, src, size);
-}
-
-void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
-  // Call the cma_flush_cache on the CMA buffer
-  // so that the FPGA can read the buffer data.
-  cma_flush_cache(vir_addr, phy_addr, size);
-}
-
-void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) {
-  // Call the cma_invalidate_cache on the CMA buffer
-  // so that the host needs to read the buffer data.
-  cma_invalidate_cache(vir_addr, phy_addr, size);
-}
-
-void *VTAMapRegister(uint32_t addr) {
-  // Align the base address with the pages
-  uint32_t virt_base = addr & ~(getpagesize() - 1);
-  // Calculate base address offset w.r.t the base address
-  uint32_t virt_offset = addr - virt_base;
-  // Open file and mmap
-  uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC);
-  return mmap(NULL,
-              (VTA_IP_REG_MAP_RANGE + virt_offset),
-              PROT_READ|PROT_WRITE,
-              MAP_SHARED,
-              mmap_file,
-              virt_base);
-}
-
-void VTAUnmapRegister(void *vta) {
-  // Unmap memory
-  int status = munmap(vta, VTA_IP_REG_MAP_RANGE);
-  assert(status == 0);
-}
-
-void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
-  *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)) = val;
-}
-
-uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
-  return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
-}
-
-class VTADevice {
- public:
-  VTADevice() {
-    // VTA stage handles
-    vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR);
-    vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR);
-    vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR);
-    vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR);
-  }
-
-  ~VTADevice() {
-    // Close VTA stage handle
-    VTAUnmapRegister(vta_fetch_handle_);
-    VTAUnmapRegister(vta_load_handle_);
-    VTAUnmapRegister(vta_compute_handle_);
-    VTAUnmapRegister(vta_store_handle_);
-  }
-
-  int Run(vta_phy_addr_t insn_phy_addr,
-          uint32_t insn_count,
-          uint32_t wait_cycles) {
-    VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
-    VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy_addr);
-    VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_INP_ADDR_OFFSET, 0);
-    VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_WGT_ADDR_OFFSET, 0);
-    VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_UOP_ADDR_OFFSET, 0);
-    VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_BIAS_ADDR_OFFSET, 0);
-    VTAWriteMappedReg(vta_store_handle_, VTA_STORE_OUT_ADDR_OFFSET, 0);
-
-    // VTA start
-    VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START);
-    VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART);
-    VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART);
-    VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART);
-
-    // Allow device to respond
-    struct timespec ts = { .tv_sec = 0, .tv_nsec = 1000 };
-    nanosleep(&ts, &ts);
-
-    // Loop until the VTA is done
-    unsigned t, flag = 0;
-    for (t = 0; t < wait_cycles; ++t) {
-      flag = VTAReadMappedReg(vta_compute_handle_, VTA_COMPUTE_DONE_RD_OFFSET);
-      if (flag == VTA_DONE) break;
-      std::this_thread::yield();
-    }
-    // Report error if timeout
-    return t < wait_cycles ? 0 : 1;
-  }
-
- private:
-  // VTA handles (register maps)
-  void* vta_fetch_handle_{nullptr};
-  void* vta_load_handle_{nullptr};
-  void* vta_compute_handle_{nullptr};
-  void* vta_store_handle_{nullptr};
-};
-
-VTADeviceHandle VTADeviceAlloc() {
-  return new VTADevice();
-}
-
-void VTADeviceFree(VTADeviceHandle handle) {
-  delete static_cast<VTADevice*>(handle);
-}
-
-int VTADeviceRun(VTADeviceHandle handle,
-                 vta_phy_addr_t insn_phy_addr,
-                 uint32_t insn_count,
-                 uint32_t wait_cycles) {
-  return static_cast<VTADevice*>(handle)->Run(
-      insn_phy_addr, insn_count, wait_cycles);
-}
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 30fe7f2b0b06..b61f594872c1 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -96,7 +96,7 @@
 # The ``start_pack`` and ``stop_pack`` labels indicate where
 # to start and end the graph packing relay pass: in other words
 # where to start and finish offloading to VTA.
-model = "resnet18_v1"
+model = "resnet50_v2"
 assert model in pack_dict
 
 ######################################################################
@@ -162,7 +162,8 @@
 
 # Load pre-configured AutoTVM schedules
 log_file = "%s.%s.log-manual-formatv0_2" % (device, model)
-with autotvm.tophub.context(target, extra_files=[log_file]):
+alu_log_file = "%s.alu.%s.log" % (device, model)
+with autotvm.tophub.context(target, extra_files=[log_file, alu_log_file]):
 
     # Populate the shape and data type dictionary for ImageNet classifier input
     dtype_dict = {"data": 'float32'}
@@ -176,7 +177,6 @@
 
     # Start front end compilation
     mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
-    eprint("from_mxnet mod = ", mod)
 
     # Update shape and type dictionary
     shape_dict.update({k: v.shape for k, v in params.items()})
@@ -189,7 +189,6 @@
             with relay.quantize.qconfig(global_scale=8.0,
                                         skip_conv_layers=[0]):
                 mod = relay.quantize.quantize(mod, params=params)
-                eprint("done quantize", mod)
             # Perform graph packing and constant folding for VTA target
             assert env.BLOCK_IN == env.BLOCK_OUT
             relay_prog = graph_pack(
@@ -199,7 +198,6 @@
                 env.WGT_WIDTH,
                 start_name=pack_dict[model][0],
                 stop_name=pack_dict[model][1])
-            eprint("done graphpack ", relay_prog)
     else:
         relay_prog = mod["main"]
 

From 127ae4a72ce794495242a064e1d32b33d9265b38 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Wed, 10 Jun 2020 23:31:04 +0800
Subject: [PATCH 24/44] update vta-hw commit

---
 3rdparty/vta-hw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index db65157208ec..7d9629d58945 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit db65157208ec8fabb7b548c94596211b9db04190
+Subproject commit 7d9629d58945f0f042fb1690847d09f2e3e7781c

From a6cd975e11ba53e754ae6438145d193600d74de1 Mon Sep 17 00:00:00 2001
From: Li Jiashu <lijiashu@4paradigm>
Date: Fri, 12 Jun 2020 02:06:13 +0800
Subject: [PATCH 25/44] Rename VTA_MEM_ID_ACC_8 to VTA_MEM_ID_ACC_8BIT

---
 3rdparty/vta-hw        | 2 +-
 vta/runtime/runtime.cc | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 7d9629d58945..410049f9340a 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 7d9629d58945f0f042fb1690847d09f2e3e7781c
+Subproject commit 410049f9340a0ab1552655f8c8bfc1a833851e89
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 92d5ab06cc8f..d3cfce3e2b66 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -796,7 +796,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
             return "LOAD INP";
           } else if (c.mem.memory_type == VTA_MEM_ID_ACC) {
             return "LOAD ACC";
-          } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8) {
+          } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8BIT) {
             return "LOAD ACC 8";
           } else {
             return "LOAD";
@@ -1123,7 +1123,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
   }
   // Get stage of the memory
   static PipelineStage GetMemPipelineStage(int memory_type) {
-    if (memory_type == VTA_MEM_ID_ACC || memory_type == VTA_MEM_ID_ACC_8) return kComputeStage;
+    if (memory_type == VTA_MEM_ID_ACC || memory_type == VTA_MEM_ID_ACC_8BIT) return kComputeStage;
     if (memory_type == VTA_MEM_ID_UOP) return kComputeStage;
     return kLoadStage;
   }
@@ -1133,7 +1133,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage;
     if (insn->opcode == VTA_OPCODE_LOAD) {
       if (insn->x_size == 0) return kNoneStage;
-      if (insn->memory_type == VTA_MEM_ID_ACC || insn->memory_type == VTA_MEM_ID_ACC_8) return kComputeStage;
+      if (insn->memory_type == VTA_MEM_ID_ACC || insn->memory_type == VTA_MEM_ID_ACC_8BIT) return kComputeStage;
       if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage;
       return kLoadStage;
     }
@@ -1218,7 +1218,7 @@ class CommandQueue {
       case VTA_MEM_ID_OUT:
         elem_bytes = VTA_OUT_ELEM_BYTES;
         break;
-      case VTA_MEM_ID_ACC_8:
+      case VTA_MEM_ID_ACC_8BIT:
         elem_bytes = VTA_ACC_ELEM_BYTES / 4;
         break;
       default:

From 06af08b9a3f09a1aadcc151d0878674759dcdeba Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Fri, 12 Jun 2020 12:15:34 +0800
Subject: [PATCH 26/44] back-compatible other vta hardware impl

---
 vta/python/vta/testing/simulator.py | 2 +-
 vta/python/vta/top/op.py            | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
index bf89107f9f79..5ac8c80fed8d 100644
--- a/vta/python/vta/testing/simulator.py
+++ b/vta/python/vta/testing/simulator.py
@@ -25,7 +25,7 @@ def _load_sw():
     """Load hardware library for simulator."""
 
     env = get_env()
-    lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim" if env.TARGET == "sim" else "libvta"
+    lib_driver_name = "libvta_tsim" if env.TARGET == "" else "libvta" if env.TARGET == "intelfocl" else "libvta_fsim"
 
     # Load driver library
     lib_driver = find_libvta(lib_driver_name, optional=True)
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index ae9ca1a90142..69eee2aad94c 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -95,6 +95,11 @@ def is_cast_op(op):
     te.schedule.AutoInlineInjective(s)
     # s[output].fuse(s[output].op.axis)
 
+    env = get_env()
+    # other target does not support alu-only ops
+    if not (env.TARGET in ["sim", "tsim", "intelfocl"]):
+        return s
+
     # only put the int-related ops to vta
     if "int" in output.dtype and len(output.shape) == 6:
         ewise_inputs = []
@@ -144,7 +149,6 @@ def _traverse(op):
         s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
         store_pt = x_j0
 
-        env = get_env()
         for eo in ewise_ops:
             s[eo].set_scope(env.acc_scope)
             s[eo].pragma(s[eo].op.axis[0], env.alu)

From 0855a4a2d6f0f0b69bff6ff000838d78fca8ac5e Mon Sep 17 00:00:00 2001
From: Li Jiashu <lijiashu@4paradigm>
Date: Fri, 12 Jun 2020 15:54:12 +0800
Subject: [PATCH 27/44] update vta-hw commit

---
 3rdparty/vta-hw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 410049f9340a..5f0a28671be9 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 410049f9340a0ab1552655f8c8bfc1a833851e89
+Subproject commit 5f0a28671be9f2c621253c7a33c2dcb678a20ae2

From 63977922c63ba157bf73cb6b8f810f411b1aa7ec Mon Sep 17 00:00:00 2001
From: Li Jiashu <lijiashu@4paradigm>
Date: Fri, 12 Jun 2020 16:22:17 +0800
Subject: [PATCH 28/44] update vta-hw commit

---
 3rdparty/vta-hw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index 5f0a28671be9..f0347e202966 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit 5f0a28671be9f2c621253c7a33c2dcb678a20ae2
+Subproject commit f0347e202966322fe6a961eab2f4ff963bced2d5

From e43981f284198f2f570ec80e86df3e4ba48c9419 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Sun, 14 Jun 2020 22:34:37 +0800
Subject: [PATCH 29/44] remove unneeded code

---
 cmake/modules/VTA.cmake                       |  1 +
 include/tvm/relay/transform.h                 | 29 ------
 python/tvm/autotvm/measure/measure_methods.py | 10 --
 python/tvm/autotvm/tuner/tuner.py             | 13 ++-
 python/tvm/contrib/util.py                    |  5 -
 python/tvm/relay/op/op.py                     |  1 -
 python/tvm/relay/op/strategy/generic.py       |  2 +-
 python/tvm/relay/quantize/_partition.py       |  3 +-
 python/tvm/relay/transform/transform.py       |  4 -
 src/relay/backend/build_module.cc             |  8 --
 src/relay/backend/compile_engine.cc           |  4 +-
 src/relay/backend/graph_plan_memory.cc        | 64 +++++++------
 src/relay/quantize/realize.cc                 | 11 +--
 src/relay/transforms/device_annotation.cc     | 96 +------------------
 src/tir/transforms/lower_tvm_builtin.cc       |  2 +
 vta/python/vta/environment.py                 |  2 +-
 vta/python/vta/top/graphpack.py               |  9 +-
 vta/python/vta/top/op.py                      | 25 +----
 vta/python/vta/top/vta_conv2d.py              |  7 --
 vta/python/vta/transform.py                   |  3 +-
 vta/runtime/runtime.cc                        | 38 +-------
 .../integration/test_benchmark_topi_conv2d.py |  4 +-
 vta/tutorials/autotvm/tune_alu_vta.py         |  2 -
 vta/tutorials/autotvm/tune_relay_vta.py       | 17 ++--
 .../frontend/deploy_classification.py         |  5 +-
 vta/tutorials/frontend/deploy_dcgan.py        |  2 -
 vta/tutorials/frontend/deploy_mobilenet.py    |  1 -
 27 files changed, 75 insertions(+), 293 deletions(-)

diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 371bd27fa80e..4193fbaf657f 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -125,6 +125,7 @@ elseif(PYTHON)
     elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
       target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include")
       target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include)
+      set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
       target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL)
     endif()
   endif()
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index a7f5fea98ea2..b287c053e8a9 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -116,16 +116,6 @@ TVM_DLL Pass FuseOps(int fuse_opt_level = -1);
  */
 TVM_DLL Pass RewriteAnnotatedOps(int fallback_device);
 
-/*!
- * \brief add device_copy if two adjacent nodes are on different devices
- *
- * \param expr The expression.
- *
- * \return The updated program.
- */
-TVM_DLL Pass AddDeviceCopyOps();
-
-
 /*!
  * \brief turn a dataflow graph into Administrative Normal Form, or A-Normal Form (ANF).
  *
@@ -384,7 +374,6 @@ TVM_DLL Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& binds);
  * \note this function mutates mod and is not thread-safe.
  */
 TVM_DLL Function InferType(const Function& f, const IRModule& mod, const GlobalVar& var);
-TVM_DLL Expr InferType(const Expr& expr, const IRModule& mod);
 
 /*!
  * \brief Apply rewrite rules to rewrite the expr in post DFS order. This
@@ -429,24 +418,6 @@ TVM_DLL Expr ForwardRewrite(const Expr& expr, const FForwardRewrite& rewrite_fun
  */
 TVM_DLL Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device);
 
-/*!
- * \brief add device_copy if two adjacent nodes are on different devices
- *
- * \param expr The expression.
- *
- * \return The updated program.
- */
-TVM_DLL Expr AddDeviceCopyOps(const Expr& expr);
-
-/*!
- * \brief Fuse operations into expr into seperate functions.
- *
- * \param fuse_opt_level Optimization level. If it is -1 it will be inferred from pass context.
- *
- * \return The pass.
- */
-TVM_DLL Expr FuseOps(const Expr& expr, int fuse_opt_level, const IRModule& module);
-
 /*!
  * \brief Turn an expression into continuation passing style(CPS).
  *
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 666d307247c1..d6b5defb710c 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -50,8 +50,6 @@
 from .measure import MeasureResult, MeasureErrorNo, Builder, Runner
 from .local_executor import LocalExecutor
 
-from tvm.contrib.util import eprint
-
 logger = logging.getLogger('autotvm')
 
 class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))):
@@ -489,14 +487,6 @@ def run_through_rpc(measure_input, build_result,
     try:
         # upload built module
         remote = request_remote(*remote_args)
-        # Program the FPGA every single time when targeting VTA
-        if hasattr(measure_input.target, 'device_name') and \
-            measure_input.target.device_name == 'vta':
-            # pylint: disable=import-outside-toplevel
-            from vta import program_fpga, reconfig_runtime
-            # FIXME(zhanghao): remove this
-            # program_fpga(remote, None)
-            # reconfig_runtime(remote)
         remote.upload(build_result.filename)
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 4f984aae701f..2441a4ae642f 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -161,13 +161,12 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr
                 logger.debug("Early stopped. Best iter: %d.", self.best_iter)
                 break
 
-            # NOTE(zhanghao): comment out as it will raise too many logs
-            # if error_ct > 150:
-            #     logging.basicConfig()
-            #     logger.warning("Too many errors happen in the tuning. Now is in debug mode")
-            #     logger.setLevel(logging.DEBUG)
-            # else:
-            #     logger.setLevel(old_level)
+            if error_ct > 150:
+                logging.basicConfig()
+                logger.warning("Too many errors happen in the tuning. Now is in debug mode")
+                logger.setLevel(logging.DEBUG)
+            else:
+                logger.setLevel(old_level)
 
         GLOBAL_SCOPE.in_tuning = False
         del measure_batch
diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py
index 20854ab3fb27..474741fc1e35 100644
--- a/python/tvm/contrib/util.py
+++ b/python/tvm/contrib/util.py
@@ -29,11 +29,6 @@
     fcntl = None
 
 
-def eprint(*args, **kwargs):
-    # return
-    print(*args, file=sys.stderr, flush=True, **kwargs)
-
-
 class DirectoryCreatedPastAtExit(Exception):
     """Raised when a TempDirectory is created after the atexit hook runs."""
 
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 8ef51cf595fc..7fad9a258f2b 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -394,7 +394,6 @@ def register_external_compiler(op_name, fexternal=None, level=10):
     return tvm.ir.register_op_attr(op_name, "FTVMExternalCompiler", fexternal, level)
 
 
-
 @tvm._ffi.register_func("relay.op.compiler._lower")
 def _lower(name, schedule, inputs, outputs):
     return lower(schedule, list(inputs) + list(outputs), name=name)
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 025d67630cf9..63ad1127bbc0 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -35,7 +35,7 @@ def wrapper(attrs, outs, target):
 
 
 def wrap_topi_compute(topi_compute):
-    """Wrap TOPI schedule which doesn't use attrs"""
+    """Wrap TOPI compute which doesn't use attrs"""
     def wrapper(attrs, inputs, out_type):
         return [topi_compute(*inputs)]
     return wrapper
diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py
index f26e88301894..c0234594a38f 100644
--- a/python/tvm/relay/quantize/_partition.py
+++ b/python/tvm/relay/quantize/_partition.py
@@ -21,7 +21,6 @@
 from .. import analysis as _analysis
 from . import _quantize
 from .quantize import _forward_op
-from tvm.contrib.util import eprint
 
 def register_partition_function(op_name, frewrite=None, level=10):
     return tvm.ir.register_op_attr(op_name, "FQPartitionRewrite", frewrite, level)
@@ -55,7 +54,7 @@ def conv2d_partition_function(ref_call, new_args, ctx):
 
 @register_partition_function("nn.conv2d_transpose")
 def conv2d_partition_function(ref_call, new_args, ctx):
-    """Rewrite function for conv2d for partition"""
+    """Rewrite function for conv2d_transpose for partition"""
     data_cond, data = partition_expr_check(new_args[0])
     kernel_cond, kernel = partition_expr_check(new_args[1])
 
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index d1a93fd5f9b8..8f4ec1046500 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -441,10 +441,6 @@ def RewriteAnnotatedOps(fallback_device):
     return _ffi_api.RewriteDeviceAnnotation(fallback_device)
 
 
-def AddDeviceCopy():
-    return _transform.AddDeviceCopy()
-
-
 def ToANormalForm():
     """Turn Graph Normal Form expression into A Normal Form Expression.
     The scope of the root expression is the global scope.
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index cbe4ae2d4256..f9ce24d410b7 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -419,14 +419,6 @@ class RelayBuildModule : public runtime::ModuleNode {
     // Get the updated function.
     auto func = Downcast<Function>(relay_module->Lookup("main"));
 
-    // do extra pass to check to insert device_copy if necessary
-    if (targets_.size() > 1) {
-      func = Downcast<Function>(relay::AddDeviceCopyOps(func));
-      // we have to do fuseops again as we may add new device_copy ops
-      func = Downcast<Function>(relay::FuseOps(func, -1, relay_module));
-      func = Downcast<Function>(relay::InferType(func, relay_module));
-    }
-
     // Generate code for the updated function.
     graph_codegen_ = std::unique_ptr<GraphCodegen>(new GraphCodegen());
     graph_codegen_->Init(nullptr, targets_);
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 37fb0108f111..8079fdbe76d8 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -123,7 +123,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     readable_name_stream_ << "fused";
     cache_node->outputs = this->VisitExpr(prim_func->body);
     auto candidate_name = readable_name_stream_.str();
-    constexpr static size_t kMaxFuncNameLength = 800;
+    constexpr static size_t kMaxFuncNameLength = 80;
     if (candidate_name.size() > kMaxFuncNameLength) {
       std::stringstream truncated_name;
       truncated_name << candidate_name.substr(0, kMaxFuncNameLength);
@@ -343,7 +343,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     auto cache_node = make_object<CachedFuncNode>();
     cache_node->outputs = VisitExpr(prim_func->body);
     auto candidate_name = readable_name_stream_.str();
-    constexpr static size_t kMaxFuncNameLength = 800;
+    constexpr static size_t kMaxFuncNameLength = 80;
     if (candidate_name.size() > kMaxFuncNameLength) {
       std::stringstream truncated_name;
       truncated_name << candidate_name.substr(0, kMaxFuncNameLength);
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 8ebf9847c3a7..66de20dcf4c0 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -309,35 +309,41 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     if (match_range_ == 0) {
       return this->Alloc(prototype, size);
     }
-    // TODO(zhanghao): to avoid overwrite shared storage when we copy all the instructions in a single batch
-    // auto begin = free_.lower_bound(size / match_range_);
-    // auto mid = free_.lower_bound(size);
-    // auto end = free_.upper_bound(size * match_range_);
-    // // search for memory blocks larger than requested
-    // for (auto it = mid; it != end; ++it) {
-    //   StorageToken *tok = it->second;
-    //   if (tok->device_type != prototype->device_type) continue;
-    //   CHECK_EQ(tok->ref_counter, 0);
-    //   // Use exect matching strategy
-    //   tok->max_bytes = std::max(size, tok->max_bytes);
-    //   tok->ref_counter = prototype->ref_counter;
-    //   // find a exact match, erase from map and return
-    //   free_.erase(it);
-    //   return tok;
-    // }
-    // // then search for memory blocks smaller than requested space
-    // for (auto it = mid; it != begin;) {
-    //   --it;
-    //   StorageToken *tok = it->second;
-    //   if (tok->device_type != prototype->device_type) continue;
-    //   CHECK_EQ(tok->ref_counter, 0);
-    //   // Use exect matching strategy
-    //   tok->max_bytes = std::max(size, tok->max_bytes);
-    //   tok->ref_counter = prototype->ref_counter;
-    //   // erase from map and return
-    //   free_.erase(it);
-    //   return tok;
-    // }
+    // quickfix(zhanghao): we copy all the instructions in a single batch
+    // to avoid overwrite shared storage, we do not re-use allocation
+    const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE");
+    if (sync_once) {
+      return this->Alloc(prototype, size);
+    }
+
+    auto begin = free_.lower_bound(size / match_range_);
+    auto mid = free_.lower_bound(size);
+    auto end = free_.upper_bound(size * match_range_);
+    // search for memory blocks larger than requested
+    for (auto it = mid; it != end; ++it) {
+      StorageToken* tok = it->second;
+      if (tok->device_type != prototype->device_type) continue;
+      CHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      tok->max_bytes = std::max(size, tok->max_bytes);
+      tok->ref_counter = prototype->ref_counter;
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return tok;
+    }
+    // then search for memory blocks smaller than requested space
+    for (auto it = mid; it != begin;) {
+      --it;
+      StorageToken* tok = it->second;
+      if (tok->device_type != prototype->device_type) continue;
+      CHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      tok->max_bytes = std::max(size, tok->max_bytes);
+      tok->ref_counter = prototype->ref_counter;
+      // erase from map and return
+      free_.erase(it);
+      return tok;
+    }
     // cannot find anything return a new one.
     return this->Alloc(prototype, size);
   }
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index b71249c8c755..9dbc27d2c5a3 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -360,11 +360,7 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
     if (nptrs[i]->dtype != dtype) {
       auto new_arg = Cast(ret[i], dtype);
 
-      // NOTE(zhanghao)
-      // if you want to let cpu to do all the cast, use the following code
-      // ret.Set(i, StopFusion(new_arg));
-
-      // do not fuse float32 cast
+      // FIXME(zhanghao): do not fuse float32 cast
       if (nptrs[i]->dtype == DataType::Float(32)) {
         ret.Set(i, StopFusion(new_arg));
       } else {
@@ -374,11 +370,6 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
                ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
       auto new_arg = Cast(ret[i], cfg->dtype_input);
       new_arg = StopFusion(new_arg);
-
-      // NOTE(zhanghao)
-      // if you want to let cpu to do all the cast, use the following code
-      // ret.Set(i, StopFusion(Cast(new_arg, dtype)));
-
       ret.Set(i, Cast(new_arg, dtype));
     }
   }
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index 3609ee0bacc4..fe3cfebf7fe3 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -60,10 +60,8 @@ bool IsDeviceCopyNode(const ExprNode* node) {
 
 class ValidateAnnotation : private ExprVisitor {
  public:
-  ValidateAnnotation(int fallback_device): fallback_device_(fallback_device) {}
-
-  static std::unordered_map<const ExprNode*, int> Validate(const Expr& expr, int fallback_device) {
-    ValidateAnnotation valid(fallback_device);
+  static std::unordered_map<const ExprNode*, int> Validate(const Expr& expr) {
+    ValidateAnnotation valid;
     valid(expr);
     return valid.annotation_map_;
   }
@@ -82,15 +80,12 @@ class ValidateAnnotation : private ExprVisitor {
 
       CHECK_EQ(call_node->args.size(), 1U);
       const auto* node = call_node->args[0].operator->();
-      // LOG(WARNING) << "annotated node, device_type = " << device_type << " : " << GetRef<Expr>(node).as<CallNode>()->op;
       if (annotation_map_.count(node)) {
         CHECK_EQ(annotation_map_.at(node), device_type)
             << "An expression node can only be annotated to one device.";
       } else {
         annotation_map_.insert({node, GetDeviceId(call_node)});
       }
-
-      if (device_type != fallback_device_) extra_device_ = device_type;
     }
   }
 
@@ -114,8 +109,6 @@ class ValidateAnnotation : private ExprVisitor {
   }
 
   std::unordered_map<const ExprNode*, int> annotation_map_;
-  int fallback_device_ = 0;
-  int extra_device_ = 0;
 };
 
 // Replace the use of an expression with the output of a `copy_device` operator
@@ -129,7 +122,7 @@ class RewriteAnnotation : public ExprMutator {
  public:
   Expr Rewrite(const Expr& expr, int fallback_device) {
     fallback_device_ = fallback_device;
-    annotation_map_ = ValidateAnnotation::Validate(expr, fallback_device);
+    annotation_map_ = ValidateAnnotation::Validate(expr);
     return this->VisitExpr(expr);
   }
 
@@ -236,7 +229,6 @@ class RewriteAnnotation : public ExprMutator {
       CHECK(dit != annotation_map_.end())
           << "Device copy op is not required when both src and dst ops are not "
              "annotated.";
-      // LOG(WARNING) << "Create device copy " << fallback_device_ << " to " << dit->second << ": " << src.as<CallNode>()->op;
       return CreateDeviceCopy(src, fallback_device_, dit->second);
     } else {
       const auto dit = annotation_map_.find(dst);
@@ -255,8 +247,7 @@ class RewriteAnnotation : public ExprMutator {
         return src_dev_type != fallback_device_;
       }
     } else {
-      // if annotation value < 0, it means this is for "copy from" only
-      if (annotation_map_.count(dst) && annotation_map_.at(dst) > 0) {
+      if (annotation_map_.count(dst)) {
         // Though data copy op could be inserted whenever the `src` and `dst`
         // ops are annotated to different devices, it leads to high overhead.
         //
@@ -496,68 +487,6 @@ class DeviceInfo {
   Map<Expr, Integer> device_map_;
 };
 
-
-// TODO(zhanghao): consider to remove this as I think it is not necessary for now
-class AddDeviceCopy : public ExprMutator {
- public:
-  Expr Rewrite(const Expr& expr) {
-    device_map_ = DeviceInfo::GetDeviceMap(expr);
-    return this->Mutate(expr);
-  }
-
- private:
-  // add device copy if two nodes not on the same device
-  Expr VisitExpr_(const CallNode* call_node) override {
-    auto func_node = call_node->op.as<FunctionNode>();
-    bool src_is_copy_node = false;
-    if (func_node && IsDeviceCopyNode(func_node->body.as<CallNode>())) {
-      // LOG(WARNING) << "DeviceCopy skip device_copy node";
-      src_is_copy_node = true;
-    }
-
-    tvm::Array<Expr> call_args;
-    auto call_expr = GetRef<Expr>(call_node);
-    CHECK(device_map_.count(call_expr));
-
-    for (auto& arg: call_node->args) {
-      CHECK(device_map_.count(arg));
-      bool dst_is_copy_node = false;
-      if (auto arg_node = arg.as<CallNode>()) {
-        auto func_node = arg_node->op.as<FunctionNode>();
-        if (func_node && IsDeviceCopyNode(func_node->body.as<CallNode>())) {
-          // LOG(WARNING) << "DeviceCopy skip dst device_copy node";
-          dst_is_copy_node = true;
-        }
-      }
-
-      int src_dev_type = device_map_.count(arg) ? device_map_[arg]->value : 1;
-      int dst_dev_type = device_map_.count(call_expr) ? device_map_[call_expr]->value : 1;
-      if (!src_is_copy_node && !dst_is_copy_node && src_dev_type != dst_dev_type) {
-        // auto arg_call = arg.as<CallNode>();
-        // LOG(WARNING) << "Not consistent device type, src = " << src_dev_type << ":" << (arg_call ? arg_call->op : arg);
-        // LOG(WARNING) << "Not consistent device type, dst = " << dst_dev_type << ":" << call_node->op;
-        auto attrs = make_object<DeviceCopyAttrs>();
-        attrs->src_dev_type = src_dev_type;
-        attrs->dst_dev_type = dst_dev_type;
-        static const Op& op = Op::Get("device_copy");
-        Call device_copy = Call(op, {this->Mutate(arg)}, Attrs(attrs), {});
-        device_copy->checked_type_ = arg->checked_type_;
-        call_args.push_back(device_copy);
-      } else {
-        call_args.push_back(this->Mutate(arg));
-      }
-    }
-
-    auto ret = Call(call_node->op, call_args, call_node->attrs, call_node->type_args);
-    // manually add the checked_type_
-    // alternatively, can call InferType Pass after this
-    ret->checked_type_ = call_node->checked_type_;
-    return ret;
-  }
-
-  Map<Expr, Integer> device_map_;
-};
-
 Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
   RewriteAnnotation rewrote = RewriteAnnotation();
   Expr new_expr = rewrote.Rewrite(expr, fallback_device);
@@ -605,12 +534,6 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
   }
 }
 
-Expr AddDeviceCopyOps(const Expr& expr) {
-  auto rewrote = AddDeviceCopy();
-  Expr new_expr = rewrote.Rewrite(expr);
-  return new_expr;
-}
-
 Map<Expr, Integer> CollectDeviceInfo(const Expr& expr) {
   return DeviceInfo::GetDeviceMap(expr);
 }
@@ -636,17 +559,6 @@ Pass RewriteAnnotatedOps(int fallback_device) {
 
 TVM_REGISTER_GLOBAL("relay._transform.RewriteDeviceAnnotation").set_body_typed(RewriteAnnotatedOps);
 
-Pass AddDeviceCopyOps() {
-  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-    [=](Function f, IRModule m, PassContext pc) {
-    return Downcast<Function>(AddDeviceCopyOps(f));
-  };
-  return CreateFunctionPass(pass_func, 1, "AddDeviceCopyOps", {"InferType"});
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.AddDeviceCopy")
-.set_body_typed(AddDeviceCopyOps);
-
 }  // namespace transform
 
 }  // namespace relay
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 386e9885807b..3d54d45015c6 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -87,6 +87,8 @@ class BuiltinLower : public StmtExprMutator {
     // Get constant allocation bound.
     int64_t nbytes = GetVectorBytes(op->dtype);
     // FIXME(zhanghao): remove special handling for kDLCPU
+    // otherwise, may cause LLVM parameters match error
+    // if in heterogenous targets
     // if (device_type_.defined()) {
     //   if (arith::GetConst(device_type_, &dev_type)) {
     //     if (dev_type == kDLCPU) {
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index 548dc03aae78..cfed3f77def1 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -62,7 +62,7 @@ class DevContext(object):
     MEM_ID_INP = 2
     MEM_ID_ACC = 3
     MEM_ID_OUT = 4
-    MEM_ID_ACC_8 = 5
+    MEM_ID_ACC_8BIT = 5
     # VTA ALU Opcodes
     ALU_OPCODE_MIN = 0
     ALU_OPCODE_MAX = 1
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index ac4c8aac4539..0934ed15d8b9 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -21,7 +21,6 @@
 from tvm import relay
 from tvm.relay import op, transform
 from tvm.relay import ExprMutator
-from tvm.contrib.util import eprint
 
 def run_opt_pass(expr, opt_pass):
     """Exectue a relay pass."""
@@ -416,11 +415,6 @@ def visit_call(self, call):
             elif self.start_pack and call.op == op.op.get('cast') and \
                     input_types[0].dtype == 'int32':
                 cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs)
-                # zhanghao: force separate cast and copy (to let copy do on cpu)
-                # cast = relay.Call(op.op.get('annotation.stop_fusion'), [cast])
-
-                # zhanghao: remove the redudant copy
-                # return relay.Call(op.op.get('copy'), [cast])
                 return cast
             elif call.op == self.pad:
                 pad_width = call.attrs.pad_width
@@ -516,7 +510,7 @@ def graph_pack(expr,
                stop_name="nn.global_avg_pool2d",
                start_name_idx=None,
                stop_name_idx=None,
-               count_meta=False, device_annot=True):
+               count_meta=False, device_annot=False):
     """Pack the graph into batch&channel packed format.
 
     Parameters
@@ -574,6 +568,7 @@ def graph_pack(expr,
         expr_locator = ExprLocater()
         expr_locator.visit(expr)
 
+        # FIXME(zhanghao): generalize this part
         # from the first int conv2d to the last int stop_fusion, all will run on vta
         conv2d = op.op.get("nn.conv2d")
         conv2d_transpose = op.op.get("nn.conv2d_transpose")
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 69eee2aad94c..617be4b56d19 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -35,10 +35,6 @@
 from ..environment import get_env
 
 
-# override to force partition at copy
-# TODO(zhanghao): remove all copy
-# reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
-
 # add clip vta strategy
 def compute_clip_vta(attrs, inputs, output_type):
     """ Clip operator. """
@@ -76,11 +72,6 @@ def multiply_packed(cfg, lhs, rhs):
     return topi.multiply(lhs, rhs)
 
 
-@autotvm.register_topi_compute("copy.vta")
-def copy_packed(cfg, i):
-    return topi.identify(i)
-
-
 def schedule_alu_packed(cfg, outs):
     assert len(outs) == 1
 
@@ -183,11 +174,6 @@ def schedule_multiply_packed(cfg, outs):
     return schedule_alu_packed(cfg, outs)
 
 
-@autotvm.register_topi_schedule("copy.vta")
-def schedule_copy_packed(cfg, outs):
-    return schedule_alu_packed(cfg, outs)
-
-
 def add_strategy_vta(attrs, inputs, out_type, target):
     strategy = OpStrategy()
     strategy.add_implementation(
@@ -206,18 +192,9 @@ def multiply_strategy_vta(attrs, inputs, out_type, target):
     return strategy
 
 
-def copy_strategy_vta(attrs, inputs, out_type, target):
-    strategy = OpStrategy()
-    strategy.add_implementation(
-        _strategy.wrap_topi_compute(copy_packed),
-        _strategy.wrap_topi_schedule(schedule_copy_packed),
-        name="copy.vta")
-    return strategy
-
-
 reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
 reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
-reg.get("copy").get_attr("FTVMStrategy").register(copy_strategy_vta, "vta")
+
 
 @_strategy.conv2d_strategy.register("vta")
 def conv2d_strategy_vta(attrs, inputs, out_type, target):
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index 525d60ae383d..5b23ddeba1c1 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -25,9 +25,6 @@
 
 from .util import is_packed_layout
 from ..environment import get_env
-from tvm.relay import op as Op
-from tvm.contrib.util import eprint
-
 
 @autotvm.register_topi_compute("conv2d_packed.vta")
 def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
@@ -66,7 +63,6 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
 
     return res
 
-
 @autotvm.register_topi_schedule("conv2d_packed.vta")
 def schedule_conv2d_packed(cfg, outs):
     """Schedule packed conv2d"""
@@ -188,6 +184,3 @@ def _traverse(op):
     s[output].pragma(x_co1, env.dma_copy)
 
     return s
-
-
-
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index 3b13c1769103..a8ecb1099a89 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -549,6 +549,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
 
             _check_compact(dst)
 
+            # FIXME(zhanghao): optimize
             # for int8 -> int32 cast/load
             orig_dtype = src.dtype
             if src.dtype != data_type:
@@ -562,7 +563,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
 
             if orig_dtype != src.dtype:
                 src.dtype = orig_dtype
-                mem_type = env.dev.MEM_ID_ACC_8
+                mem_type = env.dev.MEM_ID_ACC_8BIT
 
             irb = tvm.tir.ir_builder.create()
             irb.scope_attr(env.dev.vta_axis, "coproc_scope",
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index d3cfce3e2b66..cf70f7e19361 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -186,12 +186,7 @@ struct DataBuffer {
    * Bytes.
    */
   void MemCopyFromHost(void* dst, const void* src, size_t size) {
-    // struct timespec start, stop;
-    // clock_gettime(CLOCK_REALTIME, &start);
     VTAMemCopyFromHost(dst, src, size);
-    // clock_gettime(CLOCK_REALTIME, &stop);
-    // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000;
-    // LOG(WARNING) << "DataBuffer VTAMemCopyFromHost: " << elapsed << " us";
   }
   /*!
    * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory.
@@ -549,16 +544,6 @@ class UopQueue : public BaseQueue<VTAUop> {
       buff_size += cache_[i]->size() * kElemBytes;
     }
     CHECK(buff_size <= kMaxBytes);
-    // Move kernel contents to FPGA readable buffer
-    // uint32_t offset = 0;
-    // for (uint32_t i = 0; i < cache_.size(); ++i) {
-    //   uint32_t ksize = cache_[i]->size() * kElemBytes;
-    //   VTAMemCopyFromHost(static_cast<char*>(fpga_buff_) + offset,
-    //                      cache_[i]->data(),
-    //                      ksize);
-    //   // Update offset
-    //   offset += ksize;
-    // }
 
     // merge all the cache entries and do CopyFromHost once
     uint32_t total_size = 0;
@@ -797,7 +782,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
           } else if (c.mem.memory_type == VTA_MEM_ID_ACC) {
             return "LOAD ACC";
           } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8BIT) {
-            return "LOAD ACC 8";
+            return "LOAD ACC 8BIT";
           } else {
             return "LOAD";
           }
@@ -860,6 +845,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     // Iterate over all instructions
     int insn_count = count();
     const VTAGenericInsn* insn = data();
+    // FIXME(zhanghao): rapidjson dep
     rapidjson::StringBuffer s;
     rapidjson::Writer<rapidjson::StringBuffer> writer(s);
 
@@ -1335,18 +1321,9 @@ class CommandQueue {
     // Check if there are no instruction to execute at all
     if (insn_queue_.count() == 0) return;
     // Synchronization for the queues
-    // struct timespec start, stop;
-    // clock_gettime(CLOCK_REALTIME, &start);
     uop_queue_.AutoReadBarrier();
-    // clock_gettime(CLOCK_REALTIME, &stop);
-    // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000;
-    // LOG(WARNING) << "UopQueue VTAMemCopyFromHost: " << elapsed << " us";
 
-    // clock_gettime(CLOCK_REALTIME, &start);
     insn_queue_.AutoReadBarrier();
-    // clock_gettime(CLOCK_REALTIME, &stop);
-    // elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000;
-    // LOG(WARNING) << "InsnQueue VTAMemCopyFromHost: " << elapsed << " us";
     // Dump instructions if debug enabled
     if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
       insn_queue_.DumpInsn();
@@ -1505,6 +1482,7 @@ class CommandQueue {
   void CheckInsnOverFlow() {
     // At each API call, we can at most commit:
     // one pending store, one pending load, and one uop
+    // FIXME(zhanghao): check why there are 5 insns
     if ((insn_queue_.count() + 5) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
       this->AutoSync();
     }
@@ -1547,13 +1525,8 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off
   if (from_buffer) {
     // This is an FPGA to host mem transfer
     // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly
-    // struct timespec start, stop;
-    // clock_gettime(CLOCK_REALTIME, &start);
     const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE");
     if (sync_once) VTASynchronize(VTATLSCommandHandle(), 1<<31, false);
-    // clock_gettime(CLOCK_REALTIME, &stop);
-    // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000;
-    // LOG(WARNING) << "Final Synchronize: " << elapsed << " us";
     from_buffer->InvalidateCache(from_offset, size);
     from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
                                static_cast<const char*>(from) + from_offset, size);
@@ -1573,8 +1546,6 @@ void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) {
   static_cast<vta::CommandQueue*>(cmd)->SetDebugFlag(debug_flag);
 }
 
-// TODO(zhanghao): now we do the check here
-// it would be better to do the check in ir_pass before adding the "VTABufferCPUPtr"
 void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) {
   auto data_buf = vta::DataBuffer::FromHandle(buffer);
   if (data_buf) {
@@ -1645,5 +1616,4 @@ int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) {
 }
 
 void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip) {
-  static_cast<vta::CommandQueue*>(cmd)->
-      Synchronize(wait_cycles, skip); }
+  static_cast<vta::CommandQueue*>(cmd)->Synchronize(wait_cycles, skip); }
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index ea6b9cf1e9da..1d940c2ac9be 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -246,12 +246,12 @@ def _run(env, remote):
                 reconfig_runtime(remote)
         elif device == "arm_cpu":
             target = env.target_vta_cpu
-        with autotvm.tophub.context(target, extra_files = ['vta.resnet18_v1.log-manual-formatv0_2']): # load pre-tuned schedule parameters
+        with autotvm.tophub.context(target): # load pre-tuned schedule parameters
             for _, wl in resnet_wkls:
                 print(wl)
                 run_conv2d(env, remote, wl, target)
     vta.testing.run(_run)
 
 if __name__ == "__main__":
-    # test_conv2d(device="arm_cpu")
+    test_conv2d(device="arm_cpu")
     test_conv2d(device="vta")
diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py
index 8a9a09c76856..a5f03cdc22c7 100644
--- a/vta/tutorials/autotvm/tune_alu_vta.py
+++ b/vta/tutorials/autotvm/tune_alu_vta.py
@@ -37,7 +37,6 @@
 from vta.top import graph_pack
 import copy
 
-from tvm.contrib.util import eprint
 
 #################################################################
 # Compile network
@@ -145,7 +144,6 @@ def log_to_file(file_out, protocol='json'):
     def _callback(_, inputs, results):
         with open(file_out, "a") as f:
             for inp, result in zip(inputs, results):
-                eprint("inp = {}, result = {}".format(inp, result))
                 f.write(record.encode(inp, result, protocol) + "\n")
 
                 # we only consider task with same lhs and rhs
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index 7e537fae9128..3f62f15b6490 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -76,7 +76,7 @@
 # Perform vta-specific compilation with Relay from a Gluon model
 
 
-def compile_network(env, target, model, start_pack, stop_pack, device_annot=False):
+def compile_network(env, target, model, start_pack, stop_pack):
 
     # Populate the shape and data type dictionary
     dtype_dict = {"data": 'float32'}
@@ -104,8 +104,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals
                                 env.BLOCK_OUT,
                                 env.WGT_WIDTH,
                                 start_name=start_pack,
-                                stop_name=stop_pack,
-                                device_annot=device_annot)
+                                stop_name=stop_pack)
 
     return relay_prog, params
 
@@ -195,7 +194,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals
 # The ``start_pack`` and ``stop_pack`` labels indicate where
 # to start and end the graph packing relay pass: in other words
 # where to start and finish offloading to VTA.
-network = "resnet50_v2"
+network = "resnet18_v1"
 start_pack = "nn.max_pool2d"
 stop_pack = "nn.global_avg_pool2d"
 
@@ -368,7 +367,7 @@ def tune_and_evaluate(tuning_opt):
     tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
 
     # We should have extracted 10 convolution tasks
-    # assert len(tasks) == 10
+    assert len(tasks) == 10
     print("Extracted {} conv2d tasks:".format(len(tasks)))
     for tsk in tasks:
         inp = tsk.args[0][1]
@@ -386,7 +385,7 @@ def tune_and_evaluate(tuning_opt):
 
     # We do not run the tuning in our webpage server since it takes too long.
     # Comment the following line to run it by yourself.
-    # return
+    return
 
     # run tuning tasks
     print("Tuning...")
@@ -402,9 +401,9 @@ def tune_and_evaluate(tuning_opt):
         if target.device_name != "vta":
             with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
                 graph, lib, params = relay.build(relay_prog,
-                                                 target=target,
-                                                 params=params,
-                                                 target_host=env.target_host)
+                                                target=target,
+                                                params=params,
+                                                target_host=env.target_host)
         else:
             targets = {
                 "cpu": env.target_vta_cpu,
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index b61f594872c1..a168d30c7498 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -59,7 +59,6 @@
 import vta
 from vta.testing import simulator
 from vta.top import graph_pack
-from tvm.contrib.util import eprint
 
 # Make sure that TVM was compiled with RPC=1
 assert tvm.runtime.enabled("rpc")
@@ -96,7 +95,7 @@
 # The ``start_pack`` and ``stop_pack`` labels indicate where
 # to start and end the graph packing relay pass: in other words
 # where to start and finish offloading to VTA.
-model = "resnet50_v2"
+model = "resnet18_v1"
 assert model in pack_dict
 
 ######################################################################
@@ -197,7 +196,7 @@
                 env.BLOCK_OUT,
                 env.WGT_WIDTH,
                 start_name=pack_dict[model][0],
-                stop_name=pack_dict[model][1])
+                stop_name=pack_dict[model][1], device_annot=True)
     else:
         relay_prog = mod["main"]
 
diff --git a/vta/tutorials/frontend/deploy_dcgan.py b/vta/tutorials/frontend/deploy_dcgan.py
index 95a3731f98f9..6aaff4301258 100644
--- a/vta/tutorials/frontend/deploy_dcgan.py
+++ b/vta/tutorials/frontend/deploy_dcgan.py
@@ -20,7 +20,6 @@
 import vta
 from vta.testing import simulator
 from vta.top import graph_pack
-from tvm.contrib.util import eprint
 
 # Make sure that TVM was compiled with RPC=1
 assert tvm.runtime.enabled("rpc")
@@ -145,7 +144,6 @@
     m = graph_runtime.create(graph, lib, ctxes)
 
 image = np.zeros((1, 100), dtype=np.float32)
-eprint("image", image.dtype, image)
 image = np.repeat(image, env.BATCH, axis=0)
 
 # Set the network parameters and inputs
diff --git a/vta/tutorials/frontend/deploy_mobilenet.py b/vta/tutorials/frontend/deploy_mobilenet.py
index 8a94a588741e..9cf9dd98b09c 100644
--- a/vta/tutorials/frontend/deploy_mobilenet.py
+++ b/vta/tutorials/frontend/deploy_mobilenet.py
@@ -20,7 +20,6 @@
 import vta
 from vta.testing import simulator
 from vta.top import graph_pack
-from tvm.contrib.util import eprint
 
 # Make sure that TVM was compiled with RPC=1
 assert tvm.runtime.enabled("rpc")

From d6993845ea2891fdefb782de5ac0dc997427fd20 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 15 Jun 2020 00:27:32 +0800
Subject: [PATCH 30/44] refine graphpack and deploy exp

---
 vta/python/vta/top/graphpack.py               | 28 ++++++++++-------
 .../frontend/deploy_classification.py         | 31 ++++++++++---------
 2 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 0934ed15d8b9..ea2a20dd8797 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -510,7 +510,10 @@ def graph_pack(expr,
                stop_name="nn.global_avg_pool2d",
                start_name_idx=None,
                stop_name_idx=None,
-               count_meta=False, device_annot=False):
+               count_meta=False,
+               device_annot=False,
+               annot_start_name="nn.conv2d",
+               annot_end_name="annotation.stop_fusion"):
     """Pack the graph into batch&channel packed format.
 
     Parameters
@@ -547,6 +550,15 @@ def graph_pack(expr,
         'expr.astext(show_meta_data=False)'. When count_meta is True, the operator increase
         logic would count the meta.
 
+    device_annot: boolean, optional
+        if we want to annoate the device_type
+
+    annot_start_name: str, optional
+        device annotation start node, from which we mark the nodes as `ext_dev`
+
+    annot_end_name: str, optional
+        device annotation end node, after which we mark the nodes as 'cpu'
+
     Returns
     -------
     expr : Expr
@@ -568,18 +580,12 @@ def graph_pack(expr,
         expr_locator = ExprLocater()
         expr_locator.visit(expr)
 
-        # FIXME(zhanghao): generalize this part
-        # from the first int conv2d to the last int stop_fusion, all will run on vta
-        conv2d = op.op.get("nn.conv2d")
-        conv2d_transpose = op.op.get("nn.conv2d_transpose")
-        stop_fusion = op.op.get("annotation.stop_fusion")
-        if (conv2d, "int32") in expr_locator.op2nodes:
-            start = expr_locator.op2nodes[(conv2d, "int32")][0]
-        else:
-            start = expr_locator.op2nodes[(conv2d_transpose, "int32")][0]
+        annot_start = op.op.get(annot_start_name)
+        start = expr_locator.op2nodes[(annot_start, "int32")][0]
 
+        annot_end = op.op.get(annot_end_name)
         # we mark the next op to the last stop_fusion on cpu device
-        end = expr_locator.op2nodes[(stop_fusion, "int8")][-1] + 1
+        end = expr_locator.op2nodes[(annot_end, "int8")][-1] + 1
 
         device_annot = ExprDeviceAnnot(start=start, end=end)
         expr = device_annot.visit(expr)
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index a168d30c7498..fe5b62890922 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -75,11 +75,6 @@
 # or ``device=vta`` to run inference on the FPGA.
 device = "vta"
 target = env.target if device == "vta" else env.target_vta_cpu
-# multiple targets to run both on cpu and vta
-targets = {
-    "cpu": env.target_vta_cpu,
-    "ext_dev": env.target
-}
 
 # Dictionary lookup for when to start/end bit packing
 pack_dict = {
@@ -140,8 +135,7 @@
     remote = rpc.LocalSession()
 
 # Get execution context from remote
-# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-ctxes = [remote.ext_dev(0), remote.cpu(0)]
+ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
 
 ######################################################################
 # Build the inference graph runtime
@@ -160,9 +154,7 @@
 #
 
 # Load pre-configured AutoTVM schedules
-log_file = "%s.%s.log-manual-formatv0_2" % (device, model)
-alu_log_file = "%s.alu.%s.log" % (device, model)
-with autotvm.tophub.context(target, extra_files=[log_file, alu_log_file]):
+with autotvm.tophub.context(target):
 
     # Populate the shape and data type dictionary for ImageNet classifier input
     dtype_dict = {"data": 'float32'}
@@ -207,9 +199,15 @@
                 relay_prog, target=target,
                 params=params, target_host=env.target_host)
     else:
+        if env.TARGET == "intelfocl":
+            # multiple targets to run both on cpu and vta
+            target = {
+                "cpu": env.target_vta_cpu,
+                "ext_dev": target
+            }
         with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             graph, lib, params = relay.build(
-                relay_prog, target=targets,
+                relay_prog, target=target,
                 params=params, target_host=env.target_host)
 
     # Measure Relay build time
@@ -222,8 +220,13 @@
     remote.upload(temp.relpath("graphlib.o"))
     lib = remote.load_module("graphlib.o")
 
-    # Graph runtime
-    m = graph_runtime.create(graph, lib, ctxes)
+
+    if env.TARGET == "intelfocl":
+        ctxes = [remote.ext_dev(0), remote.cpu(0)]
+        m = graph_runtime.create(graph, lib, ctxes)
+    else:
+        # Graph runtime
+        m = graph_runtime.create(graph, lib, ctx)
 
 ######################################################################
 # Perform image classification inference
@@ -261,7 +264,7 @@
 # More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
 num = 4 # number of times we run module for a single measurement
 rep = 3 # number of measurements (we derive std dev from this)
-timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep)
+timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
 
 if env.TARGET in ["sim", "tsim"]:
     simulator.clear_stats()

From 4dbcdf58c76084e08737d700036a765924d8d7e3 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 15 Jun 2020 12:19:17 +0800
Subject: [PATCH 31/44] some bugfix

---
 src/arith/detect_linear_equation.cc             | 2 +-
 vta/tutorials/frontend/deploy_classification.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc
index 18d28b53a431..c9704e3fff4b 100644
--- a/src/arith/detect_linear_equation.cc
+++ b/src/arith/detect_linear_equation.cc
@@ -152,7 +152,7 @@ class LinearEqDetector : public ExprFunctor<LinearEqEntry(const PrimExpr&, const
   PrimExpr FloorDivCombine(PrimExpr a, PrimExpr b) {
     if (!a.defined()) return b;
     if (!b.defined()) return a;
-    return FloorDivNode::make(a, b);
+    return FloorDiv(a, b);
   }
 };
 
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index fe5b62890922..73f13b3bf792 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -188,7 +188,7 @@
                 env.BLOCK_OUT,
                 env.WGT_WIDTH,
                 start_name=pack_dict[model][0],
-                stop_name=pack_dict[model][1], device_annot=True)
+                stop_name=pack_dict[model][1], device_annot=env.TARGET == "intelfocl")
     else:
         relay_prog = mod["main"]
 

From 41374c49974cc2e207483ae17d35ac667b43117e Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 15 Jun 2020 12:50:59 +0800
Subject: [PATCH 32/44] remove dcgan and mobilenet tutorial

---
 python/tvm/relay/testing/mobilenet.py      |  50 ++---
 vta/python/vta/top/op.py                   |   2 -
 vta/python/vta/top/vta_conv2d_transpose.py |  12 +-
 vta/tutorials/frontend/deploy_dcgan.py     | 184 -----------------
 vta/tutorials/frontend/deploy_mobilenet.py | 225 ---------------------
 5 files changed, 20 insertions(+), 453 deletions(-)
 delete mode 100644 vta/tutorials/frontend/deploy_dcgan.py
 delete mode 100644 vta/tutorials/frontend/deploy_mobilenet.py

diff --git a/python/tvm/relay/testing/mobilenet.py b/python/tvm/relay/testing/mobilenet.py
index e83336525ea7..d5a4d5f1e08f 100644
--- a/python/tvm/relay/testing/mobilenet.py
+++ b/python/tvm/relay/testing/mobilenet.py
@@ -44,22 +44,20 @@ def conv_block(data, name, channels, kernel_size=(3, 3), strides=(1, 1),
 
 def separable_conv_block(data, name, depthwise_channels, pointwise_channels,
                          kernel_size=(3, 3), downsample=False, padding=(1, 1),
-                         epsilon=1e-5, layout='NCHW', dtype="float32", depthwise_group_factor=1):
+                         epsilon=1e-5, layout='NCHW', dtype="float32"):
     """Helper function to get a separable conv block"""
     if downsample:
         strides = (2, 2)
     else:
         strides = (1, 1)
     # depthwise convolution + bn + relu
-    wshape = (depthwise_channels, depthwise_group_factor) + kernel_size
+    wshape = (depthwise_channels, 1) + kernel_size
     weight = relay.var(name + "_weight", shape=wshape, dtype=dtype)
-    depthwise_group_factor = min(depthwise_group_factor, depthwise_channels)
-    groups = int(depthwise_channels/depthwise_group_factor)
     conv1 = layers.conv2d(
         data=data,
         weight=weight,
         channels=depthwise_channels,
-        groups=groups,
+        groups=depthwise_channels,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
@@ -84,59 +82,47 @@ def separable_conv_block(data, name, depthwise_channels, pointwise_channels,
 
 
 def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224),
-               dtype='float32', alpha=1.0, is_shallow=False, layout='NCHW',
-               depthwise_group_factor=1):
+               dtype='float32', alpha=1.0, is_shallow=False, layout='NCHW'):
     """Function to construct a MobileNet"""
     data = relay.var("data", shape=data_shape, dtype=dtype)
     body = conv_block(data, 'conv_block_1', int(32*alpha), strides=(2, 2),
                       layout=layout)
     body = separable_conv_block(body, 'separable_conv_block_1',
                                 int(32*alpha), int(64*alpha), layout=layout,
-                                dtype=dtype,
-                                depthwise_group_factor=depthwise_group_factor)
+                                dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_2',
                                 int(64*alpha), int(128*alpha), downsample=True,
-                                layout=layout, dtype=dtype,
-                                depthwise_group_factor=depthwise_group_factor)
+                                layout=layout, dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_3',
                                 int(128*alpha), int(128*alpha), layout=layout,
-                                dtype=dtype,
-                                depthwise_group_factor=depthwise_group_factor)
+                                dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_4',
                                 int(128*alpha), int(256*alpha), downsample=True,
-                                layout=layout, dtype=dtype,
-                                depthwise_group_factor=depthwise_group_factor)
+                                layout=layout, dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_5',
                                 int(256*alpha), int(256*alpha), layout=layout,
-                                dtype=dtype,
-                                depthwise_group_factor=depthwise_group_factor)
+                                dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_6',
                                 int(256*alpha), int(512*alpha), downsample=True,
-                                layout=layout, dtype=dtype,
-                                depthwise_group_factor=depthwise_group_factor)
+                                layout=layout, dtype=dtype)
     if is_shallow:
         body = separable_conv_block(body, 'separable_conv_block_7',
                                     int(512*alpha), int(1024*alpha),
-                                    downsample=True, layout=layout, dtype=dtype,
-                                    depthwise_group_factor=depthwise_group_factor)
+                                    downsample=True, layout=layout, dtype=dtype)
         body = separable_conv_block(body, 'separable_conv_block_8',
                                     int(1024*alpha), int(1024*alpha),
-                                    downsample=True, layout=layout, dtype=dtype,
-                                    depthwise_group_factor=depthwise_group_factor)
+                                    downsample=True, layout=layout, dtype=dtype)
     else:
         for i in range(7, 12):
             body = separable_conv_block(body, 'separable_conv_block_%d' % i,
                                         int(512*alpha), int(512*alpha),
-                                        layout=layout, dtype=dtype,
-                                        depthwise_group_factor=depthwise_group_factor)
+                                        layout=layout, dtype=dtype)
         body = separable_conv_block(body, 'separable_conv_block_12',
                                     int(512*alpha), int(1024*alpha),
-                                    downsample=True, layout=layout, dtype=dtype,
-                                    depthwise_group_factor=depthwise_group_factor)
+                                    downsample=True, layout=layout, dtype=dtype)
         body = separable_conv_block(body, 'separable_conv_block_13',
-                                   int(1024*alpha), int(1024*alpha),
-                                   layout=layout, dtype=dtype,
-                                   depthwise_group_factor=depthwise_group_factor)
+                                    int(1024*alpha), int(1024*alpha),
+                                    layout=layout, dtype=dtype)
     pool = relay.nn.global_avg_pool2d(data=body, layout=layout)
     flatten = relay.nn.batch_flatten(data=pool)
     weight = relay.var('fc_weight')
@@ -148,7 +134,7 @@ def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224),
 
 
 def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224),
-                 dtype='float32', layout='NCHW', depthwise_group_factor=1):
+                 dtype='float32', layout='NCHW'):
     """Get benchmark workload for mobilenet
 
     Parameters
@@ -180,5 +166,5 @@ def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224),
     data_shape = tuple([batch_size] + list(image_shape))
     net = mobile_net(num_classes=num_classes, data_shape=data_shape,
                      dtype=dtype, alpha=1.0, is_shallow=False,
-                     layout=layout, depthwise_group_factor=depthwise_group_factor)
+                     layout=layout)
     return create_workload(net)
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 617be4b56d19..938fefa1e1cc 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -76,8 +76,6 @@ def schedule_alu_packed(cfg, outs):
     assert len(outs) == 1
 
     def is_cast_op(op):
-        # return op.same_as(Op.op.get("cast"))
-        # FIXME(zhanghao): find a better way to do compare
         return op.name == 'T_cast'
 
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py
index 15383e557c3b..91434e62c79f 100644
--- a/vta/python/vta/top/vta_conv2d_transpose.py
+++ b/vta/python/vta/top/vta_conv2d_transpose.py
@@ -120,16 +120,8 @@ def _traverse(op):
     data, kernel = conv2d_stage.op.input_tensors
     if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
         temp = data.op.input_tensors[0]
-        # FIXME(zhanghao): force merge pad(dilate(xx)) to one load op
-        # this may cause results in-correct
-        # disable for now
-        if False and isinstance(temp.op, tvm.te.ComputeOp) and ("pad" in temp.op.tag or temp.op.name == "DilatedInput"):
-            pad_data = data
-            data = temp.op.input_tensors[0]
-            s[temp.op].compute_inline()
-        else:
-            pad_data = data
-            data = temp
+        pad_data = data
+        data = temp
     else:
         pad_data = None
 
diff --git a/vta/tutorials/frontend/deploy_dcgan.py b/vta/tutorials/frontend/deploy_dcgan.py
deleted file mode 100644
index 6aaff4301258..000000000000
--- a/vta/tutorials/frontend/deploy_dcgan.py
+++ /dev/null
@@ -1,184 +0,0 @@
-from __future__ import absolute_import, print_function
-
-import argparse, json, os, requests, sys, time
-from io import BytesIO
-from os.path import join, isfile
-from PIL import Image
-
-from mxnet.gluon.model_zoo import vision
-import numpy as np
-from matplotlib import pyplot as plt
-
-import tvm
-from tvm import te
-from tvm import rpc, autotvm, relay
-from tvm.contrib import graph_runtime, util, download
-from tvm.contrib.debugger import debug_runtime
-from tvm.relay import transform
-import tvm.relay.testing
-
-import vta
-from vta.testing import simulator
-from vta.top import graph_pack
-
-# Make sure that TVM was compiled with RPC=1
-assert tvm.runtime.enabled("rpc")
-
-######################################################################
-# Define the platform and model targets
-# -------------------------------------
-# Execute on CPU vs. VTA, and define the model.
-
-# Load VTA parameters from the vta/config/vta_config.json file
-env = vta.get_env()
-
-# Set ``device=arm_cpu`` to run inference on the CPU
-# or ``device=vta`` to run inference on the FPGA.
-device = "vta"
-target = env.target if device == "vta" else env.target_vta_cpu
-# multiple targets to run both on cpu and vta
-targets = {
-    "cpu": env.target_vta_cpu,
-    "ext_dev": env.target
-}
-
-model = "DCGAN"
-
-######################################################################
-# Obtain an execution remote
-# --------------------------
-# When target is 'pynq', reconfigure FPGA and runtime.
-# Otherwise, if target is 'sim', execute locally.
-
-if env.TARGET not in ["sim", "tsim", "intelfocl"]:
-
-    # Get remote from tracker node if environment variable is set.
-    # To set up the tracker, you'll need to follow the "Auto-tuning
-    # a convolutional network for VTA" tutorial.
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
-    # Otherwise if you have a device you want to program directly from
-    # the host, make sure you've set the variables below to the IP of
-    # your board.
-    device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
-    device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
-    if not tracker_host or not tracker_port:
-        remote = rpc.connect(device_host, int(device_port))
-    else:
-        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000)
-
-    # Reconfigure the JIT runtime and FPGA.
-    # You can program the FPGA with your own custom bitstream
-    # by passing the path to the bitstream file instead of None.
-    reconfig_start = time.time()
-    vta.reconfig_runtime(remote)
-    bitstream = os.environ.get("TVM_BIT", None)
-    if bitstream:
-        print("Program fpga with {}".format(bitstream))
-        vta.program_fpga(remote, bitstream)
-
-    reconfig_time = time.time() - reconfig_start
-    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
-
-# In simulation mode, host the RPC server locally.
-else:
-    remote = rpc.LocalSession()
-
-# Get execution context from remote
-# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-ctxes = [remote.ext_dev(0), remote.cpu(0)]
-
-# Load pre-configured AutoTVM schedules
-with autotvm.tophub.context(target):
-
-    # Populate the shape and data type dictionary for ImageNet classifier input
-    dtype_dict = {"data": 'float32'}
-    shape_dict = {"data": (env.BATCH, 100)}
-
-    # get the mobilenet model
-    mod, params = relay.testing.dcgan.get_workload(batch_size=1, dtype="float32", oshape=(3, 64, 64))
-
-    # Measure build start time
-    build_start = time.time()
-
-    # Update shape and type dictionary
-    shape_dict.update({k: v.shape for k, v in params.items()})
-    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
-
-    if target.device_name == "vta":
-        # Perform quantization in Relay
-        # Note: We set opt_level to 3 in order to fold batch norm
-        with relay.build_config(opt_level=3):
-            with relay.quantize.qconfig(global_scale=8.0,
-                                        skip_conv_layers=[3]):
-                mod = relay.quantize.quantize(mod, params=params)
-            # Perform graph packing and constant folding for VTA target
-            assert env.BLOCK_IN == env.BLOCK_OUT
-            relay_prog = graph_pack(
-                mod["main"],
-                env.BATCH,
-                env.BLOCK_OUT,
-                env.WGT_WIDTH,
-                start_name="cast",
-                stop_name="cast", stop_name_idx=52, device_annot=True)
-    else:
-        relay_prog = mod["main"]
-
-    # Compile Relay program with AlterOpLayout disabled
-    with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
-        if target.device_name != "vta":
-            graph, lib, params = relay.build(
-                relay_prog, target=target,
-                params=params, target_host=env.target_host)
-        else:
-            with vta.build_config(debug_flag=38):
-                graph, lib, params = relay.build(
-                    relay_prog, target=targets,
-                    params=params, target_host=env.target_host)
-
-    # Measure Relay build time
-    build_time = time.time() - build_start
-    print(model + " inference graph built in {0:.2f}s!".format(build_time))
-
-    # Graph runtime
-    m = graph_runtime.create(graph, lib, ctxes)
-
-image = np.zeros((1, 100), dtype=np.float32)
-image = np.repeat(image, env.BATCH, axis=0)
-
-# Set the network parameters and inputs
-m.set_input(**params)
-m.set_input('data', image)
-
-# Perform inference and gather execution statistics
-# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
-num = 3 # number of times we run module for a single measurement
-rep = 3 # number of measurements (we derive std dev from this)
-timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep)
-
-if env.TARGET in ["sim", "tsim"]:
-    simulator.clear_stats()
-    # timer()
-    m['run']()
-
-    sim_stats = simulator.stats()
-    print("\nExecution statistics:")
-    for k, v in sim_stats.items():
-        # Since we execute the workload many times, we need to normalize stats
-        # Note that there is always one warm up run
-        # Therefore we divide the overall stats by (num * rep + 1)
-        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
-else:
-    m['run']()
-    print("Run done")
-    # tcost = timer()
-    # std = np.std(tcost.results) * 1000
-    # mean = tcost.mean * 1000
-    # print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
-    # print("Average per sample inference time: %.2fms" % (mean/env.BATCH))
-
-# Get classification results
-tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 3, 64, 64), "float32", remote.cpu(0)))
-output = tvm_output.asnumpy()
-for b in range(env.BATCH):
-    print(tvm_output.asnumpy()[b])
diff --git a/vta/tutorials/frontend/deploy_mobilenet.py b/vta/tutorials/frontend/deploy_mobilenet.py
deleted file mode 100644
index 9cf9dd98b09c..000000000000
--- a/vta/tutorials/frontend/deploy_mobilenet.py
+++ /dev/null
@@ -1,225 +0,0 @@
-from __future__ import absolute_import, print_function
-
-import argparse, json, os, requests, sys, time
-from io import BytesIO
-from os.path import join, isfile
-from PIL import Image
-
-from mxnet.gluon.model_zoo import vision
-import numpy as np
-from matplotlib import pyplot as plt
-
-import tvm
-from tvm import te
-from tvm import rpc, autotvm, relay
-from tvm.contrib import graph_runtime, util, download
-from tvm.contrib.debugger import debug_runtime
-from tvm.relay import transform
-import tvm.relay.testing
-
-import vta
-from vta.testing import simulator
-from vta.top import graph_pack
-
-# Make sure that TVM was compiled with RPC=1
-assert tvm.runtime.enabled("rpc")
-
-######################################################################
-# Define the platform and model targets
-# -------------------------------------
-# Execute on CPU vs. VTA, and define the model.
-
-# Load VTA parameters from the vta/config/vta_config.json file
-env = vta.get_env()
-
-# Set ``device=arm_cpu`` to run inference on the CPU
-# or ``device=vta`` to run inference on the FPGA.
-device = "vta"
-target = env.target if device == "vta" else env.target_vta_cpu
-# multiple targets to run both on cpu and vta
-targets = {
-    "cpu": env.target_vta_cpu,
-    "ext_dev": env.target
-}
-
-model = "mobilenetG"
-
-######################################################################
-# Obtain an execution remote
-# --------------------------
-# When target is 'pynq', reconfigure FPGA and runtime.
-# Otherwise, if target is 'sim', execute locally.
-
-if env.TARGET not in ["sim", "tsim", "intelfocl"]:
-
-    # Get remote from tracker node if environment variable is set.
-    # To set up the tracker, you'll need to follow the "Auto-tuning
-    # a convolutional network for VTA" tutorial.
-    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
-    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
-    # Otherwise if you have a device you want to program directly from
-    # the host, make sure you've set the variables below to the IP of
-    # your board.
-    device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
-    device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
-    if not tracker_host or not tracker_port:
-        remote = rpc.connect(device_host, int(device_port))
-    else:
-        remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000)
-
-    # Reconfigure the JIT runtime and FPGA.
-    # You can program the FPGA with your own custom bitstream
-    # by passing the path to the bitstream file instead of None.
-    reconfig_start = time.time()
-    vta.reconfig_runtime(remote)
-    bitstream = os.environ.get("TVM_BIT", None)
-    if bitstream:
-        print("Program fpga with {}".format(bitstream))
-        vta.program_fpga(remote, bitstream)
-
-    reconfig_time = time.time() - reconfig_start
-    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
-
-# In simulation mode, host the RPC server locally.
-else:
-    remote = rpc.LocalSession()
-
-# Get execution context from remote
-# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-ctxes = [remote.ext_dev(0), remote.cpu(0)]
-
-# Load pre-configured AutoTVM schedules
-with autotvm.tophub.context(target):
-
-    # Populate the shape and data type dictionary for ImageNet classifier input
-    dtype_dict = {"data": 'float32'}
-    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
-
-    # get the mobilenet model
-    mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32",
-                                                       depthwise_group_factor=env.BLOCK_IN)
-
-    # Measure build start time
-    build_start = time.time()
-
-    # Update shape and type dictionary
-    shape_dict.update({k: v.shape for k, v in params.items()})
-    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
-
-    if target.device_name == "vta":
-        # Perform quantization in Relay
-        # Note: We set opt_level to 3 in order to fold batch norm
-        with relay.build_config(opt_level=3):
-            with relay.quantize.qconfig(global_scale=8.0,
-                                        skip_conv_layers=[0]):
-                mod = relay.quantize.quantize(mod, params=params)
-            # Perform graph packing and constant folding for VTA target
-            assert env.BLOCK_IN == env.BLOCK_OUT
-            relay_prog = graph_pack(
-                mod["main"],
-                env.BATCH,
-                env.BLOCK_OUT,
-                env.WGT_WIDTH,
-                start_name="nn.conv2d",
-                stop_name="nn.global_avg_pool2d")
-    else:
-        relay_prog = mod["main"]
-
-    # Compile Relay program with AlterOpLayout disabled
-    with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
-        if target.device_name != "vta":
-            graph, lib, params = relay.build(
-                relay_prog, target=target,
-                params=params, target_host=env.target_host)
-        else:
-            with vta.build_config(debug_flag=32):
-                graph, lib, params = relay.build(
-                    relay_prog, target=targets,
-                    params=params, target_host=env.target_host)
-
-    # Measure Relay build time
-    build_time = time.time() - build_start
-    print(model + " inference graph built in {0:.2f}s!".format(build_time))
-
-    # Graph runtime
-    m = graph_runtime.create(graph, lib, ctxes)
-
-######################################################################
-# Perform image classification inference
-# --------------------------------------
-# We run classification on an image sample from ImageNet
-# We just need to download the categories files, `synset.txt`
-# and an input test image.
-
-# Download ImageNet categories
-categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/"
-categ_fn = "synset.txt"
-download.download(join(categ_url, categ_fn), categ_fn)
-synset = eval(open(categ_fn).read())
-
-# Download test image
-image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg'
-image_fn = 'cat.png'
-download.download(image_url, image_fn)
-
-# Prepare test image for inference
-image = Image.open(image_fn).resize((224, 224))
-plt.imshow(image)
-plt.show()
-image = np.array(image) - np.array([123., 117., 104.])
-image /= np.array([58.395, 57.12, 57.375])
-image = image.transpose((2, 0, 1))
-image = image[np.newaxis, :]
-image = np.repeat(image, env.BATCH, axis=0)
-
-# Set the network parameters and inputs
-m.set_input(**params)
-m.set_input('data', image)
-
-# Perform inference and gather execution statistics
-# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
-num = 3 # number of times we run module for a single measurement
-rep = 3 # number of measurements (we derive std dev from this)
-timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep)
-
-if env.TARGET in ["sim", "tsim"]:
-    simulator.clear_stats()
-    timer()
-
-    sim_stats = simulator.stats()
-    print("\nExecution statistics:")
-    for k, v in sim_stats.items():
-        # Since we execute the workload many times, we need to normalize stats
-        # Note that there is always one warm up run
-        # Therefore we divide the overall stats by (num * rep + 1)
-        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
-else:
-    tcost = timer()
-    std = np.std(tcost.results) * 1000
-    mean = tcost.mean * 1000
-    print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
-    print("Average per sample inference time: %.2fms" % (mean/env.BATCH))
-
-# Get classification results
-tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0)))
-output = tvm_output.asnumpy()
-for b in range(env.BATCH):
-    top_categories = np.argsort(tvm_output.asnumpy()[b])
-    # print("top_categories = ", top_categories)
-    # Report top-5 classification results
-    print("\n{} prediction for sample {}".format(model, b))
-    print("\t#1:", synset[top_categories[-1]], output[b][top_categories[-1]])
-    print("\t#2:", synset[top_categories[-2]], output[b][top_categories[-2]])
-    print("\t#3:", synset[top_categories[-3]], output[b][top_categories[-3]])
-    print("\t#4:", synset[top_categories[-4]], output[b][top_categories[-4]])
-    print("\t#5:", synset[top_categories[-5]], output[b][top_categories[-5]])
-    # This just checks that one of the 5 top categories
-    # is one variety of cat; this is by no means an accurate
-    # assessment of how quantization affects classification
-    # accuracy but is meant to catch changes to the
-    # quantization pass that would accuracy in the CI.
-    cat_detected = False
-    for k in top_categories[-5:]:
-        if "cat" in synset[k]:
-            cat_detected = True
-    assert(cat_detected)

From 75f7272552c8f8ff3f76c754b0b5008860f1ca05 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 15 Jun 2020 20:10:12 +0800
Subject: [PATCH 33/44] some bugfix and code optimize

---
 src/relay/quantize/realize.cc | 32 +++++++++++++-------------------
 vta/python/vta/top/op.py      |  8 +++++---
 vta/python/vta/transform.py   | 20 +++++++-------------
 vta/runtime/runtime.cc        |  9 ++++-----
 4 files changed, 29 insertions(+), 40 deletions(-)

diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 9dbc27d2c5a3..74bef7d1e4ed 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -330,7 +330,7 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {
 
 /* \brief Unify the dom scale of arguments */
 Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args,
-                            DataType* dtype_ptr, Expr* scale_ptr) {
+                            DataType* dtype_ptr, Expr* scale_ptr, DataType dtype = DataType::Void()) {
   static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
@@ -345,27 +345,19 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
 
   // unify the data type
   CHECK_EQ(ref_args.size(), args.size());
-  DataType dtype;
 
-  // FIXME(zhanghao): force to use add(int32, int32) in order to put in VTA ALU
-  // but this may be not necessary for other devices
-  // if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
-  //   dtype = cfg->dtype_input;
-  // } else {
-  //   dtype = cfg->dtype_activation;
-  // }
-  dtype = cfg->dtype_activation;
+  if (dtype.is_void()) {
+    if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
+      dtype = cfg->dtype_input;
+    } else {
+      dtype = cfg->dtype_activation;
+    }
+  }
+
   for (size_t i = 0; i < ret.size(); ++i) {
     auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
-      auto new_arg = Cast(ret[i], dtype);
-
-      // FIXME(zhanghao): do not fuse float32 cast
-      if (nptrs[i]->dtype == DataType::Float(32)) {
-        ret.Set(i, StopFusion(new_arg));
-      } else {
-        ret.Set(i, new_arg);
-      }
+      ret.Set(i, Cast(ret[i], dtype));
     } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) &&
                ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
       auto new_arg = Cast(ret[i], cfg->dtype_input);
@@ -392,7 +384,9 @@ Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale);
+    // execute the operation with activation data type.
+    const QConfig& cfg = QConfig::Current();
+    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation);
     Expr ret = ForwardOp(ref_call, ret_args);
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 938fefa1e1cc..20a7af2c5c1b 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -82,7 +82,6 @@ def is_cast_op(op):
     output = outs[0]
     s = te.create_schedule([x.op for x in outs])
     te.schedule.AutoInlineInjective(s)
-    # s[output].fuse(s[output].op.axis)
 
     env = get_env()
     # other target does not support alu-only ops
@@ -190,8 +189,11 @@ def multiply_strategy_vta(attrs, inputs, out_type, target):
     return strategy
 
 
-reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
-reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
+env = get_env()
+# other target does not support alu-only ops
+if env.TARGET in ["sim", "tsim", "intelfocl"]:
+    reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
+    reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
 
 
 @_strategy.conv2d_strategy.register("vta")
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index a8ecb1099a89..abb152d32314 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -381,9 +381,10 @@ def _fold_buffer_dim(buf, scope, elem_block):
 
     def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold):
         elem_block = elem_bytes * 8 // elem_width
-        if buf.dtype != dtype:
-            raise RuntimeError("Expect buffer type to be %s instead of %s" %
-                               (dtype, buf.dtype))
+        # remove the checking as we have load_int8 insn
+        # if buf.dtype != dtype:
+        #     raise RuntimeError("Expect buffer type to be %s instead of %s" %
+        #                        (dtype, buf.dtype))
         shape, strides = buf.shape, buf.strides
         if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
             raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block))
@@ -549,20 +550,13 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
 
             _check_compact(dst)
 
-            # FIXME(zhanghao): optimize
-            # for int8 -> int32 cast/load
-            orig_dtype = src.dtype
-            if src.dtype != data_type:
-                assert(data_type == "int%d" % env.ACC_WIDTH and \
-                       src.dtype == "int%d" % env.INP_WIDTH)
-                src.dtype = data_type
-
             x_size, y_size, x_stride, offset = _get_2d_pattern(
                 src, elem_width, elem_bytes, data_type,
                 dst.scope, allow_fold=allow_fold)
 
-            if orig_dtype != src.dtype:
-                src.dtype = orig_dtype
+            if data_type != src.dtype:
+                assert(data_type == "int%d" % env.ACC_WIDTH and \
+                       src.dtype == "int%d" % env.INP_WIDTH)
                 mem_type = env.dev.MEM_ID_ACC_8BIT
 
             irb = tvm.tir.ir_builder.create()
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index cf70f7e19361..67f055a04538 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -1078,6 +1078,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     CHECK(fpga_buff_ != nullptr);
     CHECK(fpga_buff_phy_);
     uint32_t buff_size = dram_buffer_.size() * elem_bytes_;
+
     CHECK(buff_size <= kMaxBytes);
     // Copy contents of DRAM buffer to FPGA buff
     VTAMemCopyFromHost(fpga_buff_, dram_buffer_.data(), buff_size);
@@ -1322,7 +1323,6 @@ class CommandQueue {
     if (insn_queue_.count() == 0) return;
     // Synchronization for the queues
     uop_queue_.AutoReadBarrier();
-
     insn_queue_.AutoReadBarrier();
     // Dump instructions if debug enabled
     if (debug_flag_ & VTA_DEBUG_DUMP_INSN) {
@@ -1333,7 +1333,7 @@ class CommandQueue {
           VTA_OPCODE_FINISH);
 
     // Make sure that we don't exceed contiguous physical memory limits
-    CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
+    CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) <= VTA_MAX_XFER);
     int timeout =
         VTADeviceRun(device_, insn_queue_.dram_phy_addr(), insn_queue_.count(), wait_cycles);
     CHECK_EQ(timeout, 0);
@@ -1481,9 +1481,8 @@ class CommandQueue {
 
   void CheckInsnOverFlow() {
     // At each API call, we can at most commit:
-    // one pending store, one pending load, and one uop
-    // FIXME(zhanghao): check why there are 5 insns
-    if ((insn_queue_.count() + 5) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
+    // at most: 2 NOP-COMPUTE-STAGE -> 2 NOP-MEMORY-STAGE -> 1 NOP-COMPUTE-STAGE -> 1 FINISH
+    if ((insn_queue_.count() + 6) * sizeof(VTAGenericInsn) > VTA_MAX_XFER) {
       this->AutoSync();
     }
   }

From c8a357424d9d6e5e77a8ea169e10488d49e73a93 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Tue, 16 Jun 2020 11:29:07 +0800
Subject: [PATCH 34/44] some minor fix and code refine

---
 python/tvm/autotvm/measure/measure_methods.py   | 14 ++++++--------
 src/relay/backend/graph_plan_memory.cc          |  6 ++++--
 src/relay/quantize/realize.cc                   |  6 ++++++
 src/tir/transforms/lower_tvm_builtin.cc         |  2 +-
 vta/runtime/runtime.cc                          | 14 ++++++++------
 vta/tutorials/autotvm/tune_alu_vta.py           |  2 +-
 vta/tutorials/autotvm/tune_relay_vta.py         |  4 ++--
 vta/tutorials/frontend/deploy_classification.py |  6 +++---
 8 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index d6b5defb710c..26e13f85c964 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -83,8 +83,6 @@ class LocalBuilder(Builder):
         If is callable, use it as custom build function, expect lib_format field.
     """
     def __init__(self, timeout=10, n_parallel=None, build_func='default'):
-        # FIXME(zhanghao): quickfix - use single thread. otherwise may cause seg fault
-        n_parallel = 1
         super(LocalBuilder, self).__init__(timeout, n_parallel)
 
         if isinstance(build_func, str):
@@ -191,7 +189,7 @@ def __init__(self,
                  timeout=10, n_parallel=None,
                  number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
                  check_correctness=False):
-        static_tune = os.getenv("TVM_STATIC_TUNE")
+        static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL")
         if static_tune:
             if n_parallel is None or n_parallel > 1:
                 print("static tune only allows n_parallel == 1")
@@ -385,7 +383,7 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti
             # pylint: disable=import-outside-toplevel
             import vta
 
-            static_tune = os.getenv("TVM_STATIC_TUNE")
+            static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL")
             if static_tune:
                 debug_flag = 1 << 6
             else:
@@ -483,7 +481,7 @@ def run_through_rpc(measure_input, build_result,
 
     tic = time.time()
     errno = MeasureErrorNo.NO_ERROR
-    static_tune = os.getenv("TVM_STATIC_TUNE")
+    static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL")
     try:
         # upload built module
         remote = request_remote(*remote_args)
@@ -513,8 +511,8 @@ def run_through_rpc(measure_input, build_result,
         else:
             func(*args)
             cost = 0
-            insn_dump = os.getenv('TVM_INSN_DUMP', "insn.dump")
-            insn_cost_file = os.getenv('TVM_INSN_COST', "cost.py")
+            insn_dump = os.getenv('TVM_INSN_DUMP_FILE', "insn.json")
+            insn_cost_file = os.getenv('TVM_INSN_COST_FILE', "cost.py")
             path, filename = os.path.split(insn_cost_file)
             sys.path.append(path)
             module_path = filename[:-3]  # remove the .py suffix
@@ -577,7 +575,7 @@ def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
     ------
     session: RPCSession
     """
-    static_tune = os.getenv("TVM_STATIC_TUNE")
+    static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL")
     if static_tune:
         return _rpc.LocalSession()
 
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 66de20dcf4c0..9dfc54212f2e 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -309,9 +309,11 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     if (match_range_ == 0) {
       return this->Alloc(prototype, size);
     }
-    // quickfix(zhanghao): we copy all the instructions in a single batch
+
+    // TODO(zhanghao): find a better way to do this
+    // we copy all the instructions in a single batch
     // to avoid overwrite shared storage, we do not re-use allocation
-    const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE");
+    const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL");
     if (sync_once) {
       return this->Alloc(prototype, size);
     }
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 74bef7d1e4ed..cafae6c2146c 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -387,6 +387,12 @@ Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
     // execute the operation with activation data type.
     const QConfig& cfg = QConfig::Current();
     Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation);
+    for (size_t i = 0; i < ret_args.size(); ++i) {
+      // do not fuse float32 arg
+      if (new_args[i].as<QRealizeIntExprNode>()->dtype == DataType::Float(32)) {
+        ret_args.Set(i, StopFusion(ret_args[i]));
+      }
+    }
     Expr ret = ForwardOp(ref_call, ret_args);
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 3d54d45015c6..628de0604042 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -86,7 +86,7 @@ class BuiltinLower : public StmtExprMutator {
     op = stmt.as<AllocateNode>();
     // Get constant allocation bound.
     int64_t nbytes = GetVectorBytes(op->dtype);
-    // FIXME(zhanghao): remove special handling for kDLCPU
+    // NOTE(zhanghao): remove special handling for kDLCPU
     // otherwise, may cause LLVM parameters match error
     // if in heterogenous targets
     // if (device_type_.defined()) {
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 67f055a04538..835f65a7947e 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -1278,10 +1278,10 @@ class CommandQueue {
   }
 
   void Synchronize(uint32_t wait_cycles, bool skip=true) {
-    if (debug_flag_ & VTA_DEBUG_AUTO_TUNE) {
-      const char* insn_file = std::getenv("TVM_INSN_DUMP");
+    if (debug_flag_ & VTA_DEBUG_LOG_INSN) {
+      const char* insn_file = std::getenv("TVM_INSN_DUMP_FILE");
       if (insn_file == nullptr) {
-        insn_file = "insn.dump";
+        insn_file = "insn.json";
       }
       FILE* out = fopen(insn_file, "w+");
       if (out) {
@@ -1294,8 +1294,10 @@ class CommandQueue {
     }
 
     // FIXME(zhanghao): It is required to use force_serial
-    // by using skip and sync at the final layer, we can avoid do DeviceCopy every time
-    const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE");
+    // by using skip and sync at the final layer.
+    // By doing this, we can avoid do DeviceCopy every time
+    // consider to make it as a flag when mature
+    const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL");
     if (sync_once && skip) {
       if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) {
         LOG(ERROR) <<
@@ -1524,7 +1526,7 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off
   if (from_buffer) {
     // This is an FPGA to host mem transfer
     // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly
-    const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE");
+    const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL");
     if (sync_once) VTASynchronize(VTATLSCommandHandle(), 1<<31, false);
     from_buffer->InvalidateCache(from_offset, size);
     from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py
index a5f03cdc22c7..68ea96ec4b64 100644
--- a/vta/tutorials/autotvm/tune_alu_vta.py
+++ b/vta/tutorials/autotvm/tune_alu_vta.py
@@ -113,7 +113,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals
     'early_stopping': None,
 
     'measure_option': autotvm.measure_option(
-        builder=autotvm.LocalBuilder(),
+        builder=autotvm.LocalBuilder(n_parallel=1),
         runner=autotvm.RPCRunner(env.TARGET,
                                  host=tracker_host,
                                  port=tracker_port,
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index 3f62f15b6490..8e2da559c6c2 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -208,7 +208,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
     'early_stopping': None,
 
     'measure_option': autotvm.measure_option(
-        builder=autotvm.LocalBuilder(),
+        builder=autotvm.LocalBuilder(n_parallel=1),
         runner=autotvm.RPCRunner(env.TARGET,
                                  host=tracker_host,
                                  port=tracker_port,
@@ -395,7 +395,7 @@ def tune_and_evaluate(tuning_opt):
     with autotvm.tophub.context(target, extra_files=[log_file]):
         # recompile the programs with device annotations
         print("Recompile")
-        relay_prog, params = compile_network(env, target, network, start_pack, stop_pack, device_annot=True)
+        relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
         # Compile network
         print("Compile...")
         if target.device_name != "vta":
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 73f13b3bf792..63907b996734 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -188,7 +188,7 @@
                 env.BLOCK_OUT,
                 env.WGT_WIDTH,
                 start_name=pack_dict[model][0],
-                stop_name=pack_dict[model][1], device_annot=env.TARGET == "intelfocl")
+                stop_name=pack_dict[model][1], device_annot=(env.TARGET == "intelfocl" or env.TARGET == "sim"))
     else:
         relay_prog = mod["main"]
 
@@ -199,7 +199,7 @@
                 relay_prog, target=target,
                 params=params, target_host=env.target_host)
     else:
-        if env.TARGET == "intelfocl":
+        if env.TARGET == "intelfocl" or env.TARGET == "sim":
             # multiple targets to run both on cpu and vta
             target = {
                 "cpu": env.target_vta_cpu,
@@ -221,7 +221,7 @@
     lib = remote.load_module("graphlib.o")
 
 
-    if env.TARGET == "intelfocl":
+    if env.TARGET == "intelfocl" or env.TARGET == "sim":
         ctxes = [remote.ext_dev(0), remote.cpu(0)]
         m = graph_runtime.create(graph, lib, ctxes)
     else:

From 7ca6f4098505172e7354174343d1e3b82140bbae Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Tue, 16 Jun 2020 17:55:07 +0800
Subject: [PATCH 35/44] remove rapidjson dep (use picojson)

---
 cmake/modules/VTA.cmake |   6 +-
 vta/runtime/runtime.cc  | 168 +++++++++++++++++-----------------------
 vta/runtime/runtime.h   |   2 +-
 3 files changed, 74 insertions(+), 102 deletions(-)

diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 4193fbaf657f..b586800efe2d 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -27,6 +27,9 @@ endif()
 
 message(STATUS "VTA build with VTA_HW_PATH=" ${VTA_HW_PATH})
 
+# enable picojson int type support
+add_definitions(-DPICOJSON_USE_INT64)
+
 if(MSVC)
   message(STATUS "VTA build is skipped in Windows..")
 elseif(PYTHON)
@@ -108,13 +111,13 @@ elseif(PYTHON)
     # Target lib: vta
     add_library(vta SHARED ${FPGA_RUNTIME_SRCS})
     target_include_directories(vta PUBLIC vta/runtime)
+    target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include)
     foreach(__def ${VTA_DEFINITIONS})
       string(SUBSTRING ${__def} 3 -1 __strip_def)
       target_compile_definitions(vta PUBLIC ${__strip_def})
     endforeach()
     if(${VTA_TARGET} STREQUAL "pynq" OR
        ${VTA_TARGET} STREQUAL "ultra96")
-      target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include)
       target_link_libraries(vta ${__cma_lib})
     elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
      #target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21)
@@ -124,7 +127,6 @@ elseif(PYTHON)
         "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
     elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
       target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include")
-      target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include)
       set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
       target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL)
     endif()
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 835f65a7947e..628f702de7fc 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -40,9 +40,7 @@
 #include <stdlib.h>
 #include <malloc.h>
 
-#include <rapidjson/document.h>
-#include <rapidjson/writer.h>
-#include <rapidjson/stringbuffer.h>
+#include <picojson.h>
 #include <thread>
 #include <mutex>
 
@@ -845,35 +843,23 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     // Iterate over all instructions
     int insn_count = count();
     const VTAGenericInsn* insn = data();
-    // FIXME(zhanghao): rapidjson dep
-    rapidjson::StringBuffer s;
-    rapidjson::Writer<rapidjson::StringBuffer> writer(s);
+    picojson::array jarr;
 
     if (!json) {
       fprintf(out, "There are %u instructions\n", insn_count);
-    } else {
-      writer.StartArray();
     }
 
     for (int i = 0; i < insn_count; ++i) {
       // Fetch instruction and decode opcode
       c.generic = insn[i];
+      picojson::object kv;
       if (json) {
-        writer.StartObject();
-        writer.Key("name");
-        writer.String(GetOpName(c).c_str());
-
-        writer.Key("type");
-        writer.String(GetOpcodeName(c).c_str());
-
-        writer.Key("pop_prev");
-        writer.Int(c.mem.pop_prev_dep);
-        writer.Key("pop_next");
-        writer.Int(c.mem.pop_next_dep);
-        writer.Key("push_prev");
-        writer.Int(c.mem.push_prev_dep);
-        writer.Key("push_next");
-        writer.Int(c.mem.push_next_dep);
+        kv["name"] = picojson::value(GetOpName(c).c_str());
+        kv["type"] = picojson::value(GetOpcodeName(c).c_str());
+        kv["pop_prev"] = picojson::value(static_cast<int64_t>(c.mem.pop_prev_dep));
+        kv["pop_next"] = picojson::value(static_cast<int64_t>(c.mem.pop_next_dep));
+        kv["push_prev"] = picojson::value(static_cast<int64_t>(c.mem.push_prev_dep));
+        kv["push_next"] = picojson::value(static_cast<int64_t>(c.mem.push_next_dep));
       } else {
         fprintf(out, "INSTRUCTION %u: ", i);
         fprintf(out, "%s\n", GetOpName(c).c_str());
@@ -887,25 +873,21 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
 
       if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
         if (json) {
-          writer.Key("dram");
-          writer.Uint64(c.mem.dram_base);
-          writer.Key("sram");
-          writer.Uint64(c.mem.sram_base);
-
-          writer.Key("y");
-          writer.StartArray();
-          writer.Uint64(c.mem.y_size);
-          writer.Uint64(c.mem.y_pad_0);
-          writer.Uint64(c.mem.y_pad_1);
-          writer.EndArray();
-
-          writer.Key("x");
-          writer.StartArray();
-          writer.Uint64(c.mem.x_size);
-          writer.Uint64(c.mem.x_pad_0);
-          writer.Uint64(c.mem.x_pad_1);
-          writer.Uint64(c.mem.x_stride);
-          writer.EndArray();
+          kv["dram"] = picojson::value(static_cast<int64_t>(c.mem.dram_base));
+          kv["sram"] = picojson::value(static_cast<int64_t>(c.mem.sram_base));
+
+          picojson::array arr;
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.y_size)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.y_pad_0)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.y_pad_1)));
+          kv["y"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.x_size)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.x_pad_0)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.x_pad_1)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.x_stride)));
+          kv["x"] = picojson::value(arr);
         } else {
           fprintf(out, "\tDRAM: 0x%08x, SRAM:0x%04x\n",
                  static_cast<int>(c.mem.dram_base),
@@ -922,29 +904,26 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
         }
       } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
         if (json) {
-          writer.Key("reset_out");
-          writer.Int(c.gemm.reset_reg);
-          writer.Key("range");
-          writer.StartArray();
-          writer.Int(c.gemm.uop_bgn);
-          writer.Int(c.gemm.uop_end);
-          writer.EndArray();
-
-          writer.Key("outer_loop");
-          writer.StartArray();
-          writer.Int(c.gemm.iter_out);
-          writer.Int(c.gemm.wgt_factor_out),
-          writer.Int(c.gemm.src_factor_out),
-          writer.Int(c.gemm.dst_factor_out);
-          writer.EndArray();
-
-          writer.Key("inner_loop");
-          writer.StartArray();
-          writer.Int(c.gemm.iter_in);
-          writer.Int(c.gemm.wgt_factor_in),
-          writer.Int(c.gemm.src_factor_in),
-          writer.Int(c.gemm.dst_factor_in);
-          writer.EndArray();
+          kv["reset_out"] = picojson::value(static_cast<int64_t>(c.gemm.reset_reg));
+
+          picojson::array arr;
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.uop_bgn)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.uop_end)));
+          kv["range"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.iter_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.wgt_factor_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.src_factor_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.dst_factor_out)));
+          kv["outer_loop"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.iter_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.wgt_factor_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.src_factor_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.dst_factor_in)));
+          kv["inner_loop"] = picojson::value(arr);
         } else {
           fprintf(out, "\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
           fprintf(out, "\trange (%d, %d)\n",
@@ -963,27 +942,23 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
         }
       } else if (c.mem.opcode == VTA_OPCODE_ALU) {
         if (json) {
-          writer.Key("reset_out");
-          writer.Int(c.alu.reset_reg);
-          writer.Key("range");
-          writer.StartArray();
-          writer.Int(c.alu.uop_bgn);
-          writer.Int(c.alu.uop_end);
-          writer.EndArray();
-
-          writer.Key("outer_loop");
-          writer.StartArray();
-          writer.Int(c.alu.iter_out);
-          writer.Int(c.alu.dst_factor_out),
-          writer.Int(c.alu.src_factor_out),
-          writer.EndArray();
-
-          writer.Key("inner_loop");
-          writer.StartArray();
-          writer.Int(c.alu.iter_in);
-          writer.Int(c.alu.dst_factor_in);
-          writer.Int(c.alu.src_factor_in),
-          writer.EndArray();
+          kv["reset_out"] = picojson::value(static_cast<int64_t>(c.alu.reset_reg));
+          picojson::array arr;
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.uop_bgn)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.uop_end)));
+          kv["range"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.iter_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.dst_factor_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.src_factor_out)));
+          kv["outer_loop"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.iter_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.dst_factor_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.src_factor_in)));
+          kv["inner_loop"] = picojson::value(arr);
         } else {
           fprintf(out, "\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
           fprintf(out, "\trange (%d, %d)\n",
@@ -1027,16 +1002,12 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
         if (c.gemm.push_next_dep) g2s_queue++;
       }
       if (json) {
-        writer.Key("l2g_queue");
-        writer.Int(l2g_queue);
-        writer.Key("g2l_queue");
-        writer.Int(g2l_queue);
-        writer.Key("s2g_queue");
-        writer.Int(s2g_queue);
-        writer.Key("g2s_queue");
-        writer.Int(g2s_queue);
-
-        writer.EndObject();
+        kv["l2g_queue"] = picojson::value(static_cast<int64_t>(l2g_queue));
+        kv["g2l_queue"] = picojson::value(static_cast<int64_t>(g2l_queue));
+        kv["s2g_queue"] = picojson::value(static_cast<int64_t>(s2g_queue));
+        kv["g2s_queue"] = picojson::value(static_cast<int64_t>(g2s_queue));
+
+        jarr.push_back(picojson::value(kv));
       } else {
         fprintf(out, "\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
         fprintf(out, "\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
@@ -1044,9 +1015,8 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     }
 
     if (json) {
-      writer.EndArray();
-      auto str = s.GetString();
-      fwrite(str, 1, s.GetSize(), out);
+      auto str = picojson::value(jarr).serialize();
+      fwrite(str.c_str(), 1, str.size(), out);
     }
   }
   // Commit all pending pop of corresponding stage
diff --git a/vta/runtime/runtime.h b/vta/runtime/runtime.h
index 22cf15a91503..a61906e98ff6 100644
--- a/vta/runtime/runtime.h
+++ b/vta/runtime/runtime.h
@@ -41,7 +41,7 @@ extern "C" {
 #define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
 #define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
 #define VTA_DEBUG_FORCE_SERIAL (1 << 5)
-#define VTA_DEBUG_AUTO_TUNE (1 << 6)
+#define VTA_DEBUG_LOG_INSN (1 << 6)
 
 /*!
  * \brief Allocate data buffer.

From 12554d51e45cb7b31e2d51cb91d85f7d2e153de9 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Tue, 16 Jun 2020 18:14:06 +0800
Subject: [PATCH 36/44] bugfix for tune alu vta

---
 vta/tutorials/autotvm/tune_alu_vta.py | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py
index 68ea96ec4b64..2998b1c57fc9 100644
--- a/vta/tutorials/autotvm/tune_alu_vta.py
+++ b/vta/tutorials/autotvm/tune_alu_vta.py
@@ -223,7 +223,7 @@ def my_clip(x, a_min, a_max):
     # init autotvm env to register VTA operator
     TaskExtractEnv()
 
-    @autotvm.register_customized_task("add.vta")
+    @autotvm.template("add.vta")
     def _topi_add(*args, **kwargs):
         assert not kwargs, "Do not support kwargs in template function call"
         A, B = args[:2]
@@ -239,7 +239,7 @@ def _topi_add(*args, **kwargs):
             s = te.create_schedule([res.op])
         return s, [A, B, res]
 
-    @autotvm.register_customized_task("multiply.vta")
+    @autotvm.template("multiply.vta")
     def _topi_multiply(*args, **kwargs):
         assert not kwargs, "Do not support kwargs in template function call"
         A, B = args[:2]
@@ -255,22 +255,6 @@ def _topi_multiply(*args, **kwargs):
             s = te.create_schedule([res.op])
         return s, [A, B, res]
 
-    @autotvm.register_customized_task("copy.vta")
-    def _topi_identity(*args, **kwargs):
-        assert not kwargs, "Do not support kwargs in template function call"
-        A = args[0]
-
-        with tvm.target.vta():
-            res = vta.top.op.copy_packed(*args, **kwargs)
-            res = my_clip(res, 0, 127)
-            res = topi.cast(res, "int8")
-
-        if tvm.target.Target.current().device_name == 'vta':
-            s = vta.top.op.schedule_copy_packed([res])
-        else:
-            s = te.create_schedule([res.op])
-        return s, [A, res]
-
 
 ########################################################################
 # Finally, we launch tuning jobs and evaluate the end-to-end performance.

From b8d842ed5c9baa6c5a27c419895cf40da21f2c02 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 18 Jun 2020 16:02:20 +0800
Subject: [PATCH 37/44] cleanup

---
 python/tvm/relay/quantize/_annotate.py        |  2 -
 src/relay/backend/graph_plan_memory.cc        |  4 +-
 src/relay/transforms/device_annotation.cc     |  4 +-
 src/tir/transforms/inject_copy_intrin.cc      | 14 +----
 vta.resnet18_v1.log-manual-formatv0_2         | 10 ----
 vta/python/vta/top/op.py                      |  6 ++-
 vta/python/vta/transform.py                   |  1 -
 vta/runtime/runtime.cc                        | 54 +++++++++----------
 vta/tutorials/autotvm/tune_alu_vta.py         |  9 ++--
 vta/tutorials/autotvm/tune_relay_vta.py       | 21 ++------
 .../frontend/deploy_classification.py         |  9 +---
 11 files changed, 45 insertions(+), 89 deletions(-)
 delete mode 100644 vta.resnet18_v1.log-manual-formatv0_2

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 08930527b443..f902a0abf80e 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -195,8 +195,6 @@ def conv2d_transpose_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
-# TODO(tmoreau89,ziheng) need to include an option to turn off dense quant
-# @register_annotate_function("nn.dense")
 @register_annotate_function("nn.dense")
 def dense_rewrite(ref_call, new_args, ctx):
     """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 9dfc54212f2e..4a1bfd874b5c 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -311,8 +311,8 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     }
 
     // TODO(zhanghao): find a better way to do this
-    // we copy all the instructions in a single batch
-    // to avoid overwrite shared storage, we do not re-use allocation
+    // We copy all the instructions of all layers in a single batch.
+    // To avoid overwrite shared storage, we do not re-use allocation
     const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL");
     if (sync_once) {
       return this->Alloc(prototype, size);
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index fe3cfebf7fe3..319f9ba59064 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -538,9 +538,7 @@ Map<Expr, Integer> CollectDeviceInfo(const Expr& expr) {
   return DeviceInfo::GetDeviceMap(expr);
 }
 
-Map<Expr, Integer> CollectDeviceAnnotationOps(const Expr& expr) {
-  return AnnotatationVisitor::GetAnnotations(expr);
-}
+Map<Expr, Integer> CollectDeviceAnnotationOps(const Expr& expr) { return AnnotatationVisitor::GetAnnotations(expr); }
 
 TVM_REGISTER_GLOBAL("relay.analysis.CollectDeviceInfo").set_body_typed(CollectDeviceInfo);
 
diff --git a/src/tir/transforms/inject_copy_intrin.cc b/src/tir/transforms/inject_copy_intrin.cc
index 279274632648..b27459f4bd45 100644
--- a/src/tir/transforms/inject_copy_intrin.cc
+++ b/src/tir/transforms/inject_copy_intrin.cc
@@ -80,19 +80,7 @@ class CopyIntrinInjector : public StmtMutator {
     }
     // for now only support true condition matching
     if (has_cond) {
-      auto true_val = sel_true_value.Eval();
-
-      // TODO(zhanghao): we do cond unfold one more further
-      // this is used to lift the pad(dilate) to one load op
-      // However, ignoring false condition may cause incorrect results
-      PVar<PrimExpr> sel_cond_extra, sel_true_value_extra, sel_false_value_extra;
-      bool has_cond_extra = if_then_else(sel_cond_extra, sel_true_value_extra, sel_false_value_extra).Match(true_val) ||
-          select(sel_cond_extra, sel_true_value_extra, sel_false_value_extra).Match(true_val);
-      if (has_cond_extra) {
-        load = sel_true_value_extra.Eval().as<LoadNode>();
-      } else {
-        load = true_val.as<LoadNode>();
-      }
+      load = sel_true_value.Eval().as<LoadNode>();
     }
     // cast can be part of the pattern
     if (cast != nullptr) {
diff --git a/vta.resnet18_v1.log-manual-formatv0_2 b/vta.resnet18_v1.log-manual-formatv0_2
deleted file mode 100644
index 7b3c9d61a318..000000000000
--- a/vta.resnet18_v1.log-manual-formatv0_2
+++ /dev/null
@@ -1,10 +0,0 @@
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [16, 8, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 131, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 8]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.0014505], 0, 1.328160047531128, 1578987870.726089], "version": 0.2, "tvm_version": "0.7.dev0"}
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 16, 7, 7, 1, 32], "int8"], ["TENSOR", [16, 16, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 163, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 8]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.002734464], 0, 1.7085223197937012, 1578988000.5012062], "version": 0.2, "tvm_version": "0.7.dev0"}
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [8, 4, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 302, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.0008805], 0, 1.2376818656921387, 1578988097.9650147], "version": 0.2, "tvm_version": "0.7.dev0"}
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [8, 8, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 143, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 14]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.001309522], 0, 1.3671045303344727, 1578988174.358436], "version": 0.2, "tvm_version": "0.7.dev0"}
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [4, 2, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 177, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.00079938], 0, 1.1500802040100098, 1578988361.3194962], "version": 0.2, "tvm_version": "0.7.dev0"}
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [4, 4, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 681, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 14]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 1]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 2]]}, "result": [[0.001198882], 0, 1.2445652484893799, 1578988503.2178001], "version": 0.2, "tvm_version": "0.7.dev0"}
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [2, 2, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 570, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 4]], ["tile_w", "sp", [-1, 56]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 2]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 2]]}, "result": [[0.001230756], 0, 1.4033727645874023, 1578988610.0491438], "version": 0.2, "tvm_version": "0.7.dev0"}
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [4, 2, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 176, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.000339938], 0, 1.025542974472046, 1578988875.3407557], "version": 0.2, "tvm_version": "0.7.dev0"}
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [8, 4, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 299, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.000387532], 0, 1.095754861831665, 1578988972.0000997], "version": 0.2, "tvm_version": "0.7.dev0"}
-{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [16, 8, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 67, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 16]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.000294566], 0, 0.9454472064971924, 1578989137.6281488], "version": 0.2, "tvm_version": "0.7.dev0"}
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 20a7af2c5c1b..dca42de95ffc 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -35,6 +35,9 @@
 from ..environment import get_env
 
 
+# override to force partition at copy
+reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
+
 # add clip vta strategy
 def compute_clip_vta(attrs, inputs, output_type):
     """ Clip operator. """
@@ -63,8 +66,7 @@ def clip_strategy_vta(attrs, inputs, out_type, target):
 
 @autotvm.register_topi_compute("add.vta")
 def add_packed(cfg, lhs, rhs):
-    ret = topi.add(lhs, rhs)
-    return ret
+    return topi.add(lhs, rhs)
 
 
 @autotvm.register_topi_compute("multiply.vta")
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index abb152d32314..9a340c6d0406 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -549,7 +549,6 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
                 allow_fold = True
 
             _check_compact(dst)
-
             x_size, y_size, x_stride, offset = _get_2d_pattern(
                 src, elem_width, elem_bytes, data_type,
                 dst.scope, allow_fold=allow_fold)
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 628f702de7fc..df20a8e87ed7 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -733,30 +733,30 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
   }
   // Helper function: Get Opcode string
   std::string getOpcodeString(int opcode, bool use_imm, int64_t imm) {
-      // The string name
-      if (opcode == VTA_ALU_OPCODE_MIN) {
-          if (use_imm) {
-              return std::string("min imm ") + std::to_string(imm);
-          } else {
-              return "min";
-          }
-      } else if (opcode == VTA_ALU_OPCODE_MAX) {
-          if (use_imm) {
-              return (std::string("max imm ") + std::to_string(imm));
-          } else {
-              return "max";
-          }
-      } else if (opcode == VTA_ALU_OPCODE_ADD) {
-          if (use_imm) {
-              return (std::string("add imm ") + std::to_string(imm));
-          } else {
-              return "add";
-          }
-      } else if (opcode == VTA_ALU_OPCODE_SHR) {
-          return (std::string("shr ") + std::to_string(imm));
-      } else if (opcode == VTA_ALU_OPCODE_MUL) {
-        return "mul";
-      }
+    // The string name
+    if (opcode == VTA_ALU_OPCODE_MIN) {
+        if (use_imm) {
+            return std::string("min imm ") + std::to_string(imm);
+        } else {
+            return "min";
+        }
+    } else if (opcode == VTA_ALU_OPCODE_MAX) {
+        if (use_imm) {
+            return (std::string("max imm ") + std::to_string(imm));
+        } else {
+            return "max";
+        }
+    } else if (opcode == VTA_ALU_OPCODE_ADD) {
+        if (use_imm) {
+            return (std::string("add imm ") + std::to_string(imm));
+        } else {
+            return "add";
+        }
+    } else if (opcode == VTA_ALU_OPCODE_SHR) {
+        return (std::string("shr ") + std::to_string(imm));
+    } else if (opcode == VTA_ALU_OPCODE_MUL) {
+      return "mul";
+    }
 
     return "unknown op";
   }
@@ -832,7 +832,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
   }
 
   // Dump instructions in the queue
-  void DumpInsn(FILE* out = stderr, bool json=false) {
+  void DumpInsn(FILE* out = stderr, bool json = false) {
     // Keep tabs on dependence queues
     int l2g_queue = 0;
     int g2l_queue = 0;
@@ -1265,8 +1265,8 @@ class CommandQueue {
 
     // FIXME(zhanghao): It is required to use force_serial
     // by using skip and sync at the final layer.
-    // By doing this, we can avoid do DeviceCopy every time
-    // consider to make it as a flag when mature
+    // By doing this, we can avoid do DeviceCopy every time.
+    // TODO: Consider to make it as a flag when mature
     const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL");
     if (sync_once && skip) {
       if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) {
diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py
index 2998b1c57fc9..f1638ba49432 100644
--- a/vta/tutorials/autotvm/tune_alu_vta.py
+++ b/vta/tutorials/autotvm/tune_alu_vta.py
@@ -104,7 +104,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals
 stop_pack = "nn.global_avg_pool2d"
 
 # Tuning option
-log_file = "%s.%s.log" % (device, network)
+log_file = "%s.alu.%s.log" % (device, network)
 tuning_option = {
     'log_filename': log_file,
 
@@ -267,11 +267,8 @@ def tune_and_evaluate(tuning_opt):
                                                 tracker_port,
                                                 timeout=10000)
         # Reconfigure the JIT runtime and FPGA.
-        bitstream = os.environ.get("TVM_BIT", None)
-        if bitstream:
-            print("Program fpga with {}".format(bitstream))
-            vta.reconfig_runtime(remote)
-            vta.program_fpga(remote, bitstream)
+        vta.reconfig_runtime(remote)
+        vta.program_fpga(remote, bitstream)
     else:
         # In simulation mode, host the RPC server locally.
         remote = rpc.LocalSession()
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index 8e2da559c6c2..e9d4c48e55e7 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -341,11 +341,8 @@ def tune_and_evaluate(tuning_opt):
                                                 tracker_port,
                                                 timeout=10000)
         # Reconfigure the JIT runtime and FPGA.
-        bitstream = os.environ.get("TVM_BIT", None)
-        if bitstream:
-            print("Program fpga with {}".format(bitstream))
-            vta.reconfig_runtime(remote)
-            vta.program_fpga(remote, bitstream)
+         vta.reconfig_runtime(remote)
+         vta.program_fpga(remote, bitstream)
     else:
         # In simulation mode, host the RPC server locally.
         remote = rpc.LocalSession()
@@ -393,9 +390,6 @@ def tune_and_evaluate(tuning_opt):
 
     # compile kernels with history best records
     with autotvm.tophub.context(target, extra_files=[log_file]):
-        # recompile the programs with device annotations
-        print("Recompile")
-        relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
         # Compile network
         print("Compile...")
         if target.device_name != "vta":
@@ -405,14 +399,10 @@ def tune_and_evaluate(tuning_opt):
                                                 params=params,
                                                 target_host=env.target_host)
         else:
-            targets = {
-                "cpu": env.target_vta_cpu,
-                "ext_dev": env.target
-            }
-            with vta.build_config(opt_level=3, debug_flag=32, disabled_pass={"AlterOpLayout"}):
+            with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
                 graph, lib, params = relay.build(
                     relay_prog,
-                    target=targets,
+                    target=target,
                     params=params,
                     target_host=env.target_host)
 
@@ -425,8 +415,7 @@ def tune_and_evaluate(tuning_opt):
 
         # Generate the graph runtime
         ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
-        ctxes = [ctx, remote.cpu(0)]
-        m = graph_runtime.create(graph, lib, ctxes)
+        m = graph_runtime.create(graph, lib, ctx)
 
         # upload parameters to device
         image = tvm.nd.array(
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 63907b996734..33f59bd0e701 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -121,12 +121,7 @@
     # by passing the path to the bitstream file instead of None.
     reconfig_start = time.time()
     vta.reconfig_runtime(remote)
-    # vta.program_fpga(remote, bitstream=None)
-    bitstream = os.environ.get("TVM_BIT", None)
-    if bitstream:
-        print("Program fpga with {}".format(bitstream))
-        vta.program_fpga(remote, bitstream)
-
+    vta.program_fpga(remote, bitstream=None)
     reconfig_time = time.time() - reconfig_start
     print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
 
@@ -261,7 +256,7 @@
 m.set_input('data', image)
 
 # Perform inference and gather execution statistics
-# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
+# More on: :py:method:`tvm.runtime.Module.time_evaluator`
 num = 4 # number of times we run module for a single measurement
 rep = 3 # number of measurements (we derive std dev from this)
 timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)

From 02b3ea0cb187dde99b342d64b40bfbf047c67464 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 18 Jun 2020 17:42:55 +0800
Subject: [PATCH 38/44] coding style

---
 cmake/modules/VTA.cmake                   | 2 +-
 src/relay/quantize/realize.cc             | 6 ++++--
 src/relay/transforms/device_annotation.cc | 4 +++-
 tests/lint/check_file_type.py             | 2 ++
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index b586800efe2d..cf21ca7c0495 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -104,7 +104,7 @@ elseif(PYTHON)
       file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc)
     elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
       file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc)
-      file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cpp)
+      file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cc)
       list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC})
       list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h)
     endif()
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index cafae6c2146c..dcf58f12ea56 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -330,7 +330,8 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {
 
 /* \brief Unify the dom scale of arguments */
 Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args,
-                            DataType* dtype_ptr, Expr* scale_ptr, DataType dtype = DataType::Void()) {
+                            DataType* dtype_ptr, Expr* scale_ptr,
+                            DataType dtype = DataType::Void()) {
   static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
@@ -386,7 +387,8 @@ Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
     Expr dom_scale;
     // execute the operation with activation data type.
     const QConfig& cfg = QConfig::Current();
-    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation);
+    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args,
+                                           &dtype, &dom_scale, cfg->dtype_activation);
     for (size_t i = 0; i < ret_args.size(); ++i) {
       // do not fuse float32 arg
       if (new_args[i].as<QRealizeIntExprNode>()->dtype == DataType::Float(32)) {
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index 319f9ba59064..fe3cfebf7fe3 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -538,7 +538,9 @@ Map<Expr, Integer> CollectDeviceInfo(const Expr& expr) {
   return DeviceInfo::GetDeviceMap(expr);
 }
 
-Map<Expr, Integer> CollectDeviceAnnotationOps(const Expr& expr) { return AnnotatationVisitor::GetAnnotations(expr); }
+Map<Expr, Integer> CollectDeviceAnnotationOps(const Expr& expr) {
+  return AnnotatationVisitor::GetAnnotations(expr);
+}
 
 TVM_REGISTER_GLOBAL("relay.analysis.CollectDeviceInfo").set_body_typed(CollectDeviceInfo);
 
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index da3a456dafb6..36bc66ec1784 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -77,6 +77,8 @@
     "tokens",
     # interface definition
     "idl",
+    # opencl file
+    "cl",
     }
 
 # List of file names allowed

From a1cd048bea8ffe17f41dcd0595b97cb9d4e15253 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 18 Jun 2020 23:05:15 +0800
Subject: [PATCH 39/44] update vta-hw commit

---
 3rdparty/vta-hw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index f0347e202966..ed466d70d01c 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit f0347e202966322fe6a961eab2f4ff963bced2d5
+Subproject commit ed466d70d01c57cde4fde602c8c593b6a8acc531

From 6960c6a38933704d520ace51751a1e866fb8d494 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Thu, 18 Jun 2020 23:52:08 +0800
Subject: [PATCH 40/44] lint

---
 vta/tutorials/autotvm/tune_alu_vta.py   | 2 +-
 vta/tutorials/autotvm/tune_relay_vta.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py
index f1638ba49432..3e2c1c57b41b 100644
--- a/vta/tutorials/autotvm/tune_alu_vta.py
+++ b/vta/tutorials/autotvm/tune_alu_vta.py
@@ -268,7 +268,7 @@ def tune_and_evaluate(tuning_opt):
                                                 timeout=10000)
         # Reconfigure the JIT runtime and FPGA.
         vta.reconfig_runtime(remote)
-        vta.program_fpga(remote, bitstream)
+        vta.program_fpga(remote, bitstream=None)
     else:
         # In simulation mode, host the RPC server locally.
         remote = rpc.LocalSession()
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index e9d4c48e55e7..9ae54cba0992 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -341,8 +341,8 @@ def tune_and_evaluate(tuning_opt):
                                                 tracker_port,
                                                 timeout=10000)
         # Reconfigure the JIT runtime and FPGA.
-         vta.reconfig_runtime(remote)
-         vta.program_fpga(remote, bitstream)
+        vta.reconfig_runtime(remote)
+        vta.program_fpga(remote, bitstream=None)
     else:
         # In simulation mode, host the RPC server locally.
         remote = rpc.LocalSession()

From 14020b7ac8319b1766e82950c99eea9a60d49e6c Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Sat, 20 Jun 2020 23:27:49 +0800
Subject: [PATCH 41/44] clean up unneeded code

---
 vta/tutorials/autotvm/tune_alu_vta.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py
index 3e2c1c57b41b..cf4922450ce5 100644
--- a/vta/tutorials/autotvm/tune_alu_vta.py
+++ b/vta/tutorials/autotvm/tune_alu_vta.py
@@ -290,8 +290,6 @@ def tune_and_evaluate(tuning_opt):
     tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
     # filter out float alu task
     tasks = list(filter(lambda t: t.args[0][2] != "float32", tasks))
-    # filter const rhs, which will be fused with conv2d
-    # tasks = list(filter(lambda t: len(t.args[1][1]) < 1, tasks))
 
     # We should have extracted 10 convolution tasks
     tasks_set = {}

From b6c1763ffcc04682fca2a62e3d873561185df68c Mon Sep 17 00:00:00 2001
From: Li Jiashu <lijiashu@4paradigm>
Date: Thu, 16 Jul 2020 01:45:27 +0800
Subject: [PATCH 42/44] Move AOCLUtils from Intel FPGA into 3rdparty directory

---
 3rdparty/aoclutils/aocl_utils.h  |  32 ++
 3rdparty/aoclutils/opencl.cc     | 555 +++++++++++++++++++++++++++++++
 3rdparty/aoclutils/opencl.h      | 122 +++++++
 3rdparty/aoclutils/options.cc    | 105 ++++++
 3rdparty/aoclutils/options.h     | 137 ++++++++
 3rdparty/aoclutils/scoped_ptrs.h | 165 +++++++++
 3rdparty/vta-hw                  |   2 +-
 cmake/modules/VTA.cmake          |   3 +-
 8 files changed, 1119 insertions(+), 2 deletions(-)
 create mode 100644 3rdparty/aoclutils/aocl_utils.h
 create mode 100644 3rdparty/aoclutils/opencl.cc
 create mode 100644 3rdparty/aoclutils/opencl.h
 create mode 100644 3rdparty/aoclutils/options.cc
 create mode 100644 3rdparty/aoclutils/options.h
 create mode 100644 3rdparty/aoclutils/scoped_ptrs.h

diff --git a/3rdparty/aoclutils/aocl_utils.h b/3rdparty/aoclutils/aocl_utils.h
new file mode 100644
index 000000000000..70e0fc6bcc0a
--- /dev/null
+++ b/3rdparty/aoclutils/aocl_utils.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// Main include file for AOCLUtils. Includes all other utility header files.
+
+#ifndef AOCL_UTILS_H
+#define AOCL_UTILS_H
+
+#include "opencl.h"
+#include "scoped_ptrs.h"
+#include "options.h"
+
+#endif
+
diff --git a/3rdparty/aoclutils/opencl.cc b/3rdparty/aoclutils/opencl.cc
new file mode 100644
index 000000000000..04d989d7c9ea
--- /dev/null
+++ b/3rdparty/aoclutils/opencl.cc
@@ -0,0 +1,555 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+#include "aocl_utils.h"
+#include <algorithm>
+#include <stdarg.h>
+
+#ifdef _WIN32 // Windows
+#include <windows.h>
+#else         // Linux
+#include <stdio.h> 
+#include <unistd.h> // readlink, chdir
+#endif
+
+namespace aocl_utils {
+
+static const char *const VERSION_STR = "161";
+
+//////////////////////////////////////////
+// Host allocation functions for alignment
+//////////////////////////////////////////
+
+// This is the minimum alignment requirement to ensure DMA can be used.
+const unsigned AOCL_ALIGNMENT = 64;
+
+#ifdef _WIN32 // Windows
+void *alignedMalloc(size_t size) {
+  return _aligned_malloc (size, AOCL_ALIGNMENT);
+}
+
+void alignedFree(void * ptr) {
+  _aligned_free(ptr);
+}
+#else          // Linux
+void *alignedMalloc(size_t size) {
+  void *result = NULL;
+  int rc;
+  rc = posix_memalign (&result, AOCL_ALIGNMENT, size);
+  (void) rc;
+  return result;
+}
+
+void alignedFree(void * ptr) {
+  free (ptr);
+}
+#endif
+
+///////////////////////////////
+// Error functions
+///////////////////////////////
+
+// Print the error associciated with an error code
+void printError(cl_int error) {
+  // Print error message
+  switch(error)
+  {
+    case -1:
+      printf("CL_DEVICE_NOT_FOUND ");
+      break;
+    case -2:
+      printf("CL_DEVICE_NOT_AVAILABLE ");
+      break;
+    case -3:
+      printf("CL_COMPILER_NOT_AVAILABLE ");
+      break;
+    case -4:
+      printf("CL_MEM_OBJECT_ALLOCATION_FAILURE ");
+      break;
+    case -5:
+      printf("CL_OUT_OF_RESOURCES ");
+      break;
+    case -6:
+      printf("CL_OUT_OF_HOST_MEMORY ");
+      break;
+    case -7:
+      printf("CL_PROFILING_INFO_NOT_AVAILABLE ");
+      break;
+    case -8:
+      printf("CL_MEM_COPY_OVERLAP ");
+      break;
+    case -9:
+      printf("CL_IMAGE_FORMAT_MISMATCH ");
+      break;
+    case -10:
+      printf("CL_IMAGE_FORMAT_NOT_SUPPORTED ");
+      break;
+    case -11:
+      printf("CL_BUILD_PROGRAM_FAILURE ");
+      break;
+    case -12:
+      printf("CL_MAP_FAILURE ");
+      break;
+    case -13:
+      printf("CL_MISALIGNED_SUB_BUFFER_OFFSET ");
+      break;
+    case -14:
+      printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ");
+      break;
+
+    case -30:
+      printf("CL_INVALID_VALUE ");
+      break;
+    case -31:
+      printf("CL_INVALID_DEVICE_TYPE ");
+      break;
+    case -32:
+      printf("CL_INVALID_PLATFORM ");
+      break;
+    case -33:
+      printf("CL_INVALID_DEVICE ");
+      break;
+    case -34:
+      printf("CL_INVALID_CONTEXT ");
+      break;
+    case -35:
+      printf("CL_INVALID_QUEUE_PROPERTIES ");
+      break;
+    case -36:
+      printf("CL_INVALID_COMMAND_QUEUE ");
+      break;
+    case -37:
+      printf("CL_INVALID_HOST_PTR ");
+      break;
+    case -38:
+      printf("CL_INVALID_MEM_OBJECT ");
+      break;
+    case -39:
+      printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR ");
+      break;
+    case -40:
+      printf("CL_INVALID_IMAGE_SIZE ");
+      break;
+    case -41:
+      printf("CL_INVALID_SAMPLER ");
+      break;
+    case -42:
+      printf("CL_INVALID_BINARY ");
+      break;
+    case -43:
+      printf("CL_INVALID_BUILD_OPTIONS ");
+      break;
+    case -44:
+      printf("CL_INVALID_PROGRAM ");
+      break;
+    case -45:
+      printf("CL_INVALID_PROGRAM_EXECUTABLE ");
+      break;
+    case -46:
+      printf("CL_INVALID_KERNEL_NAME ");
+      break;
+    case -47:
+      printf("CL_INVALID_KERNEL_DEFINITION ");
+      break;
+    case -48:
+      printf("CL_INVALID_KERNEL ");
+      break;
+    case -49:
+      printf("CL_INVALID_ARG_INDEX ");
+      break;
+    case -50:
+      printf("CL_INVALID_ARG_VALUE ");
+      break;
+    case -51:
+      printf("CL_INVALID_ARG_SIZE ");
+      break;
+    case -52:
+      printf("CL_INVALID_KERNEL_ARGS ");
+      break;
+    case -53:
+      printf("CL_INVALID_WORK_DIMENSION ");
+      break;
+    case -54:
+      printf("CL_INVALID_WORK_GROUP_SIZE ");
+      break;
+    case -55:
+      printf("CL_INVALID_WORK_ITEM_SIZE ");
+      break;
+    case -56:
+      printf("CL_INVALID_GLOBAL_OFFSET ");
+      break;
+    case -57:
+      printf("CL_INVALID_EVENT_WAIT_LIST ");
+      break;
+    case -58:
+      printf("CL_INVALID_EVENT ");
+      break;
+    case -59:
+      printf("CL_INVALID_OPERATION ");
+      break;
+    case -60:
+      printf("CL_INVALID_GL_OBJECT ");
+      break;
+    case -61:
+      printf("CL_INVALID_BUFFER_SIZE ");
+      break;
+    case -62:
+      printf("CL_INVALID_MIP_LEVEL ");
+      break;
+    case -63:
+      printf("CL_INVALID_GLOBAL_WORK_SIZE ");
+      break;
+    default:
+      printf("UNRECOGNIZED ERROR CODE (%d)", error);
+  }
+}
+
+// Print line, file name, and error code if there is an error. Exits the
+// application upon error.
+void _checkError(int line,
+                 const char *file,
+                 cl_int error,
+                 const char *msg,
+                 ...) {
+  // If not successful
+  if(error != CL_SUCCESS) {
+    // Print line and file
+    printf("ERROR: ");
+    printError(error);
+    printf("\nLocation: %s:%d\n", file, line);
+
+    // Print custom message.
+    va_list vl;
+    va_start(vl, msg);
+    vprintf(msg, vl);
+    printf("\n");
+    va_end(vl);
+
+    // Cleanup and bail.
+    cleanup();
+    exit(error);
+  }
+}
+
+// Sets the current working directory to be the same as the directory
+// containing the running executable.
+bool setCwdToExeDir() {
+#ifdef _WIN32 // Windows
+  HMODULE hMod = GetModuleHandle(NULL);
+  char path[MAX_PATH];
+  GetModuleFileNameA(hMod, path, MAX_PATH);
+
+#else         // Linux
+  // Get path of executable.
+  char path[300];
+  ssize_t n = readlink("/proc/self/exe", path, sizeof(path)/sizeof(path[0]) - 1);
+  if(n == -1) {
+    return false;
+  }
+  path[n] = 0;
+#endif
+
+  // Find the last '\' or '/' and terminate the path there; it is now
+  // the directory containing the executable.
+  size_t i;
+  for(i = strlen(path) - 1; i > 0 && path[i] != '/' && path[i] != '\\'; --i);
+  path[i] = '\0';
+
+  // Change the current directory.
+#ifdef _WIN32 // Windows
+  SetCurrentDirectoryA(path);
+#else         // Linux
+  int rc;
+  rc = chdir(path);
+  (void) rc;
+#endif
+
+  return true;
+}
+
+// Searches all platforms for the first platform whose name
+// contains the search string (case-insensitive).
+cl_platform_id findPlatform(const char *platform_name_search) {
+  cl_int status;
+
+  std::string search = platform_name_search;
+  std::transform(search.begin(), search.end(), search.begin(), tolower);
+
+  // Get number of platforms.
+  cl_uint num_platforms;
+  status = clGetPlatformIDs(0, NULL, &num_platforms);
+  checkError(status, "Query for number of platforms failed");
+
+  // Get a list of all platform ids.
+  scoped_array<cl_platform_id> pids(num_platforms);
+  status = clGetPlatformIDs(num_platforms, pids, NULL);
+  checkError(status, "Query for all platform ids failed");
+
+  // For each platform, get name and compare against the search string.
+  for(unsigned i = 0; i < num_platforms; ++i) {
+    std::string name = getPlatformName(pids[i]);
+
+    // Convert to lower case.
+    std::transform(name.begin(), name.end(), name.begin(), tolower);
+
+    if(name.find(search) != std::string::npos) {
+      // Found!
+      return pids[i];
+    }
+  }
+
+  // No platform found.
+  return NULL;
+}
+
+// Returns the platform name.
+std::string getPlatformName(cl_platform_id pid) {
+  cl_int status;
+
+  size_t sz;
+  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
+  checkError(status, "Query for platform name size failed");
+
+  scoped_array<char> name(sz);
+  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
+  checkError(status, "Query for platform name failed");
+
+  return name.get();
+}
+
+// Returns the device name.
+std::string getDeviceName(cl_device_id did) {
+  cl_int status;
+
+  size_t sz;
+  status = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, NULL, &sz);
+  checkError(status, "Failed to get device name size");
+
+  scoped_array<char> name(sz);
+  status = clGetDeviceInfo(did, CL_DEVICE_NAME, sz, name, NULL);
+  checkError(status, "Failed to get device name");
+
+  return name.get();
+}
+
+// Returns the list of all devices.
+cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices) {
+  cl_int status;
+
+  status = clGetDeviceIDs(pid, dev_type, 0, NULL, num_devices);
+  checkError(status, "Query for number of devices failed");
+
+  cl_device_id *dids = new cl_device_id[*num_devices];
+  status = clGetDeviceIDs(pid, dev_type, *num_devices, dids, NULL);
+  checkError(status, "Query for device ids");
+
+  return dids;
+}
+
+// Create a program for all devices associated with the context.
+cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices) {
+  // Early exit for potentially the most common way to fail: AOCX does not exist.
+  if(!fileExists(binary_file_name)) {
+    printf("AOCX file '%s' does not exist.\n", binary_file_name);
+    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
+  }
+
+  // Load the binary.
+  size_t binary_size;
+  scoped_array<unsigned char> binary(loadBinaryFile(binary_file_name, &binary_size));
+  if(binary == NULL) {
+    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
+  }
+
+  scoped_array<size_t> binary_lengths(num_devices);
+  scoped_array<unsigned char *> binaries(num_devices);
+  for(unsigned i = 0; i < num_devices; ++i) {
+    binary_lengths[i] = binary_size;
+    binaries[i] = binary;
+  }
+
+  cl_int status;
+  scoped_array<cl_int> binary_status(num_devices);
+
+  cl_program program = clCreateProgramWithBinary(context, num_devices, devices, binary_lengths,
+      (const unsigned char **) binaries.get(), binary_status, &status);
+  checkError(status, "Failed to create program with binary");
+  for(unsigned i = 0; i < num_devices; ++i) {
+    checkError(binary_status[i], "Failed to load binary for device");
+  }
+
+  return program;
+}
+
+// Loads a file in binary form.
+unsigned char *loadBinaryFile(const char *file_name, size_t *size) {
+  // Open the File
+  FILE* fp;
+#ifdef _WIN32
+  if(fopen_s(&fp, file_name, "rb") != 0) {
+    return NULL;
+  }
+#else
+  fp = fopen(file_name, "rb");
+  if(fp == 0) {
+    return NULL;
+  }
+#endif
+
+  // Get the size of the file
+  fseek(fp, 0, SEEK_END);
+  *size = ftell(fp);
+
+  // Allocate space for the binary
+  unsigned char *binary = new unsigned char[*size];
+
+  // Go back to the file start
+  rewind(fp);
+
+  // Read the file into the binary
+  if(fread((void*)binary, *size, 1, fp) == 0) {
+    delete[] binary;
+    fclose(fp);
+    return NULL;
+  }
+
+  return binary;
+}
+
+bool fileExists(const char *file_name) {
+#ifdef _WIN32 // Windows
+  DWORD attrib = GetFileAttributesA(file_name);
+  return (attrib != INVALID_FILE_ATTRIBUTES && !(attrib & FILE_ATTRIBUTE_DIRECTORY));
+#else         // Linux
+  return access(file_name, R_OK) != -1;
+#endif
+}
+
+std::string getBoardBinaryFile(const char *prefix, cl_device_id device) {
+  // First check if <prefix>.aocx exists. Use it if it does.
+  std::string file_name = std::string(prefix) + ".aocx";
+  if(fileExists(file_name.c_str())) {
+    return file_name;
+  }
+
+  // Now get the name of the board. For Intel(R) FPGA SDK for OpenCL(TM) boards,
+  // the name of the device is presented as:
+  //  <board name> : ...
+  std::string device_name = getDeviceName(device);
+
+  // Now search for the " :" in the device name.
+  size_t end = device_name.find(" :");
+  if(end != std::string::npos) {
+    std::string board_name(device_name, 0, end);
+
+    // Look for a AOCX with the name <prefix>_<board_name>_<version>.aocx.
+    file_name = std::string(prefix) + "_" + board_name + "_" + VERSION_STR + ".aocx";
+    if(fileExists(file_name.c_str())) {
+      return file_name;
+    }
+  }
+
+  // At this point just use <prefix>.aocx. This file doesn't exist
+  // and this should trigger an error later.
+  return std::string(prefix) + ".aocx";
+}
+
+// High-resolution timer.
+double getCurrentTimestamp() {
+#ifdef _WIN32 // Windows
+  // Use the high-resolution performance counter.
+
+  static LARGE_INTEGER ticks_per_second = {};
+  if(ticks_per_second.QuadPart == 0) {
+    // First call - get the frequency.
+    QueryPerformanceFrequency(&ticks_per_second);
+  }
+
+  LARGE_INTEGER counter;
+  QueryPerformanceCounter(&counter);
+
+  double seconds = double(counter.QuadPart) / double(ticks_per_second.QuadPart);
+  return seconds;
+#else         // Linux
+  timespec a;
+  clock_gettime(CLOCK_MONOTONIC, &a);
+  return (double(a.tv_nsec) * 1.0e-9) + double(a.tv_sec);
+#endif
+}
+
+cl_ulong getStartEndTime(cl_event event) {
+  cl_int status;
+
+  cl_ulong start, end;
+  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
+  checkError(status, "Failed to query event start time");
+  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
+  checkError(status, "Failed to query event end time");
+
+  return end - start;
+}
+
+cl_ulong getStartEndTime(cl_event *events, unsigned num_events) {
+  cl_int status;
+
+  cl_ulong min_start = 0;
+  cl_ulong max_end = 0;
+  for(unsigned i = 0; i < num_events; ++i) {
+    cl_ulong start, end;
+    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
+    checkError(status, "Failed to query event start time");
+    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
+    checkError(status, "Failed to query event end time");
+
+    if(i == 0) {
+      min_start = start;
+      max_end = end;
+    }
+    else {
+      if(start < min_start) {
+        min_start = start;
+      }
+      if(end > max_end) {
+        max_end = end;
+      }
+    }
+  }
+
+  return max_end - min_start;
+}
+
+void waitMilliseconds(unsigned ms) {
+#ifdef _WIN32 // Windows
+  Sleep(ms);
+#else         // Linux
+  timespec sleeptime = {0, 0};
+  sleeptime.tv_sec = ms / 1000;
+  sleeptime.tv_nsec = long(ms % 1000) * 1000000L;  // convert to nanoseconds
+  nanosleep(&sleeptime, NULL);
+#endif
+}
+
+void oclContextCallback(const char *errinfo, const void *, size_t, void *) {
+  printf("Context callback: %s\n", errinfo);
+}
+
+} // ns aocl_utils
+
diff --git a/3rdparty/aoclutils/opencl.h b/3rdparty/aoclutils/opencl.h
new file mode 100644
index 000000000000..4aa5348b67b1
--- /dev/null
+++ b/3rdparty/aoclutils/opencl.h
@@ -0,0 +1,122 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// OpenCL utility functions.
+
+#ifndef AOCL_UTILS_OPENCL_H
+#define AOCL_UTILS_OPENCL_H
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#include "CL/opencl.h"
+
+// This is assumed to be externally provided by the application.
+extern void cleanup();
+
+namespace aocl_utils {
+
+// Host allocation functions
+void *alignedMalloc(size_t size);
+void alignedFree(void *ptr);
+
+// Error functions
+void printError(cl_int error);
+void _checkError(int line,
+								 const char *file,
+								 cl_int error,
+                 const char *msg,
+                 ...); // does not return
+#define checkError(status, ...) _checkError(__LINE__, __FILE__, status, __VA_ARGS__)
+
+// Sets the current working directory to the same directory that contains
+// this executable. Returns true on success.
+bool setCwdToExeDir();
+
+// Find a platform that contains the search string in its name (case-insensitive match).
+// Returns NULL if no match is found.
+cl_platform_id findPlatform(const char *platform_name_search);
+
+// Returns the name of the platform.
+std::string getPlatformName(cl_platform_id pid);
+
+// Returns the name of the device.
+std::string getDeviceName(cl_device_id did);
+
+// Returns an array of device ids for the given platform and the
+// device type.
+// Return value must be freed with delete[].
+cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices);
+
+// Create a OpenCL program from a binary file.
+// The program is created for all given devices associated with the context. The same
+// binary is used for all devices.
+cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices);
+
+// Load binary file.
+// Return value must be freed with delete[].
+unsigned char *loadBinaryFile(const char *file_name, size_t *size);
+
+// Checks if a file exists.
+bool fileExists(const char *file_name);
+
+// Returns the path to the AOCX file to use for the given device.
+// This is special handling for examples for the Intel(R) FPGA SDK for OpenCL(TM).
+// It uses the device name to get the board name and then looks for a
+// corresponding AOCX file. Specifically, it gets the device name and
+// extracts the board name assuming the device name has the following format:
+//  <board> : ...
+//
+// Then the AOCX file is <prefix>_<version>_<board>.aocx. If this
+// file does not exist, then the file name defaults to <prefix>.aocx.
+std::string getBoardBinaryFile(const char *prefix, cl_device_id device);
+
+// Returns the time from a high-resolution timer in seconds. This value
+// can be used with a value returned previously to measure a high-resolution
+// time difference.
+double getCurrentTimestamp();
+
+// Returns the difference between the CL_PROFILING_COMMAND_END and
+// CL_PROFILING_COMMAND_START values of a cl_event object.
+// This requires that the command queue associated with the event be created
+// with the CL_QUEUE_PROFILING_ENABLE property.
+//
+// The return value is in nanoseconds.
+cl_ulong getStartEndTime(cl_event event);
+
+// Returns the maximum time span for the given set of events.
+// The time span starts at the earliest event start time.
+// The time span ends at the latest event end time.
+cl_ulong getStartEndTime(cl_event *events, unsigned num_events);
+
+// Wait for the specified number of milliseconds.
+void waitMilliseconds(unsigned ms);
+
+// OpenCL context callback function that simply prints the error information
+// to stdout (via printf).
+void oclContextCallback(const char *errinfo, const void *, size_t, void *);
+
+} // ns aocl_utils
+
+#endif
+
diff --git a/3rdparty/aoclutils/options.cc b/3rdparty/aoclutils/options.cc
new file mode 100644
index 000000000000..05d025b43faf
--- /dev/null
+++ b/3rdparty/aoclutils/options.cc
@@ -0,0 +1,105 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+#include "aocl_utils.h"
+#include <algorithm>
+#include <iostream>
+#include <stdlib.h>
+#include <vector>
+
+namespace aocl_utils {
+
+Options::Options() {
+}
+
+Options::Options(int num, char *argv[]) {
+  addFromCommandLine(num, argv);
+}
+
+bool Options::has(const std::string &name) const {
+  return m_options.find(name) != m_options.end();
+}
+
+std::string &Options::get(const std::string &name) {
+  return m_options[name];
+}
+
+const std::string &Options::get(const std::string &name) const {
+  OptionMap::const_iterator it = m_options.find(name);
+  if(it == m_options.end()) {
+    errorNonExistent(name);
+    std::cerr << "Option '" << name << "' does not exist.\n";
+    exit(1);
+  }
+  return it->second;
+}
+
+void Options::addFromCommandLine(int num, char *argv[]) {
+  for(int i = 1; i < num; ++i) {
+    const std::string arg = argv[i];
+
+    // Look for the first '-'.
+    if(arg.size() > 1 && arg[0] == '-') {
+      size_t eq = arg.find('=');
+      size_t name_start = 1;
+
+      // Check if there's a second '-'.
+      if(arg.size() > 2 && arg[1] == '-') {
+        name_start = 2;
+      }
+
+      if(eq == std::string::npos) {
+        // No '='; treat as a boolean option.
+        set(arg.substr(name_start), true);
+      }
+      else if(eq == name_start) {
+        // No name?!
+        errorNameless();
+      }
+      else {
+        set(arg.substr(name_start, eq - name_start), arg.substr(eq + 1));
+      }
+    }
+    else {
+      // Not an option.
+      m_nonoptions.push_back(arg);
+    }
+  }
+}
+
+void Options::errorNameless() const {
+  std::cerr << "No name provided for option.\n";
+  exit(1);
+}
+
+void Options::errorNonExistent(const std::string &name) const {
+  std::cerr << "Option '" << name << "' does not exist.\n";
+  exit(1);
+}
+
+void Options::errorWrongType(const std::string &name) const {
+  std::cerr << "Value for option '" << name << "' is not of the right type (value = '"
+            << get(name) << "').\n";
+  exit(1);
+}
+
+} // ns aocl_utils
+
diff --git a/3rdparty/aoclutils/options.h b/3rdparty/aoclutils/options.h
new file mode 100644
index 000000000000..78d34605e60e
--- /dev/null
+++ b/3rdparty/aoclutils/options.h
@@ -0,0 +1,137 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// Declares a utility class used to parse command-line options.
+
+#ifndef AOCL_UTILS_OPTIONS_H
+#define AOCL_UTILS_OPTIONS_H
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace aocl_utils {
+
+class Options {
+public:
+  typedef std::vector<std::string> StringVec;
+
+  Options();
+  Options(int num, char *argv[]);
+
+  bool has(const std::string &name) const;
+  std::string &get(const std::string &name); // will create an empty option if it does not exist
+  const std::string &get(const std::string &name) const; // error if option does not exist
+
+  void set(const std::string &name, const std::string &value) { get(name) = value; }
+
+  // Command line options must be of the following form:
+  //  [-]-name (indicates option exists)
+  //  [-]-name=value
+  //
+  // This function assumes that the values are from main(int, char *).
+  // This means that the argv[0] is skipped.
+  void addFromCommandLine(int num, char *argv[]);
+
+  // This templated function converts the option value to the given type.
+  // An assert is raised if the conversion fails.
+  template<typename T>
+  T get(const std::string &name) const;
+
+  template<typename T>
+  void set(const std::string &name, const T &value);
+
+  // Non-options are arguments processed in addFromCommandLine
+  // that were not recognized as options.
+  const StringVec &getNonOptions() const { return m_nonoptions; }
+  size_t getNonOptionCount() const { return m_nonoptions.size(); }
+  const std::string &getNonOption(size_t i) const { return m_nonoptions[i]; }
+
+private:
+  typedef std::map<std::string, std::string> OptionMap;
+
+  // Displays an error message indicating that a nameless option
+  // was provided.
+  void errorNameless() const;
+
+  // Displays an error message indicating that the given option
+  // has the wrong type and then exits with an error code.
+  void errorWrongType(const std::string &name) const;
+
+  // Displays an error message indicating that the given option
+  // does not exist and then exits with an error code.
+  void errorNonExistent(const std::string &name) const;
+
+  OptionMap m_options;
+  StringVec m_nonoptions;
+
+  Options(const Options &); // not implemented
+  void operator =(const Options &); // not implemented
+};
+
+template<typename T>
+T Options::get(const std::string &name) const {
+  std::stringstream ss;
+  ss << get(name);
+
+  T v;
+  ss >> v;
+  if(ss.fail() || !ss.eof()) {
+    // Failed to parse or did not consume the whole string value.
+    errorWrongType(name);
+  }
+  return v;
+}
+
+// Specialization for bool. 
+template<>
+inline bool Options::get<bool>(const std::string &name) const {
+  if(has(name)) {
+    const std::string &v = get(name);
+    if(v == "1") {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Specialization for std::string. Simply returns the option string.
+// Requires specialization because using stringstream to read the string
+// will stop at the first whitespace character (which is wrong).
+template<>
+inline std::string Options::get<std::string>(const std::string &name) const {
+  return get(name);
+}
+
+// This assumes the type T can be serialized to a string and back (when get
+// is called).
+template<typename T>
+void Options::set(const std::string &name, const T &value) {
+  std::stringstream ss;
+  ss << value;
+  set(name, ss.str());
+}
+
+} // ns aocl_utils
+
+#endif
+
diff --git a/3rdparty/aoclutils/scoped_ptrs.h b/3rdparty/aoclutils/scoped_ptrs.h
new file mode 100644
index 000000000000..b11085c5226e
--- /dev/null
+++ b/3rdparty/aoclutils/scoped_ptrs.h
@@ -0,0 +1,165 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// Scoped pointer definitions.
+
+#ifndef AOCL_UTILS_SCOPED_PTRS_H
+#define AOCL_UTILS_SCOPED_PTRS_H
+
+namespace aocl_utils {
+
+// Interface is essentially the combination of std::auto_ptr and boost's smart pointers,
+// along with some small extensions (auto conversion to T*).
+
+// scoped_ptr: assumes pointer was allocated with operator new; destroys with operator delete
+template<typename T>
+class scoped_ptr {
+public:
+  typedef scoped_ptr<T> this_type;
+
+  scoped_ptr() : m_ptr(NULL) {}
+  scoped_ptr(T *ptr) : m_ptr(ptr) {}
+  ~scoped_ptr() { reset(); }
+
+  T *get() const { return m_ptr; }
+  operator T *() const { return m_ptr; }
+  T *operator ->() const { return m_ptr; }
+  T &operator *() const { return *m_ptr; }
+
+  this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+  void reset(T *ptr = NULL) { delete m_ptr; m_ptr = ptr; }
+  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+  T *m_ptr;
+
+  // noncopyable
+  scoped_ptr(const this_type &);
+  this_type &operator =(const this_type &);
+};
+
+// scoped_array: assumes pointer was allocated with operator new[]; destroys with operator delete[]
+// Also supports allocation/reset with a number, which is the number of
+// elements of type T.
+template<typename T>
+class scoped_array {
+public:
+  typedef scoped_array<T> this_type;
+
+  scoped_array() : m_ptr(NULL) {}
+  scoped_array(T *ptr) : m_ptr(NULL) { reset(ptr); }
+  explicit scoped_array(size_t n) : m_ptr(NULL) { reset(n); }
+  ~scoped_array() { reset(); }
+
+  T *get() const { return m_ptr; }
+  operator T *() const { return m_ptr; }
+  T *operator ->() const { return m_ptr; }
+  T &operator *() const { return *m_ptr; }
+  T &operator [](int index) const { return m_ptr[index]; }
+
+  this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+  void reset(T *ptr = NULL) { delete[] m_ptr; m_ptr = ptr; }
+  void reset(size_t n) { reset(new T[n]); }
+  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+  T *m_ptr;
+
+  // noncopyable
+  scoped_array(const this_type &);
+  this_type &operator =(const this_type &);
+};
+
+// scoped_aligned_ptr: assumes pointer was allocated with alignedMalloc; destroys with alignedFree
+// Also supports allocation/reset with a number, which is the number of
+// elements of type T
+template<typename T>
+class scoped_aligned_ptr {
+public:
+  typedef scoped_aligned_ptr<T> this_type;
+
+  scoped_aligned_ptr() : m_ptr(NULL) {}
+  scoped_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
+  explicit scoped_aligned_ptr(size_t n) : m_ptr(NULL) { reset(n); }
+  ~scoped_aligned_ptr() { reset(); }
+
+  T *get() const { return m_ptr; }
+  operator T *() const { return m_ptr; }
+  T *operator ->() const { return m_ptr; }
+  T &operator *() const { return *m_ptr; }
+  T &operator [](int index) const { return m_ptr[index]; }
+
+  this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+  void reset(T *ptr = NULL) { if(m_ptr) alignedFree(m_ptr); m_ptr = ptr; }
+  void reset(size_t n) { reset((T*) alignedMalloc(sizeof(T) * n)); }
+  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+  T *m_ptr;
+
+  // noncopyable
+  scoped_aligned_ptr(const this_type &);
+  this_type &operator =(const this_type &);
+};
+
+#if USE_SVM_API == 1
+// scoped_SVM_aligned_ptr: assumes pointer was allocated with clSVMAlloc; destroys with clSVMFree
+// Also supports allocation/reset with a number, which is the number of
+// elements of type T
+template<typename T>
+class scoped_SVM_aligned_ptr {
+public:
+	typedef scoped_SVM_aligned_ptr<T> this_type;
+
+	scoped_SVM_aligned_ptr() : m_ptr(NULL) {}
+	scoped_SVM_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
+	explicit scoped_SVM_aligned_ptr(cl_context ctx, size_t n) : m_ptr(NULL) { reset(ctx, n); }
+	~scoped_SVM_aligned_ptr() { reset(); }
+
+	T *get() const { return m_ptr; }
+	operator T *() const { return m_ptr; }
+	T *operator ->() const { return m_ptr; }
+	T &operator *() const { return *m_ptr; }
+	T &operator [](int index) const { return m_ptr[index]; }
+
+	this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+	void reset(T *ptr = NULL) { if (m_ptr) clSVMFree(m_ctx, m_ptr); m_ptr = ptr; }
+	void reset(cl_context ctx, size_t n) { reset((T*)clSVMAlloc(ctx, 0, sizeof(T) * n, 0)); m_ctx = ctx; }
+	T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+	T *m_ptr;
+	cl_context m_ctx;
+
+	// noncopyable
+	scoped_SVM_aligned_ptr(const this_type &);
+	this_type &operator =(const this_type &);
+};
+#endif /* USE_SVM_API == 1 */
+
+} // ns aocl_utils
+
+#endif
+
diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index ed466d70d01c..98860a2a31ec 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit ed466d70d01c57cde4fde602c8c593b6a8acc531
+Subproject commit 98860a2a31ecc4aaf7c3346daa750d26193847e4
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index cf21ca7c0495..6c35b8df07d0 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -104,7 +104,7 @@ elseif(PYTHON)
       file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc)
     elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
       file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc)
-      file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cc)
+      file(GLOB AOCLUTIL_SRC 3rdparty/aoclutils/*.cc)
       list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC})
       list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h)
     endif()
@@ -126,6 +126,7 @@ elseif(PYTHON)
       target_include_directories(vta PUBLIC
         "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
     elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
+      target_include_directories(vta PUBLIC 3rdparty)
       target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include")
       set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
       target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL)

From c0f918ccbbd20e3141ac3f3e2c4b3fab2f41e581 Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Sat, 18 Jul 2020 23:04:06 +0800
Subject: [PATCH 43/44] remove unnecessary comment

---
 vta/python/vta/transform.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index f53a8ae7923e..c6ba1b95a8cb 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -382,10 +382,6 @@ def _fold_buffer_dim(buf, scope, elem_block):
 
     def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold):
         elem_block = elem_bytes * 8 // elem_width
-        # remove the checking as we have load_int8 insn
-        # if buf.dtype != dtype:
-        #     raise RuntimeError("Expect buffer type to be %s instead of %s" %
-        #                        (dtype, buf.dtype))
         shape, strides = buf.shape, buf.strides
         if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
             raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block))

From 348fb91b6ba34b2df771d73d588370e612b5cd1e Mon Sep 17 00:00:00 2001
From: Zhang Hao <zhanghao@4paradigm.com>
Date: Mon, 20 Jul 2020 15:25:15 +0800
Subject: [PATCH 44/44] api to program intelfocl aocx

---
 vta/python/vta/program_bitstream.py             | 10 +++++++++-
 vta/python/vta/rpc_client.py                    | 14 ++++++++++----
 vta/tutorials/frontend/deploy_classification.py |  4 ++++
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/vta/python/vta/program_bitstream.py b/vta/python/vta/program_bitstream.py
index 62cb5f21d02a..9a48ba75378e 100644
--- a/vta/python/vta/program_bitstream.py
+++ b/vta/python/vta/program_bitstream.py
@@ -54,7 +54,13 @@ def de10nano_bitstream_program(bitstream_path):
     program = get_global_func("vta.de10nano.program")
     program(bitstream_path)
 
-def bitstream_program(target, bitstream):
+def intelfocl_bitstream_program(bitstream_path, mem_size=4*1024*1024*1024):
+    # pylint: disable=import-outside-toplevel
+    from tvm import get_global_func
+    program = get_global_func("vta.intelfocl.program")
+    program(bitstream_path, mem_size)
+
+def bitstream_program(target, bitstream, *args):
     if target in ['pynq', 'ultra96']:
         pynq_bitstream_program(bitstream)
     elif target in ['de10nano']:
@@ -62,6 +68,8 @@ def bitstream_program(target, bitstream):
     elif target in ['sim', 'tsim']:
         # In simulation, bit stream programming is a no-op
         return
+    elif target in ['intelfocl']:
+        intelfocl_bitstream_program(bitstream, *args)
     else:
         raise RuntimeError("Unknown target {}".format(target))
 
diff --git a/vta/python/vta/rpc_client.py b/vta/python/vta/rpc_client.py
index 097ea8e4a5cc..c76a8c77cb67 100644
--- a/vta/python/vta/rpc_client.py
+++ b/vta/python/vta/rpc_client.py
@@ -19,6 +19,8 @@
 
 from .environment import get_env
 from .bitstream import download_bitstream, get_bitstream_path
+from tvm import rpc
+from vta import program_bitstream
 
 def reconfig_runtime(remote):
     """Reconfigure remote runtime based on current hardware spec.
@@ -44,16 +46,20 @@ def program_fpga(remote, bitstream=None):
     bitstream : str, optional
         Path to a local bistream file. If unset, tries to download from cache server.
     """
+    env = get_env()
+
     if bitstream:
         assert os.path.isfile(bitstream)
     else:
         bitstream = get_bitstream_path()
         if not os.path.isfile(bitstream):
-            env = get_env()
             if env.TARGET == 'de10nano':
                 return
             download_bitstream()
 
-    fprogram = remote.get_function("tvm.contrib.vta.init")
-    remote.upload(bitstream)
-    fprogram(os.path.basename(bitstream))
+    if isinstance(remote, rpc.LocalSession):
+        program_bitstream.bitstream_program(env.TARGET, bitstream)
+    else:
+        fprogram = remote.get_function("tvm.contrib.vta.init")
+        remote.upload(bitstream)
+        fprogram(os.path.basename(bitstream))
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 33f59bd0e701..a9676a0096e8 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -129,6 +129,10 @@
 else:
     remote = rpc.LocalSession()
 
+    if env.TARGET in ["intelfocl"]:
+        # program intelfocl aocx
+        vta.program_fpga(remote, bitstream="vta_opencl.aocx")
+
 # Get execution context from remote
 ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)