diff --git a/.gitignore b/.gitignore
index 506e54d93067..d61b23ee5248 100644
--- a/.gitignore
+++ b/.gitignore
@@ -234,3 +234,6 @@ conda/pkg
 # antlr files
 *.tokens
 *.interp
+
+*log*
+*.txt
diff --git a/3rdparty/aoclutils/aocl_utils.h b/3rdparty/aoclutils/aocl_utils.h
new file mode 100644
index 000000000000..70e0fc6bcc0a
--- /dev/null
+++ b/3rdparty/aoclutils/aocl_utils.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// Main include file for AOCLUtils. Includes all other utility header files.
+
+#ifndef AOCL_UTILS_H
+#define AOCL_UTILS_H
+
+#include "opencl.h"
+#include "scoped_ptrs.h"
+#include "options.h"
+
+#endif
+
diff --git a/3rdparty/aoclutils/opencl.cc b/3rdparty/aoclutils/opencl.cc
new file mode 100644
index 000000000000..04d989d7c9ea
--- /dev/null
+++ b/3rdparty/aoclutils/opencl.cc
@@ -0,0 +1,555 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+#include "aocl_utils.h"
+#include <algorithm>
+#include <stdarg.h>
+
+#ifdef _WIN32 // Windows
+#include <windows.h>
+#else         // Linux
+#include <stdio.h> 
+#include <unistd.h> // readlink, chdir
+#endif
+
+namespace aocl_utils {
+
+static const char *const VERSION_STR = "161";
+
+//////////////////////////////////////////
+// Host allocation functions for alignment
+//////////////////////////////////////////
+
+// This is the minimum alignment requirement to ensure DMA can be used.
+const unsigned AOCL_ALIGNMENT = 64;
+
+#ifdef _WIN32 // Windows
+void *alignedMalloc(size_t size) {
+  return _aligned_malloc (size, AOCL_ALIGNMENT);
+}
+
+void alignedFree(void * ptr) {
+  _aligned_free(ptr);
+}
+#else          // Linux
+void *alignedMalloc(size_t size) {
+  void *result = NULL;
+  int rc;
+  rc = posix_memalign (&result, AOCL_ALIGNMENT, size);
+  (void) rc;
+  return result;
+}
+
+void alignedFree(void * ptr) {
+  free (ptr);
+}
+#endif
+
+///////////////////////////////
+// Error functions
+///////////////////////////////
+
+// Print the error associciated with an error code
+void printError(cl_int error) {
+  // Print error message
+  switch(error)
+  {
+    case -1:
+      printf("CL_DEVICE_NOT_FOUND ");
+      break;
+    case -2:
+      printf("CL_DEVICE_NOT_AVAILABLE ");
+      break;
+    case -3:
+      printf("CL_COMPILER_NOT_AVAILABLE ");
+      break;
+    case -4:
+      printf("CL_MEM_OBJECT_ALLOCATION_FAILURE ");
+      break;
+    case -5:
+      printf("CL_OUT_OF_RESOURCES ");
+      break;
+    case -6:
+      printf("CL_OUT_OF_HOST_MEMORY ");
+      break;
+    case -7:
+      printf("CL_PROFILING_INFO_NOT_AVAILABLE ");
+      break;
+    case -8:
+      printf("CL_MEM_COPY_OVERLAP ");
+      break;
+    case -9:
+      printf("CL_IMAGE_FORMAT_MISMATCH ");
+      break;
+    case -10:
+      printf("CL_IMAGE_FORMAT_NOT_SUPPORTED ");
+      break;
+    case -11:
+      printf("CL_BUILD_PROGRAM_FAILURE ");
+      break;
+    case -12:
+      printf("CL_MAP_FAILURE ");
+      break;
+    case -13:
+      printf("CL_MISALIGNED_SUB_BUFFER_OFFSET ");
+      break;
+    case -14:
+      printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ");
+      break;
+
+    case -30:
+      printf("CL_INVALID_VALUE ");
+      break;
+    case -31:
+      printf("CL_INVALID_DEVICE_TYPE ");
+      break;
+    case -32:
+      printf("CL_INVALID_PLATFORM ");
+      break;
+    case -33:
+      printf("CL_INVALID_DEVICE ");
+      break;
+    case -34:
+      printf("CL_INVALID_CONTEXT ");
+      break;
+    case -35:
+      printf("CL_INVALID_QUEUE_PROPERTIES ");
+      break;
+    case -36:
+      printf("CL_INVALID_COMMAND_QUEUE ");
+      break;
+    case -37:
+      printf("CL_INVALID_HOST_PTR ");
+      break;
+    case -38:
+      printf("CL_INVALID_MEM_OBJECT ");
+      break;
+    case -39:
+      printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR ");
+      break;
+    case -40:
+      printf("CL_INVALID_IMAGE_SIZE ");
+      break;
+    case -41:
+      printf("CL_INVALID_SAMPLER ");
+      break;
+    case -42:
+      printf("CL_INVALID_BINARY ");
+      break;
+    case -43:
+      printf("CL_INVALID_BUILD_OPTIONS ");
+      break;
+    case -44:
+      printf("CL_INVALID_PROGRAM ");
+      break;
+    case -45:
+      printf("CL_INVALID_PROGRAM_EXECUTABLE ");
+      break;
+    case -46:
+      printf("CL_INVALID_KERNEL_NAME ");
+      break;
+    case -47:
+      printf("CL_INVALID_KERNEL_DEFINITION ");
+      break;
+    case -48:
+      printf("CL_INVALID_KERNEL ");
+      break;
+    case -49:
+      printf("CL_INVALID_ARG_INDEX ");
+      break;
+    case -50:
+      printf("CL_INVALID_ARG_VALUE ");
+      break;
+    case -51:
+      printf("CL_INVALID_ARG_SIZE ");
+      break;
+    case -52:
+      printf("CL_INVALID_KERNEL_ARGS ");
+      break;
+    case -53:
+      printf("CL_INVALID_WORK_DIMENSION ");
+      break;
+    case -54:
+      printf("CL_INVALID_WORK_GROUP_SIZE ");
+      break;
+    case -55:
+      printf("CL_INVALID_WORK_ITEM_SIZE ");
+      break;
+    case -56:
+      printf("CL_INVALID_GLOBAL_OFFSET ");
+      break;
+    case -57:
+      printf("CL_INVALID_EVENT_WAIT_LIST ");
+      break;
+    case -58:
+      printf("CL_INVALID_EVENT ");
+      break;
+    case -59:
+      printf("CL_INVALID_OPERATION ");
+      break;
+    case -60:
+      printf("CL_INVALID_GL_OBJECT ");
+      break;
+    case -61:
+      printf("CL_INVALID_BUFFER_SIZE ");
+      break;
+    case -62:
+      printf("CL_INVALID_MIP_LEVEL ");
+      break;
+    case -63:
+      printf("CL_INVALID_GLOBAL_WORK_SIZE ");
+      break;
+    default:
+      printf("UNRECOGNIZED ERROR CODE (%d)", error);
+  }
+}
+
+// Print line, file name, and error code if there is an error. Exits the
+// application upon error.
+void _checkError(int line,
+                 const char *file,
+                 cl_int error,
+                 const char *msg,
+                 ...) {
+  // If not successful
+  if(error != CL_SUCCESS) {
+    // Print line and file
+    printf("ERROR: ");
+    printError(error);
+    printf("\nLocation: %s:%d\n", file, line);
+
+    // Print custom message.
+    va_list vl;
+    va_start(vl, msg);
+    vprintf(msg, vl);
+    printf("\n");
+    va_end(vl);
+
+    // Cleanup and bail.
+    cleanup();
+    exit(error);
+  }
+}
+
+// Sets the current working directory to be the same as the directory
+// containing the running executable.
+bool setCwdToExeDir() {
+#ifdef _WIN32 // Windows
+  HMODULE hMod = GetModuleHandle(NULL);
+  char path[MAX_PATH];
+  GetModuleFileNameA(hMod, path, MAX_PATH);
+
+#else         // Linux
+  // Get path of executable.
+  char path[300];
+  ssize_t n = readlink("/proc/self/exe", path, sizeof(path)/sizeof(path[0]) - 1);
+  if(n == -1) {
+    return false;
+  }
+  path[n] = 0;
+#endif
+
+  // Find the last '\' or '/' and terminate the path there; it is now
+  // the directory containing the executable.
+  size_t i;
+  for(i = strlen(path) - 1; i > 0 && path[i] != '/' && path[i] != '\\'; --i);
+  path[i] = '\0';
+
+  // Change the current directory.
+#ifdef _WIN32 // Windows
+  SetCurrentDirectoryA(path);
+#else         // Linux
+  int rc;
+  rc = chdir(path);
+  (void) rc;
+#endif
+
+  return true;
+}
+
+// Searches all platforms for the first platform whose name
+// contains the search string (case-insensitive).
+cl_platform_id findPlatform(const char *platform_name_search) {
+  cl_int status;
+
+  std::string search = platform_name_search;
+  std::transform(search.begin(), search.end(), search.begin(), tolower);
+
+  // Get number of platforms.
+  cl_uint num_platforms;
+  status = clGetPlatformIDs(0, NULL, &num_platforms);
+  checkError(status, "Query for number of platforms failed");
+
+  // Get a list of all platform ids.
+  scoped_array<cl_platform_id> pids(num_platforms);
+  status = clGetPlatformIDs(num_platforms, pids, NULL);
+  checkError(status, "Query for all platform ids failed");
+
+  // For each platform, get name and compare against the search string.
+  for(unsigned i = 0; i < num_platforms; ++i) {
+    std::string name = getPlatformName(pids[i]);
+
+    // Convert to lower case.
+    std::transform(name.begin(), name.end(), name.begin(), tolower);
+
+    if(name.find(search) != std::string::npos) {
+      // Found!
+      return pids[i];
+    }
+  }
+
+  // No platform found.
+  return NULL;
+}
+
+// Returns the platform name.
+std::string getPlatformName(cl_platform_id pid) {
+  cl_int status;
+
+  size_t sz;
+  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
+  checkError(status, "Query for platform name size failed");
+
+  scoped_array<char> name(sz);
+  status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
+  checkError(status, "Query for platform name failed");
+
+  return name.get();
+}
+
+// Returns the device name.
+std::string getDeviceName(cl_device_id did) {
+  cl_int status;
+
+  size_t sz;
+  status = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, NULL, &sz);
+  checkError(status, "Failed to get device name size");
+
+  scoped_array<char> name(sz);
+  status = clGetDeviceInfo(did, CL_DEVICE_NAME, sz, name, NULL);
+  checkError(status, "Failed to get device name");
+
+  return name.get();
+}
+
+// Returns the list of all devices.
+cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices) {
+  cl_int status;
+
+  status = clGetDeviceIDs(pid, dev_type, 0, NULL, num_devices);
+  checkError(status, "Query for number of devices failed");
+
+  cl_device_id *dids = new cl_device_id[*num_devices];
+  status = clGetDeviceIDs(pid, dev_type, *num_devices, dids, NULL);
+  checkError(status, "Query for device ids");
+
+  return dids;
+}
+
+// Create a program for all devices associated with the context.
+cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices) {
+  // Early exit for potentially the most common way to fail: AOCX does not exist.
+  if(!fileExists(binary_file_name)) {
+    printf("AOCX file '%s' does not exist.\n", binary_file_name);
+    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
+  }
+
+  // Load the binary.
+  size_t binary_size;
+  scoped_array<unsigned char> binary(loadBinaryFile(binary_file_name, &binary_size));
+  if(binary == NULL) {
+    checkError(CL_INVALID_PROGRAM, "Failed to load binary file");
+  }
+
+  scoped_array<size_t> binary_lengths(num_devices);
+  scoped_array<unsigned char *> binaries(num_devices);
+  for(unsigned i = 0; i < num_devices; ++i) {
+    binary_lengths[i] = binary_size;
+    binaries[i] = binary;
+  }
+
+  cl_int status;
+  scoped_array<cl_int> binary_status(num_devices);
+
+  cl_program program = clCreateProgramWithBinary(context, num_devices, devices, binary_lengths,
+      (const unsigned char **) binaries.get(), binary_status, &status);
+  checkError(status, "Failed to create program with binary");
+  for(unsigned i = 0; i < num_devices; ++i) {
+    checkError(binary_status[i], "Failed to load binary for device");
+  }
+
+  return program;
+}
+
+// Loads a file in binary form.
+unsigned char *loadBinaryFile(const char *file_name, size_t *size) {
+  // Open the File
+  FILE* fp;
+#ifdef _WIN32
+  if(fopen_s(&fp, file_name, "rb") != 0) {
+    return NULL;
+  }
+#else
+  fp = fopen(file_name, "rb");
+  if(fp == 0) {
+    return NULL;
+  }
+#endif
+
+  // Get the size of the file
+  fseek(fp, 0, SEEK_END);
+  *size = ftell(fp);
+
+  // Allocate space for the binary
+  unsigned char *binary = new unsigned char[*size];
+
+  // Go back to the file start
+  rewind(fp);
+
+  // Read the file into the binary
+  if(fread((void*)binary, *size, 1, fp) == 0) {
+    delete[] binary;
+    fclose(fp);
+    return NULL;
+  }
+
+  return binary;
+}
+
+bool fileExists(const char *file_name) {
+#ifdef _WIN32 // Windows
+  DWORD attrib = GetFileAttributesA(file_name);
+  return (attrib != INVALID_FILE_ATTRIBUTES && !(attrib & FILE_ATTRIBUTE_DIRECTORY));
+#else         // Linux
+  return access(file_name, R_OK) != -1;
+#endif
+}
+
+std::string getBoardBinaryFile(const char *prefix, cl_device_id device) {
+  // First check if <prefix>.aocx exists. Use it if it does.
+  std::string file_name = std::string(prefix) + ".aocx";
+  if(fileExists(file_name.c_str())) {
+    return file_name;
+  }
+
+  // Now get the name of the board. For Intel(R) FPGA SDK for OpenCL(TM) boards,
+  // the name of the device is presented as:
+  //  <board name> : ...
+  std::string device_name = getDeviceName(device);
+
+  // Now search for the " :" in the device name.
+  size_t end = device_name.find(" :");
+  if(end != std::string::npos) {
+    std::string board_name(device_name, 0, end);
+
+    // Look for a AOCX with the name <prefix>_<board_name>_<version>.aocx.
+    file_name = std::string(prefix) + "_" + board_name + "_" + VERSION_STR + ".aocx";
+    if(fileExists(file_name.c_str())) {
+      return file_name;
+    }
+  }
+
+  // At this point just use <prefix>.aocx. This file doesn't exist
+  // and this should trigger an error later.
+  return std::string(prefix) + ".aocx";
+}
+
+// High-resolution timer.
+double getCurrentTimestamp() {
+#ifdef _WIN32 // Windows
+  // Use the high-resolution performance counter.
+
+  static LARGE_INTEGER ticks_per_second = {};
+  if(ticks_per_second.QuadPart == 0) {
+    // First call - get the frequency.
+    QueryPerformanceFrequency(&ticks_per_second);
+  }
+
+  LARGE_INTEGER counter;
+  QueryPerformanceCounter(&counter);
+
+  double seconds = double(counter.QuadPart) / double(ticks_per_second.QuadPart);
+  return seconds;
+#else         // Linux
+  timespec a;
+  clock_gettime(CLOCK_MONOTONIC, &a);
+  return (double(a.tv_nsec) * 1.0e-9) + double(a.tv_sec);
+#endif
+}
+
+cl_ulong getStartEndTime(cl_event event) {
+  cl_int status;
+
+  cl_ulong start, end;
+  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
+  checkError(status, "Failed to query event start time");
+  status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
+  checkError(status, "Failed to query event end time");
+
+  return end - start;
+}
+
+cl_ulong getStartEndTime(cl_event *events, unsigned num_events) {
+  cl_int status;
+
+  cl_ulong min_start = 0;
+  cl_ulong max_end = 0;
+  for(unsigned i = 0; i < num_events; ++i) {
+    cl_ulong start, end;
+    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
+    checkError(status, "Failed to query event start time");
+    status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
+    checkError(status, "Failed to query event end time");
+
+    if(i == 0) {
+      min_start = start;
+      max_end = end;
+    }
+    else {
+      if(start < min_start) {
+        min_start = start;
+      }
+      if(end > max_end) {
+        max_end = end;
+      }
+    }
+  }
+
+  return max_end - min_start;
+}
+
+void waitMilliseconds(unsigned ms) {
+#ifdef _WIN32 // Windows
+  Sleep(ms);
+#else         // Linux
+  timespec sleeptime = {0, 0};
+  sleeptime.tv_sec = ms / 1000;
+  sleeptime.tv_nsec = long(ms % 1000) * 1000000L;  // convert to nanoseconds
+  nanosleep(&sleeptime, NULL);
+#endif
+}
+
+void oclContextCallback(const char *errinfo, const void *, size_t, void *) {
+  printf("Context callback: %s\n", errinfo);
+}
+
+} // ns aocl_utils
+
diff --git a/3rdparty/aoclutils/opencl.h b/3rdparty/aoclutils/opencl.h
new file mode 100644
index 000000000000..4aa5348b67b1
--- /dev/null
+++ b/3rdparty/aoclutils/opencl.h
@@ -0,0 +1,122 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// OpenCL utility functions.
+
+#ifndef AOCL_UTILS_OPENCL_H
+#define AOCL_UTILS_OPENCL_H
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+
+#include "CL/opencl.h"
+
+// This is assumed to be externally provided by the application.
+extern void cleanup();
+
+namespace aocl_utils {
+
+// Host allocation functions
+void *alignedMalloc(size_t size);
+void alignedFree(void *ptr);
+
+// Error functions
+void printError(cl_int error);
+void _checkError(int line,
+								 const char *file,
+								 cl_int error,
+                 const char *msg,
+                 ...); // does not return
+#define checkError(status, ...) _checkError(__LINE__, __FILE__, status, __VA_ARGS__)
+
+// Sets the current working directory to the same directory that contains
+// this executable. Returns true on success.
+bool setCwdToExeDir();
+
+// Find a platform that contains the search string in its name (case-insensitive match).
+// Returns NULL if no match is found.
+cl_platform_id findPlatform(const char *platform_name_search);
+
+// Returns the name of the platform.
+std::string getPlatformName(cl_platform_id pid);
+
+// Returns the name of the device.
+std::string getDeviceName(cl_device_id did);
+
+// Returns an array of device ids for the given platform and the
+// device type.
+// Return value must be freed with delete[].
+cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices);
+
+// Create a OpenCL program from a binary file.
+// The program is created for all given devices associated with the context. The same
+// binary is used for all devices.
+cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices);
+
+// Load binary file.
+// Return value must be freed with delete[].
+unsigned char *loadBinaryFile(const char *file_name, size_t *size);
+
+// Checks if a file exists.
+bool fileExists(const char *file_name);
+
+// Returns the path to the AOCX file to use for the given device.
+// This is special handling for examples for the Intel(R) FPGA SDK for OpenCL(TM).
+// It uses the device name to get the board name and then looks for a
+// corresponding AOCX file. Specifically, it gets the device name and
+// extracts the board name assuming the device name has the following format:
+//  <board> : ...
+//
+// Then the AOCX file is <prefix>_<version>_<board>.aocx. If this
+// file does not exist, then the file name defaults to <prefix>.aocx.
+std::string getBoardBinaryFile(const char *prefix, cl_device_id device);
+
+// Returns the time from a high-resolution timer in seconds. This value
+// can be used with a value returned previously to measure a high-resolution
+// time difference.
+double getCurrentTimestamp();
+
+// Returns the difference between the CL_PROFILING_COMMAND_END and
+// CL_PROFILING_COMMAND_START values of a cl_event object.
+// This requires that the command queue associated with the event be created
+// with the CL_QUEUE_PROFILING_ENABLE property.
+//
+// The return value is in nanoseconds.
+cl_ulong getStartEndTime(cl_event event);
+
+// Returns the maximum time span for the given set of events.
+// The time span starts at the earliest event start time.
+// The time span ends at the latest event end time.
+cl_ulong getStartEndTime(cl_event *events, unsigned num_events);
+
+// Wait for the specified number of milliseconds.
+void waitMilliseconds(unsigned ms);
+
+// OpenCL context callback function that simply prints the error information
+// to stdout (via printf).
+void oclContextCallback(const char *errinfo, const void *, size_t, void *);
+
+} // ns aocl_utils
+
+#endif
+
diff --git a/3rdparty/aoclutils/options.cc b/3rdparty/aoclutils/options.cc
new file mode 100644
index 000000000000..05d025b43faf
--- /dev/null
+++ b/3rdparty/aoclutils/options.cc
@@ -0,0 +1,105 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+#include "aocl_utils.h"
+#include <algorithm>
+#include <iostream>
+#include <stdlib.h>
+#include <vector>
+
+namespace aocl_utils {
+
+Options::Options() {
+}
+
+Options::Options(int num, char *argv[]) {
+  addFromCommandLine(num, argv);
+}
+
+bool Options::has(const std::string &name) const {
+  return m_options.find(name) != m_options.end();
+}
+
+std::string &Options::get(const std::string &name) {
+  return m_options[name];
+}
+
+const std::string &Options::get(const std::string &name) const {
+  OptionMap::const_iterator it = m_options.find(name);
+  if(it == m_options.end()) {
+    errorNonExistent(name);
+    std::cerr << "Option '" << name << "' does not exist.\n";
+    exit(1);
+  }
+  return it->second;
+}
+
+void Options::addFromCommandLine(int num, char *argv[]) {
+  for(int i = 1; i < num; ++i) {
+    const std::string arg = argv[i];
+
+    // Look for the first '-'.
+    if(arg.size() > 1 && arg[0] == '-') {
+      size_t eq = arg.find('=');
+      size_t name_start = 1;
+
+      // Check if there's a second '-'.
+      if(arg.size() > 2 && arg[1] == '-') {
+        name_start = 2;
+      }
+
+      if(eq == std::string::npos) {
+        // No '='; treat as a boolean option.
+        set(arg.substr(name_start), true);
+      }
+      else if(eq == name_start) {
+        // No name?!
+        errorNameless();
+      }
+      else {
+        set(arg.substr(name_start, eq - name_start), arg.substr(eq + 1));
+      }
+    }
+    else {
+      // Not an option.
+      m_nonoptions.push_back(arg);
+    }
+  }
+}
+
+void Options::errorNameless() const {
+  std::cerr << "No name provided for option.\n";
+  exit(1);
+}
+
+void Options::errorNonExistent(const std::string &name) const {
+  std::cerr << "Option '" << name << "' does not exist.\n";
+  exit(1);
+}
+
+void Options::errorWrongType(const std::string &name) const {
+  std::cerr << "Value for option '" << name << "' is not of the right type (value = '"
+            << get(name) << "').\n";
+  exit(1);
+}
+
+} // ns aocl_utils
+
diff --git a/3rdparty/aoclutils/options.h b/3rdparty/aoclutils/options.h
new file mode 100644
index 000000000000..78d34605e60e
--- /dev/null
+++ b/3rdparty/aoclutils/options.h
@@ -0,0 +1,137 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// Declares a utility class used to parse command-line options.
+
+#ifndef AOCL_UTILS_OPTIONS_H
+#define AOCL_UTILS_OPTIONS_H
+
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace aocl_utils {
+
+class Options {
+public:
+  typedef std::vector<std::string> StringVec;
+
+  Options();
+  Options(int num, char *argv[]);
+
+  bool has(const std::string &name) const;
+  std::string &get(const std::string &name); // will create an empty option if it does not exist
+  const std::string &get(const std::string &name) const; // error if option does not exist
+
+  void set(const std::string &name, const std::string &value) { get(name) = value; }
+
+  // Command line options must be of the following form:
+  //  [-]-name (indicates option exists)
+  //  [-]-name=value
+  //
+  // This function assumes that the values are from main(int, char *).
+  // This means that the argv[0] is skipped.
+  void addFromCommandLine(int num, char *argv[]);
+
+  // This templated function converts the option value to the given type.
+  // An assert is raised if the conversion fails.
+  template<typename T>
+  T get(const std::string &name) const;
+
+  template<typename T>
+  void set(const std::string &name, const T &value);
+
+  // Non-options are arguments processed in addFromCommandLine
+  // that were not recognized as options.
+  const StringVec &getNonOptions() const { return m_nonoptions; }
+  size_t getNonOptionCount() const { return m_nonoptions.size(); }
+  const std::string &getNonOption(size_t i) const { return m_nonoptions[i]; }
+
+private:
+  typedef std::map<std::string, std::string> OptionMap;
+
+  // Displays an error message indicating that a nameless option
+  // was provided.
+  void errorNameless() const;
+
+  // Displays an error message indicating that the given option
+  // has the wrong type and then exits with an error code.
+  void errorWrongType(const std::string &name) const;
+
+  // Displays an error message indicating that the given option
+  // does not exist and then exits with an error code.
+  void errorNonExistent(const std::string &name) const;
+
+  OptionMap m_options;
+  StringVec m_nonoptions;
+
+  Options(const Options &); // not implemented
+  void operator =(const Options &); // not implemented
+};
+
+template<typename T>
+T Options::get(const std::string &name) const {
+  std::stringstream ss;
+  ss << get(name);
+
+  T v;
+  ss >> v;
+  if(ss.fail() || !ss.eof()) {
+    // Failed to parse or did not consume the whole string value.
+    errorWrongType(name);
+  }
+  return v;
+}
+
+// Specialization for bool. 
+template<>
+inline bool Options::get<bool>(const std::string &name) const {
+  if(has(name)) {
+    const std::string &v = get(name);
+    if(v == "1") {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Specialization for std::string. Simply returns the option string.
+// Requires specialization because using stringstream to read the string
+// will stop at the first whitespace character (which is wrong).
+template<>
+inline std::string Options::get<std::string>(const std::string &name) const {
+  return get(name);
+}
+
+// This assumes the type T can be serialized to a string and back (when get
+// is called).
+template<typename T>
+void Options::set(const std::string &name, const T &value) {
+  std::stringstream ss;
+  ss << value;
+  set(name, ss.str());
+}
+
+} // ns aocl_utils
+
+#endif
+
diff --git a/3rdparty/aoclutils/scoped_ptrs.h b/3rdparty/aoclutils/scoped_ptrs.h
new file mode 100644
index 000000000000..b11085c5226e
--- /dev/null
+++ b/3rdparty/aoclutils/scoped_ptrs.h
@@ -0,0 +1,165 @@
+// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved.
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this
+// software and associated documentation files (the "Software"), to deal in the Software
+// without restriction, including without limitation the rights to use, copy, modify, merge,
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
+// whom the Software is furnished to do so, subject to the following conditions:
+// The above copyright notice and this permission notice shall be included in all copies or
+// substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+// 
+// This agreement shall be governed in all respects by the laws of the State of California and
+// by the laws of the United States of America.
+
+// Scoped pointer definitions.
+
+#ifndef AOCL_UTILS_SCOPED_PTRS_H
+#define AOCL_UTILS_SCOPED_PTRS_H
+
+namespace aocl_utils {
+
+// Interface is essentially the combination of std::auto_ptr and boost's smart pointers,
+// along with some small extensions (auto conversion to T*).
+
+// scoped_ptr: assumes pointer was allocated with operator new; destroys with operator delete
+template<typename T>
+class scoped_ptr {
+public:
+  typedef scoped_ptr<T> this_type;
+
+  scoped_ptr() : m_ptr(NULL) {}
+  scoped_ptr(T *ptr) : m_ptr(ptr) {}
+  ~scoped_ptr() { reset(); }
+
+  T *get() const { return m_ptr; }
+  operator T *() const { return m_ptr; }
+  T *operator ->() const { return m_ptr; }
+  T &operator *() const { return *m_ptr; }
+
+  this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+  void reset(T *ptr = NULL) { delete m_ptr; m_ptr = ptr; }
+  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+  T *m_ptr;
+
+  // noncopyable
+  scoped_ptr(const this_type &);
+  this_type &operator =(const this_type &);
+};
+
+// scoped_array: assumes pointer was allocated with operator new[]; destroys with operator delete[]
+// Also supports allocation/reset with a number, which is the number of
+// elements of type T.
+template<typename T>
+class scoped_array {
+public:
+  typedef scoped_array<T> this_type;
+
+  scoped_array() : m_ptr(NULL) {}
+  scoped_array(T *ptr) : m_ptr(NULL) { reset(ptr); }
+  explicit scoped_array(size_t n) : m_ptr(NULL) { reset(n); }
+  ~scoped_array() { reset(); }
+
+  T *get() const { return m_ptr; }
+  operator T *() const { return m_ptr; }
+  T *operator ->() const { return m_ptr; }
+  T &operator *() const { return *m_ptr; }
+  T &operator [](int index) const { return m_ptr[index]; }
+
+  this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+  void reset(T *ptr = NULL) { delete[] m_ptr; m_ptr = ptr; }
+  void reset(size_t n) { reset(new T[n]); }
+  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+  T *m_ptr;
+
+  // noncopyable
+  scoped_array(const this_type &);
+  this_type &operator =(const this_type &);
+};
+
+// scoped_aligned_ptr: assumes pointer was allocated with alignedMalloc; destroys with alignedFree
+// Also supports allocation/reset with a number, which is the number of
+// elements of type T
+template<typename T>
+class scoped_aligned_ptr {
+public:
+  typedef scoped_aligned_ptr<T> this_type;
+
+  scoped_aligned_ptr() : m_ptr(NULL) {}
+  scoped_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
+  explicit scoped_aligned_ptr(size_t n) : m_ptr(NULL) { reset(n); }
+  ~scoped_aligned_ptr() { reset(); }
+
+  T *get() const { return m_ptr; }
+  operator T *() const { return m_ptr; }
+  T *operator ->() const { return m_ptr; }
+  T &operator *() const { return *m_ptr; }
+  T &operator [](int index) const { return m_ptr[index]; }
+
+  this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+  void reset(T *ptr = NULL) { if(m_ptr) alignedFree(m_ptr); m_ptr = ptr; }
+  void reset(size_t n) { reset((T*) alignedMalloc(sizeof(T) * n)); }
+  T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+  T *m_ptr;
+
+  // noncopyable
+  scoped_aligned_ptr(const this_type &);
+  this_type &operator =(const this_type &);
+};
+
+#if USE_SVM_API == 1
+// scoped_SVM_aligned_ptr: assumes pointer was allocated with clSVMAlloc; destroys with clSVMFree
+// Also supports allocation/reset with a number, which is the number of
+// elements of type T
+template<typename T>
+class scoped_SVM_aligned_ptr {
+public:
+	typedef scoped_SVM_aligned_ptr<T> this_type;
+
+	scoped_SVM_aligned_ptr() : m_ptr(NULL) {}
+	scoped_SVM_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); }
+	explicit scoped_SVM_aligned_ptr(cl_context ctx, size_t n) : m_ptr(NULL) { reset(ctx, n); }
+	~scoped_SVM_aligned_ptr() { reset(); }
+
+	T *get() const { return m_ptr; }
+	operator T *() const { return m_ptr; }
+	T *operator ->() const { return m_ptr; }
+	T &operator *() const { return *m_ptr; }
+	T &operator [](int index) const { return m_ptr[index]; }
+
+	this_type &operator =(T *ptr) { reset(ptr); return *this; }
+
+	void reset(T *ptr = NULL) { if (m_ptr) clSVMFree(m_ctx, m_ptr); m_ptr = ptr; }
+	void reset(cl_context ctx, size_t n) { reset((T*)clSVMAlloc(ctx, 0, sizeof(T) * n, 0)); m_ctx = ctx; }
+	T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; }
+
+private:
+	T *m_ptr;
+	cl_context m_ctx;
+
+	// noncopyable
+	scoped_SVM_aligned_ptr(const this_type &);
+	this_type &operator =(const this_type &);
+};
+#endif /* USE_SVM_API == 1 */
+
+} // ns aocl_utils
+
+#endif
+
diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index db65157208ec..98860a2a31ec 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit db65157208ec8fabb7b548c94596211b9db04190
+Subproject commit 98860a2a31ecc4aaf7c3346daa750d26193847e4
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 0117857783db..6c35b8df07d0 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -27,6 +27,9 @@ endif()
 
 message(STATUS "VTA build with VTA_HW_PATH=" ${VTA_HW_PATH})
 
+# enable picojson int type support
+add_definitions(-DPICOJSON_USE_INT64)
+
 if(MSVC)
   message(STATUS "VTA build is skipped in Windows..")
 elseif(PYTHON)
@@ -99,6 +102,11 @@ elseif(PYTHON)
       find_library(__cma_lib NAMES cma PATH /usr/lib)
     elseif(${VTA_TARGET} STREQUAL "de10nano")  # DE10-Nano rules
       file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc)
+    elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
+      file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc)
+      file(GLOB AOCLUTIL_SRC 3rdparty/aoclutils/*.cc)
+      list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC})
+      list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h)
     endif()
     # Target lib: vta
     add_library(vta SHARED ${FPGA_RUNTIME_SRCS})
@@ -117,6 +125,11 @@ elseif(PYTHON)
       target_include_directories(vta PUBLIC 3rdparty)
       target_include_directories(vta PUBLIC
         "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include")
+    elseif(${VTA_TARGET} STREQUAL "intelfocl")  # Intel OpenCL for FPGA rules
+      target_include_directories(vta PUBLIC 3rdparty)
+      target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include")
+      set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+      target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL)
     endif()
   endif()
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 9cef674d3db1..7b71b17f9886 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -32,6 +32,9 @@
 import tempfile
 
 import numpy as np
+import json
+import sys
+from importlib import import_module
 
 import tvm._ffi
 import tvm.ir.transform
@@ -192,6 +195,16 @@ def __init__(self,
                  timeout=10, n_parallel=None,
                  number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
                  check_correctness=False, enable_cpu_cache_flush=False):
+        static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL")
+        if static_tune:
+            if n_parallel is None or n_parallel > 1:
+                print("static tune only allows n_parallel == 1")
+                n_parallel = 1
+
+            if check_correctness == True:
+                print("static tune does not support check_correctness")
+                check_correctness = False
+
         super(RPCRunner, self).__init__(timeout, n_parallel)
 
         self.key = key
@@ -383,7 +396,15 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti
             measure_input.target.device_name == 'vta':
             # pylint: disable=import-outside-toplevel
             import vta
-            func = vta.build(s, args, target_host=task.target_host)
+
+            static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL")
+            if static_tune:
+                debug_flag = 1 << 6
+            else:
+                debug_flag = 0
+
+            with vta.build_config(debug_flag=debug_flag):
+                func = vta.build(s, args, target_host=task.target_host)
         else:
             with tvm.ir.transform.PassContext(config=opts):
                 func = build(s, args, target_host=task.target_host)
@@ -483,16 +504,10 @@ def run_through_rpc(measure_input, build_result,
 
     tic = time.time()
     errno = MeasureErrorNo.NO_ERROR
+    static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL")
     try:
         # upload built module
         remote = request_remote(*remote_args)
-        # Program the FPGA every single time when targeting VTA
-        if hasattr(measure_input.target, 'device_name') and \
-            measure_input.target.device_name == 'vta':
-            # pylint: disable=import-outside-toplevel
-            from vta import program_fpga, reconfig_runtime
-            program_fpga(remote, None)
-            reconfig_runtime(remote)
         remote.upload(build_result.filename)
         func = remote.load_module(os.path.split(build_result.filename)[1])
         ctx = remote.context(str(measure_input.target), 0)
@@ -517,12 +532,31 @@ def run_through_rpc(measure_input, build_result,
             args = [nd.array(x, ctx=ctx) for x in args]
             ctx.sync()
 
-        costs = time_f(*args).results
+        if static_tune is None:
+            time_f = func.time_evaluator(
+                func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms)
+            costs = time_f(*args).results
 
-        # clean up remote files
-        remote.remove(build_result.filename)
-        remote.remove(os.path.splitext(build_result.filename)[0] + '.so')
-        remote.remove('')
+            # clean up remote files
+            remote.remove(build_result.filename)
+            remote.remove(os.path.splitext(build_result.filename)[0] + '.so')
+            remote.remove('')
+        else:
+            func(*args)
+            cost = 0
+            insn_dump = os.getenv('TVM_INSN_DUMP_FILE', "insn.json")
+            insn_cost_file = os.getenv('TVM_INSN_COST_FILE', "cost.py")
+            path, filename = os.path.split(insn_cost_file)
+            sys.path.append(path)
+            module_path = filename[:-3]  # remove the .py suffix
+            module = import_module(module_path)
+            cal_cost = getattr(module, "cal_cost")
+            with open(insn_dump) as json_file:
+                insns = json.load(json_file)
+                for insn in insns:
+                    cost += cal_cost(insn)
+
+            costs = [cost] * repeat
 
         if len(costs) > 2:  # remove largest and smallest value to reduce variance
             costs = list(costs)
@@ -543,6 +577,10 @@ def run_through_rpc(measure_input, build_result,
             msg = msg[:msg.index("CUDA Source")]
         costs = (RuntimeError(msg[:1024]),)
         errno = MeasureErrorNo.RUNTIME_DEVICE
+    except Exception as exc:
+        costs = (exc,)
+        errno = MeasureErrorNo.UNKNOWN_ERROR
+
     tstamp = time.time()
     time.sleep(cooldown_interval)
     return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
@@ -570,6 +608,10 @@ def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
     ------
     session: RPCSession
     """
+    static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL")
+    if static_tune:
+        return _rpc.LocalSession()
+
     # connect to the tracker
     host = host or os.environ['TVM_TRACKER_HOST']
     port = port or int(os.environ['TVM_TRACKER_PORT'])
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index fbf474fc4df7..53ed78a7570d 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -779,7 +779,7 @@ def _add_new_transform(self, space_class, name, axes, policy, **kwargs):
         return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))]
 
     def __len__(self):
-        if self._length is None:
+        if self._length is None or self._length <= 1:
             self._length = int(np.prod([len(x) for x in self.space_map.values()]))
         return self._length
 
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 59e77f7d0098..25d1156e2af8 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -215,7 +215,7 @@ def _decorate(topi_schedule):
         @_register_task_schedule(task_name)
         def wrapper(outs, *args, **kwargs):
             """wrapper function for topi schedule"""
-            workload = get_workload(outs)
+            workload = get_workload(outs, task_name)
             if workload is None:
                 raise RuntimeError("Cannot find workload in attribute of this schedule")
             tgt = _target.Target.current()
@@ -227,17 +227,21 @@ def wrapper(outs, *args, **kwargs):
     return _decorate
 
 
-def get_workload(outs):
+def get_workload(outs, task_name=None):
     """Retrieve the workload from outputs"""
     def traverse(tensors):
         """traverse all ops to find attached workload"""
         for t in tensors:
             op = t.op
-            if 'workload' in op.attrs:
-                return args_to_workload(op.attrs['workload'])
             wkl = traverse(op.input_tensors)
             if wkl:
                 return wkl
+
+            if 'workload' in op.attrs:
+                ret = args_to_workload(op.attrs['workload'])
+                if ret[0] == task_name:
+                    return ret
         return None
+
     outs = [outs] if isinstance(outs, tensor.Tensor) else outs
     return traverse(outs)
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index cfc1b2c38f85..6c53be582b40 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -137,7 +137,7 @@ def __del__(self):
     format_si_prefix(0, si_prefix)
 
     if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
-        sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
+        sys.stdout.write('\r%s Current/Best: %7.4f/%7.4f GFLOPS | Progress: (%d/%d) '
                          '| %.2f s' % (prefix, 0, 0, 0, total, time.time() - tic))
         sys.stdout.flush()
 
@@ -153,7 +153,7 @@ def _callback(tuner, inputs, results):
             ctx.cur_flops = flops
             ctx.best_flops = tuner.best_flops
 
-            sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f %sFLOPS | Progress: (%d/%d) '
+            sys.stdout.write('\r%s Current/Best: %7.4f/%7.4f %sFLOPS | Progress: (%d/%d) '
                              '| %.2f s' %
                              (prefix, format_si_prefix(ctx.cur_flops, si_prefix),
                               format_si_prefix(ctx.best_flops, si_prefix), si_prefix,
diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py
index 8f6dfc7f28ec..474741fc1e35 100644
--- a/python/tvm/contrib/util.py
+++ b/python/tvm/contrib/util.py
@@ -19,6 +19,7 @@
 import contextlib
 import datetime
 import os
+import sys
 import tempfile
 import threading
 import shutil
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index feeec1fa89ec..a43d9d7913d5 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -87,6 +87,9 @@
 register_broadcast_schedule("fast_exp")
 register_broadcast_schedule("fast_tanh")
 register_broadcast_schedule("fast_erf")
+# a fake on_device schedule.
+# this will not be used in actual computation as on_device will be removed during DeviceAnnotation pass
+register_injective_schedule("on_device")
 
 
 # zeros
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 62c2948b51e2..eae4d5785211 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -33,6 +33,14 @@ def wrapper(attrs, outs, target):
             return topi_schedule(outs)
     return wrapper
 
+
+def wrap_topi_compute(topi_compute):
+    """Wrap TOPI compute which doesn't use attrs"""
+    def wrapper(attrs, inputs, out_type):
+        return [topi_compute(*inputs)]
+    return wrapper
+
+
 def get_conv2d_in_channels(data_shape, data_layout):
     """Get conv2d input channels"""
     data_shape = get_const_tuple(data_shape)
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 952a86466300..f902a0abf80e 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -173,6 +173,28 @@ def conv2d_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
 
 
+@register_annotate_function("nn.conv2d_transpose")
+def conv2d_transpose_rewrite(ref_call, new_args, ctx):
+    """Rewrite function for conv2d_transpose. Lhs of conv will be quantized to
+    input field, and rhs of conv will be quantized to weight field.
+    Output would be in activation field"""
+    if quantize_context().check_to_skip(ref_call):
+        return None
+
+    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
+    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])
+
+    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:
+        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)
+
+    assert rhs_kind is None
+    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)
+
+    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
+
+    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
+
+
 @register_annotate_function("nn.dense")
 def dense_rewrite(ref_call, new_args, ctx):
     """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of
@@ -281,6 +303,7 @@ def identity_rewrite(ref_call, new_args, ctx):
     return QAnnotateExpr(ret_expr, x_kind)
 
 
+register_annotate_function("reshape", identity_rewrite)
 register_annotate_function("clip", identity_rewrite)
 register_annotate_function("nn.relu", identity_rewrite)
 register_annotate_function("strided_slice", identity_rewrite)
diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py
index b72f51c4add5..c16672628cea 100644
--- a/python/tvm/relay/quantize/_partition.py
+++ b/python/tvm/relay/quantize/_partition.py
@@ -52,6 +52,19 @@ def conv2d_partition_function(ref_call, new_args, ctx):
     return QPartitionExpr(ret)
 
 
+@register_partition_function("nn.conv2d_transpose")
+def conv2d_partition_function(ref_call, new_args, ctx):
+    """Rewrite function for conv2d_transpose for partition"""
+    data_cond, data = partition_expr_check(new_args[0])
+    kernel_cond, kernel = partition_expr_check(new_args[1])
+
+    assert not kernel_cond
+    if data_cond:
+        data = new_args[0].realize()
+    ret = _forward_op(ref_call, [data, kernel])
+    return QPartitionExpr(ret)
+
+
 def identity_partition_function(ref_call, new_args, ctx):
     cond, expr = partition_expr_check(new_args[0])
     if cond:
@@ -81,7 +94,7 @@ def add_partition_generic(ref_call, new_args, ctx):
         #     ...
         lhs = new_args[0].realize()
         rhs = new_args[1].realize()
-        return _forward_op(ref_call, [lhs, rhs])
+        return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
     if not lhs_cond and rhs_cond:
         # - introduced by residual connection in ResNet
         #     ...
@@ -128,6 +141,7 @@ def mul_partition_generic(ref_call, new_args, ctx):
 
     if lhs_cond:
         # introduced by bn: multiply(out, scale)
+        lhs = new_args[0].realize()
         return QPartitionExpr(_forward_op(ref_call, [lhs, rhs]))
 
     if not lhs_cond and not rhs_cond:
@@ -153,3 +167,16 @@ def add_partition_function(ref_call, new_args, ctx):
 def multiply_partition_function(ref_call, new_args, ctx):
     """Rewrite function for ewise multiply for partition"""
     return mul_partition_generic(ref_call, new_args, ctx)
+
+
+# add cast after the relu op to make it run on vta
+@register_partition_function("nn.global_avg_pool2d")
+def global_avg_pool2d_partition_function(ref_call, new_args, ctx):
+    cond, expr = partition_expr_check(new_args[0])
+    if cond:
+        expr = new_args[0].realize()
+        return _forward_op(ref_call, [expr])
+    else:
+        expr = QPartitionExpr(new_args[0]).realize()
+        return _forward_op(ref_call, [expr])
+    return None
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 28ebf7f3032b..b7371a3c3068 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -209,10 +209,18 @@ def check_to_skip(self, ref_call):
             # check skip conv layers
             skipped_indices = [int(x) for x in current_qconfig().skip_conv_layers]
             if self._conv2d_counter in skipped_indices:
-                if ref_call.op.name == 'nn.conv2d':
+                if ref_call.op.name == 'nn.conv2d' or ref_call.op.name == 'nn.conv2d_transpose':
                     self._conv2d_counter += 1
-                return True
-            if ref_call.op.name == 'nn.conv2d':
+                    return True
+                else:
+                    # counter is 0 before visiting the first conv2d
+                    # if the first conv2d is skipped, all ops before it will also be skipped
+                    # otherwise, we don't skip until the counter become +1
+                    if self._conv2d_counter == 0:
+                        return True
+                    else:
+                        return False
+            if ref_call.op.name == 'nn.conv2d' or ref_call.op.name == 'nn.conv2d_transpose':
                 self._conv2d_counter += 1
 
         return False
diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc
index f0634feac083..c9704e3fff4b 100644
--- a/src/arith/detect_linear_equation.cc
+++ b/src/arith/detect_linear_equation.cc
@@ -71,6 +71,16 @@ class LinearEqDetector : public ExprFunctor<LinearEqEntry(const PrimExpr&, const
     return ret;
   }
 
+  LinearEqEntry VisitExpr_(const FloorDivNode* op, const PrimExpr& e) final {
+    if (fail_) return LinearEqEntry();
+    LinearEqEntry a = VisitExpr(op->a, op->a);
+    LinearEqEntry b = VisitExpr(op->b, op->b);
+    LinearEqEntry ret;
+    ret.base = FloorDivCombine(a.base, b.base);
+    ret.coeff = FloorDivCombine(a.coeff, b.coeff);
+    return ret;
+  }
+
   LinearEqEntry VisitExpr_(const SubNode* op, const PrimExpr& e) final {
     if (fail_) return LinearEqEntry();
     LinearEqEntry a = VisitExpr(op->a, op->a);
@@ -138,6 +148,12 @@ class LinearEqDetector : public ExprFunctor<LinearEqEntry(const PrimExpr&, const
     if (!b.defined()) return b;
     return a * b;
   }
+  // Combine by div
+  PrimExpr FloorDivCombine(PrimExpr a, PrimExpr b) {
+    if (!a.defined()) return b;
+    if (!b.defined()) return a;
+    return FloorDiv(a, b);
+  }
 };
 
 Array<PrimExpr> DetectLinearEquation(const PrimExpr& e, const Array<Var>& vars) {
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index 2aae8546248f..367849112543 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -230,7 +230,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
           << "Two complicated op in a primitive function "
           << " master=" << master_op_ << " current=" << op;
     }
-    if (op_pattern >= master_op_pattern_) {
+    if (op_pattern > master_op_pattern_) {
       master_op_ = op;
       master_attrs_ = call_node->attrs;
       master_op_pattern_ = op_pattern;
@@ -288,7 +288,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   tvm::Target target_;
   Op master_op_;
   Attrs master_attrs_;
-  int master_op_pattern_{0};
+  int master_op_pattern_{-1};
   OpImplementation master_implementation_;
   std::ostringstream readable_name_stream_;
   Array<te::Operation> scalars_;
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 820e17f8a498..4a1bfd874b5c 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -309,6 +309,15 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     if (match_range_ == 0) {
       return this->Alloc(prototype, size);
     }
+
+    // TODO(zhanghao): find a better way to do this
+    // We copy all the instructions of all layers in a single batch.
+    // To avoid overwrite shared storage, we do not re-use allocation
+    const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL");
+    if (sync_once) {
+      return this->Alloc(prototype, size);
+    }
+
     auto begin = free_.lower_bound(size / match_range_);
     auto mid = free_.lower_bound(size);
     auto end = free_.upper_bound(size * match_range_);
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 6be9b0d4a3d5..4db3f930d3b5 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -54,7 +54,13 @@ RELAY_REGISTER_OP("on_device")
     .add_type_rel("Identity", IdentityRel)
     .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<TOpIsStateful>("TOpIsStateful", false)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<FTVMCompute>("FTVMCompute",
+                           [] (const Attrs& attrs,
+                               const Array<te::Tensor>& inputs,
+                               const Type& out_type) -> Array<te::Tensor> {
+                           return {topi::identity(inputs[0])};
+                           });
 
 Expr StopFusion(Expr data) {
   static const Op& op = Op::Get("annotation.stop_fusion");
diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index ace2c2473173..228413cdc8ff 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -234,6 +234,41 @@ Expr Conv2dRealize(const Call& ref_call, const Array<Expr>& new_args, const Obje
 
 RELAY_REGISTER_OP("nn.conv2d").set_attr<FForwardRewrite>("FQRealizeRewrite", Conv2dRealize);
 
+Expr Conv2dTransposeRealize(const Call& ref_call,
+                   const Array<Expr>& new_args,
+                   const ObjectRef& ctx) {
+  const QConfig& cfg = QConfig::Current();
+  CHECK_EQ(new_args.size(), 2);
+  if (!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>()) {
+    return Expr(nullptr);
+  }
+  const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
+  CHECK(lhs);
+  const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
+  CHECK(rhs);
+
+  Expr ldata = lhs->data;
+  if (lhs->dtype != cfg->dtype_input) {
+    ldata = Cast(ldata, cfg->dtype_input);
+  }
+  Expr rdata = Cast(rhs->data, cfg->dtype_weight);
+
+  const auto ref_attrs = ref_call->attrs.as<Conv2DTransposeAttrs>();
+  auto attrs = make_object<Conv2DTransposeAttrs>();
+  *attrs = *ref_attrs;
+  DataType out_dtype = cfg->dtype_activation;
+  attrs->out_dtype = out_dtype;
+
+  Expr ret = Call(ref_call->op,
+    {ldata, rdata}, Attrs(attrs), ref_call->type_args);
+  Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
+  Expr dom_scale = FoldConstantOpt(mul);
+  return QRealizeIntExpr(ret, dom_scale, out_dtype);
+}
+
+RELAY_REGISTER_OP("nn.conv2d_transpose")
+.set_attr<FForwardRewrite>("FQRealizeRewrite", Conv2dTransposeRealize);
+
 Expr DenseRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   CHECK_EQ(new_args.size(), 2);
@@ -309,7 +344,8 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {
 
 /* \brief Unify the dom scale of arguments */
 Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args,
-                            DataType* dtype_ptr, Expr* scale_ptr) {
+                            DataType* dtype_ptr, Expr* scale_ptr,
+                            DataType dtype = DataType::Void()) {
   static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
@@ -324,13 +360,15 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args, const Array<Expr>& args
 
   // unify the data type
   CHECK_EQ(ref_args.size(), args.size());
-  DataType dtype;
 
-  if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
-    dtype = cfg->dtype_input;
-  } else {
-    dtype = cfg->dtype_activation;
+  if (dtype.is_void()) {
+    if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) {
+      dtype = cfg->dtype_input;
+    } else {
+      dtype = cfg->dtype_activation;
+    }
   }
+
   for (size_t i = 0; i < ret.size(); ++i) {
     auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
@@ -361,7 +399,16 @@ Expr AddRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectR
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale);
+    // execute the operation with activation data type.
+    const QConfig& cfg = QConfig::Current();
+    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args,
+                                           &dtype, &dom_scale, cfg->dtype_activation);
+    for (size_t i = 0; i < ret_args.size(); ++i) {
+      // do not fuse float32 arg
+      if (new_args[i].as<QRealizeIntExprNode>()->dtype == DataType::Float(32)) {
+        ret_args.Set(i, StopFusion(ret_args[i]));
+      }
+    }
     Expr ret = ForwardOp(ref_call, ret_args);
     return QRealizeIntExpr(ret, dom_scale, dtype);
   }
@@ -430,6 +477,8 @@ Expr IdentityRealize(const Call& ref_call, const Array<Expr>& new_args, const Ob
 
 RELAY_REGISTER_OP("nn.relu").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
+RELAY_REGISTER_OP("reshape").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
+
 RELAY_REGISTER_OP("strided_slice").set_attr<FForwardRewrite>("FQRealizeRewrite", IdentityRealize);
 
 RELAY_REGISTER_OP("nn.batch_flatten")
diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc
index 39cf563f730a..fe3cfebf7fe3 100644
--- a/src/relay/transforms/device_annotation.cc
+++ b/src/relay/transforms/device_annotation.cc
@@ -386,22 +386,38 @@ class DeviceInfo {
     }
 
     void VisitExpr_(const ConstantNode* cn) final {
-      post_dfs_order_.push_back(std::make_pair(cn, has_copy_));
+      device_tag_[cn] = dev_type_;
     }
 
     void VisitExpr_(const CallNode* call) final {
       // Skip annotation nodes.
       if (!IsOnDeviceNode(call)) {
-        if (GetDeviceCopyNode(call)) {
+        if (const auto* node = GetDeviceCopyNode(call)) {
+          CHECK(node->IsInstance<CallNode>());
+          const auto* call_node = static_cast<const CallNode*>(node);
+          auto attrs = call_node->attrs.as<DeviceCopyAttrs>();
+
           num_device_copy_ops_++;
           bool has_copy_prev = has_copy_;
           has_copy_ = true;
-          ExprVisitor::VisitExpr_(call);
-          post_dfs_order_.push_back(std::make_pair(call, has_copy_));
+          dev_type_ = attrs->src_dev_type;
+          for (auto& arg : call->args) {
+            Visit(arg);
+            // restore the type for remaining arguments
+            dev_type_ = attrs->src_dev_type;
+          }
+          device_tag_[call] = attrs->dst_dev_type;
+          // update the out_dev_type_, which should be the dst_dev_type of last copy
+          out_dev_type_ = attrs->dst_dev_type;
           has_copy_ = has_copy_prev;
         } else {
-          ExprVisitor::VisitExpr_(call);
-          post_dfs_order_.push_back(std::make_pair(call, has_copy_));
+          for (auto& arg : call->args) {
+            int cur_dev_type = dev_type_;
+            Visit(arg);
+            // restore the type for remaining arguments
+            dev_type_ = cur_dev_type;
+          }
+          device_tag_[call] = dev_type_;
         }
       }
     }
@@ -414,22 +430,24 @@ class DeviceInfo {
     void VisitExpr_(const TupleGetItemNode* op) final { ExprVisitor::VisitExpr_(op); }
 
     void VisitExpr_(const VarNode* vn) final {
-      post_dfs_order_.push_back(std::make_pair(vn, has_copy_));
+      device_tag_[vn] = dev_type_;
     }
 
     void VisitExpr_(const LetNode* ln) final {
       ExprVisitor::VisitExpr_(ln);
-      post_dfs_order_.push_back(std::make_pair(ln, has_copy_));
+      device_tag_[ln] = dev_type_;
     }
 
     void VisitExpr_(const IfNode* in) final {
       ExprVisitor::VisitExpr_(in);
-      post_dfs_order_.push_back(std::make_pair(in, has_copy_));
+      device_tag_[in] = dev_type_;
     }
 
     int num_device_copy_ops_{0};
     bool has_copy_ = false;
-    std::vector<std::pair<const ExprNode*, bool>> post_dfs_order_;
+    int dev_type_ = -1;
+    int out_dev_type_ = -1;
+    std::unordered_map<const ExprNode*, int> device_tag_;
     friend DeviceInfo;
   };
 
@@ -455,39 +473,14 @@ class DeviceInfo {
   }
 
   void PropagateDeviceId() {
-    // Bottom-up propagation.
-    int out_dev_type = BottomUpPropagation();
-    // propagation for remained nodes.
-    FillPropagation(out_dev_type);
-  }
-
-  int BottomUpPropagation() {
-    const CallNode* last_copy_node = nullptr;
-    int cur_dev_type = -1;
-    int out_dev_type = -1;
-    for (auto it = post_visitor_.post_dfs_order_.crbegin();
-         it != post_visitor_.post_dfs_order_.crend(); ++it) {
-      if (const auto* node = GetDeviceCopyNode(it->first)) {
-        CHECK(node->IsInstance<CallNode>());
-        last_copy_node = static_cast<const CallNode*>(node);
-        const auto* attrs = last_copy_node->attrs.as<DeviceCopyAttrs>();
-        cur_dev_type = attrs->src_dev_type;
-        if (out_dev_type == -1) out_dev_type = attrs->dst_dev_type;
-        if (it->second) device_map_.Set(GetRef<Expr>(it->first), attrs->dst_dev_type);
-      } else if (last_copy_node) {
-        Expr expr = GetRef<Expr>(it->first);
-        CHECK_EQ(device_map_.count(expr), 0U);
-        if (it->second) device_map_.Set(expr, cur_dev_type);
+    int out_dev_type = post_visitor_.out_dev_type_;
+    for (auto& it : post_visitor_.device_tag_) {
+      if (it.second != -1) {
+        device_map_.Set(GetRef<Expr>(it.first), it.second);
+      } else {
+        device_map_.Set(GetRef<Expr>(it.first), out_dev_type);
       }
     }
-    return out_dev_type;
-  }
-
-  void FillPropagation(int out_dev_type) {
-    for (const auto& it : post_visitor_.post_dfs_order_) {
-      Expr expr = GetRef<Expr>(it.first);
-      if (!it.second) device_map_.Set(expr, out_dev_type);
-    }
   }
 
   PostDfsOrderVisitor post_visitor_;
@@ -541,7 +534,9 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) {
   }
 }
 
-Map<Expr, Integer> CollectDeviceInfo(const Expr& expr) { return DeviceInfo::GetDeviceMap(expr); }
+Map<Expr, Integer> CollectDeviceInfo(const Expr& expr) {
+  return DeviceInfo::GetDeviceMap(expr);
+}
 
 Map<Expr, Integer> CollectDeviceAnnotationOps(const Expr& expr) {
   return AnnotatationVisitor::GetAnnotations(expr);
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index f07170489298..2991a0842b7a 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -87,16 +87,19 @@ class BuiltinLower : public StmtExprMutator {
     op = stmt.as<AllocateNode>();
     // Get constant allocation bound.
     int64_t nbytes = GetVectorBytes(op->dtype);
-    if (device_type_.defined()) {
-      if (const auto* dev_type = device_type_.as<IntImmNode>()) {
-        if (dev_type->value == kDLCPU) {
-          int32_t constant_size = op->constant_allocation_size();
-          if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
-            return stmt;
-          }
-        }
-      }
-    }
+    // NOTE(zhanghao): remove special handling for kDLCPU
+    // otherwise, may cause LLVM parameters match error
+    // if in heterogenous targets
+    // if (device_type_.defined()) {
+    //   if (arith::GetConst(device_type_, &dev_type)) {
+    //     if (dev_type == kDLCPU) {
+    //       int32_t constant_size = op->constant_allocation_size();
+    //       if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
+    //         return stmt;
+    //       }
+    //     }
+    //   }
+    // }
     PrimExpr total_bytes = make_const(op->extents[0].dtype(), nbytes);
     for (size_t i = 0; i < op->extents.size(); ++i) {
       total_bytes = total_bytes * op->extents[i];
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 08baaf7e29ad..1379a506d0b7 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -77,6 +77,8 @@
     "tokens",
     # interface definition
     "idl",
+    # opencl file
+    "cl",
     }
 
 # List of file names allowed
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index 3a18cf74bc1d..ebd44194f659 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -62,11 +62,13 @@ class DevContext(object):
     MEM_ID_INP = 2
     MEM_ID_ACC = 3
     MEM_ID_OUT = 4
+    MEM_ID_ACC_8BIT = 5
     # VTA ALU Opcodes
     ALU_OPCODE_MIN = 0
     ALU_OPCODE_MAX = 1
     ALU_OPCODE_ADD = 2
     ALU_OPCODE_SHR = 3
+    ALU_OPCODE_MUL = 4
     # Task queue id (pipeline stage)
     QID_LOAD_INP = 1
     QID_LOAD_WGT = 1
@@ -240,7 +242,7 @@ def target_host(self):
             return "llvm -mtriple=armv7-none-linux-gnueabihf"
         if self.TARGET == "ultra96":
             return "llvm -mtriple=aarch64-linux-gnu"
-        if self.TARGET in ["sim", "tsim"]:
+        if self.TARGET in ["sim", "tsim", "intelfocl"]:
             return "llvm"
         raise ValueError("Unknown target %s" % self.TARGET)
 
@@ -294,7 +296,7 @@ def coproc_sync(op):
     return tvm.tir.call_extern(
         "int32", "VTASynchronize",
         get_env().dev.command_handle,
-        tvm.runtime.const(1<<31, dtype="uint32"))
+        tvm.runtime.const(1<<31, dtype="uint32"), True)
 
 
 
diff --git a/vta/python/vta/program_bitstream.py b/vta/python/vta/program_bitstream.py
index 62cb5f21d02a..9a48ba75378e 100644
--- a/vta/python/vta/program_bitstream.py
+++ b/vta/python/vta/program_bitstream.py
@@ -54,7 +54,13 @@ def de10nano_bitstream_program(bitstream_path):
     program = get_global_func("vta.de10nano.program")
     program(bitstream_path)
 
-def bitstream_program(target, bitstream):
+def intelfocl_bitstream_program(bitstream_path, mem_size=4*1024*1024*1024):
+    # pylint: disable=import-outside-toplevel
+    from tvm import get_global_func
+    program = get_global_func("vta.intelfocl.program")
+    program(bitstream_path, mem_size)
+
+def bitstream_program(target, bitstream, *args):
     if target in ['pynq', 'ultra96']:
         pynq_bitstream_program(bitstream)
     elif target in ['de10nano']:
@@ -62,6 +68,8 @@ def bitstream_program(target, bitstream):
     elif target in ['sim', 'tsim']:
         # In simulation, bit stream programming is a no-op
         return
+    elif target in ['intelfocl']:
+        intelfocl_bitstream_program(bitstream, *args)
     else:
         raise RuntimeError("Unknown target {}".format(target))
 
diff --git a/vta/python/vta/rpc_client.py b/vta/python/vta/rpc_client.py
index 097ea8e4a5cc..c76a8c77cb67 100644
--- a/vta/python/vta/rpc_client.py
+++ b/vta/python/vta/rpc_client.py
@@ -19,6 +19,8 @@
 
 from .environment import get_env
 from .bitstream import download_bitstream, get_bitstream_path
+from tvm import rpc
+from vta import program_bitstream
 
 def reconfig_runtime(remote):
     """Reconfigure remote runtime based on current hardware spec.
@@ -44,16 +46,20 @@ def program_fpga(remote, bitstream=None):
     bitstream : str, optional
         Path to a local bistream file. If unset, tries to download from cache server.
     """
+    env = get_env()
+
     if bitstream:
         assert os.path.isfile(bitstream)
     else:
         bitstream = get_bitstream_path()
         if not os.path.isfile(bitstream):
-            env = get_env()
             if env.TARGET == 'de10nano':
                 return
             download_bitstream()
 
-    fprogram = remote.get_function("tvm.contrib.vta.init")
-    remote.upload(bitstream)
-    fprogram(os.path.basename(bitstream))
+    if isinstance(remote, rpc.LocalSession):
+        program_bitstream.bitstream_program(env.TARGET, bitstream)
+    else:
+        fprogram = remote.get_function("tvm.contrib.vta.init")
+        remote.upload(bitstream)
+        fprogram(os.path.basename(bitstream))
diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py
index 16827c4ab079..5ac8c80fed8d 100644
--- a/vta/python/vta/testing/simulator.py
+++ b/vta/python/vta/testing/simulator.py
@@ -25,7 +25,7 @@ def _load_sw():
     """Load hardware library for simulator."""
 
     env = get_env()
-    lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim"
+    lib_driver_name = "libvta_tsim" if env.TARGET == "" else "libvta" if env.TARGET == "intelfocl" else "libvta_fsim"
 
     # Load driver library
     lib_driver = find_libvta(lib_driver_name, optional=True)
diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py
index afbf00ddac8c..83da2e157164 100644
--- a/vta/python/vta/testing/util.py
+++ b/vta/python/vta/testing/util.py
@@ -32,7 +32,7 @@ def run(run_func):
     """
     env = get_env()
 
-    if env.TARGET in ["sim", "tsim"]:
+    if env.TARGET in ["sim", "tsim", "intelfocl"]:
         # Talk to local RPC if necessary to debug RPC server.
         # Compile vta on your host with make at the root.
         # Make sure TARGET is set to "sim" in the config.json file.
diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 633ef3f60c9b..255724afc809 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -93,7 +93,7 @@ def _weight_shape_match_transpose(data, dshape, channels, cfactor_out):
     if pad_width != 0:
         pad_width = cfactor_out - pad_width
         data = op.nn.pad(data, [[0, 0], [0, pad_width], [0, 0], [0, 0]])
-        dshape = tuple(dshape[0], [dshape[1] + pad_width, dshape[2], dshape[3]])
+        dshape = tuple([dshape[0]] + [dshape[1] + pad_width, dshape[2], dshape[3]])
 
     if channels_pad != 0:
         channels = channels + (cfactor_out - channels_pad)
@@ -174,6 +174,104 @@ def _operator_idx_inc(expr, count_meta, operator_current_idx):
         operator_current_idx = operator_current_idx + 1
     return operator_current_idx
 
+
+class ExprDeviceAnnot(ExprMutator):
+    """Visitor to perform graph annotation on an AST.
+
+    Parameters
+    ----------
+    start: int
+        the start location to mark run on vta (inclusive)
+    end: int
+        the end location to mark run on vta (exclusive)
+
+    Returns
+    ---------
+    None
+    """
+    def __init__(self, start=-1, end=-1):
+        self.ext_ctx = tvm.context("ext_dev")
+        self.cpu_ctx = tvm.context("cpu")
+        self.cast = op.op.get("cast")
+        self.counter = -1
+        self.start = start
+        self.end = end
+        super().__init__()
+
+    def visit_call(self, call):
+        """ Visit the children. """
+        # First visit the children.
+        oshape = _get_tensor_shape(call)
+        odtype = _get_tensor_type(call)
+        input_types = [arg.checked_type for arg in call.args]
+        args = [self.visit(arg) for arg in call.args]
+
+        self.counter += 1
+        if self.counter == self.start:
+            ret = relay.Call(call.op, args, call.attrs)
+            ret = relay.annotation.on_device(ret, self.ext_ctx)
+            return ret
+        elif self.counter == self.end:
+            ret = relay.Call(call.op, args, call.attrs)
+            ret = relay.annotation.on_device(ret, self.cpu_ctx)
+            return ret
+        elif self.counter > self.start and self.counter < self.end:
+            ret = relay.Call(call.op, args, call.attrs)
+
+            # skip the float op, i.e., float->int cast
+            if self.is_float_op(call):
+                return ret
+
+            return relay.annotation.on_device(ret, self.ext_ctx)
+
+        return relay.Call(self.visit(call.op), args, call.attrs)
+
+    def is_float_op(self, call):
+        """check if this op belongs to a float op
+        in general, float op's odtype is float;
+        a special case is float->int cast, which follow this op sequence:
+        multiply(float) -> round(float) -> clip(float) -> cast(int);
+        """
+        args = call.args
+        odtype = _get_tensor_type(call)
+        op = call.op
+
+        if odtype == "float32":
+            return True
+        elif op == self.cast:
+            idtype = _get_tensor_type(args[0])
+            if idtype == "float32":
+                return True
+
+        return False
+
+
+class ExprLocater(ExprMutator):
+    """Visitor to locate op on an AST.
+    """
+    def __init__(self):
+        self.counter = -1
+        self.op2nodes = {}
+        super().__init__()
+
+    def visit_call(self, call):
+        """ Visit the children. """
+        # First visit the children.
+        args = [self.visit(arg) for arg in call.args]
+
+        odtype = _get_tensor_type(call)
+        self.counter += 1
+        if (call.op, odtype) in self.op2nodes:
+            self.op2nodes[(call.op, odtype)].append(self.counter)
+        else:
+            self.op2nodes[(call.op, odtype)] = [self.counter]
+
+        return relay.Call(
+            self.visit(call.op),
+            args,
+            call.attrs)
+
+
 class ExprPack(ExprMutator):
     """Visitor to perform graph packing on an AST.
     """
@@ -317,7 +415,7 @@ def visit_call(self, call):
             elif self.start_pack and call.op == op.op.get('cast') and \
                     input_types[0].dtype == 'int32':
                 cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs)
-                return relay.Call(op.op.get('copy'), [cast])
+                return cast
             elif call.op == self.pad:
                 pad_width = call.attrs.pad_width
                 if len(pad_width) == 6:
@@ -412,7 +510,10 @@ def graph_pack(expr,
                stop_name="nn.global_avg_pool2d",
                start_name_idx=None,
                stop_name_idx=None,
-               count_meta=False):
+               count_meta=False,
+               device_annot=False,
+               annot_start_name="nn.conv2d",
+               annot_end_name="annotation.stop_fusion"):
     """Pack the graph into batch&channel packed format.
 
     Parameters
@@ -449,13 +550,23 @@ def graph_pack(expr,
         'expr.astext(show_meta_data=False)'. When count_meta is True, the operator increase
         logic would count the meta.
 
+    device_annot: boolean, optional
+        if we want to annoate the device_type
+
+    annot_start_name: str, optional
+        device annotation start node, from which we mark the nodes as `ext_dev`
+
+    annot_end_name: str, optional
+        device annotation end node, after which we mark the nodes as 'cpu'
+
     Returns
     -------
     expr : Expr
         The transformed expression.
     """
     assert isinstance(expr, relay.Function)
-    assert ((start_name != stop_name) or (start_name_idx < stop_name_idx))
+    assert ((start_name != stop_name) or (start_name_idx is None != stop_name_idx is None) or \
+            (not (start_name_idx is None and stop_name_idx is None)) or (start_name_idx < stop_name_idx))
     expr = get_subgraph(expr, start_name, stop_name, start_name_idx, stop_name_idx, count_meta)
     expr = run_opt_pass(expr, transform.InferType())
     packer = ExprPack(
@@ -463,4 +574,23 @@ def graph_pack(expr,
         weight_bits)
     expr = packer.visit(expr)
     assert not packer.start_pack
-    return run_opt_pass(expr, transform.InferType())
+    expr = run_opt_pass(expr, transform.InferType())
+
+    if device_annot:
+        expr_locator = ExprLocater()
+        expr_locator.visit(expr)
+
+        annot_start = op.op.get(annot_start_name)
+        start = expr_locator.op2nodes[(annot_start, "int32")][0]
+
+        annot_end = op.op.get(annot_end_name)
+        # we mark the next op to the last stop_fusion on cpu device
+        end = expr_locator.op2nodes[(annot_end, "int8")][-1] + 1
+
+        device_annot = ExprDeviceAnnot(start=start, end=end)
+        expr = device_annot.visit(expr)
+        ret = run_opt_pass(expr, transform.InferType())
+
+        return ret
+    else:
+        return expr
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 2198ed4c191f..dca42de95ffc 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import te
+from tvm import autotvm
 import topi
 
 from tvm.relay.op import op as reg
@@ -62,6 +63,141 @@ def clip_strategy_vta(attrs, inputs, out_type, target):
 
 reg.get("clip").get_attr("FTVMStrategy").register(clip_strategy_vta, "vta")
 
+
+@autotvm.register_topi_compute("add.vta")
+def add_packed(cfg, lhs, rhs):
+    return topi.add(lhs, rhs)
+
+
+@autotvm.register_topi_compute("multiply.vta")
+def multiply_packed(cfg, lhs, rhs):
+    return topi.multiply(lhs, rhs)
+
+
+def schedule_alu_packed(cfg, outs):
+    assert len(outs) == 1
+
+    def is_cast_op(op):
+        return op.name == 'T_cast'
+
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    output = outs[0]
+    s = te.create_schedule([x.op for x in outs])
+    te.schedule.AutoInlineInjective(s)
+
+    env = get_env()
+    # other target does not support alu-only ops
+    if not (env.TARGET in ["sim", "tsim", "intelfocl"]):
+        return s
+
+    # only put the int-related ops to vta
+    if "int" in output.dtype and len(output.shape) == 6:
+        ewise_inputs = []
+        ewise_ops = []
+        const_ops = []
+
+        def _traverse(op):
+            if topi.tag.is_broadcast(op.tag):
+                if not op.same_as(output.op):
+                    if not op.axis:
+                        const_ops.append(op)
+                    elif not is_cast_op(op):
+                        ewise_ops.append(op)
+
+                for tensor in op.input_tensors:
+                    if isinstance(tensor.op, tvm.te.PlaceholderOp):
+                        ewise_inputs.append((op, tensor))
+                    elif is_cast_op(tensor.op) and not op.same_as(output.op):
+                        ewise_inputs.append((op, tensor))
+                    else:
+                        _traverse(tensor.op)
+            else:
+                for tensor in op.input_tensors:
+                    if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \
+                            and (not is_cast_op(tensor.op)):
+                        _traverse(tensor.op)
+
+        op = output.op
+        _traverse(op)
+        for _, t in ewise_inputs:
+            if t.dtype == 'float32':
+                return s
+
+        x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
+
+        cfg.define_split('tile_co', x_co, num_outputs=2)
+        cfg.define_split('tile_h', x_i, num_outputs=2)
+        cfg.define_split('tile_w', x_j, num_outputs=2)
+
+        x_co_max = topi.util.get_const_int(x_bo.dom.extent)
+        x_i_max = topi.util.get_const_int(x_i.dom.extent)
+        x_j_max = topi.util.get_const_int(x_j.dom.extent)
+
+        x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co)
+        x_i0, x_i1 = cfg['tile_h'].apply(s, output, x_i)
+        x_j0, x_j1 = cfg['tile_w'].apply(s, output, x_j)
+        s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
+        store_pt = x_j0
+
+        for eo in ewise_ops:
+            s[eo].set_scope(env.acc_scope)
+            s[eo].pragma(s[eo].op.axis[0], env.alu)
+            s[eo].compute_at(s[output], store_pt)
+
+        # cache read input
+        cache_read_ewise = []
+        for consumer, tensor in ewise_inputs:
+            cache_read_ewise.append(
+                s.cache_read(tensor, env.acc_scope, [consumer]))
+
+        for tensor in cache_read_ewise:
+            if s[tensor].op.axis:
+                s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy)
+            s[tensor].compute_at(s[output], store_pt)
+
+        for op in const_ops:
+            s[op].compute_inline()
+
+        s[output].pragma(x_co1, env.dma_copy)
+
+    return s
+
+
+@autotvm.register_topi_schedule("add.vta")
+def schedule_add_packed(cfg, outs):
+    return schedule_alu_packed(cfg, outs)
+
+
+@autotvm.register_topi_schedule("multiply.vta")
+def schedule_multiply_packed(cfg, outs):
+    return schedule_alu_packed(cfg, outs)
+
+
+def add_strategy_vta(attrs, inputs, out_type, target):
+    strategy = OpStrategy()
+    strategy.add_implementation(
+        _strategy.wrap_topi_compute(add_packed),
+        _strategy.wrap_topi_schedule(schedule_add_packed),
+        name="add.vta")
+    return strategy
+
+
+def multiply_strategy_vta(attrs, inputs, out_type, target):
+    strategy = OpStrategy()
+    strategy.add_implementation(
+        _strategy.wrap_topi_compute(multiply_packed),
+        _strategy.wrap_topi_schedule(schedule_multiply_packed),
+        name="multiply.vta")
+    return strategy
+
+
+env = get_env()
+# other target does not support alu-only ops
+if env.TARGET in ["sim", "tsim", "intelfocl"]:
+    reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
+    reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
+
+
 @_strategy.conv2d_strategy.register("vta")
 def conv2d_strategy_vta(attrs, inputs, out_type, target):
     """conv2d vta strategy"""
diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py
index ddfebc2cc8c1..2153531c72d0 100644
--- a/vta/python/vta/top/vta_conv2d_transpose.py
+++ b/vta/python/vta/top/vta_conv2d_transpose.py
@@ -81,6 +81,7 @@ def schedule_conv2d_transpose_packed(cfg, outs):
     """Schedule packed conv2d_transpose"""
     assert len(outs) == 1
     output = outs[0]
+    const_ops = []
     ewise_inputs = []
     ewise_ops = []
     conv2d_res = []
@@ -90,7 +91,10 @@ def schedule_conv2d_transpose_packed(cfg, outs):
     def _traverse(op):
         if topi.tag.is_broadcast(op.tag):
             if not op.same_as(output.op):
-                ewise_ops.append(op)
+                if not op.axis:
+                    const_ops.append(op)
+                else:
+                    ewise_ops.append(op)
             for tensor in op.input_tensors:
                 if isinstance(tensor.op, tvm.te.PlaceholderOp):
                     ewise_inputs.append((op, tensor))
@@ -146,6 +150,9 @@ def _traverse(op):
         s[op].set_scope(env.acc_scope)
         s[op].pragma(s[op].op.axis[0], env.alu)
 
+    for op in const_ops:
+        s[op].compute_inline()
+
     # tile
     x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
     x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co)
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index d9f47f1f71ec..c6ba1b95a8cb 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -382,9 +382,6 @@ def _fold_buffer_dim(buf, scope, elem_block):
 
     def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold):
         elem_block = elem_bytes * 8 // elem_width
-        if buf.dtype != dtype:
-            raise RuntimeError("Expect buffer type to be %s instead of %s" %
-                               (dtype, buf.dtype))
         shape, strides = buf.shape, buf.strides
         if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0):
             raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block))
@@ -553,6 +550,11 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
                 src, elem_width, elem_bytes, data_type,
                 dst.scope, allow_fold=allow_fold)
 
+            if data_type != src.dtype:
+                assert(data_type == "int%d" % env.ACC_WIDTH and \
+                       src.dtype == "int%d" % env.INP_WIDTH)
+                mem_type = env.dev.MEM_ID_ACC_8BIT
+
             irb = tvm.tir.ir_builder.create()
             irb.scope_attr(env.dev.vta_axis, "coproc_scope",
                            env.dev.get_task_qid(task_qid))
diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc
index 49fe9c557336..df20a8e87ed7 100644
--- a/vta/runtime/runtime.cc
+++ b/vta/runtime/runtime.cc
@@ -36,6 +36,13 @@
 #include <cstring>
 #include <memory>
 #include <vector>
+#include <set>
+#include <stdlib.h>
+#include <malloc.h>
+
+#include <picojson.h>
+#include <thread>
+#include <mutex>
 
 namespace vta {
 
@@ -47,10 +54,105 @@ static const bool kBufferCoherent = VTA_COHERENT_ACCESSES;
 /*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */
 static const bool kAlwaysCache = true;
 
+template <typename T, std::size_t N = 64>
+class AlignmentAllocator {
+public:
+  typedef T value_type;
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+
+  typedef T * pointer;
+  typedef const T * const_pointer;
+
+  typedef T & reference;
+  typedef const T & const_reference;
+
+  public:
+  inline AlignmentAllocator () throw () { }
+
+  template <typename T2>
+  inline AlignmentAllocator (const AlignmentAllocator<T2, N> &) throw () { }
+
+  inline ~AlignmentAllocator () throw () { }
+
+  inline pointer adress (reference r) {
+    return &r;
+  }
+
+  inline const_pointer adress (const_reference r) const {
+    return &r;
+  }
+
+  inline pointer allocate (size_type n) {
+     return (pointer)memalign(N, n*sizeof(value_type));
+  }
+
+  inline void deallocate (pointer p, size_type) {
+    free(p);
+  }
+
+  inline void construct (pointer p, const value_type & wert) {
+     new (p) value_type (wert);
+  }
+
+  inline void destroy (pointer p) {
+    p->~value_type ();
+  }
+
+  inline size_type max_size () const throw () {
+    return size_type (-1) / sizeof (value_type);
+  }
+
+  template <typename T2>
+  struct rebind {
+    typedef AlignmentAllocator<T2, N> other;
+  };
+
+  bool operator!=(const AlignmentAllocator<T,N>& other) const  {
+    return !(*this == other);
+  }
+
+  // Returns true if and only if storage allocated from *this
+  // can be deallocated from other, and vice versa.
+  // Always returns true for stateless allocators.
+  bool operator==(const AlignmentAllocator<T,N>& other) const {
+    return true;
+  }
+};
+
+class DeviceAllocStat {
+ public:
+  void AddAlloc(const void* ptr) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    allocated_.insert(ptr);
+  }
+
+  bool CheckAlloc(const void* ptr) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    return allocated_.count(ptr);
+  }
+
+  void DelAlloc(const void* ptr) {
+    std::lock_guard<std::mutex> lock(mtx_);
+    allocated_.erase(ptr);
+  }
+
+ private:
+  std::set<const void*> allocated_;
+  std::mutex mtx_;
+};
+
+// here we use a global variable to memorize the allocation stats
+static std::shared_ptr<DeviceAllocStat> alloc_stat(new DeviceAllocStat());
+
 /*!
  * \brief Data buffer represents data on CMA.
  */
 struct DataBuffer {
+  DataBuffer() {
+    alloc_stat_ = alloc_stat;
+  }
+
   /*! \return Virtual address of the data. */
   void* virt_addr() const { return data_; }
   /*! \return Physical address of the data. */
@@ -101,6 +203,8 @@ struct DataBuffer {
     DataBuffer* buffer = new DataBuffer();
     buffer->data_ = data;
     buffer->phy_addr_ = VTAMemGetPhyAddr(data);
+
+    alloc_stat->AddAlloc(buffer);
     return buffer;
   }
   /*!
@@ -108,6 +212,7 @@ struct DataBuffer {
    * \param buffer The buffer to be freed.
    */
   static void Free(DataBuffer* buffer) {
+    alloc_stat->DelAlloc(buffer);
     VTAMemFree(buffer->data_);
     delete buffer;
   }
@@ -117,7 +222,12 @@ struct DataBuffer {
    * \return The corresponding data buffer header.
    */
   static DataBuffer* FromHandle(const void* buffer) {
-    return const_cast<DataBuffer*>(reinterpret_cast<const DataBuffer*>(buffer));
+    if (alloc_stat->CheckAlloc(buffer)) {
+      return const_cast<DataBuffer*>(
+          reinterpret_cast<const DataBuffer*>(buffer));
+    } else {
+      return nullptr;
+    }
   }
 
  private:
@@ -125,6 +235,10 @@ struct DataBuffer {
   void* data_;
   /*! \brief The physical address of the buffer, excluding header. */
   vta_phy_addr_t phy_addr_;
+
+  // a copy of global shared_ptr instance
+  // to avoid the global instance is destructed before there are still some pending DataBuffers not destructed
+  std::shared_ptr<DeviceAllocStat> alloc_stat_;
 };
 
 /*!
@@ -329,7 +443,7 @@ class BaseQueue {
   // End location of current SRAM write in FIFO mode
   uint32_t sram_end_{0};
   // The buffer in DRAM
-  std::vector<T> dram_buffer_;
+  std::vector<T, AlignmentAllocator<T, 64>> dram_buffer_;
   // FPGA accessible buffer
   void* fpga_buff_{NULL};
   // Physical address of the FPGA buffer
@@ -413,7 +527,6 @@ class UopQueue : public BaseQueue<VTAUop> {
       kernel->sram_begin_ = 0;
       kernel->sram_end_ = 0;
     }
-
     cache_.clear();
     cache_idx_ = 0;
     BaseQueue<VTAUop>::Reset();
@@ -429,14 +542,24 @@ class UopQueue : public BaseQueue<VTAUop> {
       buff_size += cache_[i]->size() * kElemBytes;
     }
     CHECK(buff_size <= kMaxBytes);
-    // Move kernel contents to FPGA readable buffer
+
+    // merge all the cache entries and do CopyFromHost once
+    uint32_t total_size = 0;
+    for (uint32_t i = 0; i < cache_.size(); ++i) {
+      uint32_t ksize = cache_[i]->size() * kElemBytes;
+      total_size += ksize;
+    }
+
+    char *lbuf = (char*)memalign(64, total_size);
     uint32_t offset = 0;
     for (uint32_t i = 0; i < cache_.size(); ++i) {
       uint32_t ksize = cache_[i]->size() * kElemBytes;
-      VTAMemCopyFromHost(static_cast<char*>(fpga_buff_) + offset, cache_[i]->data(), ksize);
-      // Update offset
+      memcpy(lbuf + offset, cache_[i]->data(), ksize);
       offset += ksize;
     }
+    VTAMemCopyFromHost(static_cast<char*>(fpga_buff_), lbuf, total_size);
+    free(lbuf);
+
     // Flush if we're using a shared memory system
     // and if interface is non-coherent
     if (!coherent_ && always_cache_) {
@@ -609,34 +732,107 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     CommitPendingPop(kComputeStage);
   }
   // Helper function: Get Opcode string
-  const char* getOpcodeString(int opcode, bool use_imm) {
+  std::string getOpcodeString(int opcode, bool use_imm, int64_t imm) {
     // The string name
     if (opcode == VTA_ALU_OPCODE_MIN) {
-      if (use_imm) {
-        return "min imm";
-      } else {
-        return "min";
-      }
+        if (use_imm) {
+            return std::string("min imm ") + std::to_string(imm);
+        } else {
+            return "min";
+        }
     } else if (opcode == VTA_ALU_OPCODE_MAX) {
-      if (use_imm) {
-        return "max imm";
-      } else {
-        return "max";
-      }
+        if (use_imm) {
+            return (std::string("max imm ") + std::to_string(imm));
+        } else {
+            return "max";
+        }
     } else if (opcode == VTA_ALU_OPCODE_ADD) {
-      if (use_imm) {
-        return "add imm";
-      } else {
-        return "add";
-      }
+        if (use_imm) {
+            return (std::string("add imm ") + std::to_string(imm));
+        } else {
+            return "add";
+        }
     } else if (opcode == VTA_ALU_OPCODE_SHR) {
-      return "shr";
+        return (std::string("shr ") + std::to_string(imm));
+    } else if (opcode == VTA_ALU_OPCODE_MUL) {
+      return "mul";
     }
 
     return "unknown op";
   }
+
+  std::string GetOpName(const union VTAInsn& c) {
+    switch (c.mem.opcode) {
+      case VTA_OPCODE_LOAD:
+        if (c.mem.x_size == 0) {
+          if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
+            return "NOP-COMPUTE-STAGE";
+          } else {
+            return "NOP-MEMORY-STAGE";
+          }
+        } else {
+          if (c.mem.memory_type == VTA_MEM_ID_UOP) {
+            return "LOAD UOP";
+          } else if (c.mem.memory_type == VTA_MEM_ID_WGT) {
+            return "LOAD WGT";
+          } else if (c.mem.memory_type == VTA_MEM_ID_INP) {
+            return "LOAD INP";
+          } else if (c.mem.memory_type == VTA_MEM_ID_ACC) {
+            return "LOAD ACC";
+          } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8BIT) {
+            return "LOAD ACC 8BIT";
+          } else {
+            return "LOAD";
+          }
+        }
+      case VTA_OPCODE_STORE:
+        if (c.mem.x_size == 0) {
+          return "NOP-STORE-STAGE";
+        } else {
+          return "STORE";
+        }
+      case VTA_OPCODE_GEMM:
+        return "GEMM";
+      case VTA_OPCODE_ALU:
+        return "ALU - " + getOpcodeString(c.alu.alu_opcode, c.alu.use_imm, c.alu.imm);
+      case VTA_OPCODE_FINISH:
+        return "FINISH";
+      default:
+        return "Not recogonized";
+    }
+  }
+
+  std::string GetOpcodeName(const union VTAInsn& c) {
+    switch (c.mem.opcode) {
+      case VTA_OPCODE_LOAD:
+        if (c.mem.x_size == 0) {
+          return "NOP";
+        } else {
+          return "LOAD";
+        }
+      case VTA_OPCODE_STORE:
+        if (c.mem.x_size == 0) {
+          return "NOP";
+        } else {
+          return "STORE";
+        }
+      case VTA_OPCODE_GEMM:
+        return "GEMM";
+      case VTA_OPCODE_ALU:
+        if (c.alu.use_imm) {
+          return "ALU IMM";
+        } else {
+          return "ALU";
+        }
+      case VTA_OPCODE_FINISH:
+        return "NOP";
+      default:
+        return "Unknown";
+    }
+  }
+
   // Dump instructions in the queue
-  void DumpInsn() {
+  void DumpInsn(FILE* out = stderr, bool json = false) {
     // Keep tabs on dependence queues
     int l2g_queue = 0;
     int g2l_queue = 0;
@@ -647,97 +843,136 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     // Iterate over all instructions
     int insn_count = count();
     const VTAGenericInsn* insn = data();
-    printf("There are %u instructions\n", insn_count);
+    picojson::array jarr;
+
+    if (!json) {
+      fprintf(out, "There are %u instructions\n", insn_count);
+    }
+
     for (int i = 0; i < insn_count; ++i) {
       // Fetch instruction and decode opcode
       c.generic = insn[i];
-      printf("INSTRUCTION %u: ", i);
+      picojson::object kv;
+      if (json) {
+        kv["name"] = picojson::value(GetOpName(c).c_str());
+        kv["type"] = picojson::value(GetOpcodeName(c).c_str());
+        kv["pop_prev"] = picojson::value(static_cast<int64_t>(c.mem.pop_prev_dep));
+        kv["pop_next"] = picojson::value(static_cast<int64_t>(c.mem.pop_next_dep));
+        kv["push_prev"] = picojson::value(static_cast<int64_t>(c.mem.push_prev_dep));
+        kv["push_next"] = picojson::value(static_cast<int64_t>(c.mem.push_next_dep));
+      } else {
+        fprintf(out, "INSTRUCTION %u: ", i);
+        fprintf(out, "%s\n", GetOpName(c).c_str());
+
+        fprintf(out, "\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
+               static_cast<int>(c.mem.pop_prev_dep),
+               static_cast<int>(c.mem.pop_next_dep),
+               static_cast<int>(c.mem.push_prev_dep),
+               static_cast<int>(c.mem.push_next_dep));
+      }
+
       if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
-        if (c.mem.x_size == 0) {
-          if (c.mem.opcode == VTA_OPCODE_STORE) {
-            printf("NOP-STORE-STAGE\n");
-          } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
-            printf("NOP-COMPUTE-STAGE\n");
-          } else {
-            printf("NOP-MEMORY-STAGE\n");
-          }
-          printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-                 static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-                 static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-          // Count status in queues
-          if (c.mem.opcode == VTA_OPCODE_STORE) {
-            CHECK(c.mem.pop_next_dep == false);
-            CHECK(c.mem.push_next_dep == false);
-            if (c.mem.pop_prev_dep) g2s_queue--;
-            if (c.mem.push_prev_dep) s2g_queue++;
-          } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
-                     (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
-            CHECK(c.mem.pop_prev_dep == false);
-            CHECK(c.mem.push_prev_dep == false);
-            if (c.mem.pop_next_dep) g2l_queue--;
-            if (c.mem.push_next_dep) l2g_queue++;
-          } else {
-            if (c.mem.pop_prev_dep) l2g_queue--;
-            if (c.mem.push_prev_dep) g2l_queue++;
-            if (c.mem.pop_next_dep) s2g_queue--;
-            if (c.mem.push_next_dep) g2s_queue++;
-          }
-          printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
-          printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
-          continue;
-        }
-        // Print instruction field information
-        if (c.mem.opcode == VTA_OPCODE_LOAD) {
-          printf("LOAD ");
-          if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
-          if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
-          if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
-          if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
-        }
-        if (c.mem.opcode == VTA_OPCODE_STORE) {
-          printf("STORE:\n");
+        if (json) {
+          kv["dram"] = picojson::value(static_cast<int64_t>(c.mem.dram_base));
+          kv["sram"] = picojson::value(static_cast<int64_t>(c.mem.sram_base));
+
+          picojson::array arr;
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.y_size)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.y_pad_0)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.y_pad_1)));
+          kv["y"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.x_size)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.x_pad_0)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.x_pad_1)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.mem.x_stride)));
+          kv["x"] = picojson::value(arr);
+        } else {
+          fprintf(out, "\tDRAM: 0x%08x, SRAM:0x%04x\n",
+                 static_cast<int>(c.mem.dram_base),
+                 static_cast<int>(c.mem.sram_base));
+          fprintf(out, "\ty: size=%d, pad=[%d, %d]\n",
+                 static_cast<int>(c.mem.y_size),
+                 static_cast<int>(c.mem.y_pad_0),
+                 static_cast<int>(c.mem.y_pad_1));
+          fprintf(out, "\tx: size=%d, stride=%d, pad=[%d, %d]\n",
+                 static_cast<int>(c.mem.x_size),
+                 static_cast<int>(c.mem.x_stride),
+                 static_cast<int>(c.mem.x_pad_0),
+                 static_cast<int>(c.mem.x_pad_1));
         }
-        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-               static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-               static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-        printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", static_cast<int>(c.mem.dram_base),
-               static_cast<int>(c.mem.sram_base));
-        printf("\ty: size=%d, pad=[%d, %d]\n", static_cast<int>(c.mem.y_size),
-               static_cast<int>(c.mem.y_pad_0), static_cast<int>(c.mem.y_pad_1));
-        printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", static_cast<int>(c.mem.x_size),
-               static_cast<int>(c.mem.x_stride), static_cast<int>(c.mem.x_pad_0),
-               static_cast<int>(c.mem.x_pad_1));
       } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
-        // Print instruction field information
-        printf("GEMM\n");
-
-        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-               static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-               static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-        printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
-        printf("\trange (%d, %d)\n", static_cast<int>(c.gemm.uop_bgn),
-               static_cast<int>(c.gemm.uop_end));
-        printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
-               static_cast<int>(c.gemm.iter_out), static_cast<int>(c.gemm.wgt_factor_out),
-               static_cast<int>(c.gemm.src_factor_out), static_cast<int>(c.gemm.dst_factor_out));
-        printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
-               static_cast<int>(c.gemm.iter_in), static_cast<int>(c.gemm.wgt_factor_in),
-               static_cast<int>(c.gemm.src_factor_in), static_cast<int>(c.gemm.dst_factor_in));
+        if (json) {
+          kv["reset_out"] = picojson::value(static_cast<int64_t>(c.gemm.reset_reg));
+
+          picojson::array arr;
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.uop_bgn)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.uop_end)));
+          kv["range"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.iter_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.wgt_factor_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.src_factor_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.dst_factor_out)));
+          kv["outer_loop"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.iter_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.wgt_factor_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.src_factor_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.gemm.dst_factor_in)));
+          kv["inner_loop"] = picojson::value(arr);
+        } else {
+          fprintf(out, "\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
+          fprintf(out, "\trange (%d, %d)\n",
+                 static_cast<int>(c.gemm.uop_bgn),
+                 static_cast<int>(c.gemm.uop_end));
+          fprintf(out, "\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
+                 static_cast<int>(c.gemm.iter_out),
+                 static_cast<int>(c.gemm.wgt_factor_out),
+                 static_cast<int>(c.gemm.src_factor_out),
+                 static_cast<int>(c.gemm.dst_factor_out));
+          fprintf(out, "\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n",
+                 static_cast<int>(c.gemm.iter_in),
+                 static_cast<int>(c.gemm.wgt_factor_in),
+                 static_cast<int>(c.gemm.src_factor_in),
+                 static_cast<int>(c.gemm.dst_factor_in));
+        }
       } else if (c.mem.opcode == VTA_OPCODE_ALU) {
-        // Print instruction field information
-        printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
-        printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-               static_cast<int>(c.mem.pop_prev_dep), static_cast<int>(c.mem.pop_next_dep),
-               static_cast<int>(c.mem.push_prev_dep), static_cast<int>(c.mem.push_next_dep));
-        printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
-        printf("\trange (%d, %d)\n", static_cast<int>(c.alu.uop_bgn),
-               static_cast<int>(c.alu.uop_end));
-        printf("\touter loop - iter: %d, dst: %d, src: %d\n", static_cast<int>(c.alu.iter_out),
-               static_cast<int>(c.alu.dst_factor_out), static_cast<int>(c.alu.src_factor_out));
-        printf("\tinner loop - iter: %d, dst: %d, src: %d\n", static_cast<int>(c.alu.iter_in),
-               static_cast<int>(c.alu.dst_factor_in), static_cast<int>(c.alu.src_factor_in));
-      } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
-        printf("FINISH\n");
+        if (json) {
+          kv["reset_out"] = picojson::value(static_cast<int64_t>(c.alu.reset_reg));
+          picojson::array arr;
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.uop_bgn)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.uop_end)));
+          kv["range"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.iter_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.dst_factor_out)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.src_factor_out)));
+          kv["outer_loop"] = picojson::value(arr);
+
+          arr.clear();
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.iter_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.dst_factor_in)));
+          arr.push_back(picojson::value(static_cast<int64_t>(c.alu.src_factor_in)));
+          kv["inner_loop"] = picojson::value(arr);
+        } else {
+          fprintf(out, "\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
+          fprintf(out, "\trange (%d, %d)\n",
+                 static_cast<int>(c.alu.uop_bgn),
+                 static_cast<int>(c.alu.uop_end));
+          fprintf(out, "\touter loop - iter: %d, dst: %d, src: %d\n",
+                 static_cast<int>(c.alu.iter_out),
+                 static_cast<int>(c.alu.dst_factor_out),
+                 static_cast<int>(c.alu.src_factor_out));
+          fprintf(out, "\tinner loop - iter: %d, dst: %d, src: %d\n",
+                 static_cast<int>(c.alu.iter_in),
+                 static_cast<int>(c.alu.dst_factor_in),
+                 static_cast<int>(c.alu.src_factor_in));
+        }
       }
 
       // Count status in queues
@@ -766,8 +1001,22 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
         if (c.gemm.pop_next_dep) s2g_queue--;
         if (c.gemm.push_next_dep) g2s_queue++;
       }
-      printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
-      printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
+      if (json) {
+        kv["l2g_queue"] = picojson::value(static_cast<int64_t>(l2g_queue));
+        kv["g2l_queue"] = picojson::value(static_cast<int64_t>(g2l_queue));
+        kv["s2g_queue"] = picojson::value(static_cast<int64_t>(s2g_queue));
+        kv["g2s_queue"] = picojson::value(static_cast<int64_t>(g2s_queue));
+
+        jarr.push_back(picojson::value(kv));
+      } else {
+        fprintf(out, "\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
+        fprintf(out, "\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
+      }
+    }
+
+    if (json) {
+      auto str = picojson::value(jarr).serialize();
+      fwrite(str.c_str(), 1, str.size(), out);
     }
   }
   // Commit all pending pop of corresponding stage
@@ -799,6 +1048,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     CHECK(fpga_buff_ != nullptr);
     CHECK(fpga_buff_phy_);
     uint32_t buff_size = dram_buffer_.size() * elem_bytes_;
+
     CHECK(buff_size <= kMaxBytes);
     // Copy contents of DRAM buffer to FPGA buff
     VTAMemCopyFromHost(fpga_buff_, dram_buffer_.data(), buff_size);
@@ -830,7 +1080,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
   }
   // Get stage of the memory
   static PipelineStage GetMemPipelineStage(int memory_type) {
-    if (memory_type == VTA_MEM_ID_ACC) return kComputeStage;
+    if (memory_type == VTA_MEM_ID_ACC || memory_type == VTA_MEM_ID_ACC_8BIT) return kComputeStage;
     if (memory_type == VTA_MEM_ID_UOP) return kComputeStage;
     return kLoadStage;
   }
@@ -840,7 +1090,7 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
     if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage;
     if (insn->opcode == VTA_OPCODE_LOAD) {
       if (insn->x_size == 0) return kNoneStage;
-      if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage;
+      if (insn->memory_type == VTA_MEM_ID_ACC || insn->memory_type == VTA_MEM_ID_ACC_8BIT) return kComputeStage;
       if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage;
       return kLoadStage;
     }
@@ -888,6 +1138,8 @@ class InsnQueue : public BaseQueue<VTAGenericInsn> {
   int pending_pop_next_[4];
   static constexpr int kElemBytes = sizeof(VTAGenericInsn);
   static constexpr int kMaxElems = kMaxBytes / kElemBytes;
+
+  friend class CommandQueue;
 };
 
 /*!
@@ -923,6 +1175,9 @@ class CommandQueue {
       case VTA_MEM_ID_OUT:
         elem_bytes = VTA_OUT_ELEM_BYTES;
         break;
+      case VTA_MEM_ID_ACC_8BIT:
+        elem_bytes = VTA_ACC_ELEM_BYTES / 4;
+        break;
       default:
         LOG(FATAL) << "Memory id not recognized:" << memory_id;
         break;
@@ -992,7 +1247,35 @@ class CommandQueue {
     }
   }
 
-  void Synchronize(uint32_t wait_cycles) {
+  void Synchronize(uint32_t wait_cycles, bool skip=true) {
+    if (debug_flag_ & VTA_DEBUG_LOG_INSN) {
+      const char* insn_file = std::getenv("TVM_INSN_DUMP_FILE");
+      if (insn_file == nullptr) {
+        insn_file = "insn.json";
+      }
+      FILE* out = fopen(insn_file, "w+");
+      if (out) {
+        insn_queue_.DumpInsn(out, true);
+        fclose(out);
+      } else {
+        LOG(ERROR) << insn_file << " open failed";
+      }
+      return;
+    }
+
+    // FIXME(zhanghao): It is required to use force_serial
+    // by using skip and sync at the final layer.
+    // By doing this, we can avoid do DeviceCopy every time.
+    // TODO: Consider to make it as a flag when mature
+    const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL");
+    if (sync_once && skip) {
+      if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) {
+        LOG(ERROR) <<
+            "Synchronizing all in one round requires to use force_serial to make things right";
+      }
+      return;
+    }
+
     // Insert dependences to force serialization
     if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) {
       insn_queue_.RewriteForceSerial();
@@ -1022,7 +1305,7 @@ class CommandQueue {
           VTA_OPCODE_FINISH);
 
     // Make sure that we don't exceed contiguous physical memory limits
-    CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER);
+    CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) <= VTA_MAX_XFER);
     int timeout =
         VTADeviceRun(device_, insn_queue_.dram_phy_addr(), insn_queue_.count(), wait_cycles);
     CHECK_EQ(timeout, 0);
@@ -1170,13 +1453,13 @@ class CommandQueue {
 
   void CheckInsnOverFlow() {
     // At each API call, we can at most commit:
-    // one pending store, one pending load, and one uop
-    if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) {
+    // at most: 2 NOP-COMPUTE-STAGE -> 2 NOP-MEMORY-STAGE -> 1 NOP-COMPUTE-STAGE -> 1 FINISH
+    if ((insn_queue_.count() + 6) * sizeof(VTAGenericInsn) > VTA_MAX_XFER) {
       this->AutoSync();
     }
   }
   // Auto sync when instruction overflow
-  void AutoSync() { this->Synchronize(1 << 31); }
+  void AutoSync() { this->Synchronize(1 << 31, false); }
 
   // Internal debug flag
   int debug_flag_{0};
@@ -1212,6 +1495,9 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off
 
   if (from_buffer) {
     // This is an FPGA to host mem transfer
+    // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly
+    const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL");
+    if (sync_once) VTASynchronize(VTATLSCommandHandle(), 1<<31, false);
     from_buffer->InvalidateCache(from_offset, size);
     from_buffer->MemCopyToHost(static_cast<char*>(to) + to_offset,
                                static_cast<const char*>(from) + from_offset, size);
@@ -1232,7 +1518,12 @@ void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) {
 }
 
 void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) {
-  return vta::DataBuffer::FromHandle(buffer)->virt_addr();
+  auto data_buf = vta::DataBuffer::FromHandle(buffer);
+  if (data_buf) {
+    return data_buf->virt_addr();
+  } else {  // it is a raw ptr allocated by CPU
+    return buffer;
+  }
 }
 
 void VTAWriteBarrier(VTACommandHandle cmd, void* buffer, uint32_t elem_bits, uint32_t start,
@@ -1295,6 +1586,5 @@ int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) {
   return 0;
 }
 
-void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) {
-  static_cast<vta::CommandQueue*>(cmd)->Synchronize(wait_cycles);
-}
+void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip) {
+  static_cast<vta::CommandQueue*>(cmd)->Synchronize(wait_cycles, skip); }
diff --git a/vta/runtime/runtime.h b/vta/runtime/runtime.h
index 24ebb8e1247b..a61906e98ff6 100644
--- a/vta/runtime/runtime.h
+++ b/vta/runtime/runtime.h
@@ -41,6 +41,7 @@ extern "C" {
 #define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3)
 #define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4)
 #define VTA_DEBUG_FORCE_SERIAL (1 << 5)
+#define VTA_DEBUG_LOG_INSN (1 << 6)
 
 /*!
  * \brief Allocate data buffer.
@@ -251,7 +252,7 @@ TVM_DLL int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
  * \param wait_cycles The limit of poll cycles.
  *
  */
-TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
+TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip=true);
 
 #ifdef __cplusplus
 }
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index b3c36e85d56b..1d940c2ac9be 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -240,7 +240,7 @@ def test_conv2d(device):
     def _run(env, remote):
         if device == "vta":
             target = env.target
-            if env.TARGET not in ["sim", "tsim"]:
+            if env.TARGET not in ["sim", "tsim", "intelfocl"]:
                 assert tvm.runtime.enabled("rpc")
                 program_fpga(remote, bitstream=None)
                 reconfig_runtime(remote)
diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py
new file mode 100644
index 000000000000..cf4922450ce5
--- /dev/null
+++ b/vta/tutorials/autotvm/tune_alu_vta.py
@@ -0,0 +1,318 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Auto-tuning a ALU fused op on VTA
+"""
+
+import os
+from mxnet.gluon.model_zoo import vision
+import numpy as np
+from PIL import Image
+
+import topi
+import tvm
+from tvm import te
+from tvm import rpc, autotvm, relay
+from tvm.contrib import graph_runtime, util, download
+from tvm.autotvm.measure.measure_methods import request_remote
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.autotvm import record
+
+import vta
+from vta.testing import simulator
+from vta.top import graph_pack
+import copy
+
+
+#################################################################
+# Compile network
+# ---------------
+# Perform vta-specific compilation with Relay from a Gluon model
+def compile_network(env, target, model, start_pack, stop_pack, device_annot=False):
+
+    # Populate the shape and data type dictionary
+    dtype_dict = {"data": 'float32'}
+    shape_dict = {"data": (env.BATCH, 3, 224, 224)}
+
+    # Get off the shelf gluon model, and convert to relay
+    gluon_model = vision.get_model(model, pretrained=True)
+    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
+
+    # Update shape and type dictionary
+    shape_dict.update({k: v.shape for k, v in params.items()})
+    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
+
+    # Perform quantization in Relay
+    # Note: We set opt_level to 3 in order to fold batch norm
+    with relay.build_config(opt_level=3):
+        with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
+            mod = relay.quantize.quantize(mod, params=params)
+
+    # Perform graph packing and constant folding for VTA target
+    if target.device_name == "vta":
+        assert env.BLOCK_IN == env.BLOCK_OUT
+        relay_prog = graph_pack(mod["main"],
+                                env.BATCH,
+                                env.BLOCK_OUT,
+                                env.WGT_WIDTH,
+                                start_name=start_pack,
+                                stop_name=stop_pack,
+                                device_annot=device_annot)
+
+    return relay_prog, params
+
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations.
+# Here we use an Pynq-Z1 board as an example.
+
+# Tracker host and port can be set by your environment
+tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0')
+tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+
+# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
+
+# Name of Gluon model to compile
+# The ``start_pack`` and ``stop_pack`` labels indicate where
+# to start and end the graph packing relay pass: in other words
+# where to start and finish offloading to VTA.
+network = "resnet50_v2"
+start_pack = "nn.max_pool2d"
+stop_pack = "nn.global_avg_pool2d"
+
+# Tuning option
+log_file = "%s.alu.%s.log" % (device, network)
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'random',
+    'n_trial': 1000,
+    'early_stopping': None,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(n_parallel=1),
+        runner=autotvm.RPCRunner(env.TARGET,
+                                 host=tracker_host,
+                                 port=tracker_port,
+                                 number=5,
+                                 timeout=60,
+                                 check_correctness=True),
+    ),
+}
+
+
+def log_to_file(file_out, protocol='json'):
+    """Log the tuning records into file.
+    The rows of the log are stored in the format of autotvm.record.encode.
+    for lhs == rhs, we add an extra rhs = [] record
+
+    Parameters
+    ----------
+    file_out : str
+        The file to log to.
+    protocol: str, optional
+        The log protocol. Can be 'json' or 'pickle'
+
+    Returns
+    -------
+    callback : callable
+        Callback function to do the logging.
+    """
+    def _callback(_, inputs, results):
+        with open(file_out, "a") as f:
+            for inp, result in zip(inputs, results):
+                f.write(record.encode(inp, result, protocol) + "\n")
+
+                # we only consider task with same lhs and rhs
+                if inp.task.args[0] == inp.task.args[1]:
+                    args = list(inp.task.args)
+                    args[1] = (args[0][0], (), args[0][2])
+                    inp_copy = copy.deepcopy(inp)
+                    inp_copy.task.args = tuple(args)
+                    f.write(record.encode(inp_copy, result, protocol) + "\n")
+
+    return _callback
+
+
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=10,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True):
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'xgb_knob':
+            tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tsk_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(n_trial=tsk_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                           log_to_file(tmp_log_file)
+                       ])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Register VTA-specific tuning tasks
+def register_vta_tuning_tasks():
+    from tvm.autotvm.task import TaskExtractEnv
+
+    @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
+    def my_clip(x, a_min, a_max):
+        """Unlike topi's current clip, put min and max into two stages."""
+        const_min = tvm.tir.const(a_min, x.dtype)
+        const_max = tvm.tir.const(a_max, x.dtype)
+        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+        x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
+        return x
+
+    # init autotvm env to register VTA operator
+    TaskExtractEnv()
+
+    @autotvm.template("add.vta")
+    def _topi_add(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        A, B = args[:2]
+
+        with tvm.target.vta():
+            res = vta.top.op.add_packed(*args, **kwargs)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.Target.current().device_name == 'vta':
+            s = vta.top.op.schedule_add_packed([res])
+        else:
+            s = te.create_schedule([res.op])
+        return s, [A, B, res]
+
+    @autotvm.template("multiply.vta")
+    def _topi_multiply(*args, **kwargs):
+        assert not kwargs, "Do not support kwargs in template function call"
+        A, B = args[:2]
+
+        with tvm.target.vta():
+            res = vta.top.op.multiply_packed(*args, **kwargs)
+            res = my_clip(res, 0, 127)
+            res = topi.cast(res, "int8")
+
+        if tvm.target.Target.current().device_name == 'vta':
+            s = vta.top.op.schedule_multiply_packed([res])
+        else:
+            s = te.create_schedule([res.op])
+        return s, [A, B, res]
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+def tune_and_evaluate(tuning_opt):
+
+    if env.TARGET != "sim":
+        # Get remote from fleet node
+        remote = autotvm.measure.request_remote(env.TARGET,
+                                                tracker_host,
+                                                tracker_port,
+                                                timeout=10000)
+        # Reconfigure the JIT runtime and FPGA.
+        vta.reconfig_runtime(remote)
+        vta.program_fpga(remote, bitstream=None)
+    else:
+        # In simulation mode, host the RPC server locally.
+        remote = rpc.LocalSession()
+
+    # Register VTA tuning tasks
+    register_vta_tuning_tasks()
+
+    # Perform task extraction on Relay program
+    print("Extract tasks...")
+    relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
+    mod = tvm.IRModule.from_expr(relay_prog)
+    tasks = autotvm.task.extract_from_program(mod,
+                                              params=params,
+                                              ops=(relay.op.get("add"), relay.op.get("multiply"),),
+                                              target=target,
+                                              target_host=env.target_host)
+
+    # filter out non-packed alu task
+    tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
+    # filter out float alu task
+    tasks = list(filter(lambda t: t.args[0][2] != "float32", tasks))
+
+    # We should have extracted 10 convolution tasks
+    tasks_set = {}
+    print("Extracted {} alu tasks:".format(len(tasks)))
+    for tsk in tasks:
+        print("tsk = ", tsk)
+
+        if len(tsk.args[1][1]) == 0:
+            args = list(tsk.args)
+            args[1] = args[0]
+            tsk.args = tuple(args)
+
+        if (tsk.name, tsk.args) in tasks_set:
+            print("task {} already exists".format(tsk))
+        tasks_set[(tsk.name, tsk.args)] = tsk
+
+    tasks = list(tasks_set.values())
+    print("After merged, final #tasks={}, tasks = {}".format(len(tasks), tasks))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+
+# Run the tuning and evaluate the results
+tune_and_evaluate(tuning_option)
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index a92b1ee5d90b..9ae54cba0992 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -208,7 +208,7 @@ def compile_network(env, target, model, start_pack, stop_pack):
     'early_stopping': None,
 
     'measure_option': autotvm.measure_option(
-        builder=autotvm.LocalBuilder(),
+        builder=autotvm.LocalBuilder(n_parallel=1),
         runner=autotvm.RPCRunner(env.TARGET,
                                  host=tracker_host,
                                  port=tracker_port,
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index 3a367851ed25..a9676a0096e8 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -99,7 +99,7 @@
 # When target is 'pynq', reconfigure FPGA and runtime.
 # Otherwise, if target is 'sim', execute locally.
 
-if env.TARGET not in ["sim", "tsim"]:
+if env.TARGET not in ["sim", "tsim", "intelfocl"]:
 
     # Get remote from tracker node if environment variable is set.
     # To set up the tracker, you'll need to follow the "Auto-tuning
@@ -129,6 +129,10 @@
 else:
     remote = rpc.LocalSession()
 
+    if env.TARGET in ["intelfocl"]:
+        # program intelfocl aocx
+        vta.program_fpga(remote, bitstream="vta_opencl.aocx")
+
 # Get execution context from remote
 ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
 
@@ -183,7 +187,7 @@
                 env.BLOCK_OUT,
                 env.WGT_WIDTH,
                 start_name=pack_dict[model][0],
-                stop_name=pack_dict[model][1])
+                stop_name=pack_dict[model][1], device_annot=(env.TARGET == "intelfocl" or env.TARGET == "sim"))
     else:
         relay_prog = mod["main"]
 
@@ -194,6 +198,12 @@
                 relay_prog, target=target,
                 params=params, target_host=env.target_host)
     else:
+        if env.TARGET == "intelfocl" or env.TARGET == "sim":
+            # multiple targets to run both on cpu and vta
+            target = {
+                "cpu": env.target_vta_cpu,
+                "ext_dev": target
+            }
         with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
             graph, lib, params = relay.build(
                 relay_prog, target=target,
@@ -209,8 +219,13 @@
     remote.upload(temp.relpath("graphlib.o"))
     lib = remote.load_module("graphlib.o")
 
-    # Graph runtime
-    m = graph_runtime.create(graph, lib, ctx)
+
+    if env.TARGET == "intelfocl" or env.TARGET == "sim":
+        ctxes = [remote.ext_dev(0), remote.cpu(0)]
+        m = graph_runtime.create(graph, lib, ctxes)
+    else:
+        # Graph runtime
+        m = graph_runtime.create(graph, lib, ctx)
 
 ######################################################################
 # Perform image classification inference
diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py
index ab416874b71b..8ac7307f5a05 100644
--- a/vta/tutorials/vta_get_started.py
+++ b/vta/tutorials/vta_get_started.py
@@ -91,7 +91,7 @@
     vta.program_fpga(remote, bitstream=None)
 
 # In simulation mode, host the RPC server locally.
-elif env.TARGET == "sim":
+elif env.TARGET in ("sim", "tsim", "intelfocl"):
     remote = rpc.LocalSession()
 
 ######################################################################