From 004b759f179a2049c7778f57f349c7f8eff50a96 Mon Sep 17 00:00:00 2001 From: Li Jiashu Date: Wed, 4 Mar 2020 16:51:02 +0800 Subject: [PATCH 01/44] Added support of Intel OpenCL for FPGA devices --- cmake/modules/VTA.cmake | 8 + vta/python/vta/environment.py | 2 +- vta/python/vta/testing/simulator.py | 2 +- vta/runtime/runtime.cc | 1 - vta/src/intelfocl/AOCLUtils/aocl_utils.h | 32 ++ vta/src/intelfocl/AOCLUtils/opencl.cpp | 555 ++++++++++++++++++++++ vta/src/intelfocl/AOCLUtils/opencl.h | 122 +++++ vta/src/intelfocl/AOCLUtils/options.cpp | 105 ++++ vta/src/intelfocl/AOCLUtils/options.h | 137 ++++++ vta/src/intelfocl/AOCLUtils/scoped_ptrs.h | 165 +++++++ vta/src/intelfocl/intelfocl_device.cc | 181 +++++++ vta/src/intelfocl/intelfocl_device.h | 53 +++ vta/src/intelfocl/intelfocl_driver.cc | 74 +++ vta/src/pynq/pynq_driver.cc | 167 +++++++ 14 files changed, 1601 insertions(+), 3 deletions(-) create mode 100644 vta/src/intelfocl/AOCLUtils/aocl_utils.h create mode 100644 vta/src/intelfocl/AOCLUtils/opencl.cpp create mode 100644 vta/src/intelfocl/AOCLUtils/opencl.h create mode 100644 vta/src/intelfocl/AOCLUtils/options.cpp create mode 100644 vta/src/intelfocl/AOCLUtils/options.h create mode 100644 vta/src/intelfocl/AOCLUtils/scoped_ptrs.h create mode 100644 vta/src/intelfocl/intelfocl_device.cc create mode 100644 vta/src/intelfocl/intelfocl_device.h create mode 100644 vta/src/intelfocl/intelfocl_driver.cc create mode 100644 vta/src/pynq/pynq_driver.cc diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index d9508470c0a2..33fe0016fe4a 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -99,6 +99,11 @@ elseif(PYTHON) find_library(__cma_lib NAMES cma PATH /usr/lib) elseif(${VTA_TARGET} STREQUAL "de10nano") # DE10-Nano rules file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc) + elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules + file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc) + file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cpp) + list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC}) + list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc vta/src/vmem/virtual_memory.h) endif() # Target lib: vta add_library(vta SHARED ${FPGA_RUNTIME_SRCS}) @@ -117,6 +122,9 @@ elseif(PYTHON) target_include_directories(vta PUBLIC 3rdparty) target_include_directories(vta PUBLIC "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include") + elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules + target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include") + target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL) endif() endif() diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index e68f098ba53f..c556352e4539 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -241,7 +241,7 @@ def target_host(self): return "llvm -target=armv7-none-linux-gnueabihf" if self.TARGET == "ultra96": return "llvm -target=aarch64-linux-gnu" - if self.TARGET in ["sim", "tsim"]: + if self.TARGET in ["sim", "tsim", "intelfocl"]: return "llvm" raise ValueError("Unknown target %s" % self.TARGET) diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py index 16827c4ab079..bf89107f9f79 100644 --- a/vta/python/vta/testing/simulator.py +++ b/vta/python/vta/testing/simulator.py @@ -25,7 +25,7 @@ def _load_sw(): """Load hardware library for simulator.""" env = get_env() - lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim" + lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim" if env.TARGET == "sim" else "libvta" # Load driver library lib_driver = find_libvta(lib_driver_name, optional=True) diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 49fe9c557336..b1d3ad424d6e 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -413,7 +413,6 @@ class UopQueue : public BaseQueue { kernel->sram_begin_ = 0; kernel->sram_end_ = 0; } - cache_.clear(); cache_idx_ = 0; BaseQueue::Reset(); diff --git a/vta/src/intelfocl/AOCLUtils/aocl_utils.h b/vta/src/intelfocl/AOCLUtils/aocl_utils.h new file mode 100644 index 000000000000..70e0fc6bcc0a --- /dev/null +++ b/vta/src/intelfocl/AOCLUtils/aocl_utils.h @@ -0,0 +1,32 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// Main include file for AOCLUtils. Includes all other utility header files. + +#ifndef AOCL_UTILS_H +#define AOCL_UTILS_H + +#include "opencl.h" +#include "scoped_ptrs.h" +#include "options.h" + +#endif + diff --git a/vta/src/intelfocl/AOCLUtils/opencl.cpp b/vta/src/intelfocl/AOCLUtils/opencl.cpp new file mode 100644 index 000000000000..04d989d7c9ea --- /dev/null +++ b/vta/src/intelfocl/AOCLUtils/opencl.cpp @@ -0,0 +1,555 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +#include "aocl_utils.h" +#include +#include + +#ifdef _WIN32 // Windows +#include +#else // Linux +#include +#include // readlink, chdir +#endif + +namespace aocl_utils { + +static const char *const VERSION_STR = "161"; + +////////////////////////////////////////// +// Host allocation functions for alignment +////////////////////////////////////////// + +// This is the minimum alignment requirement to ensure DMA can be used. +const unsigned AOCL_ALIGNMENT = 64; + +#ifdef _WIN32 // Windows +void *alignedMalloc(size_t size) { + return _aligned_malloc (size, AOCL_ALIGNMENT); +} + +void alignedFree(void * ptr) { + _aligned_free(ptr); +} +#else // Linux +void *alignedMalloc(size_t size) { + void *result = NULL; + int rc; + rc = posix_memalign (&result, AOCL_ALIGNMENT, size); + (void) rc; + return result; +} + +void alignedFree(void * ptr) { + free (ptr); +} +#endif + +/////////////////////////////// +// Error functions +/////////////////////////////// + +// Print the error associciated with an error code +void printError(cl_int error) { + // Print error message + switch(error) + { + case -1: + printf("CL_DEVICE_NOT_FOUND "); + break; + case -2: + printf("CL_DEVICE_NOT_AVAILABLE "); + break; + case -3: + printf("CL_COMPILER_NOT_AVAILABLE "); + break; + case -4: + printf("CL_MEM_OBJECT_ALLOCATION_FAILURE "); + break; + case -5: + printf("CL_OUT_OF_RESOURCES "); + break; + case -6: + printf("CL_OUT_OF_HOST_MEMORY "); + break; + case -7: + printf("CL_PROFILING_INFO_NOT_AVAILABLE "); + break; + case -8: + printf("CL_MEM_COPY_OVERLAP "); + break; + case -9: + printf("CL_IMAGE_FORMAT_MISMATCH "); + break; + case -10: + printf("CL_IMAGE_FORMAT_NOT_SUPPORTED "); + break; + case -11: + printf("CL_BUILD_PROGRAM_FAILURE "); + break; + case -12: + printf("CL_MAP_FAILURE "); + break; + case -13: + printf("CL_MISALIGNED_SUB_BUFFER_OFFSET "); + break; + case -14: + printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST "); + break; + + case -30: + printf("CL_INVALID_VALUE "); + break; + case -31: + printf("CL_INVALID_DEVICE_TYPE "); + break; + case -32: + printf("CL_INVALID_PLATFORM "); + break; + case -33: + printf("CL_INVALID_DEVICE "); + break; + case -34: + printf("CL_INVALID_CONTEXT "); + break; + case -35: + printf("CL_INVALID_QUEUE_PROPERTIES "); + break; + case -36: + printf("CL_INVALID_COMMAND_QUEUE "); + break; + case -37: + printf("CL_INVALID_HOST_PTR "); + break; + case -38: + printf("CL_INVALID_MEM_OBJECT "); + break; + case -39: + printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR "); + break; + case -40: + printf("CL_INVALID_IMAGE_SIZE "); + break; + case -41: + printf("CL_INVALID_SAMPLER "); + break; + case -42: + printf("CL_INVALID_BINARY "); + break; + case -43: + printf("CL_INVALID_BUILD_OPTIONS "); + break; + case -44: + printf("CL_INVALID_PROGRAM "); + break; + case -45: + printf("CL_INVALID_PROGRAM_EXECUTABLE "); + break; + case -46: + printf("CL_INVALID_KERNEL_NAME "); + break; + case -47: + printf("CL_INVALID_KERNEL_DEFINITION "); + break; + case -48: + printf("CL_INVALID_KERNEL "); + break; + case -49: + printf("CL_INVALID_ARG_INDEX "); + break; + case -50: + printf("CL_INVALID_ARG_VALUE "); + break; + case -51: + printf("CL_INVALID_ARG_SIZE "); + break; + case -52: + printf("CL_INVALID_KERNEL_ARGS "); + break; + case -53: + printf("CL_INVALID_WORK_DIMENSION "); + break; + case -54: + printf("CL_INVALID_WORK_GROUP_SIZE "); + break; + case -55: + printf("CL_INVALID_WORK_ITEM_SIZE "); + break; + case -56: + printf("CL_INVALID_GLOBAL_OFFSET "); + break; + case -57: + printf("CL_INVALID_EVENT_WAIT_LIST "); + break; + case -58: + printf("CL_INVALID_EVENT "); + break; + case -59: + printf("CL_INVALID_OPERATION "); + break; + case -60: + printf("CL_INVALID_GL_OBJECT "); + break; + case -61: + printf("CL_INVALID_BUFFER_SIZE "); + break; + case -62: + printf("CL_INVALID_MIP_LEVEL "); + break; + case -63: + printf("CL_INVALID_GLOBAL_WORK_SIZE "); + break; + default: + printf("UNRECOGNIZED ERROR CODE (%d)", error); + } +} + +// Print line, file name, and error code if there is an error. Exits the +// application upon error. +void _checkError(int line, + const char *file, + cl_int error, + const char *msg, + ...) { + // If not successful + if(error != CL_SUCCESS) { + // Print line and file + printf("ERROR: "); + printError(error); + printf("\nLocation: %s:%d\n", file, line); + + // Print custom message. + va_list vl; + va_start(vl, msg); + vprintf(msg, vl); + printf("\n"); + va_end(vl); + + // Cleanup and bail. + cleanup(); + exit(error); + } +} + +// Sets the current working directory to be the same as the directory +// containing the running executable. +bool setCwdToExeDir() { +#ifdef _WIN32 // Windows + HMODULE hMod = GetModuleHandle(NULL); + char path[MAX_PATH]; + GetModuleFileNameA(hMod, path, MAX_PATH); + +#else // Linux + // Get path of executable. + char path[300]; + ssize_t n = readlink("/proc/self/exe", path, sizeof(path)/sizeof(path[0]) - 1); + if(n == -1) { + return false; + } + path[n] = 0; +#endif + + // Find the last '\' or '/' and terminate the path there; it is now + // the directory containing the executable. + size_t i; + for(i = strlen(path) - 1; i > 0 && path[i] != '/' && path[i] != '\\'; --i); + path[i] = '\0'; + + // Change the current directory. +#ifdef _WIN32 // Windows + SetCurrentDirectoryA(path); +#else // Linux + int rc; + rc = chdir(path); + (void) rc; +#endif + + return true; +} + +// Searches all platforms for the first platform whose name +// contains the search string (case-insensitive). +cl_platform_id findPlatform(const char *platform_name_search) { + cl_int status; + + std::string search = platform_name_search; + std::transform(search.begin(), search.end(), search.begin(), tolower); + + // Get number of platforms. + cl_uint num_platforms; + status = clGetPlatformIDs(0, NULL, &num_platforms); + checkError(status, "Query for number of platforms failed"); + + // Get a list of all platform ids. + scoped_array pids(num_platforms); + status = clGetPlatformIDs(num_platforms, pids, NULL); + checkError(status, "Query for all platform ids failed"); + + // For each platform, get name and compare against the search string. + for(unsigned i = 0; i < num_platforms; ++i) { + std::string name = getPlatformName(pids[i]); + + // Convert to lower case. + std::transform(name.begin(), name.end(), name.begin(), tolower); + + if(name.find(search) != std::string::npos) { + // Found! + return pids[i]; + } + } + + // No platform found. + return NULL; +} + +// Returns the platform name. +std::string getPlatformName(cl_platform_id pid) { + cl_int status; + + size_t sz; + status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz); + checkError(status, "Query for platform name size failed"); + + scoped_array name(sz); + status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL); + checkError(status, "Query for platform name failed"); + + return name.get(); +} + +// Returns the device name. +std::string getDeviceName(cl_device_id did) { + cl_int status; + + size_t sz; + status = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, NULL, &sz); + checkError(status, "Failed to get device name size"); + + scoped_array name(sz); + status = clGetDeviceInfo(did, CL_DEVICE_NAME, sz, name, NULL); + checkError(status, "Failed to get device name"); + + return name.get(); +} + +// Returns the list of all devices. +cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices) { + cl_int status; + + status = clGetDeviceIDs(pid, dev_type, 0, NULL, num_devices); + checkError(status, "Query for number of devices failed"); + + cl_device_id *dids = new cl_device_id[*num_devices]; + status = clGetDeviceIDs(pid, dev_type, *num_devices, dids, NULL); + checkError(status, "Query for device ids"); + + return dids; +} + +// Create a program for all devices associated with the context. +cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices) { + // Early exit for potentially the most common way to fail: AOCX does not exist. + if(!fileExists(binary_file_name)) { + printf("AOCX file '%s' does not exist.\n", binary_file_name); + checkError(CL_INVALID_PROGRAM, "Failed to load binary file"); + } + + // Load the binary. + size_t binary_size; + scoped_array binary(loadBinaryFile(binary_file_name, &binary_size)); + if(binary == NULL) { + checkError(CL_INVALID_PROGRAM, "Failed to load binary file"); + } + + scoped_array binary_lengths(num_devices); + scoped_array binaries(num_devices); + for(unsigned i = 0; i < num_devices; ++i) { + binary_lengths[i] = binary_size; + binaries[i] = binary; + } + + cl_int status; + scoped_array binary_status(num_devices); + + cl_program program = clCreateProgramWithBinary(context, num_devices, devices, binary_lengths, + (const unsigned char **) binaries.get(), binary_status, &status); + checkError(status, "Failed to create program with binary"); + for(unsigned i = 0; i < num_devices; ++i) { + checkError(binary_status[i], "Failed to load binary for device"); + } + + return program; +} + +// Loads a file in binary form. +unsigned char *loadBinaryFile(const char *file_name, size_t *size) { + // Open the File + FILE* fp; +#ifdef _WIN32 + if(fopen_s(&fp, file_name, "rb") != 0) { + return NULL; + } +#else + fp = fopen(file_name, "rb"); + if(fp == 0) { + return NULL; + } +#endif + + // Get the size of the file + fseek(fp, 0, SEEK_END); + *size = ftell(fp); + + // Allocate space for the binary + unsigned char *binary = new unsigned char[*size]; + + // Go back to the file start + rewind(fp); + + // Read the file into the binary + if(fread((void*)binary, *size, 1, fp) == 0) { + delete[] binary; + fclose(fp); + return NULL; + } + + return binary; +} + +bool fileExists(const char *file_name) { +#ifdef _WIN32 // Windows + DWORD attrib = GetFileAttributesA(file_name); + return (attrib != INVALID_FILE_ATTRIBUTES && !(attrib & FILE_ATTRIBUTE_DIRECTORY)); +#else // Linux + return access(file_name, R_OK) != -1; +#endif +} + +std::string getBoardBinaryFile(const char *prefix, cl_device_id device) { + // First check if .aocx exists. Use it if it does. + std::string file_name = std::string(prefix) + ".aocx"; + if(fileExists(file_name.c_str())) { + return file_name; + } + + // Now get the name of the board. For Intel(R) FPGA SDK for OpenCL(TM) boards, + // the name of the device is presented as: + // : ... + std::string device_name = getDeviceName(device); + + // Now search for the " :" in the device name. + size_t end = device_name.find(" :"); + if(end != std::string::npos) { + std::string board_name(device_name, 0, end); + + // Look for a AOCX with the name __.aocx. + file_name = std::string(prefix) + "_" + board_name + "_" + VERSION_STR + ".aocx"; + if(fileExists(file_name.c_str())) { + return file_name; + } + } + + // At this point just use .aocx. This file doesn't exist + // and this should trigger an error later. + return std::string(prefix) + ".aocx"; +} + +// High-resolution timer. +double getCurrentTimestamp() { +#ifdef _WIN32 // Windows + // Use the high-resolution performance counter. + + static LARGE_INTEGER ticks_per_second = {}; + if(ticks_per_second.QuadPart == 0) { + // First call - get the frequency. + QueryPerformanceFrequency(&ticks_per_second); + } + + LARGE_INTEGER counter; + QueryPerformanceCounter(&counter); + + double seconds = double(counter.QuadPart) / double(ticks_per_second.QuadPart); + return seconds; +#else // Linux + timespec a; + clock_gettime(CLOCK_MONOTONIC, &a); + return (double(a.tv_nsec) * 1.0e-9) + double(a.tv_sec); +#endif +} + +cl_ulong getStartEndTime(cl_event event) { + cl_int status; + + cl_ulong start, end; + status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); + checkError(status, "Failed to query event start time"); + status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); + checkError(status, "Failed to query event end time"); + + return end - start; +} + +cl_ulong getStartEndTime(cl_event *events, unsigned num_events) { + cl_int status; + + cl_ulong min_start = 0; + cl_ulong max_end = 0; + for(unsigned i = 0; i < num_events; ++i) { + cl_ulong start, end; + status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); + checkError(status, "Failed to query event start time"); + status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); + checkError(status, "Failed to query event end time"); + + if(i == 0) { + min_start = start; + max_end = end; + } + else { + if(start < min_start) { + min_start = start; + } + if(end > max_end) { + max_end = end; + } + } + } + + return max_end - min_start; +} + +void waitMilliseconds(unsigned ms) { +#ifdef _WIN32 // Windows + Sleep(ms); +#else // Linux + timespec sleeptime = {0, 0}; + sleeptime.tv_sec = ms / 1000; + sleeptime.tv_nsec = long(ms % 1000) * 1000000L; // convert to nanoseconds + nanosleep(&sleeptime, NULL); +#endif +} + +void oclContextCallback(const char *errinfo, const void *, size_t, void *) { + printf("Context callback: %s\n", errinfo); +} + +} // ns aocl_utils + diff --git a/vta/src/intelfocl/AOCLUtils/opencl.h b/vta/src/intelfocl/AOCLUtils/opencl.h new file mode 100644 index 000000000000..4aa5348b67b1 --- /dev/null +++ b/vta/src/intelfocl/AOCLUtils/opencl.h @@ -0,0 +1,122 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// OpenCL utility functions. + +#ifndef AOCL_UTILS_OPENCL_H +#define AOCL_UTILS_OPENCL_H + +#include +#include +#include +#include + +#include "CL/opencl.h" + +// This is assumed to be externally provided by the application. +extern void cleanup(); + +namespace aocl_utils { + +// Host allocation functions +void *alignedMalloc(size_t size); +void alignedFree(void *ptr); + +// Error functions +void printError(cl_int error); +void _checkError(int line, + const char *file, + cl_int error, + const char *msg, + ...); // does not return +#define checkError(status, ...) _checkError(__LINE__, __FILE__, status, __VA_ARGS__) + +// Sets the current working directory to the same directory that contains +// this executable. Returns true on success. +bool setCwdToExeDir(); + +// Find a platform that contains the search string in its name (case-insensitive match). +// Returns NULL if no match is found. +cl_platform_id findPlatform(const char *platform_name_search); + +// Returns the name of the platform. +std::string getPlatformName(cl_platform_id pid); + +// Returns the name of the device. +std::string getDeviceName(cl_device_id did); + +// Returns an array of device ids for the given platform and the +// device type. +// Return value must be freed with delete[]. +cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices); + +// Create a OpenCL program from a binary file. +// The program is created for all given devices associated with the context. The same +// binary is used for all devices. +cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices); + +// Load binary file. +// Return value must be freed with delete[]. +unsigned char *loadBinaryFile(const char *file_name, size_t *size); + +// Checks if a file exists. +bool fileExists(const char *file_name); + +// Returns the path to the AOCX file to use for the given device. +// This is special handling for examples for the Intel(R) FPGA SDK for OpenCL(TM). +// It uses the device name to get the board name and then looks for a +// corresponding AOCX file. Specifically, it gets the device name and +// extracts the board name assuming the device name has the following format: +// : ... +// +// Then the AOCX file is __.aocx. If this +// file does not exist, then the file name defaults to .aocx. +std::string getBoardBinaryFile(const char *prefix, cl_device_id device); + +// Returns the time from a high-resolution timer in seconds. This value +// can be used with a value returned previously to measure a high-resolution +// time difference. +double getCurrentTimestamp(); + +// Returns the difference between the CL_PROFILING_COMMAND_END and +// CL_PROFILING_COMMAND_START values of a cl_event object. +// This requires that the command queue associated with the event be created +// with the CL_QUEUE_PROFILING_ENABLE property. +// +// The return value is in nanoseconds. +cl_ulong getStartEndTime(cl_event event); + +// Returns the maximum time span for the given set of events. +// The time span starts at the earliest event start time. +// The time span ends at the latest event end time. +cl_ulong getStartEndTime(cl_event *events, unsigned num_events); + +// Wait for the specified number of milliseconds. +void waitMilliseconds(unsigned ms); + +// OpenCL context callback function that simply prints the error information +// to stdout (via printf). +void oclContextCallback(const char *errinfo, const void *, size_t, void *); + +} // ns aocl_utils + +#endif + diff --git a/vta/src/intelfocl/AOCLUtils/options.cpp b/vta/src/intelfocl/AOCLUtils/options.cpp new file mode 100644 index 000000000000..05d025b43faf --- /dev/null +++ b/vta/src/intelfocl/AOCLUtils/options.cpp @@ -0,0 +1,105 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +#include "aocl_utils.h" +#include +#include +#include +#include + +namespace aocl_utils { + +Options::Options() { +} + +Options::Options(int num, char *argv[]) { + addFromCommandLine(num, argv); +} + +bool Options::has(const std::string &name) const { + return m_options.find(name) != m_options.end(); +} + +std::string &Options::get(const std::string &name) { + return m_options[name]; +} + +const std::string &Options::get(const std::string &name) const { + OptionMap::const_iterator it = m_options.find(name); + if(it == m_options.end()) { + errorNonExistent(name); + std::cerr << "Option '" << name << "' does not exist.\n"; + exit(1); + } + return it->second; +} + +void Options::addFromCommandLine(int num, char *argv[]) { + for(int i = 1; i < num; ++i) { + const std::string arg = argv[i]; + + // Look for the first '-'. + if(arg.size() > 1 && arg[0] == '-') { + size_t eq = arg.find('='); + size_t name_start = 1; + + // Check if there's a second '-'. + if(arg.size() > 2 && arg[1] == '-') { + name_start = 2; + } + + if(eq == std::string::npos) { + // No '='; treat as a boolean option. + set(arg.substr(name_start), true); + } + else if(eq == name_start) { + // No name?! + errorNameless(); + } + else { + set(arg.substr(name_start, eq - name_start), arg.substr(eq + 1)); + } + } + else { + // Not an option. + m_nonoptions.push_back(arg); + } + } +} + +void Options::errorNameless() const { + std::cerr << "No name provided for option.\n"; + exit(1); +} + +void Options::errorNonExistent(const std::string &name) const { + std::cerr << "Option '" << name << "' does not exist.\n"; + exit(1); +} + +void Options::errorWrongType(const std::string &name) const { + std::cerr << "Value for option '" << name << "' is not of the right type (value = '" + << get(name) << "').\n"; + exit(1); +} + +} // ns aocl_utils + diff --git a/vta/src/intelfocl/AOCLUtils/options.h b/vta/src/intelfocl/AOCLUtils/options.h new file mode 100644 index 000000000000..78d34605e60e --- /dev/null +++ b/vta/src/intelfocl/AOCLUtils/options.h @@ -0,0 +1,137 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// Declares a utility class used to parse command-line options. + +#ifndef AOCL_UTILS_OPTIONS_H +#define AOCL_UTILS_OPTIONS_H + +#include +#include +#include +#include + +namespace aocl_utils { + +class Options { +public: + typedef std::vector StringVec; + + Options(); + Options(int num, char *argv[]); + + bool has(const std::string &name) const; + std::string &get(const std::string &name); // will create an empty option if it does not exist + const std::string &get(const std::string &name) const; // error if option does not exist + + void set(const std::string &name, const std::string &value) { get(name) = value; } + + // Command line options must be of the following form: + // [-]-name (indicates option exists) + // [-]-name=value + // + // This function assumes that the values are from main(int, char *). + // This means that the argv[0] is skipped. + void addFromCommandLine(int num, char *argv[]); + + // This templated function converts the option value to the given type. + // An assert is raised if the conversion fails. + template + T get(const std::string &name) const; + + template + void set(const std::string &name, const T &value); + + // Non-options are arguments processed in addFromCommandLine + // that were not recognized as options. + const StringVec &getNonOptions() const { return m_nonoptions; } + size_t getNonOptionCount() const { return m_nonoptions.size(); } + const std::string &getNonOption(size_t i) const { return m_nonoptions[i]; } + +private: + typedef std::map OptionMap; + + // Displays an error message indicating that a nameless option + // was provided. + void errorNameless() const; + + // Displays an error message indicating that the given option + // has the wrong type and then exits with an error code. + void errorWrongType(const std::string &name) const; + + // Displays an error message indicating that the given option + // does not exist and then exits with an error code. + void errorNonExistent(const std::string &name) const; + + OptionMap m_options; + StringVec m_nonoptions; + + Options(const Options &); // not implemented + void operator =(const Options &); // not implemented +}; + +template +T Options::get(const std::string &name) const { + std::stringstream ss; + ss << get(name); + + T v; + ss >> v; + if(ss.fail() || !ss.eof()) { + // Failed to parse or did not consume the whole string value. + errorWrongType(name); + } + return v; +} + +// Specialization for bool. +template<> +inline bool Options::get(const std::string &name) const { + if(has(name)) { + const std::string &v = get(name); + if(v == "1") { + return true; + } + } + return false; +} + +// Specialization for std::string. Simply returns the option string. +// Requires specialization because using stringstream to read the string +// will stop at the first whitespace character (which is wrong). +template<> +inline std::string Options::get(const std::string &name) const { + return get(name); +} + +// This assumes the type T can be serialized to a string and back (when get +// is called). +template +void Options::set(const std::string &name, const T &value) { + std::stringstream ss; + ss << value; + set(name, ss.str()); +} + +} // ns aocl_utils + +#endif + diff --git a/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h b/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h new file mode 100644 index 000000000000..b11085c5226e --- /dev/null +++ b/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h @@ -0,0 +1,165 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// Scoped pointer definitions. + +#ifndef AOCL_UTILS_SCOPED_PTRS_H +#define AOCL_UTILS_SCOPED_PTRS_H + +namespace aocl_utils { + +// Interface is essentially the combination of std::auto_ptr and boost's smart pointers, +// along with some small extensions (auto conversion to T*). + +// scoped_ptr: assumes pointer was allocated with operator new; destroys with operator delete +template +class scoped_ptr { +public: + typedef scoped_ptr this_type; + + scoped_ptr() : m_ptr(NULL) {} + scoped_ptr(T *ptr) : m_ptr(ptr) {} + ~scoped_ptr() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { delete m_ptr; m_ptr = ptr; } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + + // noncopyable + scoped_ptr(const this_type &); + this_type &operator =(const this_type &); +}; + +// scoped_array: assumes pointer was allocated with operator new[]; destroys with operator delete[] +// Also supports allocation/reset with a number, which is the number of +// elements of type T. +template +class scoped_array { +public: + typedef scoped_array this_type; + + scoped_array() : m_ptr(NULL) {} + scoped_array(T *ptr) : m_ptr(NULL) { reset(ptr); } + explicit scoped_array(size_t n) : m_ptr(NULL) { reset(n); } + ~scoped_array() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + T &operator [](int index) const { return m_ptr[index]; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { delete[] m_ptr; m_ptr = ptr; } + void reset(size_t n) { reset(new T[n]); } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + + // noncopyable + scoped_array(const this_type &); + this_type &operator =(const this_type &); +}; + +// scoped_aligned_ptr: assumes pointer was allocated with alignedMalloc; destroys with alignedFree +// Also supports allocation/reset with a number, which is the number of +// elements of type T +template +class scoped_aligned_ptr { +public: + typedef scoped_aligned_ptr this_type; + + scoped_aligned_ptr() : m_ptr(NULL) {} + scoped_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); } + explicit scoped_aligned_ptr(size_t n) : m_ptr(NULL) { reset(n); } + ~scoped_aligned_ptr() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + T &operator [](int index) const { return m_ptr[index]; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { if(m_ptr) alignedFree(m_ptr); m_ptr = ptr; } + void reset(size_t n) { reset((T*) alignedMalloc(sizeof(T) * n)); } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + + // noncopyable + scoped_aligned_ptr(const this_type &); + this_type &operator =(const this_type &); +}; + +#if USE_SVM_API == 1 +// scoped_SVM_aligned_ptr: assumes pointer was allocated with clSVMAlloc; destroys with clSVMFree +// Also supports allocation/reset with a number, which is the number of +// elements of type T +template +class scoped_SVM_aligned_ptr { +public: + typedef scoped_SVM_aligned_ptr this_type; + + scoped_SVM_aligned_ptr() : m_ptr(NULL) {} + scoped_SVM_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); } + explicit scoped_SVM_aligned_ptr(cl_context ctx, size_t n) : m_ptr(NULL) { reset(ctx, n); } + ~scoped_SVM_aligned_ptr() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + T &operator [](int index) const { return m_ptr[index]; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { if (m_ptr) clSVMFree(m_ctx, m_ptr); m_ptr = ptr; } + void reset(cl_context ctx, size_t n) { reset((T*)clSVMAlloc(ctx, 0, sizeof(T) * n, 0)); m_ctx = ctx; } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + cl_context m_ctx; + + // noncopyable + scoped_SVM_aligned_ptr(const this_type &); + this_type &operator =(const this_type &); +}; +#endif /* USE_SVM_API == 1 */ + +} // ns aocl_utils + +#endif + diff --git a/vta/src/intelfocl/intelfocl_device.cc b/vta/src/intelfocl/intelfocl_device.cc new file mode 100644 index 000000000000..5eb1519b1124 --- /dev/null +++ b/vta/src/intelfocl/intelfocl_device.cc @@ -0,0 +1,181 @@ +#include +#include +#include "intelfocl_device.h" +#include "AOCLUtils/aocl_utils.h" + +#define MEM_ALIGNMENT (1024) + +#define CL_STATUS_SUCCESS(x) ((x) == CL_SUCCESS) + +void cleanup() {} + +int IntelFOCLDevice::init(size_t mem_size, std::string aocx_file) +{ + cl_int status; + cl_device_id device; + cl_platform_id platform; + unsigned int argi; + bool focl_device_avail; + unsigned int num_devices; + aocl_utils::scoped_array devices; + + platform = aocl_utils::findPlatform("Intel(R) FPGA SDK for OpenCL(TM)"); + CHECK(platform) << "Unable to find Intel(R) FPGA OpenCL platform"; + + devices.reset(aocl_utils::getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices)); + focl_device_avail = false; + for ( unsigned int i = 0; i < num_devices; i ++ ) + { + device = devices[i]; + _context = clCreateContext(NULL, 1, &device, &aocl_utils::oclContextCallback, NULL, &status); + if ( CL_STATUS_SUCCESS(status) ) + { + focl_device_avail = true; + LOG(INFO) << "Using device: " << aocl_utils::getDeviceName(device); + break; + } + } + CHECK(focl_device_avail) << "No FPGA device available"; + num_devices = 1; + + LOG(INFO) << "Using AOCX: " << aocx_file; + _program = aocl_utils::createProgramFromBinary(_context, aocx_file.c_str(), &device, num_devices); + status = clBuildProgram(_program, 0, NULL, "", NULL, NULL); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to build program"; + + for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ ) + { + _kernels[i] = clCreateKernel(_program, kernel_names[i].c_str(), &status); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create kernel"; + _queues[i] = clCreateCommandQueue(_context, device, 0, &status); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create command queue"; + } + + _mem = clCreateBuffer(_context, CL_MEM_READ_WRITE, mem_size, NULL, &status); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create buffer mem"; + mem_chunk_t init_chunk = {.offset = 0, .size = mem_size, .occupied = false}; + _mem_chunks.push_back(init_chunk); + + argi = 1; + status = clSetKernelArg(_kernels[KERNEL_FETCH], argi++, sizeof(cl_mem), &_mem); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; + argi = 0; + status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; + status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; + status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; + status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; + status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; + + return 0; +} + +ifocl_mem_off_t IntelFOCLDevice::alloc(size_t size) +{ + auto iter = _mem_chunks.begin(); + size_t aligned_size = ((size + MEM_ALIGNMENT - 1) / MEM_ALIGNMENT) * MEM_ALIGNMENT; + + while ( iter != _mem_chunks.end() && (iter->occupied || (iter->size < aligned_size)) ) + { + iter++; + } + + if ( iter == _mem_chunks.end() ) return IFOCL_MEM_OFF_ERR; + + iter->occupied = true; + if ( iter->size != aligned_size ) + { + mem_chunk_t rem = {iter->offset + aligned_size, iter->size - aligned_size, false}; + iter->size = aligned_size; + _mem_chunks.insert(std::next(iter), rem); + } + + return iter->offset; +} + +void IntelFOCLDevice::free(ifocl_mem_off_t offset) +{ + auto iter = _mem_chunks.begin(); + while ( iter != _mem_chunks.end() && iter->offset < offset ) iter++; + + if ( iter == _mem_chunks.end() || iter->offset != offset || !iter->occupied ) + { + return; + } + + iter->occupied = false; + if ( iter != _mem_chunks.begin() && !std::prev(iter)->occupied ) iter--; + + while ( std::next(iter) != _mem_chunks.end() && !std::next(iter)->occupied ) + { + iter->size += std::next(iter)->size; + _mem_chunks.erase(std::next(iter)); + } +} + + +void IntelFOCLDevice::write_mem(ifocl_mem_off_t offset, const void *buf, size_t nbyte) +{ + cl_int status = clEnqueueWriteBuffer(_queues[0], _mem, CL_TRUE, offset, nbyte, buf, 0, NULL, NULL); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue write buffer"; +} + +void IntelFOCLDevice::read_mem(ifocl_mem_off_t offset, void *buf, size_t nbyte) +{ + cl_int status = clEnqueueReadBuffer(_queues[0], _mem, CL_TRUE, offset, nbyte, buf, 0, NULL, NULL); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue read buffer"; +}; + +int IntelFOCLDevice::execute_instructions(ifocl_mem_off_t offset, size_t count) +{ + cl_int status; + unsigned int argi; + unsigned int insn_offset = offset / VTA_INS_ELEM_BYTES; + unsigned int insn_count = count; + const size_t global_work_size = 1; + + argi = 0; + status = clSetKernelArg(_kernels[KERNEL_FETCH], argi, sizeof(unsigned int), &insn_count); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; + argi = 2; + status = clSetKernelArg(_kernels[KERNEL_FETCH], argi, sizeof(unsigned int), &insn_offset); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; + + for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ ) + { + status = clEnqueueNDRangeKernel(_queues[i], _kernels[i], 1, NULL, &global_work_size, NULL, 0, NULL, NULL); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue kernel"; + } + + for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ ) + { + status = clFinish(_queues[i]); + CHECK(CL_STATUS_SUCCESS(status)) << "Failed to clFinish"; + } + + return 0; +}; + +void IntelFOCLDevice::deinit() +{ + for ( unsigned int i = 0; i < NUM_OCL_KERNELS; i++ ) + { + clReleaseKernel(_kernels[i]); + clReleaseCommandQueue(_queues[i]); + } + + clReleaseMemObject(_mem); + + clReleaseProgram(_program); + + clReleaseContext(_context); +} + +IntelFOCLDevice::~IntelFOCLDevice() +{ + deinit(); +} diff --git a/vta/src/intelfocl/intelfocl_device.h b/vta/src/intelfocl/intelfocl_device.h new file mode 100644 index 000000000000..6c53a4d47323 --- /dev/null +++ b/vta/src/intelfocl/intelfocl_device.h @@ -0,0 +1,53 @@ +#ifndef VTA_INTEL_FOCL_DEVICE_H_ +#define VTA_INTEL_FOCL_DEVICE_H_ + +#include +#include + +#include "CL/opencl.h" + +#define NUM_OCL_KERNELS 3 +enum kernel_index {KERNEL_FETCH, KERNEL_COMPUTE, KERNEL_PROFILE}; +static std::string kernel_names[3] = {"fetch", "compute", "profile"}; + +typedef size_t ifocl_mem_off_t; +#define IFOCL_MEM_OFF_ERR (SIZE_MAX) + +typedef struct +{ + ifocl_mem_off_t offset; + size_t size; + bool occupied; +} mem_chunk_t; + +class IntelFOCLDevice { + private: + cl_context _context; + cl_program _program; + cl_mem _mem; + cl_kernel _kernels[NUM_OCL_KERNELS]; + cl_command_queue _queues[NUM_OCL_KERNELS]; + std::list _mem_chunks; + + public: + IntelFOCLDevice() { init(4*1024*1024*1024ULL, "vta_opencl.aocx"); } + + int init(size_t mem_size, std::string aocx_file); + + ifocl_mem_off_t alloc(size_t size); + + void free(ifocl_mem_off_t offset); + + void write_mem(ifocl_mem_off_t offset, const void *buf, size_t nbyte); + + void read_mem(ifocl_mem_off_t offset, void *buf, size_t nbyte); + + int execute_instructions(ifocl_mem_off_t offset, size_t count); + + void deinit(); + + ~IntelFOCLDevice(); +}; + +#endif // VTA_INTEL_FOCL_DEVICE_H_ + diff --git a/vta/src/intelfocl/intelfocl_driver.cc b/vta/src/intelfocl/intelfocl_driver.cc new file mode 100644 index 000000000000..a8db9cd0e394 --- /dev/null +++ b/vta/src/intelfocl/intelfocl_driver.cc @@ -0,0 +1,74 @@ +#include +#include +#include +#include "intelfocl_device.h" + +#define MEM_ADDR_IDENTIFIER (0x18000000) + +static IntelFOCLDevice focl_device; + +static inline void* mem_get_addr(ifocl_mem_off_t offset) +{ + void *ret = (void *) (offset + MEM_ADDR_IDENTIFIER); + return ret; +} + +static inline ifocl_mem_off_t mem_get_offset(const void *addr) +{ + ifocl_mem_off_t ret = (ifocl_mem_off_t) addr - MEM_ADDR_IDENTIFIER; + return ret; +} + +void* VTAMemAlloc(size_t size, int cached) { + (void) cached; + ifocl_mem_off_t offset = focl_device.alloc(size); + if ( offset == IFOCL_MEM_OFF_ERR ) return NULL; + void *addr = mem_get_addr(offset); + return addr; +} + +void VTAMemFree(void *buf) { + ifocl_mem_off_t offset = mem_get_offset(buf); + focl_device.free(offset); +} + +vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { + ifocl_mem_off_t offset = mem_get_offset(buf); + return (vta_phy_addr_t) offset; +} + +void VTAMemCopyFromHost(void* dst, const void* src, size_t size) { + ifocl_mem_off_t dst_offset = mem_get_offset(dst); + focl_device.write_mem(dst_offset, src, size); +} + +void VTAMemCopyToHost(void* dst, const void* src, size_t size) { + ifocl_mem_off_t src_offset = mem_get_offset(src); + focl_device.read_mem(src_offset, dst, size); +} + +void VTAFlushCache(void * offset, vta_phy_addr_t buf, int size) { + std::cout << "VTAFlushCache not implemented for Intel OpenCL for FPGA devices" << std::endl; +} + +void VTAInvalidateCache(void * offset, vta_phy_addr_t buf, int size) { + std::cout << "VTAInvalidateCache not implemented for Intel OpenCL for FPGA devices" << std::endl; +} + +VTADeviceHandle VTADeviceAlloc() { + return (VTADeviceHandle) &focl_device; +} + +void VTADeviceFree(VTADeviceHandle handle) { + (void) handle; +} + +int VTADeviceRun(VTADeviceHandle handle, + vta_phy_addr_t insn_phy_addr, + uint32_t insn_count, + uint32_t wait_cycles) +{ + (void) wait_cycles; + ifocl_mem_off_t offset = (ifocl_mem_off_t) insn_phy_addr; + return focl_device.execute_instructions(offset, insn_count); +} diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc new file mode 100644 index 000000000000..518b6c368926 --- /dev/null +++ b/vta/src/pynq/pynq_driver.cc @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * \file pynq_driver.c + * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io). + */ + +#include +#include +#include +#include "pynq_driver.h" + + +void* VTAMemAlloc(size_t size, int cached) { + assert(size <= VTA_MAX_XFER); + // Rely on the pynq-specific cma library + return cma_alloc(size, cached); +} + +void VTAMemFree(void* buf) { + // Rely on the pynq-specific cma library + cma_free(buf); +} + +vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { + return cma_get_phy_addr(buf); +} + +void VTAMemCopyFromHost(void* dst, const void* src, size_t size) { + // For SoC-based FPGAs that used shared memory with the CPU, use memcopy() + memcpy(dst, src, size); +} + +void VTAMemCopyToHost(void* dst, const void* src, size_t size) { + // For SoC-based FPGAs that used shared memory with the CPU, use memcopy() + memcpy(dst, src, size); +} + +void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { + // Call the cma_flush_cache on the CMA buffer + // so that the FPGA can read the buffer data. + cma_flush_cache(vir_addr, phy_addr, size); +} + +void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { + // Call the cma_invalidate_cache on the CMA buffer + // so that the host needs to read the buffer data. + cma_invalidate_cache(vir_addr, phy_addr, size); +} + +void *VTAMapRegister(uint32_t addr) { + // Align the base address with the pages + uint32_t virt_base = addr & ~(getpagesize() - 1); + // Calculate base address offset w.r.t the base address + uint32_t virt_offset = addr - virt_base; + // Open file and mmap + uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC); + return mmap(NULL, + (VTA_IP_REG_MAP_RANGE + virt_offset), + PROT_READ|PROT_WRITE, + MAP_SHARED, + mmap_file, + virt_base); +} + +void VTAUnmapRegister(void *vta) { + // Unmap memory + int status = munmap(vta, VTA_IP_REG_MAP_RANGE); + assert(status == 0); +} + +void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) { + *((volatile uint32_t *) (reinterpret_cast(base_addr) + offset)) = val; +} + +uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) { + return *((volatile uint32_t *) (reinterpret_cast(base_addr) + offset)); +} + +class VTADevice { + public: + VTADevice() { + // VTA stage handles + vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR); + vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR); + vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR); + vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR); + } + + ~VTADevice() { + // Close VTA stage handle + VTAUnmapRegister(vta_fetch_handle_); + VTAUnmapRegister(vta_load_handle_); + VTAUnmapRegister(vta_compute_handle_); + VTAUnmapRegister(vta_store_handle_); + } + + int Run(vta_phy_addr_t insn_phy_addr, + uint32_t insn_count, + uint32_t wait_cycles) { + VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_COUNT_OFFSET, insn_count); + VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy_addr); + VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_INP_ADDR_OFFSET, 0); + VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_WGT_ADDR_OFFSET, 0); + VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_UOP_ADDR_OFFSET, 0); + VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_BIAS_ADDR_OFFSET, 0); + VTAWriteMappedReg(vta_store_handle_, VTA_STORE_OUT_ADDR_OFFSET, 0); + + // VTA start + VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START); + VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART); + VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART); + VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART); + + // Allow device to respond + struct timespec ts = { .tv_sec = 0, .tv_nsec = 1000 }; + nanosleep(&ts, &ts); + + // Loop until the VTA is done + unsigned t, flag = 0; + for (t = 0; t < wait_cycles; ++t) { + flag = VTAReadMappedReg(vta_compute_handle_, VTA_COMPUTE_DONE_RD_OFFSET); + if (flag == VTA_DONE) break; + std::this_thread::yield(); + } + // Report error if timeout + return t < wait_cycles ? 0 : 1; + } + + private: + // VTA handles (register maps) + void* vta_fetch_handle_{nullptr}; + void* vta_load_handle_{nullptr}; + void* vta_compute_handle_{nullptr}; + void* vta_store_handle_{nullptr}; +}; + +VTADeviceHandle VTADeviceAlloc() { + return new VTADevice(); +} + +void VTADeviceFree(VTADeviceHandle handle) { + delete static_cast(handle); +} + +int VTADeviceRun(VTADeviceHandle handle, + vta_phy_addr_t insn_phy_addr, + uint32_t insn_count, + uint32_t wait_cycles) { + return static_cast(handle)->Run( + insn_phy_addr, insn_count, wait_cycles); +} From 3e51e493f07c41d75baf9ae05fcd1342b72d2375 Mon Sep 17 00:00:00 2001 From: zhanghao Date: Mon, 9 Mar 2020 09:42:36 +0800 Subject: [PATCH 02/44] put resnet18 middle layers to run on vta - add load acc_int8 in simulation - remove copy op - add vta schedule - add always 32-bits --- .gitignore | 3 + python/tvm/autotvm/measure/measure_methods.py | 5 +- python/tvm/autotvm/tuner/tuner.py | 13 ++-- python/tvm/contrib/util.py | 1 + python/tvm/relay/op/_tensor.py | 4 +- python/tvm/relay/op/op.py | 7 ++ python/tvm/relay/quantize/_partition.py | 12 +++ src/relay/backend/compile_engine.cc | 4 +- src/relay/quantize/realize.cc | 31 ++++++-- topi/python/topi/generic/injective.py | 4 + vta/python/vta/environment.py | 2 + vta/python/vta/top/graphpack.py | 7 +- vta/python/vta/top/op.py | 3 +- vta/python/vta/top/vta_conv2d.py | 77 +++++++++++++++++++ vta/runtime/runtime.cc | 54 +++++++------ 15 files changed, 183 insertions(+), 44 deletions(-) diff --git a/.gitignore b/.gitignore index b9357018a64c..dd3634bf2bb0 100644 --- a/.gitignore +++ b/.gitignore @@ -233,3 +233,6 @@ conda/pkg # antlr files *.tokens *.interp + +*log* +*.txt diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index b8969f55c00a..7f915132fdc8 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -468,8 +468,9 @@ def run_through_rpc(measure_input, build_result, measure_input.target.device_name == 'vta': # pylint: disable=import-outside-toplevel from vta import program_fpga, reconfig_runtime - program_fpga(remote, None) - reconfig_runtime(remote) + # FIXME(zhanghao): remove this + # program_fpga(remote, None) + # reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py index 2441a4ae642f..4f984aae701f 100644 --- a/python/tvm/autotvm/tuner/tuner.py +++ b/python/tvm/autotvm/tuner/tuner.py @@ -161,12 +161,13 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr logger.debug("Early stopped. Best iter: %d.", self.best_iter) break - if error_ct > 150: - logging.basicConfig() - logger.warning("Too many errors happen in the tuning. Now is in debug mode") - logger.setLevel(logging.DEBUG) - else: - logger.setLevel(old_level) + # NOTE(zhanghao): comment out as it will raise too many logs + # if error_ct > 150: + # logging.basicConfig() + # logger.warning("Too many errors happen in the tuning. Now is in debug mode") + # logger.setLevel(logging.DEBUG) + # else: + # logger.setLevel(old_level) GLOBAL_SCOPE.in_tuning = False del measure_batch diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py index 8f6dfc7f28ec..474741fc1e35 100644 --- a/python/tvm/contrib/util.py +++ b/python/tvm/contrib/util.py @@ -19,6 +19,7 @@ import contextlib import datetime import os +import sys import tempfile import threading import shutil diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index cd9e4ed050d2..cca44429f7df 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -51,7 +51,9 @@ register_broadcast_schedule("sign") register_broadcast_schedule("abs") register_broadcast_schedule("tanh") -register_broadcast_schedule("add") +# NOTE(zhanghao): use customized add schedule +register_schedule("add", schedule_add) +# register_broadcast_schedule("add") register_broadcast_schedule("subtract") register_broadcast_schedule("multiply") register_broadcast_schedule("divide") diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index 7fad9a258f2b..f2428f9db2ef 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -394,6 +394,13 @@ def register_external_compiler(op_name, fexternal=None, level=10): return tvm.ir.register_op_attr(op_name, "FTVMExternalCompiler", fexternal, level) + +def schedule_add(attrs, outputs, target): + """Generic schedule for add.""" + with target: + return topi.generic.schedule_add(outputs) + + @tvm._ffi.register_func("relay.op.compiler._lower") def _lower(name, schedule, inputs, outputs): return lower(schedule, list(inputs) + list(outputs), name=name) diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py index a607f4ea50b8..bba5a6d842f9 100644 --- a/python/tvm/relay/quantize/_partition.py +++ b/python/tvm/relay/quantize/_partition.py @@ -144,3 +144,15 @@ def multiply_partition_function(ref_call, new_args, ctx): return QPartitionExpr(_forward_op(ref_call, [lhs, rhs])) assert (not lhs_cond) and (not rhs_cond) return None + +# @register_partition_function("nn.global_avg_pool2d") +# def global_avg_pool2d_partition_function(ref_call, new_args, ctx): +# cond, expr = partition_expr_check(new_args[0]) +# eprint("global_avg_pool2d partition") +# if cond: +# expr = stop_fusion(new_args[0].realize()) +# return _forward_op(ref_call, [expr]) +# else: +# expr = stop_fusion(QPartitionExpr(new_args[0]).realize()) +# return _forward_op(ref_call, [expr]) +# return None diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index 3687b75c8ce8..3b0b1b39c62c 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -123,7 +123,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator> readable_name_stream_ << "fused"; cache_node->outputs = this->VisitExpr(prim_func->body); auto candidate_name = readable_name_stream_.str(); - constexpr static size_t kMaxFuncNameLength = 80; + constexpr static size_t kMaxFuncNameLength = 800; if (candidate_name.size() > kMaxFuncNameLength) { std::stringstream truncated_name; truncated_name << candidate_name.substr(0, kMaxFuncNameLength); @@ -343,7 +343,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> auto cache_node = make_object(); cache_node->outputs = VisitExpr(prim_func->body); auto candidate_name = readable_name_stream_.str(); - constexpr static size_t kMaxFuncNameLength = 80; + constexpr static size_t kMaxFuncNameLength = 800; if (candidate_name.size() > kMaxFuncNameLength) { std::stringstream truncated_name; truncated_name << candidate_name.substr(0, kMaxFuncNameLength); diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 49d1e522f7d7..41680b655a66 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -312,19 +312,38 @@ Array UnifyDTypeScale(const Array& ref_args, const Array& args CHECK_EQ(ref_args.size(), args.size()); DataType dtype; - if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) { - dtype = cfg->dtype_input; - } else { - dtype = cfg->dtype_activation; - } + // FIXME(zhanghao): force to use add(int32, int32) in order to put in VTA ALU + // but this may be not necessary for other devices + // if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) { + // dtype = cfg->dtype_input; + // } else { + // dtype = cfg->dtype_activation; + // } + dtype = cfg->dtype_activation; for (size_t i = 0; i < ret.size(); ++i) { auto ref_arg = ref_args[i].as(); if (nptrs[i]->dtype != dtype) { - ret.Set(i, Cast(ret[i], dtype)); + auto new_arg = Cast(ret[i], dtype); + + // NOTE(zhanghao) + // if you want to let cpu to do all the cast, use the following code + // ret.Set(i, StopFusion(new_arg)); + + // do not fuse float32 cast + if (nptrs[i]->dtype == DataType::Float(32)) { + ret.Set(i, StopFusion(new_arg)); + } else { + ret.Set(i, new_arg); + } } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) && ref_arg->attrs.as()->kind == kQInput) { auto new_arg = Cast(ret[i], cfg->dtype_input); new_arg = StopFusion(new_arg); + + // NOTE(zhanghao) + // if you want to let cpu to do all the cast, use the following code + // ret.Set(i, StopFusion(Cast(new_arg, dtype))); + ret.Set(i, Cast(new_arg, dtype)); } } diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py index fa6aee4864ec..8aae9a3c5f14 100644 --- a/topi/python/topi/generic/injective.py +++ b/topi/python/topi/generic/injective.py @@ -63,5 +63,9 @@ def schedule_injective(outs): schedule_injective_from_existing(s, x) return s +@tvm.target.generic_func +def schedule_add(outs): + return schedule_injective(outs) + schedule_elemwise = schedule_injective schedule_broadcast = schedule_injective diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index c556352e4539..9f82d65f1d4e 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -62,11 +62,13 @@ class DevContext(object): MEM_ID_INP = 2 MEM_ID_ACC = 3 MEM_ID_OUT = 4 + MEM_ID_ACC_8 = 5 # VTA ALU Opcodes ALU_OPCODE_MIN = 0 ALU_OPCODE_MAX = 1 ALU_OPCODE_ADD = 2 ALU_OPCODE_SHR = 3 + # ALU_OPCODE_CAST = 4 # Task queue id (pipeline stage) QID_LOAD_INP = 1 QID_LOAD_WGT = 1 diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index 231d40033350..f6b22ce67ce5 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -317,7 +317,12 @@ def visit_call(self, call): elif self.start_pack and call.op == op.op.get('cast') and \ input_types[0].dtype == 'int32': cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs) - return relay.Call(op.op.get('copy'), [cast]) + # zhanghao: force separate cast and copy (to let copy do on cpu) + # cast = relay.Call(op.op.get('annotation.stop_fusion'), [cast]) + + # zhanghao: remove the redudant copy + # return relay.Call(op.op.get('copy'), [cast]) + return cast elif call.op == self.pad: pad_width = call.attrs.pad_width if len(pad_width) == 6: diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 2198ed4c191f..010daaedf2bc 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -35,7 +35,8 @@ # override to force partition at copy -reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) +# TODO(zhanghao): remove all copy +# reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) # add clip vta strategy def compute_clip_vta(attrs, inputs, output_type): diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 5b23ddeba1c1..2f30aba45d10 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -25,6 +25,9 @@ from .util import is_packed_layout from ..environment import get_env +from tvm.relay import op as Op +from tvm.contrib.util import eprint + @autotvm.register_topi_compute("conv2d_packed.vta") def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): @@ -33,6 +36,7 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty raise topi.InvalidShapeError() assert dilation == (1, 1) + eprint("data.shape, kernel.shape", data.shape, kernel.shape) if padding[0]: pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data") else: @@ -63,6 +67,79 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty return res + +# FIXME(zhanghao): move this code to a proper location +@topi.generic.schedule_add.register(["vta"]) +def _schedule_add(outs): + eprint("schedule_add vta") + assert len(outs) == 1 + + def is_cast_op(op): + # return op.same_as(Op.op.get("cast")) + # FIXME(zhanghao): find a better way to do compare + return op.name == 'T_cast' + + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + output = outs[0] + s = tvm.create_schedule([x.op for x in outs]) + tvm.schedule.AutoInlineInjective(s) + # s[output].fuse(s[output].op.axis) + + ewise_inputs = [] + ewise_ops = [] + const_ops = [] + + def _traverse(op): + if topi.tag.is_broadcast(op.tag): + if not op.same_as(output.op): + if not op.axis: + const_ops.append(op) + elif not is_cast_op(op): + ewise_ops.append(op) + + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.tensor.PlaceholderOp): + ewise_inputs.append((op, tensor)) + elif is_cast_op(tensor.op) and not op.same_as(output.op): + ewise_inputs.append((op, tensor)) + else: + _traverse(tensor.op) + else: + for tensor in op.input_tensors: + if (not isinstance(tensor.op, tvm.tensor.PlaceholderOp)) \ + and (not is_cast_op(tensor.op)): + _traverse(tensor.op) + + op = output.op + _traverse(op) + # only put the int-related ops to vta + if "int" in output.dtype: + env = get_env() + for eo in ewise_ops: + eprint("add ewise_ops ", eo) + s[eo].set_scope(env.acc_scope) + s[eo].pragma(s[eo].op.axis[0], env.alu) + s[eo].compute_at(s[output], s[output].op.axis[-2]) + + # cache read input + cache_read_ewise = [] + for consumer, tensor in ewise_inputs: + eprint("add dma_copy", consumer, tensor, tensor.op) + cache_read_ewise.append( + s.cache_read(tensor, env.acc_scope, [consumer])) + + for tensor in cache_read_ewise: + s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy) + s[tensor].compute_at(s[output], s[output].op.axis[-2]) + + for op in const_ops: + s[op].compute_inline() + + s[output].pragma(s[output].op.axis[-1], env.dma_copy) + + return s + + @autotvm.register_topi_schedule("conv2d_packed.vta") def schedule_conv2d_packed(cfg, outs): """Schedule packed conv2d""" diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index b1d3ad424d6e..314eb46fcf56 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -608,29 +608,29 @@ class InsnQueue : public BaseQueue { CommitPendingPop(kComputeStage); } // Helper function: Get Opcode string - const char* getOpcodeString(int opcode, bool use_imm) { - // The string name - if (opcode == VTA_ALU_OPCODE_MIN) { - if (use_imm) { - return "min imm"; - } else { - return "min"; - } - } else if (opcode == VTA_ALU_OPCODE_MAX) { - if (use_imm) { - return "max imm"; - } else { - return "max"; - } - } else if (opcode == VTA_ALU_OPCODE_ADD) { - if (use_imm) { - return "add imm"; - } else { - return "add"; + std::string getOpcodeString(int opcode, bool use_imm, int64_t imm) { + // The string name + if (opcode == VTA_ALU_OPCODE_MIN) { + if (use_imm) { + return std::string("min imm ") + std::to_string(imm); + } else { + return "min"; + } + } else if (opcode == VTA_ALU_OPCODE_MAX) { + if (use_imm) { + return (std::string("max imm ") + std::to_string(imm)); + } else { + return "max"; + } + } else if (opcode == VTA_ALU_OPCODE_ADD) { + if (use_imm) { + return (std::string("add imm ") + std::to_string(imm)); + } else { + return "add"; + } + } else if (opcode == VTA_ALU_OPCODE_SHR) { + return (std::string("shr ") + std::to_string(imm)); } - } else if (opcode == VTA_ALU_OPCODE_SHR) { - return "shr"; - } return "unknown op"; } @@ -692,6 +692,7 @@ class InsnQueue : public BaseQueue { if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n"); if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n"); if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n"); + if (c.mem.memory_type == VTA_MEM_ID_ACC_8) printf("ACC 8\n"); } if (c.mem.opcode == VTA_OPCODE_STORE) { printf("STORE:\n"); @@ -724,7 +725,7 @@ class InsnQueue : public BaseQueue { static_cast(c.gemm.src_factor_in), static_cast(c.gemm.dst_factor_in)); } else if (c.mem.opcode == VTA_OPCODE_ALU) { // Print instruction field information - printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm)); + printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm, c.alu.imm).c_str()); printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", static_cast(c.mem.pop_prev_dep), static_cast(c.mem.pop_next_dep), static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); @@ -829,7 +830,7 @@ class InsnQueue : public BaseQueue { } // Get stage of the memory static PipelineStage GetMemPipelineStage(int memory_type) { - if (memory_type == VTA_MEM_ID_ACC) return kComputeStage; + if (memory_type == VTA_MEM_ID_ACC || memory_type == VTA_MEM_ID_ACC_8) return kComputeStage; if (memory_type == VTA_MEM_ID_UOP) return kComputeStage; return kLoadStage; } @@ -839,7 +840,7 @@ class InsnQueue : public BaseQueue { if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage; if (insn->opcode == VTA_OPCODE_LOAD) { if (insn->x_size == 0) return kNoneStage; - if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage; + if (insn->memory_type == VTA_MEM_ID_ACC || insn->memory_type == VTA_MEM_ID_ACC_8) return kComputeStage; if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage; return kLoadStage; } @@ -922,6 +923,9 @@ class CommandQueue { case VTA_MEM_ID_OUT: elem_bytes = VTA_OUT_ELEM_BYTES; break; + case VTA_MEM_ID_ACC_8: + elem_bytes = VTA_ACC_ELEM_BYTES / 4; + break; default: LOG(FATAL) << "Memory id not recognized:" << memory_id; break; From 082f64e561b7834055c6b32119e02e0ef0af1329 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 9 Mar 2020 11:19:39 +0800 Subject: [PATCH 03/44] adapt to the code base --- python/tvm/relay/op/_tensor.py | 2 +- python/tvm/relay/op/op.py | 14 +++ python/tvm/relay/op/strategy/generic.py | 7 ++ vta/python/vta/top/vta_conv2d.py | 144 ++++++++++++------------ 4 files changed, 94 insertions(+), 73 deletions(-) diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index cca44429f7df..44d0a60227d6 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -52,7 +52,7 @@ register_broadcast_schedule("abs") register_broadcast_schedule("tanh") # NOTE(zhanghao): use customized add schedule -register_schedule("add", schedule_add) +register_add_schedule("add") # register_broadcast_schedule("add") register_broadcast_schedule("subtract") register_broadcast_schedule("multiply") diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index f2428f9db2ef..5056825d007c 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -240,6 +240,20 @@ def register_injective_schedule(op_name, level=10): return register_schedule(op_name, _schedule_injective, level) +def register_add_schedule(op_name, level=10): + """Register schedule function for add. + + Parameters + ---------- + op_name : str + The name of the op. + + level : int + The priority level + """ + return register_schedule(op_name, _schedule_add, level) + + def register_broadcast_schedule(op_name, level=10): """Register broadcast schedule function for an op. diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 4fa2b11d554d..3d24cdf73e9d 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -69,6 +69,12 @@ def schedule_injective(attrs, outs, target): with target: return topi.generic.schedule_injective(outs) +@generic_func +def schedule_add(attrs, outputs, target): + """Generic schedule for add.""" + with target: + return topi.generic.schedule_add(outputs) + @generic_func def schedule_reduce(attrs, outs, target): """Schedule reduction ops""" @@ -77,6 +83,7 @@ def schedule_reduce(attrs, outs, target): _op._schedule_injective = schedule_injective _op._schedule_reduce = schedule_reduce +_op._schedule_add = schedule_add # concatenate @generic_func diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 2f30aba45d10..44430b9123c7 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -68,78 +68,6 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty return res -# FIXME(zhanghao): move this code to a proper location -@topi.generic.schedule_add.register(["vta"]) -def _schedule_add(outs): - eprint("schedule_add vta") - assert len(outs) == 1 - - def is_cast_op(op): - # return op.same_as(Op.op.get("cast")) - # FIXME(zhanghao): find a better way to do compare - return op.name == 'T_cast' - - outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - output = outs[0] - s = tvm.create_schedule([x.op for x in outs]) - tvm.schedule.AutoInlineInjective(s) - # s[output].fuse(s[output].op.axis) - - ewise_inputs = [] - ewise_ops = [] - const_ops = [] - - def _traverse(op): - if topi.tag.is_broadcast(op.tag): - if not op.same_as(output.op): - if not op.axis: - const_ops.append(op) - elif not is_cast_op(op): - ewise_ops.append(op) - - for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.tensor.PlaceholderOp): - ewise_inputs.append((op, tensor)) - elif is_cast_op(tensor.op) and not op.same_as(output.op): - ewise_inputs.append((op, tensor)) - else: - _traverse(tensor.op) - else: - for tensor in op.input_tensors: - if (not isinstance(tensor.op, tvm.tensor.PlaceholderOp)) \ - and (not is_cast_op(tensor.op)): - _traverse(tensor.op) - - op = output.op - _traverse(op) - # only put the int-related ops to vta - if "int" in output.dtype: - env = get_env() - for eo in ewise_ops: - eprint("add ewise_ops ", eo) - s[eo].set_scope(env.acc_scope) - s[eo].pragma(s[eo].op.axis[0], env.alu) - s[eo].compute_at(s[output], s[output].op.axis[-2]) - - # cache read input - cache_read_ewise = [] - for consumer, tensor in ewise_inputs: - eprint("add dma_copy", consumer, tensor, tensor.op) - cache_read_ewise.append( - s.cache_read(tensor, env.acc_scope, [consumer])) - - for tensor in cache_read_ewise: - s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy) - s[tensor].compute_at(s[output], s[output].op.axis[-2]) - - for op in const_ops: - s[op].compute_inline() - - s[output].pragma(s[output].op.axis[-1], env.dma_copy) - - return s - - @autotvm.register_topi_schedule("conv2d_packed.vta") def schedule_conv2d_packed(cfg, outs): """Schedule packed conv2d""" @@ -261,3 +189,75 @@ def _traverse(op): s[output].pragma(x_co1, env.dma_copy) return s + + +# FIXME(zhanghao): move this code to a proper location +@topi.generic.schedule_add.register(["vta"]) +def _schedule_add(outs): + eprint("schedule_add vta") + assert len(outs) == 1 + + def is_cast_op(op): + # return op.same_as(Op.op.get("cast")) + # FIXME(zhanghao): find a better way to do compare + return op.name == 'T_cast' + + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + output = outs[0] + s = te.create_schedule([x.op for x in outs]) + te.schedule.AutoInlineInjective(s) + # s[output].fuse(s[output].op.axis) + + ewise_inputs = [] + ewise_ops = [] + const_ops = [] + + def _traverse(op): + if topi.tag.is_broadcast(op.tag): + if not op.same_as(output.op): + if not op.axis: + const_ops.append(op) + elif not is_cast_op(op): + ewise_ops.append(op) + + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.te.PlaceholderOp): + ewise_inputs.append((op, tensor)) + elif is_cast_op(tensor.op) and not op.same_as(output.op): + ewise_inputs.append((op, tensor)) + else: + _traverse(tensor.op) + else: + for tensor in op.input_tensors: + if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \ + and (not is_cast_op(tensor.op)): + _traverse(tensor.op) + + op = output.op + _traverse(op) + # only put the int-related ops to vta + if "int" in output.dtype: + env = get_env() + for eo in ewise_ops: + eprint("add ewise_ops ", eo) + s[eo].set_scope(env.acc_scope) + s[eo].pragma(s[eo].op.axis[0], env.alu) + s[eo].compute_at(s[output], s[output].op.axis[-2]) + + # cache read input + cache_read_ewise = [] + for consumer, tensor in ewise_inputs: + eprint("add dma_copy", consumer, tensor, tensor.op) + cache_read_ewise.append( + s.cache_read(tensor, env.acc_scope, [consumer])) + + for tensor in cache_read_ewise: + s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy) + s[tensor].compute_at(s[output], s[output].op.axis[-2]) + + for op in const_ops: + s[op].compute_inline() + + s[output].pragma(s[output].op.axis[-1], env.dma_copy) + + return s From b6bc82a6ac598234b2a6c01a02fc68cf730b4b2c Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Wed, 18 Mar 2020 16:45:07 +0800 Subject: [PATCH 04/44] auto device_copy feature for vta --- include/tvm/relay/transform.h | 28 +++++ python/tvm/relay/quantize/_partition.py | 23 ++-- python/tvm/relay/transform/transform.py | 4 + src/relay/backend/build_module.cc | 8 ++ src/relay/transforms/device_annotation.cc | 114 ++++++++++++++++- src/tir/transforms/lower_tvm_builtin.cc | 21 ++-- vta.resnet18_v1.log-manual-formatv0_2 | 10 ++ vta/python/vta/top/graphpack.py | 116 +++++++++++++++++- vta/runtime/runtime.cc | 27 +++- .../frontend/deploy_classification.py | 23 +++- 10 files changed, 338 insertions(+), 36 deletions(-) create mode 100644 vta.resnet18_v1.log-manual-formatv0_2 diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h index b287c053e8a9..61eb6dd50ce2 100644 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@ -116,6 +116,16 @@ TVM_DLL Pass FuseOps(int fuse_opt_level = -1); */ TVM_DLL Pass RewriteAnnotatedOps(int fallback_device); +/*! + * \brief add device_copy if two adjacent nodes are on different devices + * + * \param expr The expression. + * + * \return The updated program. + */ +TVM_DLL Pass AddDeviceCopyOps(); + + /*! * \brief turn a dataflow graph into Administrative Normal Form, or A-Normal Form (ANF). * @@ -418,6 +428,24 @@ TVM_DLL Expr ForwardRewrite(const Expr& expr, const FForwardRewrite& rewrite_fun */ TVM_DLL Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device); +/*! + * \brief add device_copy if two adjacent nodes are on different devices + * + * \param expr The expression. + * + * \return The updated program. + */ +TVM_DLL Expr AddDeviceCopyOps(const Expr& expr); + +/*! + * \brief Fuse operations into expr into seperate functions. + * + * \param fuse_opt_level Optimization level. If it is -1 it will be inferred from pass context. + * + * \return The pass. + */ +TVM_DLL Expr FuseOps(const Expr& expr, int fuse_opt_level, const IRModule& module); + /*! * \brief Turn an expression into continuation passing style(CPS). * diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py index bba5a6d842f9..6ff2a8be0b4a 100644 --- a/python/tvm/relay/quantize/_partition.py +++ b/python/tvm/relay/quantize/_partition.py @@ -145,14 +145,15 @@ def multiply_partition_function(ref_call, new_args, ctx): assert (not lhs_cond) and (not rhs_cond) return None -# @register_partition_function("nn.global_avg_pool2d") -# def global_avg_pool2d_partition_function(ref_call, new_args, ctx): -# cond, expr = partition_expr_check(new_args[0]) -# eprint("global_avg_pool2d partition") -# if cond: -# expr = stop_fusion(new_args[0].realize()) -# return _forward_op(ref_call, [expr]) -# else: -# expr = stop_fusion(QPartitionExpr(new_args[0]).realize()) -# return _forward_op(ref_call, [expr]) -# return None + +# add cast after the relu op to make it run on vta +@register_partition_function("nn.global_avg_pool2d") +def global_avg_pool2d_partition_function(ref_call, new_args, ctx): + cond, expr = partition_expr_check(new_args[0]) + if cond: + expr = new_args[0].realize() + return _forward_op(ref_call, [expr]) + else: + expr = QPartitionExpr(new_args[0]).realize() + return _forward_op(ref_call, [expr]) + return None diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index 8f4ec1046500..d1a93fd5f9b8 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -441,6 +441,10 @@ def RewriteAnnotatedOps(fallback_device): return _ffi_api.RewriteDeviceAnnotation(fallback_device) +def AddDeviceCopy(): + return _transform.AddDeviceCopy() + + def ToANormalForm(): """Turn Graph Normal Form expression into A Normal Form Expression. The scope of the root expression is the global scope. diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index f9ce24d410b7..cbe4ae2d4256 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -419,6 +419,14 @@ class RelayBuildModule : public runtime::ModuleNode { // Get the updated function. auto func = Downcast(relay_module->Lookup("main")); + // do extra pass to check to insert device_copy if necessary + if (targets_.size() > 1) { + func = Downcast(relay::AddDeviceCopyOps(func)); + // we have to do fuseops again as we may add new device_copy ops + func = Downcast(relay::FuseOps(func, -1, relay_module)); + func = Downcast(relay::InferType(func, relay_module)); + } + // Generate code for the updated function. graph_codegen_ = std::unique_ptr(new GraphCodegen()); graph_codegen_->Init(nullptr, targets_); diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc index 39cf563f730a..5a87b06b0540 100644 --- a/src/relay/transforms/device_annotation.cc +++ b/src/relay/transforms/device_annotation.cc @@ -60,8 +60,10 @@ bool IsDeviceCopyNode(const ExprNode* node) { class ValidateAnnotation : private ExprVisitor { public: - static std::unordered_map Validate(const Expr& expr) { - ValidateAnnotation valid; + ValidateAnnotation(int fallback_device): fallback_device_(fallback_device) {} + + static std::unordered_map Validate(const Expr& expr, int fallback_device) { + ValidateAnnotation valid(fallback_device); valid(expr); return valid.annotation_map_; } @@ -80,12 +82,24 @@ class ValidateAnnotation : private ExprVisitor { CHECK_EQ(call_node->args.size(), 1U); const auto* node = call_node->args[0].operator->(); + // LOG(WARNING) << "annotated node, device_type = " << device_type << " : " << GetRef(node); if (annotation_map_.count(node)) { CHECK_EQ(annotation_map_.at(node), device_type) << "An expression node can only be annotated to one device."; } else { annotation_map_.insert({node, GetDeviceId(call_node)}); } + + // FIXME(zhanghao): find a better way + // here assume there are max two device types + if (device_type == fallback_device_ && extra_device_ && extra_device_ != fallback_device_) { + const auto* child = GetRef(node).as()->args[0].operator->(); + // here we mark as negative to indicate this is for copy from only + int ext_dev = -extra_device_; + annotation_map_.insert({child, ext_dev}); + } + + if (device_type != fallback_device_) extra_device_ = device_type; } } @@ -109,6 +123,8 @@ class ValidateAnnotation : private ExprVisitor { } std::unordered_map annotation_map_; + int fallback_device_ = 0; + int extra_device_ = 0; }; // Replace the use of an expression with the output of a `copy_device` operator @@ -122,7 +138,7 @@ class RewriteAnnotation : public ExprMutator { public: Expr Rewrite(const Expr& expr, int fallback_device) { fallback_device_ = fallback_device; - annotation_map_ = ValidateAnnotation::Validate(expr); + annotation_map_ = ValidateAnnotation::Validate(expr, fallback_device); return this->VisitExpr(expr); } @@ -229,6 +245,7 @@ class RewriteAnnotation : public ExprMutator { CHECK(dit != annotation_map_.end()) << "Device copy op is not required when both src and dst ops are not " "annotated."; + // LOG(WARNING) << "Create device copy " << fallback_device_ << " to " << dit->second << ": " << src; return CreateDeviceCopy(src, fallback_device_, dit->second); } else { const auto dit = annotation_map_.find(dst); @@ -244,10 +261,15 @@ class RewriteAnnotation : public ExprMutator { if (annotation_map_.count(dst)) { return src_dev_type != annotation_map_.at(dst); } else { - return src_dev_type != fallback_device_; + // TODO(zhanghao): for now, we only make a device_copy when dst is "on_device" marked + // This allows us to do a start-end mark (mark two points) + // to mark all the middle ops with a device_type + return false; + // return src_dev_type != fallback_device_; } } else { - if (annotation_map_.count(dst)) { + // if annotation value < 0, it means this is for "copy from" only + if (annotation_map_.count(dst) && annotation_map_.at(dst) > 0) { // Though data copy op could be inserted whenever the `src` and `dst` // ops are annotated to different devices, it leads to high overhead. // @@ -494,6 +516,66 @@ class DeviceInfo { Map device_map_; }; + +class AddDeviceCopy : public ExprMutator { + public: + Expr Rewrite(const Expr& expr) { + device_map_ = DeviceInfo::GetDeviceMap(expr); + return this->Mutate(expr); + } + + private: + // add device copy if two nodes not on the same device + Expr VisitExpr_(const CallNode* call_node) override { + auto func_node = call_node->op.as(); + bool src_is_copy_node = false; + if (func_node && IsDeviceCopyNode(func_node->body.as())) { + // LOG(WARNING) << "DeviceCopy skip device_copy node"; + src_is_copy_node = true; + } + + tvm::Array call_args; + auto call_expr = GetRef(call_node); + CHECK(device_map_.count(call_expr)); + + for (auto& arg: call_node->args) { + CHECK(device_map_.count(arg)); + bool dst_is_copy_node = false; + if (auto arg_node = arg.as()) { + auto func_node = arg_node->op.as(); + if (func_node && IsDeviceCopyNode(func_node->body.as())) { + // LOG(WARNING) << "DeviceCopy skip dst device_copy node"; + dst_is_copy_node = true; + } + } + + int src_dev_type = device_map_.count(arg) ? device_map_[arg]->value : 1; + int dst_dev_type = device_map_.count(call_expr) ? device_map_[call_expr]->value : 1; + if (!src_is_copy_node && !dst_is_copy_node && src_dev_type != dst_dev_type) { + // LOG(WARNING) << "Not consistent device type, src = " << src_dev_type << ":" << arg; + // LOG(WARNING) << "Not consistent device type, dst = " << dst_dev_type << ":" << call_expr; + auto attrs = make_object(); + attrs->src_dev_type = src_dev_type; + attrs->dst_dev_type = dst_dev_type; + static const Op& op = Op::Get("device_copy"); + Call device_copy = CallNode::make(op, {this->Mutate(arg)}, Attrs(attrs), {}); + device_copy->checked_type_ = arg->checked_type_; + call_args.push_back(device_copy); + } else { + call_args.push_back(this->Mutate(arg)); + } + } + + auto ret = CallNode::make(call_node->op, call_args, call_node->attrs, call_node->type_args); + // manually add the checked_type_ + // alternatively, can call InferType Pass after this + ret->checked_type_ = call_node->checked_type_; + return ret; + } + + Map device_map_; +}; + Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) { RewriteAnnotation rewrote = RewriteAnnotation(); Expr new_expr = rewrote.Rewrite(expr, fallback_device); @@ -541,7 +623,15 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) { } } -Map CollectDeviceInfo(const Expr& expr) { return DeviceInfo::GetDeviceMap(expr); } +Expr AddDeviceCopyOps(const Expr& expr) { + auto rewrote = AddDeviceCopy(); + Expr new_expr = rewrote.Rewrite(expr); + return new_expr; +} + +Map CollectDeviceInfo(const Expr& expr) { + return DeviceInfo::GetDeviceMap(expr); +} Map CollectDeviceAnnotationOps(const Expr& expr) { return AnnotatationVisitor::GetAnnotations(expr); @@ -564,6 +654,18 @@ Pass RewriteAnnotatedOps(int fallback_device) { TVM_REGISTER_GLOBAL("relay._transform.RewriteDeviceAnnotation").set_body_typed(RewriteAnnotatedOps); +Pass AddDeviceCopyOps() { + runtime::TypedPackedFunc pass_func = + [=](Function f, IRModule m, PassContext pc) { + return Downcast(AddDeviceCopyOps(f)); + }; + return CreateFunctionPass(pass_func, 1, "AddDeviceCopyOps", + {tir::StringImmNode::make("InferType")}); +} + +TVM_REGISTER_GLOBAL("relay._transform.AddDeviceCopy") +.set_body_typed(AddDeviceCopyOps); + } // namespace transform } // namespace relay diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 7611e0fcc8b3..386e9885807b 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -86,16 +86,17 @@ class BuiltinLower : public StmtExprMutator { op = stmt.as(); // Get constant allocation bound. int64_t nbytes = GetVectorBytes(op->dtype); - if (device_type_.defined()) { - if (const auto* dev_type = device_type_.as()) { - if (dev_type->value == kDLCPU) { - int32_t constant_size = op->constant_allocation_size(); - if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) { - return stmt; - } - } - } - } + // FIXME(zhanghao): remove special handling for kDLCPU + // if (device_type_.defined()) { + // if (arith::GetConst(device_type_, &dev_type)) { + // if (dev_type == kDLCPU) { + // int32_t constant_size = op->constant_allocation_size(); + // if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) { + // return stmt; + // } + // } + // } + // } PrimExpr total_bytes = make_const(op->extents[0].dtype(), nbytes); for (size_t i = 0; i < op->extents.size(); ++i) { total_bytes = total_bytes * op->extents[i]; diff --git a/vta.resnet18_v1.log-manual-formatv0_2 b/vta.resnet18_v1.log-manual-formatv0_2 new file mode 100644 index 000000000000..7b3c9d61a318 --- /dev/null +++ b/vta.resnet18_v1.log-manual-formatv0_2 @@ -0,0 +1,10 @@ +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [16, 8, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 131, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 8]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.0014505], 0, 1.328160047531128, 1578987870.726089], "version": 0.2, "tvm_version": "0.7.dev0"} +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 16, 7, 7, 1, 32], "int8"], ["TENSOR", [16, 16, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 163, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 8]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.002734464], 0, 1.7085223197937012, 1578988000.5012062], "version": 0.2, "tvm_version": "0.7.dev0"} +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [8, 4, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 302, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.0008805], 0, 1.2376818656921387, 1578988097.9650147], "version": 0.2, "tvm_version": "0.7.dev0"} +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [8, 8, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 143, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 14]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.001309522], 0, 1.3671045303344727, 1578988174.358436], "version": 0.2, "tvm_version": "0.7.dev0"} +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [4, 2, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 177, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.00079938], 0, 1.1500802040100098, 1578988361.3194962], "version": 0.2, "tvm_version": "0.7.dev0"} +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [4, 4, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 681, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 14]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 1]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 2]]}, "result": [[0.001198882], 0, 1.2445652484893799, 1578988503.2178001], "version": 0.2, "tvm_version": "0.7.dev0"} +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [2, 2, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 570, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 4]], ["tile_w", "sp", [-1, 56]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 2]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 2]]}, "result": [[0.001230756], 0, 1.4033727645874023, 1578988610.0491438], "version": 0.2, "tvm_version": "0.7.dev0"} +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [4, 2, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 176, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.000339938], 0, 1.025542974472046, 1578988875.3407557], "version": 0.2, "tvm_version": "0.7.dev0"} +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [8, 4, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 299, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.000387532], 0, 1.095754861831665, 1578988972.0000997], "version": 0.2, "tvm_version": "0.7.dev0"} +{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [16, 8, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 67, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 16]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.000294566], 0, 0.9454472064971924, 1578989137.6281488], "version": 0.2, "tvm_version": "0.7.dev0"} diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index f6b22ce67ce5..0aff565cdf2e 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -21,6 +21,7 @@ from tvm import relay from tvm.relay import op, transform from tvm.relay import ExprMutator +from tvm.contrib.util import eprint def run_opt_pass(expr, opt_pass): """Exectue a relay pass.""" @@ -174,6 +175,95 @@ def _operator_idx_inc(expr, count_meta, operator_current_idx): operator_current_idx = operator_current_idx + 1 return operator_current_idx + +class ExprDeviceAnnot(ExprMutator): + """Visitor to perform graph annotation on an AST. + + Parameters + ---------- + start: int + the start location to mark run on vta (inclusive) + end: int + the end location to mark run on vta (exclusive) + + Returns + --------- + None + """ + def __init__(self, start=-1, end=-1): + self.ext_ctx = tvm.context("ext_dev") + self.cpu_ctx = tvm.context("cpu") + self.counter = -1 + self.start = start + self.end = end + super().__init__() + + def visit_call(self, call): + """ Visit the children. """ + # First visit the children. + oshape = _get_tensor_shape(call) + odtype = _get_tensor_type(call) + input_types = [arg.checked_type for arg in call.args] + args = [self.visit(arg) for arg in call.args] + + self.counter += 1 + if self.counter == self.start: + ret = relay.Call(call.op, args, call.attrs) + ret = relay.annotation.on_device(ret, self.ext_ctx) + eprint("add on_device {}: {}".format("ext", ret)) + return ret + elif self.counter == self.end: + ret = relay.Call(call.op, args, call.attrs) + ret = relay.annotation.on_device(ret, self.cpu_ctx) + eprint("add on_device {}: {}".format("cpu", ret)) + return ret + +# if call.op == self.global_avg_pool2d: +# eprint("graphpack call = ", call) +# eprint("graphpack call annot relu, ", args[0]) +# ret = relay.Call(call.op, args, call.attrs) +# ret = relay.annotation.on_device(ret, self.cpu_ctx) +# return ret +# +# if call.op == self.conv2d and odtype == 'int32': +# if not self.first_conv2d: +# ret = relay.Call(call.op, args, call.attrs) +# ret = relay.annotation.on_device(ret, self.ext_ctx) +# eprint("graphpack call conv2d", type(ret.op), ret.op, type(ret), ret) +# self.first_conv2d = True +# return ret + + return relay.Call( + self.visit(call.op), + args, + call.attrs) + + +class ExprLocater(ExprMutator): + """Visitor to locate op on an AST. + """ + def __init__(self): + self.counter = -1 + self.op2nodes = {} + super().__init__() + + def visit_call(self, call): + """ Visit the children. """ + # First visit the children. + args = [self.visit(arg) for arg in call.args] + + self.counter += 1 + if call.op in self.op2nodes: + self.op2nodes[call.op].append(self.counter) + else: + self.op2nodes[call.op] = [self.counter] + + return relay.Call( + self.visit(call.op), + args, + call.attrs) + + class ExprPack(ExprMutator): """Visitor to perform graph packing on an AST. """ @@ -468,4 +558,28 @@ def graph_pack(expr, weight_bits) expr = packer.visit(expr) assert not packer.start_pack - return run_opt_pass(expr, transform.InferType()) + expr = run_opt_pass(expr, transform.InferType()) + + expr_locator = ExprLocater() + expr_locator.visit(expr) + + # from the second conv2d to the global_avg_pool2d, all will run on vta + conv2d = op.op.get("nn.conv2d") + avg_pool2d = op.op.get("nn.global_avg_pool2d") + start = expr_locator.op2nodes[conv2d][1] + # preceeding the nn.global_avg_pool2d, it will look like this + # + # %310 = annotation.stop_fusion(%309) /* ty=Tensor[(1, 16, 7, 7, 1, 32), int8] */; + # %311 = cast(%310, dtype="int32") /* ty=Tensor[(1, 16, 7, 7, 1, 32), int32] */; + # %312 = transpose(%311, axes=[0, 4, 1, 5, 2, 3]) /* ty=Tensor[(1, 1, 16, 32, 7, 7), int32] */; + # %313 = reshape(%312, newshape=[1, 512, 7, 7]) /* ty=Tensor[(1, 512, 7, 7), int32] */; + # %314 = nn.global_avg_pool2d(%313) /* ty=Tensor[(1, 512, 1, 1), int32] */; + # + # we mark the preceeding three ops also on cpu device + end = expr_locator.op2nodes[avg_pool2d][0] - 3 + + device_annot = ExprDeviceAnnot(start=start, end=end) + expr = device_annot.visit(expr) + ret = run_opt_pass(expr, transform.InferType()) + + return ret diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 314eb46fcf56..c5c8ba44a0a2 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -36,6 +36,7 @@ #include #include #include +#include namespace vta { @@ -101,6 +102,8 @@ struct DataBuffer { DataBuffer* buffer = new DataBuffer(); buffer->data_ = data; buffer->phy_addr_ = VTAMemGetPhyAddr(data); + + allocated_.insert(buffer); return buffer; } /*! @@ -108,6 +111,7 @@ struct DataBuffer { * \param buffer The buffer to be freed. */ static void Free(DataBuffer* buffer) { + allocated_.erase(buffer); VTAMemFree(buffer->data_); delete buffer; } @@ -117,7 +121,12 @@ struct DataBuffer { * \return The corresponding data buffer header. */ static DataBuffer* FromHandle(const void* buffer) { - return const_cast(reinterpret_cast(buffer)); + if (allocated_.count(buffer)) { + return const_cast( + reinterpret_cast(buffer)); + } else { + return nullptr; + } } private: @@ -125,8 +134,13 @@ struct DataBuffer { void* data_; /*! \brief The physical address of the buffer, excluding header. */ vta_phy_addr_t phy_addr_; + + static std::set allocated_; }; +// init static member +std::set DataBuffer::allocated_; + /*! * \brief Micro op kernel. * Contains functions to construct the kernel with prefix Push. @@ -1207,10 +1221,12 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off if (kind_mask & 2) { from_buffer = vta::DataBuffer::FromHandle(from); from = from_buffer->virt_addr(); + // LOG(WARNING) << "BufferCopy from " << from << ", from_offset " << from_offset << ", size = " << size; } if (kind_mask & 1) { to_buffer = vta::DataBuffer::FromHandle(to); to = to_buffer->virt_addr(); + // LOG(WARNING) << "BufferCopy to " << to << ", to_offset " << to_offset << ", size = " << size; } if (from_buffer) { @@ -1234,8 +1250,15 @@ void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) { static_cast(cmd)->SetDebugFlag(debug_flag); } +// TODO(zhanghao): now we do the check here +// it would be better to do the check in ir_pass before adding the "VTABufferCPUPtr" void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) { - return vta::DataBuffer::FromHandle(buffer)->virt_addr(); + auto data_buf = vta::DataBuffer::FromHandle(buffer); + if (data_buf) { + return data_buf->virt_addr(); + } else { // it is a raw ptr allocated by CPU + return buffer; + } } void VTAWriteBarrier(VTACommandHandle cmd, void* buffer, uint32_t elem_bits, uint32_t start, diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index 3a367851ed25..56abe6f70b76 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -59,6 +59,7 @@ import vta from vta.testing import simulator from vta.top import graph_pack +from tvm.contrib.util import eprint # Make sure that TVM was compiled with RPC=1 assert tvm.runtime.enabled("rpc") @@ -75,6 +76,11 @@ # or ``device=vta`` to run inference on the FPGA. device = "vta" target = env.target if device == "vta" else env.target_vta_cpu +# multiple targets to run both on cpu and vta +targets = { + "cpu": env.target_vta_cpu, + "ext_dev": env.target +} # Dictionary lookup for when to start/end bit packing pack_dict = { @@ -130,7 +136,8 @@ remote = rpc.LocalSession() # Get execution context from remote -ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) +# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) +ctxes = [remote.ext_dev(0), remote.cpu(0)] ###################################################################### # Build the inference graph runtime @@ -149,7 +156,8 @@ # # Load pre-configured AutoTVM schedules -with autotvm.tophub.context(target): +log_file = "%s.%s.log-manual-formatv0_2" % (device, model) +with autotvm.tophub.context(target, extra_files=[log_file]): # Populate the shape and data type dictionary for ImageNet classifier input dtype_dict = {"data": 'float32'} @@ -163,6 +171,7 @@ # Start front end compilation mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + eprint("from_mxnet mod = ", mod) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) @@ -175,6 +184,7 @@ with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): mod = relay.quantize.quantize(mod, params=params) + eprint("done quantize", mod) # Perform graph packing and constant folding for VTA target assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack( @@ -184,6 +194,7 @@ env.WGT_WIDTH, start_name=pack_dict[model][0], stop_name=pack_dict[model][1]) + eprint("done graphpack ", relay_prog) else: relay_prog = mod["main"] @@ -196,7 +207,7 @@ else: with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( - relay_prog, target=target, + relay_prog, target=targets, params=params, target_host=env.target_host) # Measure Relay build time @@ -210,7 +221,7 @@ lib = remote.load_module("graphlib.o") # Graph runtime - m = graph_runtime.create(graph, lib, ctx) + m = graph_runtime.create(graph, lib, ctxes) ###################################################################### # Perform image classification inference @@ -245,10 +256,10 @@ m.set_input('data', image) # Perform inference and gather execution statistics -# More on: :py:method:`tvm.runtime.Module.time_evaluator` +# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator num = 4 # number of times we run module for a single measurement rep = 3 # number of measurements (we derive std dev from this) -timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep) +timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() From ef153e253bacb2443fd092694b759f5f5aa1cfbd Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Tue, 24 Mar 2020 14:19:10 +0800 Subject: [PATCH 05/44] bugfix for AddDeviceCopy pass; add Mul for vta simulation --- include/tvm/relay/attrs/device_copy.h | 1 + python/tvm/relay/op/_tensor.py | 5 ++++- python/tvm/relay/quantize/_partition.py | 4 +++- src/relay/transforms/device_annotation.cc | 10 ++++++---- vta/python/vta/environment.py | 2 +- vta/runtime/runtime.cc | 2 ++ 6 files changed, 17 insertions(+), 7 deletions(-) diff --git a/include/tvm/relay/attrs/device_copy.h b/include/tvm/relay/attrs/device_copy.h index 7da92b3ff763..c4a60c827048 100644 --- a/include/tvm/relay/attrs/device_copy.h +++ b/include/tvm/relay/attrs/device_copy.h @@ -37,6 +37,7 @@ namespace relay { struct DeviceCopyAttrs : public tvm::AttrsNode { int dst_dev_type; int src_dev_type; + bool used_for_propagate = true; TVM_DECLARE_ATTRS(DeviceCopyAttrs, "relay.attrs.DeviceCopyAttrs") { TVM_ATTR_FIELD(src_dev_type) diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index 44d0a60227d6..4f409ff4538f 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -82,7 +82,10 @@ register_broadcast_schedule("isinf") register_injective_schedule("maximum") register_injective_schedule("minimum") -register_injective_schedule("right_shift") +# NOTE(zhanghao): use customized add schedule +# TODO(zhanghao): change the schedule name +register_add_schedule("right_shift") +# register_injective_schedule("right_shift") register_injective_schedule("left_shift") register_injective_schedule("shape_of") register_injective_schedule("ndarray_size") diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py index 6ff2a8be0b4a..315986d55607 100644 --- a/python/tvm/relay/quantize/_partition.py +++ b/python/tvm/relay/quantize/_partition.py @@ -21,6 +21,7 @@ from .. import analysis as _analysis from . import _quantize from .quantize import _forward_op +from tvm.contrib.util import eprint def register_partition_function(op_name, frewrite=None, level=10): return tvm.ir.register_op_attr(op_name, "FQPartitionRewrite", frewrite, level) @@ -81,7 +82,7 @@ def add_partition_generic(ref_call, new_args, ctx): # ... lhs = new_args[0].realize() rhs = new_args[1].realize() - return _forward_op(ref_call, [lhs, rhs]) + return QPartitionExpr(_forward_op(ref_call, [lhs, rhs])) if not lhs_cond and rhs_cond: # - introduced by residual connection in ResNet # ... @@ -141,6 +142,7 @@ def multiply_partition_function(ref_call, new_args, ctx): rhs_cond, rhs = partition_expr_check(new_args[1]) if lhs_cond: # introduced by bn: multiply(out, scale) + lhs = new_args[0].realize() return QPartitionExpr(_forward_op(ref_call, [lhs, rhs])) assert (not lhs_cond) and (not rhs_cond) return None diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc index 5a87b06b0540..2d53751665da 100644 --- a/src/relay/transforms/device_annotation.cc +++ b/src/relay/transforms/device_annotation.cc @@ -82,7 +82,7 @@ class ValidateAnnotation : private ExprVisitor { CHECK_EQ(call_node->args.size(), 1U); const auto* node = call_node->args[0].operator->(); - // LOG(WARNING) << "annotated node, device_type = " << device_type << " : " << GetRef(node); + // LOG(WARNING) << "annotated node, device_type = " << device_type << " : " << GetRef(node).as()->op; if (annotation_map_.count(node)) { CHECK_EQ(annotation_map_.at(node), device_type) << "An expression node can only be annotated to one device."; @@ -245,7 +245,7 @@ class RewriteAnnotation : public ExprMutator { CHECK(dit != annotation_map_.end()) << "Device copy op is not required when both src and dst ops are not " "annotated."; - // LOG(WARNING) << "Create device copy " << fallback_device_ << " to " << dit->second << ": " << src; + // LOG(WARNING) << "Create device copy " << fallback_device_ << " to " << dit->second << ": " << src.as()->op; return CreateDeviceCopy(src, fallback_device_, dit->second); } else { const auto dit = annotation_map_.find(dst); @@ -552,11 +552,13 @@ class AddDeviceCopy : public ExprMutator { int src_dev_type = device_map_.count(arg) ? device_map_[arg]->value : 1; int dst_dev_type = device_map_.count(call_expr) ? device_map_[call_expr]->value : 1; if (!src_is_copy_node && !dst_is_copy_node && src_dev_type != dst_dev_type) { - // LOG(WARNING) << "Not consistent device type, src = " << src_dev_type << ":" << arg; - // LOG(WARNING) << "Not consistent device type, dst = " << dst_dev_type << ":" << call_expr; + // auto arg_call = arg.as(); + // LOG(WARNING) << "Not consistent device type, src = " << src_dev_type << ":" << (arg_call ? arg_call->op : arg); + // LOG(WARNING) << "Not consistent device type, dst = " << dst_dev_type << ":" << call_node->op; auto attrs = make_object(); attrs->src_dev_type = src_dev_type; attrs->dst_dev_type = dst_dev_type; + attrs->used_for_propagate = false; static const Op& op = Op::Get("device_copy"); Call device_copy = CallNode::make(op, {this->Mutate(arg)}, Attrs(attrs), {}); device_copy->checked_type_ = arg->checked_type_; diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index 9f82d65f1d4e..3aa63cbb3415 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -68,7 +68,7 @@ class DevContext(object): ALU_OPCODE_MAX = 1 ALU_OPCODE_ADD = 2 ALU_OPCODE_SHR = 3 - # ALU_OPCODE_CAST = 4 + ALU_OPCODE_MUL = 4 # Task queue id (pipeline stage) QID_LOAD_INP = 1 QID_LOAD_WGT = 1 diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index c5c8ba44a0a2..4ebf7bdab450 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -644,6 +644,8 @@ class InsnQueue : public BaseQueue { } } else if (opcode == VTA_ALU_OPCODE_SHR) { return (std::string("shr ") + std::to_string(imm)); + } else if (opcode == VTA_ALU_OPCODE_MUL) { + return "mul"; } return "unknown op"; From 87461d10973b0643f29a87f3717b85689ada2052 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 26 Mar 2020 16:21:35 +0800 Subject: [PATCH 06/44] intelfocl support in samples --- vta/python/vta/testing/util.py | 2 +- vta/python/vta/top/vta_conv2d.py | 19 ++++++++++++++++--- .../integration/test_benchmark_topi_conv2d.py | 6 +++--- .../frontend/deploy_classification.py | 9 +++++++-- vta/tutorials/vta_get_started.py | 2 +- 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py index afbf00ddac8c..83da2e157164 100644 --- a/vta/python/vta/testing/util.py +++ b/vta/python/vta/testing/util.py @@ -32,7 +32,7 @@ def run(run_func): """ env = get_env() - if env.TARGET in ["sim", "tsim"]: + if env.TARGET in ["sim", "tsim", "intelfocl"]: # Talk to local RPC if necessary to debug RPC server. # Compile vta on your host with make at the root. # Make sure TARGET is set to "sim" in the config.json file. diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 44430b9123c7..c87a89ecfe80 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -235,6 +235,19 @@ def _traverse(op): op = output.op _traverse(op) + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis + + x_co_max = topi.util.get_const_int(x_bo.dom.extent) + x_i_max = topi.util.get_const_int(x_i.dom.extent) + x_j_max = topi.util.get_const_int(x_j.dom.extent) + + # TODO(zhanghao): auto-tune + x_co0, x_co1 = s[output].split(x_co, factor=1) + x_i0, x_i1 = s[output].split(x_i, factor=min(28, x_i_max)) + x_j0, x_j1 = s[output].split(x_j, factor=min(14, x_j_max)) + s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) + store_pt = x_j0 + # only put the int-related ops to vta if "int" in output.dtype: env = get_env() @@ -242,7 +255,7 @@ def _traverse(op): eprint("add ewise_ops ", eo) s[eo].set_scope(env.acc_scope) s[eo].pragma(s[eo].op.axis[0], env.alu) - s[eo].compute_at(s[output], s[output].op.axis[-2]) + s[eo].compute_at(s[output], store_pt) # cache read input cache_read_ewise = [] @@ -253,11 +266,11 @@ def _traverse(op): for tensor in cache_read_ewise: s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy) - s[tensor].compute_at(s[output], s[output].op.axis[-2]) + s[tensor].compute_at(s[output], store_pt) for op in const_ops: s[op].compute_inline() - s[output].pragma(s[output].op.axis[-1], env.dma_copy) + s[output].pragma(x_co1, env.dma_copy) return s diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index b3c36e85d56b..ea6b9cf1e9da 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -240,18 +240,18 @@ def test_conv2d(device): def _run(env, remote): if device == "vta": target = env.target - if env.TARGET not in ["sim", "tsim"]: + if env.TARGET not in ["sim", "tsim", "intelfocl"]: assert tvm.runtime.enabled("rpc") program_fpga(remote, bitstream=None) reconfig_runtime(remote) elif device == "arm_cpu": target = env.target_vta_cpu - with autotvm.tophub.context(target): # load pre-tuned schedule parameters + with autotvm.tophub.context(target, extra_files = ['vta.resnet18_v1.log-manual-formatv0_2']): # load pre-tuned schedule parameters for _, wl in resnet_wkls: print(wl) run_conv2d(env, remote, wl, target) vta.testing.run(_run) if __name__ == "__main__": - test_conv2d(device="arm_cpu") + # test_conv2d(device="arm_cpu") test_conv2d(device="vta") diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index 56abe6f70b76..30fe7f2b0b06 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -105,7 +105,7 @@ # When target is 'pynq', reconfigure FPGA and runtime. # Otherwise, if target is 'sim', execute locally. -if env.TARGET not in ["sim", "tsim"]: +if env.TARGET not in ["sim", "tsim", "intelfocl"]: # Get remote from tracker node if environment variable is set. # To set up the tracker, you'll need to follow the "Auto-tuning @@ -127,7 +127,12 @@ # by passing the path to the bitstream file instead of None. reconfig_start = time.time() vta.reconfig_runtime(remote) - vta.program_fpga(remote, bitstream=None) + # vta.program_fpga(remote, bitstream=None) + bitstream = os.environ.get("TVM_BIT", None) + if bitstream: + print("Program fpga with {}".format(bitstream)) + vta.program_fpga(remote, bitstream) + reconfig_time = time.time() - reconfig_start print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py index ab416874b71b..8ac7307f5a05 100644 --- a/vta/tutorials/vta_get_started.py +++ b/vta/tutorials/vta_get_started.py @@ -91,7 +91,7 @@ vta.program_fpga(remote, bitstream=None) # In simulation mode, host the RPC server locally. -elif env.TARGET == "sim": +elif env.TARGET in ("sim", "tsim", "intelfocl"): remote = rpc.LocalSession() ###################################################################### From bd79e8398ee33ae4eac6837d7362040519a8f873 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Fri, 3 Apr 2020 12:04:24 +0800 Subject: [PATCH 07/44] sync all insts and uops in one batch --- src/relay/backend/graph_plan_memory.cc | 57 ++++++------ vta/python/vta/environment.py | 2 +- vta/runtime/runtime.cc | 118 +++++++++++++++++++++++-- vta/runtime/runtime.h | 2 +- 4 files changed, 140 insertions(+), 39 deletions(-) diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 820e17f8a498..8ebf9847c3a7 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -309,34 +309,35 @@ class StorageAllocator : public StorageAllocaBaseVisitor { if (match_range_ == 0) { return this->Alloc(prototype, size); } - auto begin = free_.lower_bound(size / match_range_); - auto mid = free_.lower_bound(size); - auto end = free_.upper_bound(size * match_range_); - // search for memory blocks larger than requested - for (auto it = mid; it != end; ++it) { - StorageToken* tok = it->second; - if (tok->device_type != prototype->device_type) continue; - CHECK_EQ(tok->ref_counter, 0); - // Use exect matching strategy - tok->max_bytes = std::max(size, tok->max_bytes); - tok->ref_counter = prototype->ref_counter; - // find a exact match, erase from map and return - free_.erase(it); - return tok; - } - // then search for memory blocks smaller than requested space - for (auto it = mid; it != begin;) { - --it; - StorageToken* tok = it->second; - if (tok->device_type != prototype->device_type) continue; - CHECK_EQ(tok->ref_counter, 0); - // Use exect matching strategy - tok->max_bytes = std::max(size, tok->max_bytes); - tok->ref_counter = prototype->ref_counter; - // erase from map and return - free_.erase(it); - return tok; - } + // TODO(zhanghao): to avoid overwrite shared storage when we copy all the instructions in a single batch + // auto begin = free_.lower_bound(size / match_range_); + // auto mid = free_.lower_bound(size); + // auto end = free_.upper_bound(size * match_range_); + // // search for memory blocks larger than requested + // for (auto it = mid; it != end; ++it) { + // StorageToken *tok = it->second; + // if (tok->device_type != prototype->device_type) continue; + // CHECK_EQ(tok->ref_counter, 0); + // // Use exect matching strategy + // tok->max_bytes = std::max(size, tok->max_bytes); + // tok->ref_counter = prototype->ref_counter; + // // find a exact match, erase from map and return + // free_.erase(it); + // return tok; + // } + // // then search for memory blocks smaller than requested space + // for (auto it = mid; it != begin;) { + // --it; + // StorageToken *tok = it->second; + // if (tok->device_type != prototype->device_type) continue; + // CHECK_EQ(tok->ref_counter, 0); + // // Use exect matching strategy + // tok->max_bytes = std::max(size, tok->max_bytes); + // tok->ref_counter = prototype->ref_counter; + // // erase from map and return + // free_.erase(it); + // return tok; + // } // cannot find anything return a new one. return this->Alloc(prototype, size); } diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index 3aa63cbb3415..548dc03aae78 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -297,7 +297,7 @@ def coproc_sync(op): return tvm.tir.call_extern( "int32", "VTASynchronize", get_env().dev.command_handle, - tvm.runtime.const(1<<31, dtype="uint32")) + tvm.runtime.const(1<<31, dtype="uint32"), True) @tvm.register_func("tvm.intrin.rule.default.vta.coproc_dep_push") diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 4ebf7bdab450..3e42727c5bfc 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -37,6 +37,8 @@ #include #include #include +#include +#include namespace vta { @@ -48,6 +50,72 @@ static const bool kBufferCoherent = VTA_COHERENT_ACCESSES; /*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */ static const bool kAlwaysCache = true; +template +class AlignmentAllocator { +public: + typedef T value_type; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + typedef T * pointer; + typedef const T * const_pointer; + + typedef T & reference; + typedef const T & const_reference; + + public: + inline AlignmentAllocator () throw () { } + + template + inline AlignmentAllocator (const AlignmentAllocator &) throw () { } + + inline ~AlignmentAllocator () throw () { } + + inline pointer adress (reference r) { + return &r; + } + + inline const_pointer adress (const_reference r) const { + return &r; + } + + inline pointer allocate (size_type n) { + return (pointer)memalign(N, n*sizeof(value_type)); + } + + inline void deallocate (pointer p, size_type) { + free(p); + } + + inline void construct (pointer p, const value_type & wert) { + new (p) value_type (wert); + } + + inline void destroy (pointer p) { + p->~value_type (); + } + + inline size_type max_size () const throw () { + return size_type (-1) / sizeof (value_type); + } + + template + struct rebind { + typedef AlignmentAllocator other; + }; + + bool operator!=(const AlignmentAllocator& other) const { + return !(*this == other); + } + + // Returns true if and only if storage allocated from *this + // can be deallocated from other, and vice versa. + // Always returns true for stateless allocators. + bool operator==(const AlignmentAllocator& other) const { + return true; + } +}; + /*! * \brief Data buffer represents data on CMA. */ @@ -84,6 +152,7 @@ struct DataBuffer { */ void MemCopyFromHost(void* dst, const void* src, size_t size) { VTAMemCopyFromHost(dst, src, size); + } /*! * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory. @@ -343,7 +412,7 @@ class BaseQueue { // End location of current SRAM write in FIFO mode uint32_t sram_end_{0}; // The buffer in DRAM - std::vector dram_buffer_; + std::vector> dram_buffer_; // FPGA accessible buffer void* fpga_buff_{NULL}; // Physical address of the FPGA buffer @@ -443,13 +512,33 @@ class UopQueue : public BaseQueue { } CHECK(buff_size <= kMaxBytes); // Move kernel contents to FPGA readable buffer + // uint32_t offset = 0; + // for (uint32_t i = 0; i < cache_.size(); ++i) { + // uint32_t ksize = cache_[i]->size() * kElemBytes; + // VTAMemCopyFromHost(static_cast(fpga_buff_) + offset, + // cache_[i]->data(), + // ksize); + // // Update offset + // offset += ksize; + // } + + // merge all the cache entries and do CopyFromHost once + uint32_t total_size = 0; + for (uint32_t i = 0; i < cache_.size(); ++i) { + uint32_t ksize = cache_[i]->size() * kElemBytes; + total_size += ksize; + } + + char *lbuf = (char*)memalign(64, total_size); uint32_t offset = 0; for (uint32_t i = 0; i < cache_.size(); ++i) { uint32_t ksize = cache_[i]->size() * kElemBytes; - VTAMemCopyFromHost(static_cast(fpga_buff_) + offset, cache_[i]->data(), ksize); - // Update offset + memcpy(lbuf + offset, cache_[i]->data(), ksize); offset += ksize; } + VTAMemCopyFromHost(static_cast(fpga_buff_), lbuf, total_size); + free(lbuf); + // Flush if we're using a shared memory system // and if interface is non-coherent if (!coherent_ && always_cache_) { @@ -904,6 +993,8 @@ class InsnQueue : public BaseQueue { int pending_pop_next_[4]; static constexpr int kElemBytes = sizeof(VTAGenericInsn); static constexpr int kMaxElems = kMaxBytes / kElemBytes; + + friend class CommandQueue; }; /*! @@ -1011,7 +1102,16 @@ class CommandQueue { } } - void Synchronize(uint32_t wait_cycles) { + void Synchronize(uint32_t wait_cycles, bool skip=true) { + // FIXME(zhanghao): It is required to use force_serial + // by using skip and sync at the final layer, we can avoid do DeviceCopy every time + if (skip) { + if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) { + LOG(ERROR) << "Synchronizing all in one round requires to use force_serial to make things right"; + } + return; + } + // Insert dependences to force serialization if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) { insn_queue_.RewriteForceSerial(); @@ -1223,16 +1323,16 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off if (kind_mask & 2) { from_buffer = vta::DataBuffer::FromHandle(from); from = from_buffer->virt_addr(); - // LOG(WARNING) << "BufferCopy from " << from << ", from_offset " << from_offset << ", size = " << size; } if (kind_mask & 1) { to_buffer = vta::DataBuffer::FromHandle(to); to = to_buffer->virt_addr(); - // LOG(WARNING) << "BufferCopy to " << to << ", to_offset " << to_offset << ", size = " << size; } if (from_buffer) { // This is an FPGA to host mem transfer + // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly + VTASynchronize(VTATLSCommandHandle(), 1<<31, false); from_buffer->InvalidateCache(from_offset, size); from_buffer->MemCopyToHost(static_cast(to) + to_offset, static_cast(from) + from_offset, size); @@ -1323,6 +1423,6 @@ int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) { return 0; } -void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) { - static_cast(cmd)->Synchronize(wait_cycles); -} +void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip) { + static_cast(cmd)-> + Synchronize(wait_cycles, skip); } diff --git a/vta/runtime/runtime.h b/vta/runtime/runtime.h index 24ebb8e1247b..360970118144 100644 --- a/vta/runtime/runtime.h +++ b/vta/runtime/runtime.h @@ -251,7 +251,7 @@ TVM_DLL int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid); * \param wait_cycles The limit of poll cycles. * */ -TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles); +TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip=true); #ifdef __cplusplus } From f8eaef9d40436c97005fbf2e43a0a9778d422a35 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Wed, 8 Apr 2020 16:11:47 +0800 Subject: [PATCH 08/44] support for static auto-tune --- python/tvm/autotvm/measure/measure_methods.py | 110 +++++- vta/python/vta/top/graphpack.py | 51 +-- vta/runtime/runtime.cc | 369 +++++++++++++----- vta/runtime/runtime.h | 1 + vta/tutorials/autotvm/tune_relay_vta.py | 31 +- 5 files changed, 430 insertions(+), 132 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 7f915132fdc8..43ee291bfdd9 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -32,6 +32,7 @@ import tempfile import numpy as np +import json import tvm._ffi import tvm.ir.transform @@ -47,6 +48,8 @@ from .measure import MeasureResult, MeasureErrorNo, Builder, Runner from .local_executor import LocalExecutor +from tvm.contrib.util import eprint + logger = logging.getLogger('autotvm') class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))): @@ -186,6 +189,16 @@ def __init__(self, timeout=10, n_parallel=None, number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1, check_correctness=False): + static_tune = os.getenv("TVM_STATIC_TUNE") + if static_tune: + if n_parallel is None or n_parallel > 1: + print("static tune only allows n_parallel == 1") + n_parallel = 1 + + if check_correctness == True: + print("static tune does not support check_correctness") + check_correctness = False + super(RPCRunner, self).__init__(timeout, n_parallel) self.key = key @@ -369,7 +382,15 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti measure_input.target.device_name == 'vta': # pylint: disable=import-outside-toplevel import vta - func = vta.build(s, args, target_host=task.target_host) + + static_tune = os.getenv("TVM_STATIC_TUNE") + if static_tune: + debug_flag = 1 << 6 + else: + debug_flag = 0 + + with vta.build_config(debug_flag=debug_flag): + func = vta.build(s, args, target_host=task.target_host) else: with tvm.ir.transform.PassContext(config=opts): func = build(s, args, target_host=task.target_host) @@ -419,6 +440,63 @@ def _wrapped(measure_input, tmp_dir, **kwargs): return _wrapped +def cal_cost(insn): + """ + Cal the runtime cost statically + + Parameters + ------------ + insn: the insn (json) + + Returns + ------------ + the cost in s + """ + def alu_imm_cost(outer, inner, uops): + return 0.00001 + + def alu_cost(outer, inner, uops): + return 0.00001 + + def gemm_cost(outer, inner, uops): + return 0.00001 + + def load_inp_cost(y, x): + return 0.00001 + + def load_uop_cost(y, x): + return 0.00001 + + def load_wgt_cost(y, x): + return 0.00001 + + def store_cost(y, x): + return 0.00001 + + if insn['type'] == "ALU": + return alu_cost(insn['outer_loop'], insn['inner_loop'], + insn['range'][1] - insn['range'][0]) + elif insn['type'] == "ALU IMM": + return alu_imm_cost(insn['outer_loop'], insn['inner_loop'], + insn['range'][1] - insn['range'][0]) + elif insn['type'] == "GEMM": + return gemm_cost(insn['outer_loop'], insn['inner_loop'], + insn['range'][1] - insn['range'][0]) + elif insn['name'] == "LOAD INP": + return load_inp_cost(insn['y'][0], insn['x'][0]) + elif insn['name'] == "LOAD WGT": + return load_wgt_cost(insn['y'][0], insn['x'][0]) + elif insn['name'] == "LOAD UOP": + return load_uop_cost(insn['y'][0], insn['x'][0]) + elif insn['type'] == "STORE": + return store_cost(insn['y'][0], insn['x'][0]) + elif insn['type'] == "NOP": + return 0 + else: + print("Unknown op type: {}".format(insn['type'])) + return 0 + + def run_through_rpc(measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None): @@ -460,6 +538,7 @@ def run_through_rpc(measure_input, build_result, tic = time.time() errno = MeasureErrorNo.NO_ERROR + static_tune = os.getenv("TVM_STATIC_TUNE") try: # upload built module remote = request_remote(*remote_args) @@ -474,8 +553,6 @@ def run_through_rpc(measure_input, build_result, remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) - time_f = func.time_evaluator( - func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) # set input if ref_input: @@ -487,12 +564,25 @@ def run_through_rpc(measure_input, build_result, args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() - costs = time_f(*args).results + if static_tune is None: + time_f = func.time_evaluator( + func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) + costs = time_f(*args).results + + # clean up remote files + remote.remove(build_result.filename) + remote.remove(os.path.splitext(build_result.filename)[0] + '.so') + remote.remove('') + else: + func(*args) + cost = 0 + insn_dump = os.getenv('TVM_INSN_DUMP', "insn.dump") + with open(insn_dump) as json_file: + insns = json.load(json_file) + for insn in insns: + cost += cal_cost(insn) - # clean up remote files - remote.remove(build_result.filename) - remote.remove(os.path.splitext(build_result.filename)[0] + '.so') - remote.remove('') + costs = [cost] * repeat if len(costs) > 2: # remove largest and smallest value to reduce variance costs = list(costs) @@ -540,6 +630,10 @@ def request_remote(device_key, host=None, port=None, priority=1, timeout=60): ------ session: RPCSession """ + static_tune = os.getenv("TVM_STATIC_TUNE") + if static_tune: + return _rpc.LocalSession() + # connect to the tracker host = host or os.environ['TVM_TRACKER_HOST'] port = port or int(os.environ['TVM_TRACKER_PORT']) diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index 0aff565cdf2e..cdfd3c4281e2 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -507,7 +507,7 @@ def graph_pack(expr, stop_name="nn.global_avg_pool2d", start_name_idx=None, stop_name_idx=None, - count_meta=False): + count_meta=False, device_annot=True): """Pack the graph into batch&channel packed format. Parameters @@ -560,26 +560,29 @@ def graph_pack(expr, assert not packer.start_pack expr = run_opt_pass(expr, transform.InferType()) - expr_locator = ExprLocater() - expr_locator.visit(expr) - - # from the second conv2d to the global_avg_pool2d, all will run on vta - conv2d = op.op.get("nn.conv2d") - avg_pool2d = op.op.get("nn.global_avg_pool2d") - start = expr_locator.op2nodes[conv2d][1] - # preceeding the nn.global_avg_pool2d, it will look like this - # - # %310 = annotation.stop_fusion(%309) /* ty=Tensor[(1, 16, 7, 7, 1, 32), int8] */; - # %311 = cast(%310, dtype="int32") /* ty=Tensor[(1, 16, 7, 7, 1, 32), int32] */; - # %312 = transpose(%311, axes=[0, 4, 1, 5, 2, 3]) /* ty=Tensor[(1, 1, 16, 32, 7, 7), int32] */; - # %313 = reshape(%312, newshape=[1, 512, 7, 7]) /* ty=Tensor[(1, 512, 7, 7), int32] */; - # %314 = nn.global_avg_pool2d(%313) /* ty=Tensor[(1, 512, 1, 1), int32] */; - # - # we mark the preceeding three ops also on cpu device - end = expr_locator.op2nodes[avg_pool2d][0] - 3 - - device_annot = ExprDeviceAnnot(start=start, end=end) - expr = device_annot.visit(expr) - ret = run_opt_pass(expr, transform.InferType()) - - return ret + if device_annot: + expr_locator = ExprLocater() + expr_locator.visit(expr) + + # from the second conv2d to the global_avg_pool2d, all will run on vta + conv2d = op.op.get("nn.conv2d") + avg_pool2d = op.op.get("nn.global_avg_pool2d") + start = expr_locator.op2nodes[conv2d][1] + # preceeding the nn.global_avg_pool2d, it will look like this + # + # %310 = annotation.stop_fusion(%309) /* ty=Tensor[(1, 16, 7, 7, 1, 32), int8] */; + # %311 = cast(%310, dtype="int32") /* ty=Tensor[(1, 16, 7, 7, 1, 32), int32] */; + # %312 = transpose(%311, axes=[0, 4, 1, 5, 2, 3]) /* ty=Tensor[(1, 1, 16, 32, 7, 7), int32] */; + # %313 = reshape(%312, newshape=[1, 512, 7, 7]) /* ty=Tensor[(1, 512, 7, 7), int32] */; + # %314 = nn.global_avg_pool2d(%313) /* ty=Tensor[(1, 512, 1, 1), int32] */; + # + # we mark the preceeding three ops also on cpu device + end = expr_locator.op2nodes[avg_pool2d][0] - 3 + + device_annot = ExprDeviceAnnot(start=start, end=end) + expr = device_annot.visit(expr) + ret = run_opt_pass(expr, transform.InferType()) + + return ret + else: + return expr diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 3e42727c5bfc..911aac301ae6 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -40,6 +40,10 @@ #include #include +#include +#include +#include + namespace vta { // Avoid bad configurations. @@ -151,8 +155,12 @@ struct DataBuffer { * Bytes. */ void MemCopyFromHost(void* dst, const void* src, size_t size) { + // struct timespec start, stop; + // clock_gettime(CLOCK_REALTIME, &start); VTAMemCopyFromHost(dst, src, size); - + // clock_gettime(CLOCK_REALTIME, &stop); + // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000; + // LOG(WARNING) << "DataBuffer VTAMemCopyFromHost: " << elapsed << " us"; } /*! * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory. @@ -739,8 +747,79 @@ class InsnQueue : public BaseQueue { return "unknown op"; } + + std::string GetOpName(const union VTAInsn& c) { + switch (c.mem.opcode) { + case VTA_OPCODE_LOAD: + if (c.mem.x_size == 0) { + if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) { + return "NOP-COMPUTE-STAGE"; + } else { + return "NOP-MEMORY-STAGE"; + } + } else { + if (c.mem.memory_type == VTA_MEM_ID_UOP) { + return "LOAD UOP"; + } else if (c.mem.memory_type == VTA_MEM_ID_WGT) { + return "LOAD WGT"; + } else if (c.mem.memory_type == VTA_MEM_ID_INP) { + return "LOAD INP"; + } else if (c.mem.memory_type == VTA_MEM_ID_ACC) { + return "LOAD ACC"; + } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8) { + return "LOAD ACC 8"; + } else { + return "LOAD"; + } + } + case VTA_OPCODE_STORE: + if (c.mem.x_size == 0) { + return "NOP-STORE-STAGE"; + } else { + return "STORE"; + } + case VTA_OPCODE_GEMM: + return "GEMM"; + case VTA_OPCODE_ALU: + return "ALU - " + getOpcodeString(c.alu.alu_opcode, c.alu.use_imm, c.alu.imm); + case VTA_OPCODE_FINISH: + return "FINISH"; + default: + return "Not recogonized"; + } + } + + std::string GetOpcodeName(const union VTAInsn& c) { + switch (c.mem.opcode) { + case VTA_OPCODE_LOAD: + if (c.mem.x_size == 0) { + return "NOP"; + } else { + return "LOAD"; + } + case VTA_OPCODE_STORE: + if (c.mem.x_size == 0) { + return "NOP"; + } else { + return "STORE"; + } + case VTA_OPCODE_GEMM: + return "GEMM"; + case VTA_OPCODE_ALU: + if (c.alu.use_imm) { + return "ALU IMM"; + } else { + return "ALU"; + } + case VTA_OPCODE_FINISH: + return "NOP"; + default: + return "Unknown"; + } + } + // Dump instructions in the queue - void DumpInsn() { + void DumpInsn(FILE* out = stderr, bool json=false) { // Keep tabs on dependence queues int l2g_queue = 0; int g2l_queue = 0; @@ -751,98 +830,158 @@ class InsnQueue : public BaseQueue { // Iterate over all instructions int insn_count = count(); const VTAGenericInsn* insn = data(); - printf("There are %u instructions\n", insn_count); + rapidjson::StringBuffer s; + rapidjson::Writer writer(s); + + if (!json) { + fprintf(out, "There are %u instructions\n", insn_count); + } else { + writer.StartArray(); + } + for (int i = 0; i < insn_count; ++i) { // Fetch instruction and decode opcode c.generic = insn[i]; - printf("INSTRUCTION %u: ", i); + if (json) { + writer.StartObject(); + writer.Key("name"); + writer.String(GetOpName(c).c_str()); + + writer.Key("type"); + writer.String(GetOpcodeName(c).c_str()); + + writer.Key("pop_prev"); + writer.Int(c.mem.pop_prev_dep); + writer.Key("pop_next"); + writer.Int(c.mem.pop_next_dep); + writer.Key("push_prev"); + writer.Int(c.mem.push_prev_dep); + writer.Key("push_next"); + writer.Int(c.mem.push_next_dep); + } else { + fprintf(out, "INSTRUCTION %u: ", i); + fprintf(out, "%s\n", GetOpName(c).c_str()); + + fprintf(out, "\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + } + if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) { - if (c.mem.x_size == 0) { - if (c.mem.opcode == VTA_OPCODE_STORE) { - printf("NOP-STORE-STAGE\n"); - } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) { - printf("NOP-COMPUTE-STAGE\n"); - } else { - printf("NOP-MEMORY-STAGE\n"); - } - printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - static_cast(c.mem.pop_prev_dep), static_cast(c.mem.pop_next_dep), - static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); - // Count status in queues - if (c.mem.opcode == VTA_OPCODE_STORE) { - CHECK(c.mem.pop_next_dep == false); - CHECK(c.mem.push_next_dep == false); - if (c.mem.pop_prev_dep) g2s_queue--; - if (c.mem.push_prev_dep) s2g_queue++; - } else if (c.mem.opcode == VTA_OPCODE_LOAD && - (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) { - CHECK(c.mem.pop_prev_dep == false); - CHECK(c.mem.push_prev_dep == false); - if (c.mem.pop_next_dep) g2l_queue--; - if (c.mem.push_next_dep) l2g_queue++; - } else { - if (c.mem.pop_prev_dep) l2g_queue--; - if (c.mem.push_prev_dep) g2l_queue++; - if (c.mem.pop_next_dep) s2g_queue--; - if (c.mem.push_next_dep) g2s_queue++; - } - printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); - printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); - continue; - } - // Print instruction field information - if (c.mem.opcode == VTA_OPCODE_LOAD) { - printf("LOAD "); - if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n"); - if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n"); - if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n"); - if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n"); - if (c.mem.memory_type == VTA_MEM_ID_ACC_8) printf("ACC 8\n"); - } - if (c.mem.opcode == VTA_OPCODE_STORE) { - printf("STORE:\n"); + if (json) { + writer.Key("dram"); + writer.Uint64(c.mem.dram_base); + writer.Key("sram"); + writer.Uint64(c.mem.sram_base); + + writer.Key("y"); + writer.StartArray(); + writer.Uint64(c.mem.y_size); + writer.Uint64(c.mem.y_pad_0); + writer.Uint64(c.mem.y_pad_1); + writer.EndArray(); + + writer.Key("x"); + writer.StartArray(); + writer.Uint64(c.mem.x_size); + writer.Uint64(c.mem.x_pad_0); + writer.Uint64(c.mem.x_pad_1); + writer.Uint64(c.mem.x_stride); + writer.EndArray(); + } else { + fprintf(out, "\tDRAM: 0x%08x, SRAM:0x%04x\n", + static_cast(c.mem.dram_base), + static_cast(c.mem.sram_base)); + fprintf(out, "\ty: size=%d, pad=[%d, %d]\n", + static_cast(c.mem.y_size), + static_cast(c.mem.y_pad_0), + static_cast(c.mem.y_pad_1)); + fprintf(out, "\tx: size=%d, stride=%d, pad=[%d, %d]\n", + static_cast(c.mem.x_size), + static_cast(c.mem.x_stride), + static_cast(c.mem.x_pad_0), + static_cast(c.mem.x_pad_1)); } - printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - static_cast(c.mem.pop_prev_dep), static_cast(c.mem.pop_next_dep), - static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); - printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", static_cast(c.mem.dram_base), - static_cast(c.mem.sram_base)); - printf("\ty: size=%d, pad=[%d, %d]\n", static_cast(c.mem.y_size), - static_cast(c.mem.y_pad_0), static_cast(c.mem.y_pad_1)); - printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", static_cast(c.mem.x_size), - static_cast(c.mem.x_stride), static_cast(c.mem.x_pad_0), - static_cast(c.mem.x_pad_1)); } else if (c.mem.opcode == VTA_OPCODE_GEMM) { - // Print instruction field information - printf("GEMM\n"); - - printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - static_cast(c.mem.pop_prev_dep), static_cast(c.mem.pop_next_dep), - static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); - printf("\treset_out: %d\n", static_cast(c.gemm.reset_reg)); - printf("\trange (%d, %d)\n", static_cast(c.gemm.uop_bgn), - static_cast(c.gemm.uop_end)); - printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", - static_cast(c.gemm.iter_out), static_cast(c.gemm.wgt_factor_out), - static_cast(c.gemm.src_factor_out), static_cast(c.gemm.dst_factor_out)); - printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", - static_cast(c.gemm.iter_in), static_cast(c.gemm.wgt_factor_in), - static_cast(c.gemm.src_factor_in), static_cast(c.gemm.dst_factor_in)); + if (json) { + writer.Key("reset_out"); + writer.Int(c.gemm.reset_reg); + writer.Key("range"); + writer.StartArray(); + writer.Int(c.gemm.uop_bgn); + writer.Int(c.gemm.uop_end); + writer.EndArray(); + + writer.Key("outer_loop"); + writer.StartArray(); + writer.Int(c.gemm.iter_out); + writer.Int(c.gemm.wgt_factor_out), + writer.Int(c.gemm.src_factor_out), + writer.Int(c.gemm.dst_factor_out); + writer.EndArray(); + + writer.Key("inner_loop"); + writer.StartArray(); + writer.Int(c.gemm.iter_in); + writer.Int(c.gemm.wgt_factor_in), + writer.Int(c.gemm.src_factor_in), + writer.Int(c.gemm.dst_factor_in); + writer.EndArray(); + } else { + fprintf(out, "\treset_out: %d\n", static_cast(c.gemm.reset_reg)); + fprintf(out, "\trange (%d, %d)\n", + static_cast(c.gemm.uop_bgn), + static_cast(c.gemm.uop_end)); + fprintf(out, "\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", + static_cast(c.gemm.iter_out), + static_cast(c.gemm.wgt_factor_out), + static_cast(c.gemm.src_factor_out), + static_cast(c.gemm.dst_factor_out)); + fprintf(out, "\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", + static_cast(c.gemm.iter_in), + static_cast(c.gemm.wgt_factor_in), + static_cast(c.gemm.src_factor_in), + static_cast(c.gemm.dst_factor_in)); + } } else if (c.mem.opcode == VTA_OPCODE_ALU) { - // Print instruction field information - printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm, c.alu.imm).c_str()); - printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - static_cast(c.mem.pop_prev_dep), static_cast(c.mem.pop_next_dep), - static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); - printf("\treset_out: %d\n", static_cast(c.alu.reset_reg)); - printf("\trange (%d, %d)\n", static_cast(c.alu.uop_bgn), - static_cast(c.alu.uop_end)); - printf("\touter loop - iter: %d, dst: %d, src: %d\n", static_cast(c.alu.iter_out), - static_cast(c.alu.dst_factor_out), static_cast(c.alu.src_factor_out)); - printf("\tinner loop - iter: %d, dst: %d, src: %d\n", static_cast(c.alu.iter_in), - static_cast(c.alu.dst_factor_in), static_cast(c.alu.src_factor_in)); - } else if (c.mem.opcode == VTA_OPCODE_FINISH) { - printf("FINISH\n"); + if (json) { + writer.Key("reset_out"); + writer.Int(c.alu.reset_reg); + writer.Key("range"); + writer.StartArray(); + writer.Int(c.alu.uop_bgn); + writer.Int(c.alu.uop_end); + writer.EndArray(); + + writer.Key("outer_loop"); + writer.StartArray(); + writer.Int(c.alu.iter_out); + writer.Int(c.alu.dst_factor_out), + writer.Int(c.alu.src_factor_out), + writer.EndArray(); + + writer.Key("inner_loop"); + writer.StartArray(); + writer.Int(c.alu.iter_in); + writer.Int(c.alu.dst_factor_in); + writer.Int(c.alu.src_factor_in), + writer.EndArray(); + } else { + fprintf(out, "\treset_out: %d\n", static_cast(c.alu.reset_reg)); + fprintf(out, "\trange (%d, %d)\n", + static_cast(c.alu.uop_bgn), + static_cast(c.alu.uop_end)); + fprintf(out, "\touter loop - iter: %d, dst: %d, src: %d\n", + static_cast(c.alu.iter_out), + static_cast(c.alu.dst_factor_out), + static_cast(c.alu.src_factor_out)); + fprintf(out, "\tinner loop - iter: %d, dst: %d, src: %d\n", + static_cast(c.alu.iter_in), + static_cast(c.alu.dst_factor_in), + static_cast(c.alu.src_factor_in)); + } } // Count status in queues @@ -871,8 +1010,27 @@ class InsnQueue : public BaseQueue { if (c.gemm.pop_next_dep) s2g_queue--; if (c.gemm.push_next_dep) g2s_queue++; } - printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); - printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); + if (json) { + writer.Key("l2g_queue"); + writer.Int(l2g_queue); + writer.Key("g2l_queue"); + writer.Int(g2l_queue); + writer.Key("s2g_queue"); + writer.Int(s2g_queue); + writer.Key("g2s_queue"); + writer.Int(g2s_queue); + + writer.EndObject(); + } else { + fprintf(out, "\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); + fprintf(out, "\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); + } + } + + if (json) { + writer.EndArray(); + auto str = s.GetString(); + fwrite(str, 1, s.GetSize(), out); } } // Commit all pending pop of corresponding stage @@ -1103,11 +1261,27 @@ class CommandQueue { } void Synchronize(uint32_t wait_cycles, bool skip=true) { + if (debug_flag_ & VTA_DEBUG_AUTO_TUNE) { + const char* insn_file = std::getenv("TVM_INSN_DUMP"); + if (insn_file == nullptr) { + insn_file = "insn.dump"; + } + FILE* out = fopen(insn_file, "w+"); + if (out) { + insn_queue_.DumpInsn(out, true); + fclose(out); + } else { + LOG(ERROR) << insn_file << " open failed"; + } + return; + } + // FIXME(zhanghao): It is required to use force_serial // by using skip and sync at the final layer, we can avoid do DeviceCopy every time if (skip) { if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) { - LOG(ERROR) << "Synchronizing all in one round requires to use force_serial to make things right"; + LOG(ERROR) << + "Synchronizing all in one round requires to use force_serial to make things right"; } return; } @@ -1130,8 +1304,18 @@ class CommandQueue { // Check if there are no instruction to execute at all if (insn_queue_.count() == 0) return; // Synchronization for the queues + // struct timespec start, stop; + // clock_gettime(CLOCK_REALTIME, &start); uop_queue_.AutoReadBarrier(); + // clock_gettime(CLOCK_REALTIME, &stop); + // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000; + // LOG(WARNING) << "UopQueue VTAMemCopyFromHost: " << elapsed << " us"; + + // clock_gettime(CLOCK_REALTIME, &start); insn_queue_.AutoReadBarrier(); + // clock_gettime(CLOCK_REALTIME, &stop); + // elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000; + // LOG(WARNING) << "InsnQueue VTAMemCopyFromHost: " << elapsed << " us"; // Dump instructions if debug enabled if (debug_flag_ & VTA_DEBUG_DUMP_INSN) { insn_queue_.DumpInsn(); @@ -1332,7 +1516,12 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off if (from_buffer) { // This is an FPGA to host mem transfer // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly + // struct timespec start, stop; + // clock_gettime(CLOCK_REALTIME, &start); VTASynchronize(VTATLSCommandHandle(), 1<<31, false); + // clock_gettime(CLOCK_REALTIME, &stop); + // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000; + // LOG(WARNING) << "Final Synchronize: " << elapsed << " us"; from_buffer->InvalidateCache(from_offset, size); from_buffer->MemCopyToHost(static_cast(to) + to_offset, static_cast(from) + from_offset, size); diff --git a/vta/runtime/runtime.h b/vta/runtime/runtime.h index 360970118144..22cf15a91503 100644 --- a/vta/runtime/runtime.h +++ b/vta/runtime/runtime.h @@ -41,6 +41,7 @@ extern "C" { #define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3) #define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4) #define VTA_DEBUG_FORCE_SERIAL (1 << 5) +#define VTA_DEBUG_AUTO_TUNE (1 << 6) /*! * \brief Allocate data buffer. diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index a92b1ee5d90b..bc819c20d470 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -76,7 +76,7 @@ # Perform vta-specific compilation with Relay from a Gluon model -def compile_network(env, target, model, start_pack, stop_pack): +def compile_network(env, target, model, start_pack, stop_pack, device_annot=False): # Populate the shape and data type dictionary dtype_dict = {"data": 'float32'} @@ -104,7 +104,8 @@ def compile_network(env, target, model, start_pack, stop_pack): env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, - stop_name=stop_pack) + stop_name=stop_pack, + device_annot=device_annot) return relay_prog, params @@ -341,8 +342,11 @@ def tune_and_evaluate(tuning_opt): tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. - vta.reconfig_runtime(remote) - vta.program_fpga(remote, bitstream=None) + bitstream = os.environ.get("TVM_BIT", None) + if bitstream: + print("Program fpga with {}".format(bitstream)) + vta.reconfig_runtime(remote) + vta.program_fpga(remote, bitstream) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() @@ -382,12 +386,14 @@ def tune_and_evaluate(tuning_opt): # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. - return + # return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) + # recompile the programs with device annotations + relay_prog, params = compile_network(env, target, network, start_pack, stop_pack, device_annot=True) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network @@ -395,14 +401,18 @@ def tune_and_evaluate(tuning_opt): if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build(relay_prog, - target=target, - params=params, - target_host=env.target_host) + target=target, + params=params, + target_host=env.target_host) else: + targets = { + "cpu": env.target_vta_cpu, + "ext_dev": env.target + } with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, - target=target, + target=targets, params=params, target_host=env.target_host) @@ -415,7 +425,8 @@ def tune_and_evaluate(tuning_opt): # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) - m = graph_runtime.create(graph, lib, ctx) + ctxes = [ctx, remote.cpu(0)] + m = graph_runtime.create(graph, lib, ctxes) # upload parameters to device image = tvm.nd.array( From 82cbd4f41b21443fd7bfab04dd96b6d13c55ef98 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 9 Apr 2020 11:07:41 +0800 Subject: [PATCH 09/44] update cost calculation formula --- python/tvm/autotvm/measure/measure_methods.py | 110 +++++++++++++----- 1 file changed, 78 insertions(+), 32 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 43ee291bfdd9..c545fb7aa23c 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -452,46 +452,92 @@ def cal_cost(insn): ------------ the cost in s """ - def alu_imm_cost(outer, inner, uops): - return 0.00001 - - def alu_cost(outer, inner, uops): - return 0.00001 - - def gemm_cost(outer, inner, uops): - return 0.00001 - - def load_inp_cost(y, x): - return 0.00001 - - def load_uop_cost(y, x): - return 0.00001 - - def load_wgt_cost(y, x): - return 0.00001 - - def store_cost(y, x): - return 0.00001 + factor = 1000000.0 + def alu_imm_cost(iter_out, iter_in, uop_bgn, uop_end): + x = (uop_end - uop_bgn) * iter_out * iter_in + cycles = x + 46 + return cycles / factor + + def alu_cost(iter_out, iter_in, uop_bgn, uop_end): + x = (uop_end - uop_bgn) * iter_out * iter_in + cycles = 2 * x + 46 + return cycles / factor + + def gemm_cost(iter_out, iter_in, uop_bgn, uop_end): + x = (uop_end - uop_bgn) * iter_out * iter_in + cycles = x + 80 + return cycles / factor + + def load_acc_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = x + 150 + return cycles / factor + + def load_acc8_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = 1.2 * x + 150 + return cycles / factor + + def load_inp_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = (x_size + x_pad_0 + x_pad_1) * (y_size + y_pad_0 + y_pad_1) + cycles = 1.1 * x + 150 + return cycles / factor + + def load_uop_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = 1.1 * x + 150 + return cycles / factor + + def load_wgt_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = 17 * x + 150 + return cycles / factor + + def store_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = x + 150 + return cycles / factor + + def nop_cost(name): + if name == "NOP-COMPUTE-STAGE": + return 38 / factor + elif name == "NOP-MEMORY-STAGE": + return 50 / factor + elif name == "NOP-STORE-STAGE": + return 39 / factor + else: + print("Unknown nop op {}".format(name)) + return 0 if insn['type'] == "ALU": - return alu_cost(insn['outer_loop'], insn['inner_loop'], - insn['range'][1] - insn['range'][0]) + return alu_cost(insn['outer_loop'][0], insn['inner_loop'][0], + insn['range'][0], insn['range'][1]) elif insn['type'] == "ALU IMM": - return alu_imm_cost(insn['outer_loop'], insn['inner_loop'], - insn['range'][1] - insn['range'][0]) + return alu_imm_cost(insn['outer_loop'][0], insn['inner_loop'][0], + insn['range'][0], insn['range'][1]) elif insn['type'] == "GEMM": - return gemm_cost(insn['outer_loop'], insn['inner_loop'], - insn['range'][1] - insn['range'][0]) + return gemm_cost(insn['outer_loop'][0], insn['inner_loop'][0], + insn['range'][0], insn['range'][1]) elif insn['name'] == "LOAD INP": - return load_inp_cost(insn['y'][0], insn['x'][0]) + return load_inp_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) elif insn['name'] == "LOAD WGT": - return load_wgt_cost(insn['y'][0], insn['x'][0]) + return load_wgt_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) elif insn['name'] == "LOAD UOP": - return load_uop_cost(insn['y'][0], insn['x'][0]) - elif insn['type'] == "STORE": - return store_cost(insn['y'][0], insn['x'][0]) + return load_uop_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) + elif insn['name'] == "LOAD ACC": + return load_acc_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) + elif insn['name'] == "LOAD ACC 8": + return load_acc8_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) + elif insn['name'] == "STORE": + return store_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) elif insn['type'] == "NOP": - return 0 + return nop_cost(insn['name']) else: print("Unknown op type: {}".format(insn['type'])) return 0 From a810b853d4d17ef974cfede11c47596cf7101d08 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 9 Apr 2020 16:02:38 +0800 Subject: [PATCH 10/44] bugfix for vta add schedule --- vta/python/vta/top/vta_conv2d.py | 80 ++++++++++++++++---------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index c87a89ecfe80..5c856384c605 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -208,48 +208,48 @@ def is_cast_op(op): te.schedule.AutoInlineInjective(s) # s[output].fuse(s[output].op.axis) - ewise_inputs = [] - ewise_ops = [] - const_ops = [] - - def _traverse(op): - if topi.tag.is_broadcast(op.tag): - if not op.same_as(output.op): - if not op.axis: - const_ops.append(op) - elif not is_cast_op(op): - ewise_ops.append(op) - - for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.te.PlaceholderOp): - ewise_inputs.append((op, tensor)) - elif is_cast_op(tensor.op) and not op.same_as(output.op): - ewise_inputs.append((op, tensor)) - else: - _traverse(tensor.op) - else: - for tensor in op.input_tensors: - if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \ - and (not is_cast_op(tensor.op)): - _traverse(tensor.op) - - op = output.op - _traverse(op) - x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis - - x_co_max = topi.util.get_const_int(x_bo.dom.extent) - x_i_max = topi.util.get_const_int(x_i.dom.extent) - x_j_max = topi.util.get_const_int(x_j.dom.extent) - - # TODO(zhanghao): auto-tune - x_co0, x_co1 = s[output].split(x_co, factor=1) - x_i0, x_i1 = s[output].split(x_i, factor=min(28, x_i_max)) - x_j0, x_j1 = s[output].split(x_j, factor=min(14, x_j_max)) - s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) - store_pt = x_j0 - # only put the int-related ops to vta if "int" in output.dtype: + ewise_inputs = [] + ewise_ops = [] + const_ops = [] + + def _traverse(op): + if topi.tag.is_broadcast(op.tag): + if not op.same_as(output.op): + if not op.axis: + const_ops.append(op) + elif not is_cast_op(op): + ewise_ops.append(op) + + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.te.PlaceholderOp): + ewise_inputs.append((op, tensor)) + elif is_cast_op(tensor.op) and not op.same_as(output.op): + ewise_inputs.append((op, tensor)) + else: + _traverse(tensor.op) + else: + for tensor in op.input_tensors: + if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \ + and (not is_cast_op(tensor.op)): + _traverse(tensor.op) + + op = output.op + _traverse(op) + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis + + x_co_max = topi.util.get_const_int(x_bo.dom.extent) + x_i_max = topi.util.get_const_int(x_i.dom.extent) + x_j_max = topi.util.get_const_int(x_j.dom.extent) + + # TODO(zhanghao): auto-tune + x_co0, x_co1 = s[output].split(x_co, factor=1) + x_i0, x_i1 = s[output].split(x_i, factor=min(28, x_i_max)) + x_j0, x_j1 = s[output].split(x_j, factor=min(14, x_j_max)) + s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) + store_pt = x_j0 + env = get_env() for eo in ewise_ops: eprint("add ewise_ops ", eo) From 3a8e244f345a707940fd0b827bf7f0c8482ca34b Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Fri, 10 Apr 2020 12:16:33 +0800 Subject: [PATCH 11/44] bugfix for insn buffer overflow --- vta/runtime/runtime.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 911aac301ae6..39038da00b51 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -1474,12 +1474,12 @@ class CommandQueue { void CheckInsnOverFlow() { // At each API call, we can at most commit: // one pending store, one pending load, and one uop - if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) { + if ((insn_queue_.count() + 5) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) { this->AutoSync(); } } // Auto sync when instruction overflow - void AutoSync() { this->Synchronize(1 << 31); } + void AutoSync() { this->Synchronize(1 << 31, false); } // Internal debug flag int debug_flag_{0}; From 5c7ead70ee6c47881edeb8e176380c9740f4616b Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 13 Apr 2020 11:13:27 +0800 Subject: [PATCH 12/44] tune vta relay refine --- vta/tutorials/autotvm/tune_relay_vta.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index bc819c20d470..7e537fae9128 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -195,7 +195,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals # The ``start_pack`` and ``stop_pack`` labels indicate where # to start and end the graph packing relay pass: in other words # where to start and finish offloading to VTA. -network = "resnet18_v1" +network = "resnet50_v2" start_pack = "nn.max_pool2d" stop_pack = "nn.global_avg_pool2d" @@ -368,7 +368,7 @@ def tune_and_evaluate(tuning_opt): tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) # We should have extracted 10 convolution tasks - assert len(tasks) == 10 + # assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: inp = tsk.args[0][1] @@ -392,10 +392,11 @@ def tune_and_evaluate(tuning_opt): print("Tuning...") tune_tasks(tasks, **tuning_opt) - # recompile the programs with device annotations - relay_prog, params = compile_network(env, target, network, start_pack, stop_pack, device_annot=True) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): + # recompile the programs with device annotations + print("Recompile") + relay_prog, params = compile_network(env, target, network, start_pack, stop_pack, device_annot=True) # Compile network print("Compile...") if target.device_name != "vta": @@ -409,7 +410,7 @@ def tune_and_evaluate(tuning_opt): "cpu": env.target_vta_cpu, "ext_dev": env.target } - with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): + with vta.build_config(opt_level=3, debug_flag=32, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, target=targets, From cc96cbbbc4bdf5bddb5d157e949d5c9904cfb0d1 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 13 Apr 2020 13:24:45 +0800 Subject: [PATCH 13/44] separate cost function from general method_methods --- python/tvm/autotvm/measure/measure_methods.py | 115 ++---------------- vta/config/vta_cost.py | 102 ++++++++++++++++ 2 files changed, 114 insertions(+), 103 deletions(-) create mode 100644 vta/config/vta_cost.py diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index c545fb7aa23c..f32725c2e9cc 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -33,6 +33,8 @@ import numpy as np import json +import sys +from importlib import import_module import tvm._ffi import tvm.ir.transform @@ -440,109 +442,6 @@ def _wrapped(measure_input, tmp_dir, **kwargs): return _wrapped -def cal_cost(insn): - """ - Cal the runtime cost statically - - Parameters - ------------ - insn: the insn (json) - - Returns - ------------ - the cost in s - """ - factor = 1000000.0 - def alu_imm_cost(iter_out, iter_in, uop_bgn, uop_end): - x = (uop_end - uop_bgn) * iter_out * iter_in - cycles = x + 46 - return cycles / factor - - def alu_cost(iter_out, iter_in, uop_bgn, uop_end): - x = (uop_end - uop_bgn) * iter_out * iter_in - cycles = 2 * x + 46 - return cycles / factor - - def gemm_cost(iter_out, iter_in, uop_bgn, uop_end): - x = (uop_end - uop_bgn) * iter_out * iter_in - cycles = x + 80 - return cycles / factor - - def load_acc_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = x + 150 - return cycles / factor - - def load_acc8_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = 1.2 * x + 150 - return cycles / factor - - def load_inp_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = (x_size + x_pad_0 + x_pad_1) * (y_size + y_pad_0 + y_pad_1) - cycles = 1.1 * x + 150 - return cycles / factor - - def load_uop_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = 1.1 * x + 150 - return cycles / factor - - def load_wgt_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = 17 * x + 150 - return cycles / factor - - def store_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = x + 150 - return cycles / factor - - def nop_cost(name): - if name == "NOP-COMPUTE-STAGE": - return 38 / factor - elif name == "NOP-MEMORY-STAGE": - return 50 / factor - elif name == "NOP-STORE-STAGE": - return 39 / factor - else: - print("Unknown nop op {}".format(name)) - return 0 - - if insn['type'] == "ALU": - return alu_cost(insn['outer_loop'][0], insn['inner_loop'][0], - insn['range'][0], insn['range'][1]) - elif insn['type'] == "ALU IMM": - return alu_imm_cost(insn['outer_loop'][0], insn['inner_loop'][0], - insn['range'][0], insn['range'][1]) - elif insn['type'] == "GEMM": - return gemm_cost(insn['outer_loop'][0], insn['inner_loop'][0], - insn['range'][0], insn['range'][1]) - elif insn['name'] == "LOAD INP": - return load_inp_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "LOAD WGT": - return load_wgt_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "LOAD UOP": - return load_uop_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "LOAD ACC": - return load_acc_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "LOAD ACC 8": - return load_acc8_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "STORE": - return store_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['type'] == "NOP": - return nop_cost(insn['name']) - else: - print("Unknown op type: {}".format(insn['type'])) - return 0 - - def run_through_rpc(measure_input, build_result, number, repeat, min_repeat_ms, cooldown_interval, remote_args, ref_input=None, ref_output=None): @@ -623,6 +522,12 @@ def run_through_rpc(measure_input, build_result, func(*args) cost = 0 insn_dump = os.getenv('TVM_INSN_DUMP', "insn.dump") + insn_cost_file = os.getenv('TVM_INSN_COST', "cost.py") + path, filename = os.path.split(insn_cost_file) + sys.path.append(path) + module_path = filename[:-3] # remove the .py suffix + module = import_module(module_path) + cal_cost = getattr(module, "cal_cost") with open(insn_dump) as json_file: insns = json.load(json_file) for insn in insns: @@ -649,6 +554,10 @@ def run_through_rpc(measure_input, build_result, msg = msg[:msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]),) errno = MeasureErrorNo.RUNTIME_DEVICE + except Exception as exc: + costs = (exc,) + errno = MeasureErrorNo.UNKNOWN_ERROR + tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp) diff --git a/vta/config/vta_cost.py b/vta/config/vta_cost.py new file mode 100644 index 000000000000..9e1d7389b8c3 --- /dev/null +++ b/vta/config/vta_cost.py @@ -0,0 +1,102 @@ +# cost function for intelfocl 32*32 gemm version +def cal_cost(insn): + """ + Cal the runtime cost statically + + Parameters + ------------ + insn: the insn (json) + + Returns + ------------ + the cost in s + """ + factor = 1000000.0 + def alu_imm_cost(iter_out, iter_in, uop_bgn, uop_end): + x = (uop_end - uop_bgn) * iter_out * iter_in + cycles = x + 46 + return cycles / factor + + def alu_cost(iter_out, iter_in, uop_bgn, uop_end): + x = (uop_end - uop_bgn) * iter_out * iter_in + cycles = 2 * x + 46 + return cycles / factor + + def gemm_cost(iter_out, iter_in, uop_bgn, uop_end): + x = (uop_end - uop_bgn) * iter_out * iter_in + cycles = x + 80 + return cycles / factor + + def load_acc_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = x + 150 + return cycles / factor + + def load_acc8_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = 1.2 * x + 150 + return cycles / factor + + def load_inp_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = (x_size + x_pad_0 + x_pad_1) * (y_size + y_pad_0 + y_pad_1) + cycles = 1.1 * x + 150 + return cycles / factor + + def load_uop_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = 1.1 * x + 150 + return cycles / factor + + def load_wgt_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = 17 * x + 150 + return cycles / factor + + def store_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): + x = x_size * y_size + cycles = x + 150 + return cycles / factor + + def nop_cost(name): + if name == "NOP-COMPUTE-STAGE": + return 38 / factor + elif name == "NOP-MEMORY-STAGE": + return 50 / factor + elif name == "NOP-STORE-STAGE": + return 39 / factor + else: + print("Unknown nop op {}".format(name)) + return 0 + + if insn['type'] == "ALU": + return alu_cost(insn['outer_loop'][0], insn['inner_loop'][0], + insn['range'][0], insn['range'][1]) + elif insn['type'] == "ALU IMM": + return alu_imm_cost(insn['outer_loop'][0], insn['inner_loop'][0], + insn['range'][0], insn['range'][1]) + elif insn['type'] == "GEMM": + return gemm_cost(insn['outer_loop'][0], insn['inner_loop'][0], + insn['range'][0], insn['range'][1]) + elif insn['name'] == "LOAD INP": + return load_inp_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) + elif insn['name'] == "LOAD WGT": + return load_wgt_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) + elif insn['name'] == "LOAD UOP": + return load_uop_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) + elif insn['name'] == "LOAD ACC": + return load_acc_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) + elif insn['name'] == "LOAD ACC 8": + return load_acc8_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) + elif insn['name'] == "STORE": + return store_cost(insn['y'][0], insn['y'][1], insn['y'][2], + insn['x'][0], insn['x'][1], insn['x'][2]) + elif insn['type'] == "NOP": + return nop_cost(insn['name']) + else: + print("Unknown op type: {}".format(insn['type'])) + return 0 From d880b3ba6e66f7ea9e5535591571e309747f9247 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 16 Apr 2020 14:35:14 +0800 Subject: [PATCH 14/44] vta mobilenetG prediction script --- python/tvm/relay/testing/mobilenet.py | 50 +++-- vta/tutorials/frontend/deploy_mobilenet.py | 226 +++++++++++++++++++++ 2 files changed, 258 insertions(+), 18 deletions(-) create mode 100644 vta/tutorials/frontend/deploy_mobilenet.py diff --git a/python/tvm/relay/testing/mobilenet.py b/python/tvm/relay/testing/mobilenet.py index d5a4d5f1e08f..e83336525ea7 100644 --- a/python/tvm/relay/testing/mobilenet.py +++ b/python/tvm/relay/testing/mobilenet.py @@ -44,20 +44,22 @@ def conv_block(data, name, channels, kernel_size=(3, 3), strides=(1, 1), def separable_conv_block(data, name, depthwise_channels, pointwise_channels, kernel_size=(3, 3), downsample=False, padding=(1, 1), - epsilon=1e-5, layout='NCHW', dtype="float32"): + epsilon=1e-5, layout='NCHW', dtype="float32", depthwise_group_factor=1): """Helper function to get a separable conv block""" if downsample: strides = (2, 2) else: strides = (1, 1) # depthwise convolution + bn + relu - wshape = (depthwise_channels, 1) + kernel_size + wshape = (depthwise_channels, depthwise_group_factor) + kernel_size weight = relay.var(name + "_weight", shape=wshape, dtype=dtype) + depthwise_group_factor = min(depthwise_group_factor, depthwise_channels) + groups = int(depthwise_channels/depthwise_group_factor) conv1 = layers.conv2d( data=data, weight=weight, channels=depthwise_channels, - groups=depthwise_channels, + groups=groups, kernel_size=kernel_size, strides=strides, padding=padding, @@ -82,47 +84,59 @@ def separable_conv_block(data, name, depthwise_channels, pointwise_channels, def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224), - dtype='float32', alpha=1.0, is_shallow=False, layout='NCHW'): + dtype='float32', alpha=1.0, is_shallow=False, layout='NCHW', + depthwise_group_factor=1): """Function to construct a MobileNet""" data = relay.var("data", shape=data_shape, dtype=dtype) body = conv_block(data, 'conv_block_1', int(32*alpha), strides=(2, 2), layout=layout) body = separable_conv_block(body, 'separable_conv_block_1', int(32*alpha), int(64*alpha), layout=layout, - dtype=dtype) + dtype=dtype, + depthwise_group_factor=depthwise_group_factor) body = separable_conv_block(body, 'separable_conv_block_2', int(64*alpha), int(128*alpha), downsample=True, - layout=layout, dtype=dtype) + layout=layout, dtype=dtype, + depthwise_group_factor=depthwise_group_factor) body = separable_conv_block(body, 'separable_conv_block_3', int(128*alpha), int(128*alpha), layout=layout, - dtype=dtype) + dtype=dtype, + depthwise_group_factor=depthwise_group_factor) body = separable_conv_block(body, 'separable_conv_block_4', int(128*alpha), int(256*alpha), downsample=True, - layout=layout, dtype=dtype) + layout=layout, dtype=dtype, + depthwise_group_factor=depthwise_group_factor) body = separable_conv_block(body, 'separable_conv_block_5', int(256*alpha), int(256*alpha), layout=layout, - dtype=dtype) + dtype=dtype, + depthwise_group_factor=depthwise_group_factor) body = separable_conv_block(body, 'separable_conv_block_6', int(256*alpha), int(512*alpha), downsample=True, - layout=layout, dtype=dtype) + layout=layout, dtype=dtype, + depthwise_group_factor=depthwise_group_factor) if is_shallow: body = separable_conv_block(body, 'separable_conv_block_7', int(512*alpha), int(1024*alpha), - downsample=True, layout=layout, dtype=dtype) + downsample=True, layout=layout, dtype=dtype, + depthwise_group_factor=depthwise_group_factor) body = separable_conv_block(body, 'separable_conv_block_8', int(1024*alpha), int(1024*alpha), - downsample=True, layout=layout, dtype=dtype) + downsample=True, layout=layout, dtype=dtype, + depthwise_group_factor=depthwise_group_factor) else: for i in range(7, 12): body = separable_conv_block(body, 'separable_conv_block_%d' % i, int(512*alpha), int(512*alpha), - layout=layout, dtype=dtype) + layout=layout, dtype=dtype, + depthwise_group_factor=depthwise_group_factor) body = separable_conv_block(body, 'separable_conv_block_12', int(512*alpha), int(1024*alpha), - downsample=True, layout=layout, dtype=dtype) + downsample=True, layout=layout, dtype=dtype, + depthwise_group_factor=depthwise_group_factor) body = separable_conv_block(body, 'separable_conv_block_13', - int(1024*alpha), int(1024*alpha), - layout=layout, dtype=dtype) + int(1024*alpha), int(1024*alpha), + layout=layout, dtype=dtype, + depthwise_group_factor=depthwise_group_factor) pool = relay.nn.global_avg_pool2d(data=body, layout=layout) flatten = relay.nn.batch_flatten(data=pool) weight = relay.var('fc_weight') @@ -134,7 +148,7 @@ def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224), def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224), - dtype='float32', layout='NCHW'): + dtype='float32', layout='NCHW', depthwise_group_factor=1): """Get benchmark workload for mobilenet Parameters @@ -166,5 +180,5 @@ def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224), data_shape = tuple([batch_size] + list(image_shape)) net = mobile_net(num_classes=num_classes, data_shape=data_shape, dtype=dtype, alpha=1.0, is_shallow=False, - layout=layout) + layout=layout, depthwise_group_factor=depthwise_group_factor) return create_workload(net) diff --git a/vta/tutorials/frontend/deploy_mobilenet.py b/vta/tutorials/frontend/deploy_mobilenet.py new file mode 100644 index 000000000000..8a94a588741e --- /dev/null +++ b/vta/tutorials/frontend/deploy_mobilenet.py @@ -0,0 +1,226 @@ +from __future__ import absolute_import, print_function + +import argparse, json, os, requests, sys, time +from io import BytesIO +from os.path import join, isfile +from PIL import Image + +from mxnet.gluon.model_zoo import vision +import numpy as np +from matplotlib import pyplot as plt + +import tvm +from tvm import te +from tvm import rpc, autotvm, relay +from tvm.contrib import graph_runtime, util, download +from tvm.contrib.debugger import debug_runtime +from tvm.relay import transform +import tvm.relay.testing + +import vta +from vta.testing import simulator +from vta.top import graph_pack +from tvm.contrib.util import eprint + +# Make sure that TVM was compiled with RPC=1 +assert tvm.runtime.enabled("rpc") + +###################################################################### +# Define the platform and model targets +# ------------------------------------- +# Execute on CPU vs. VTA, and define the model. + +# Load VTA parameters from the vta/config/vta_config.json file +env = vta.get_env() + +# Set ``device=arm_cpu`` to run inference on the CPU +# or ``device=vta`` to run inference on the FPGA. +device = "vta" +target = env.target if device == "vta" else env.target_vta_cpu +# multiple targets to run both on cpu and vta +targets = { + "cpu": env.target_vta_cpu, + "ext_dev": env.target +} + +model = "mobilenetG" + +###################################################################### +# Obtain an execution remote +# -------------------------- +# When target is 'pynq', reconfigure FPGA and runtime. +# Otherwise, if target is 'sim', execute locally. + +if env.TARGET not in ["sim", "tsim", "intelfocl"]: + + # Get remote from tracker node if environment variable is set. + # To set up the tracker, you'll need to follow the "Auto-tuning + # a convolutional network for VTA" tutorial. + tracker_host = os.environ.get("TVM_TRACKER_HOST", None) + tracker_port = os.environ.get("TVM_TRACKER_PORT", None) + # Otherwise if you have a device you want to program directly from + # the host, make sure you've set the variables below to the IP of + # your board. + device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") + device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091") + if not tracker_host or not tracker_port: + remote = rpc.connect(device_host, int(device_port)) + else: + remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000) + + # Reconfigure the JIT runtime and FPGA. + # You can program the FPGA with your own custom bitstream + # by passing the path to the bitstream file instead of None. + reconfig_start = time.time() + vta.reconfig_runtime(remote) + bitstream = os.environ.get("TVM_BIT", None) + if bitstream: + print("Program fpga with {}".format(bitstream)) + vta.program_fpga(remote, bitstream) + + reconfig_time = time.time() - reconfig_start + print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) + +# In simulation mode, host the RPC server locally. +else: + remote = rpc.LocalSession() + +# Get execution context from remote +# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) +ctxes = [remote.ext_dev(0), remote.cpu(0)] + +# Load pre-configured AutoTVM schedules +with autotvm.tophub.context(target): + + # Populate the shape and data type dictionary for ImageNet classifier input + dtype_dict = {"data": 'float32'} + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + + # get the mobilenet model + mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32", + depthwise_group_factor=env.BLOCK_IN) + + # Measure build start time + build_start = time.time() + + # Update shape and type dictionary + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + if target.device_name == "vta": + # Perform quantization in Relay + # Note: We set opt_level to 3 in order to fold batch norm + with relay.build_config(opt_level=3): + with relay.quantize.qconfig(global_scale=8.0, + skip_conv_layers=[0]): + mod = relay.quantize.quantize(mod, params=params) + # Perform graph packing and constant folding for VTA target + assert env.BLOCK_IN == env.BLOCK_OUT + relay_prog = graph_pack( + mod["main"], + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name="nn.conv2d", + stop_name="nn.global_avg_pool2d") + else: + relay_prog = mod["main"] + + # Compile Relay program with AlterOpLayout disabled + with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): + if target.device_name != "vta": + graph, lib, params = relay.build( + relay_prog, target=target, + params=params, target_host=env.target_host) + else: + with vta.build_config(debug_flag=32): + graph, lib, params = relay.build( + relay_prog, target=targets, + params=params, target_host=env.target_host) + + # Measure Relay build time + build_time = time.time() - build_start + print(model + " inference graph built in {0:.2f}s!".format(build_time)) + + # Graph runtime + m = graph_runtime.create(graph, lib, ctxes) + +###################################################################### +# Perform image classification inference +# -------------------------------------- +# We run classification on an image sample from ImageNet +# We just need to download the categories files, `synset.txt` +# and an input test image. + +# Download ImageNet categories +categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" +categ_fn = "synset.txt" +download.download(join(categ_url, categ_fn), categ_fn) +synset = eval(open(categ_fn).read()) + +# Download test image +image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' +image_fn = 'cat.png' +download.download(image_url, image_fn) + +# Prepare test image for inference +image = Image.open(image_fn).resize((224, 224)) +plt.imshow(image) +plt.show() +image = np.array(image) - np.array([123., 117., 104.]) +image /= np.array([58.395, 57.12, 57.375]) +image = image.transpose((2, 0, 1)) +image = image[np.newaxis, :] +image = np.repeat(image, env.BATCH, axis=0) + +# Set the network parameters and inputs +m.set_input(**params) +m.set_input('data', image) + +# Perform inference and gather execution statistics +# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator +num = 3 # number of times we run module for a single measurement +rep = 3 # number of measurements (we derive std dev from this) +timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep) + +if env.TARGET in ["sim", "tsim"]: + simulator.clear_stats() + timer() + + sim_stats = simulator.stats() + print("\nExecution statistics:") + for k, v in sim_stats.items(): + # Since we execute the workload many times, we need to normalize stats + # Note that there is always one warm up run + # Therefore we divide the overall stats by (num * rep + 1) + print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1))) +else: + tcost = timer() + std = np.std(tcost.results) * 1000 + mean = tcost.mean * 1000 + print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH)) + print("Average per sample inference time: %.2fms" % (mean/env.BATCH)) + +# Get classification results +tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0))) +output = tvm_output.asnumpy() +for b in range(env.BATCH): + top_categories = np.argsort(tvm_output.asnumpy()[b]) + # print("top_categories = ", top_categories) + # Report top-5 classification results + print("\n{} prediction for sample {}".format(model, b)) + print("\t#1:", synset[top_categories[-1]], output[b][top_categories[-1]]) + print("\t#2:", synset[top_categories[-2]], output[b][top_categories[-2]]) + print("\t#3:", synset[top_categories[-3]], output[b][top_categories[-3]]) + print("\t#4:", synset[top_categories[-4]], output[b][top_categories[-4]]) + print("\t#5:", synset[top_categories[-5]], output[b][top_categories[-5]]) + # This just checks that one of the 5 top categories + # is one variety of cat; this is by no means an accurate + # assessment of how quantization affects classification + # accuracy but is meant to catch changes to the + # quantization pass that would accuracy in the CI. + cat_detected = False + for k in top_categories[-5:]: + if "cat" in synset[k]: + cat_detected = True + assert(cat_detected) From f80c3e05adf22f07b22debe988fa1d60e184a89a Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 20 Apr 2020 16:03:01 +0800 Subject: [PATCH 15/44] quickfix for auto-tune segfault --- python/tvm/autotvm/measure/measure_methods.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index f32725c2e9cc..666d307247c1 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -85,6 +85,8 @@ class LocalBuilder(Builder): If is callable, use it as custom build function, expect lib_format field. """ def __init__(self, timeout=10, n_parallel=None, build_func='default'): + # FIXME(zhanghao): quickfix - use single thread. otherwise may cause seg fault + n_parallel = 1 super(LocalBuilder, self).__init__(timeout, n_parallel) if isinstance(build_func, str): From dadf0459b4b1e7cb878cbef94ea15aae1b6cdb4a Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Wed, 6 May 2020 13:26:41 +0800 Subject: [PATCH 16/44] add dcgan support (simulation) --- python/tvm/relay/op/strategy/generic.py | 6 +- python/tvm/relay/quantize/_annotate.py | 25 +++ python/tvm/relay/quantize/_partition.py | 13 ++ python/tvm/relay/quantize/quantize.py | 14 +- src/arith/detect_linear_equation.cc | 16 ++ src/relay/quantize/realize.cc | 37 ++++ src/tir/transforms/inject_copy_intrin.cc | 14 +- vta/python/vta/top/graphpack.py | 35 ++-- vta/python/vta/top/vta_conv2d.py | 24 ++- vta/python/vta/top/vta_conv2d_transpose.py | 21 ++- vta/tutorials/frontend/deploy_dcgan.py | 186 +++++++++++++++++++++ 11 files changed, 358 insertions(+), 33 deletions(-) create mode 100644 vta/tutorials/frontend/deploy_dcgan.py diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 3d24cdf73e9d..48944474e272 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -330,8 +330,10 @@ def compute_conv2d_transpose(attrs, inputs, out_dtype): out = topi_compute( inputs[0], inputs[1], strides, padding, out_dtype) output_padding = get_const_tuple(attrs.output_padding) - out = topi.nn.pad(out, [0, 0, 0, 0], - [0, 0, output_padding[0], output_padding[1]]) + if output_padding[0] != 0 or output_padding[1] != 0: + pad_before = [0] * len(out.shape) + pad_after = [0, 0, output_padding[0], output_padding[1]] + [0] * (len(out.shape) - 4) + out = topi.nn.pad(out, pad_before, pad_after) return [out] return compute_conv2d_transpose diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 952a86466300..08930527b443 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -173,6 +173,30 @@ def conv2d_rewrite(ref_call, new_args, ctx): return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) +@register_annotate_function("nn.conv2d_transpose") +def conv2d_transpose_rewrite(ref_call, new_args, ctx): + """Rewrite function for conv2d_transpose. Lhs of conv will be quantized to + input field, and rhs of conv will be quantized to weight field. + Output would be in activation field""" + if quantize_context().check_to_skip(ref_call): + return None + + lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) + rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) + + if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION: + lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT) + + assert rhs_kind is None + rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT) + + expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) + + return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) + + +# TODO(tmoreau89,ziheng) need to include an option to turn off dense quant +# @register_annotate_function("nn.dense") @register_annotate_function("nn.dense") def dense_rewrite(ref_call, new_args, ctx): """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of @@ -281,6 +305,7 @@ def identity_rewrite(ref_call, new_args, ctx): return QAnnotateExpr(ret_expr, x_kind) +register_annotate_function("reshape", identity_rewrite) register_annotate_function("clip", identity_rewrite) register_annotate_function("nn.relu", identity_rewrite) register_annotate_function("strided_slice", identity_rewrite) diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py index 315986d55607..f26e88301894 100644 --- a/python/tvm/relay/quantize/_partition.py +++ b/python/tvm/relay/quantize/_partition.py @@ -53,6 +53,19 @@ def conv2d_partition_function(ref_call, new_args, ctx): return QPartitionExpr(ret) +@register_partition_function("nn.conv2d_transpose") +def conv2d_partition_function(ref_call, new_args, ctx): + """Rewrite function for conv2d for partition""" + data_cond, data = partition_expr_check(new_args[0]) + kernel_cond, kernel = partition_expr_check(new_args[1]) + + assert not kernel_cond + if data_cond: + data = new_args[0].realize() + ret = _forward_op(ref_call, [data, kernel]) + return QPartitionExpr(ret) + + def identity_partition_function(ref_call, new_args, ctx): cond, expr = partition_expr_check(new_args[0]) if cond: diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index 28ebf7f3032b..b7371a3c3068 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -209,10 +209,18 @@ def check_to_skip(self, ref_call): # check skip conv layers skipped_indices = [int(x) for x in current_qconfig().skip_conv_layers] if self._conv2d_counter in skipped_indices: - if ref_call.op.name == 'nn.conv2d': + if ref_call.op.name == 'nn.conv2d' or ref_call.op.name == 'nn.conv2d_transpose': self._conv2d_counter += 1 - return True - if ref_call.op.name == 'nn.conv2d': + return True + else: + # counter is 0 before visiting the first conv2d + # if the first conv2d is skipped, all ops before it will also be skipped + # otherwise, we don't skip until the counter become +1 + if self._conv2d_counter == 0: + return True + else: + return False + if ref_call.op.name == 'nn.conv2d' or ref_call.op.name == 'nn.conv2d_transpose': self._conv2d_counter += 1 return False diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc index f0634feac083..18d28b53a431 100644 --- a/src/arith/detect_linear_equation.cc +++ b/src/arith/detect_linear_equation.cc @@ -71,6 +71,16 @@ class LinearEqDetector : public ExprFunctora, op->a); + LinearEqEntry b = VisitExpr(op->b, op->b); + LinearEqEntry ret; + ret.base = FloorDivCombine(a.base, b.base); + ret.coeff = FloorDivCombine(a.coeff, b.coeff); + return ret; + } + LinearEqEntry VisitExpr_(const SubNode* op, const PrimExpr& e) final { if (fail_) return LinearEqEntry(); LinearEqEntry a = VisitExpr(op->a, op->a); @@ -138,6 +148,12 @@ class LinearEqDetector : public ExprFunctor DetectLinearEquation(const PrimExpr& e, const Array& vars) { diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 41680b655a66..07e61de82958 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -220,6 +220,41 @@ Expr Conv2dRealize(const Call& ref_call, const Array& new_args, const Obje RELAY_REGISTER_OP("nn.conv2d").set_attr("FQRealizeRewrite", Conv2dRealize); +Expr Conv2dTransposeRealize(const Call& ref_call, + const Array& new_args, + const ObjectRef& ctx) { + const QConfig& cfg = QConfig::Current(); + CHECK_EQ(new_args.size(), 2); + if (!new_args[0]->IsInstance() || !new_args[1]->IsInstance()) { + return Expr(nullptr); + } + const auto* lhs = new_args[0].as(); + CHECK(lhs); + const auto* rhs = new_args[1].as(); + CHECK(rhs); + + Expr ldata = lhs->data; + if (lhs->dtype != cfg->dtype_input) { + ldata = Cast(ldata, cfg->dtype_input); + } + Expr rdata = Cast(rhs->data, cfg->dtype_weight); + + const auto ref_attrs = ref_call->attrs.as(); + auto attrs = make_object(); + *attrs = *ref_attrs; + DataType out_dtype = cfg->dtype_activation; + attrs->out_dtype = out_dtype; + + Expr ret = CallNode::make(ref_call->op, + {ldata, rdata}, Attrs(attrs), ref_call->type_args); + Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale); + Expr dom_scale = FoldConstantOpt(mul); + return QRealizeIntExprNode::make(ret, dom_scale, out_dtype); +} + +RELAY_REGISTER_OP("nn.conv2d_transpose") +.set_attr("FQRealizeRewrite", Conv2dTransposeRealize); + Expr DenseRealize(const Call& ref_call, const Array& new_args, const ObjectRef& ctx) { const QConfig& cfg = QConfig::Current(); CHECK_EQ(new_args.size(), 2); @@ -435,6 +470,8 @@ Expr IdentityRealize(const Call& ref_call, const Array& new_args, const Ob RELAY_REGISTER_OP("nn.relu").set_attr("FQRealizeRewrite", IdentityRealize); +RELAY_REGISTER_OP("reshape").set_attr("FQRealizeRewrite", IdentityRealize); + RELAY_REGISTER_OP("strided_slice").set_attr("FQRealizeRewrite", IdentityRealize); RELAY_REGISTER_OP("annotation.stop_fusion") diff --git a/src/tir/transforms/inject_copy_intrin.cc b/src/tir/transforms/inject_copy_intrin.cc index b27459f4bd45..279274632648 100644 --- a/src/tir/transforms/inject_copy_intrin.cc +++ b/src/tir/transforms/inject_copy_intrin.cc @@ -80,7 +80,19 @@ class CopyIntrinInjector : public StmtMutator { } // for now only support true condition matching if (has_cond) { - load = sel_true_value.Eval().as(); + auto true_val = sel_true_value.Eval(); + + // TODO(zhanghao): we do cond unfold one more further + // this is used to lift the pad(dilate) to one load op + // However, ignoring false condition may cause incorrect results + PVar sel_cond_extra, sel_true_value_extra, sel_false_value_extra; + bool has_cond_extra = if_then_else(sel_cond_extra, sel_true_value_extra, sel_false_value_extra).Match(true_val) || + select(sel_cond_extra, sel_true_value_extra, sel_false_value_extra).Match(true_val); + if (has_cond_extra) { + load = sel_true_value_extra.Eval().as(); + } else { + load = true_val.as(); + } } // cast can be part of the pattern if (cast != nullptr) { diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index cdfd3c4281e2..9cdc355f6c64 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -94,7 +94,7 @@ def _weight_shape_match_transpose(data, dshape, channels, cfactor_out): if pad_width != 0: pad_width = cfactor_out - pad_width data = op.nn.pad(data, [[0, 0], [0, pad_width], [0, 0], [0, 0]]) - dshape = tuple(dshape[0], [dshape[1] + pad_width, dshape[2], dshape[3]]) + dshape = tuple([dshape[0]] + [dshape[1] + pad_width, dshape[2], dshape[3]]) if channels_pad != 0: channels = channels + (cfactor_out - channels_pad) @@ -252,11 +252,12 @@ def visit_call(self, call): # First visit the children. args = [self.visit(arg) for arg in call.args] + odtype = _get_tensor_type(call) self.counter += 1 - if call.op in self.op2nodes: - self.op2nodes[call.op].append(self.counter) + if (call.op, odtype) in self.op2nodes: + self.op2nodes[(call.op, odtype)].append(self.counter) else: - self.op2nodes[call.op] = [self.counter] + self.op2nodes[(call.op, odtype)] = [self.counter] return relay.Call( self.visit(call.op), @@ -550,7 +551,8 @@ def graph_pack(expr, The transformed expression. """ assert isinstance(expr, relay.Function) - assert ((start_name != stop_name) or (start_name_idx < stop_name_idx)) + assert ((start_name != stop_name) or (start_name_idx is None != stop_name_idx is None) or \ + (not (start_name_idx is None and stop_name_idx is None)) or (start_name_idx < stop_name_idx)) expr = get_subgraph(expr, start_name, stop_name, start_name_idx, stop_name_idx, count_meta) expr = run_opt_pass(expr, transform.InferType()) packer = ExprPack( @@ -564,20 +566,17 @@ def graph_pack(expr, expr_locator = ExprLocater() expr_locator.visit(expr) - # from the second conv2d to the global_avg_pool2d, all will run on vta + # from the first int conv2d to the last int stop_fusion, all will run on vta conv2d = op.op.get("nn.conv2d") - avg_pool2d = op.op.get("nn.global_avg_pool2d") - start = expr_locator.op2nodes[conv2d][1] - # preceeding the nn.global_avg_pool2d, it will look like this - # - # %310 = annotation.stop_fusion(%309) /* ty=Tensor[(1, 16, 7, 7, 1, 32), int8] */; - # %311 = cast(%310, dtype="int32") /* ty=Tensor[(1, 16, 7, 7, 1, 32), int32] */; - # %312 = transpose(%311, axes=[0, 4, 1, 5, 2, 3]) /* ty=Tensor[(1, 1, 16, 32, 7, 7), int32] */; - # %313 = reshape(%312, newshape=[1, 512, 7, 7]) /* ty=Tensor[(1, 512, 7, 7), int32] */; - # %314 = nn.global_avg_pool2d(%313) /* ty=Tensor[(1, 512, 1, 1), int32] */; - # - # we mark the preceeding three ops also on cpu device - end = expr_locator.op2nodes[avg_pool2d][0] - 3 + conv2d_transpose = op.op.get("nn.conv2d_transpose") + stop_fusion = op.op.get("annotation.stop_fusion") + if (conv2d, "int32") in expr_locator.op2nodes: + start = expr_locator.op2nodes[(conv2d, "int32")][0] + else: + start = expr_locator.op2nodes[(conv2d_transpose, "int32")][0] + + # we mark the next op to the last stop_fusion on cpu device + end = expr_locator.op2nodes[(stop_fusion, "int8")][-1] + 1 device_annot = ExprDeviceAnnot(start=start, end=end) expr = device_annot.visit(expr) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 5c856384c605..7ef71074caa4 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -36,7 +36,6 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty raise topi.InvalidShapeError() assert dilation == (1, 1) - eprint("data.shape, kernel.shape", data.shape, kernel.shape) if padding[0]: pad_data = topi.nn.pad(data, [0, 0, padding[0], padding[1], 0, 0], name="pad_data") else: @@ -194,7 +193,6 @@ def _traverse(op): # FIXME(zhanghao): move this code to a proper location @topi.generic.schedule_add.register(["vta"]) def _schedule_add(outs): - eprint("schedule_add vta") assert len(outs) == 1 def is_cast_op(op): @@ -245,14 +243,29 @@ def _traverse(op): # TODO(zhanghao): auto-tune x_co0, x_co1 = s[output].split(x_co, factor=1) - x_i0, x_i1 = s[output].split(x_i, factor=min(28, x_i_max)) - x_j0, x_j1 = s[output].split(x_j, factor=min(14, x_j_max)) + + from functools import reduce + def factors(n): + return sorted(set(reduce(list.__add__, + ([i, n//i] for i in range(1, int(n**0.5) + 1) if n % i == 0)))) + + # FIXME(zhanghao): use auto-tune + i_factors = factors(x_i_max) + i_factor = i_factors[-1] + if i_factor > 28: + i_factor = i_factors[-2] + + j_factors = factors(x_j_max) + j_factor = j_factors[-1] + if j_factor > 14: + j_factor = j_factors[-2] + x_i0, x_i1 = s[output].split(x_i, factor=i_factor) + x_j0, x_j1 = s[output].split(x_j, factor=j_factor) s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) store_pt = x_j0 env = get_env() for eo in ewise_ops: - eprint("add ewise_ops ", eo) s[eo].set_scope(env.acc_scope) s[eo].pragma(s[eo].op.axis[0], env.alu) s[eo].compute_at(s[output], store_pt) @@ -260,7 +273,6 @@ def _traverse(op): # cache read input cache_read_ewise = [] for consumer, tensor in ewise_inputs: - eprint("add dma_copy", consumer, tensor, tensor.op) cache_read_ewise.append( s.cache_read(tensor, env.acc_scope, [consumer])) diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py index 4f213f64d0da..15383e557c3b 100644 --- a/vta/python/vta/top/vta_conv2d_transpose.py +++ b/vta/python/vta/top/vta_conv2d_transpose.py @@ -77,6 +77,7 @@ def schedule_conv2d_transpose_packed(cfg, outs): """Schedule packed conv2d_transpose""" assert len(outs) == 1 output = outs[0] + const_ops = [] ewise_inputs = [] ewise_ops = [] conv2d_res = [] @@ -86,7 +87,10 @@ def schedule_conv2d_transpose_packed(cfg, outs): def _traverse(op): if topi.tag.is_broadcast(op.tag): if not op.same_as(output.op): - ewise_ops.append(op) + if not op.axis: + const_ops.append(op) + else: + ewise_ops.append(op) for tensor in op.input_tensors: if isinstance(tensor.op, tvm.te.PlaceholderOp): ewise_inputs.append((op, tensor)) @@ -116,8 +120,16 @@ def _traverse(op): data, kernel = conv2d_stage.op.input_tensors if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag: temp = data.op.input_tensors[0] - pad_data = data - data = temp + # FIXME(zhanghao): force merge pad(dilate(xx)) to one load op + # this may cause results in-correct + # disable for now + if False and isinstance(temp.op, tvm.te.ComputeOp) and ("pad" in temp.op.tag or temp.op.name == "DilatedInput"): + pad_data = data + data = temp.op.input_tensors[0] + s[temp.op].compute_inline() + else: + pad_data = data + data = temp else: pad_data = None @@ -142,6 +154,9 @@ def _traverse(op): s[op].set_scope(env.acc_scope) s[op].pragma(s[op].op.axis[0], env.alu) + for op in const_ops: + s[op].compute_inline() + # tile x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co) diff --git a/vta/tutorials/frontend/deploy_dcgan.py b/vta/tutorials/frontend/deploy_dcgan.py new file mode 100644 index 000000000000..95a3731f98f9 --- /dev/null +++ b/vta/tutorials/frontend/deploy_dcgan.py @@ -0,0 +1,186 @@ +from __future__ import absolute_import, print_function + +import argparse, json, os, requests, sys, time +from io import BytesIO +from os.path import join, isfile +from PIL import Image + +from mxnet.gluon.model_zoo import vision +import numpy as np +from matplotlib import pyplot as plt + +import tvm +from tvm import te +from tvm import rpc, autotvm, relay +from tvm.contrib import graph_runtime, util, download +from tvm.contrib.debugger import debug_runtime +from tvm.relay import transform +import tvm.relay.testing + +import vta +from vta.testing import simulator +from vta.top import graph_pack +from tvm.contrib.util import eprint + +# Make sure that TVM was compiled with RPC=1 +assert tvm.runtime.enabled("rpc") + +###################################################################### +# Define the platform and model targets +# ------------------------------------- +# Execute on CPU vs. VTA, and define the model. + +# Load VTA parameters from the vta/config/vta_config.json file +env = vta.get_env() + +# Set ``device=arm_cpu`` to run inference on the CPU +# or ``device=vta`` to run inference on the FPGA. +device = "vta" +target = env.target if device == "vta" else env.target_vta_cpu +# multiple targets to run both on cpu and vta +targets = { + "cpu": env.target_vta_cpu, + "ext_dev": env.target +} + +model = "DCGAN" + +###################################################################### +# Obtain an execution remote +# -------------------------- +# When target is 'pynq', reconfigure FPGA and runtime. +# Otherwise, if target is 'sim', execute locally. + +if env.TARGET not in ["sim", "tsim", "intelfocl"]: + + # Get remote from tracker node if environment variable is set. + # To set up the tracker, you'll need to follow the "Auto-tuning + # a convolutional network for VTA" tutorial. + tracker_host = os.environ.get("TVM_TRACKER_HOST", None) + tracker_port = os.environ.get("TVM_TRACKER_PORT", None) + # Otherwise if you have a device you want to program directly from + # the host, make sure you've set the variables below to the IP of + # your board. + device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") + device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091") + if not tracker_host or not tracker_port: + remote = rpc.connect(device_host, int(device_port)) + else: + remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000) + + # Reconfigure the JIT runtime and FPGA. + # You can program the FPGA with your own custom bitstream + # by passing the path to the bitstream file instead of None. + reconfig_start = time.time() + vta.reconfig_runtime(remote) + bitstream = os.environ.get("TVM_BIT", None) + if bitstream: + print("Program fpga with {}".format(bitstream)) + vta.program_fpga(remote, bitstream) + + reconfig_time = time.time() - reconfig_start + print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) + +# In simulation mode, host the RPC server locally. +else: + remote = rpc.LocalSession() + +# Get execution context from remote +# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) +ctxes = [remote.ext_dev(0), remote.cpu(0)] + +# Load pre-configured AutoTVM schedules +with autotvm.tophub.context(target): + + # Populate the shape and data type dictionary for ImageNet classifier input + dtype_dict = {"data": 'float32'} + shape_dict = {"data": (env.BATCH, 100)} + + # get the mobilenet model + mod, params = relay.testing.dcgan.get_workload(batch_size=1, dtype="float32", oshape=(3, 64, 64)) + + # Measure build start time + build_start = time.time() + + # Update shape and type dictionary + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + if target.device_name == "vta": + # Perform quantization in Relay + # Note: We set opt_level to 3 in order to fold batch norm + with relay.build_config(opt_level=3): + with relay.quantize.qconfig(global_scale=8.0, + skip_conv_layers=[3]): + mod = relay.quantize.quantize(mod, params=params) + # Perform graph packing and constant folding for VTA target + assert env.BLOCK_IN == env.BLOCK_OUT + relay_prog = graph_pack( + mod["main"], + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name="cast", + stop_name="cast", stop_name_idx=52, device_annot=True) + else: + relay_prog = mod["main"] + + # Compile Relay program with AlterOpLayout disabled + with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): + if target.device_name != "vta": + graph, lib, params = relay.build( + relay_prog, target=target, + params=params, target_host=env.target_host) + else: + with vta.build_config(debug_flag=38): + graph, lib, params = relay.build( + relay_prog, target=targets, + params=params, target_host=env.target_host) + + # Measure Relay build time + build_time = time.time() - build_start + print(model + " inference graph built in {0:.2f}s!".format(build_time)) + + # Graph runtime + m = graph_runtime.create(graph, lib, ctxes) + +image = np.zeros((1, 100), dtype=np.float32) +eprint("image", image.dtype, image) +image = np.repeat(image, env.BATCH, axis=0) + +# Set the network parameters and inputs +m.set_input(**params) +m.set_input('data', image) + +# Perform inference and gather execution statistics +# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator +num = 3 # number of times we run module for a single measurement +rep = 3 # number of measurements (we derive std dev from this) +timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep) + +if env.TARGET in ["sim", "tsim"]: + simulator.clear_stats() + # timer() + m['run']() + + sim_stats = simulator.stats() + print("\nExecution statistics:") + for k, v in sim_stats.items(): + # Since we execute the workload many times, we need to normalize stats + # Note that there is always one warm up run + # Therefore we divide the overall stats by (num * rep + 1) + print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1))) +else: + m['run']() + print("Run done") + # tcost = timer() + # std = np.std(tcost.results) * 1000 + # mean = tcost.mean * 1000 + # print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH)) + # print("Average per sample inference time: %.2fms" % (mean/env.BATCH)) + +# Get classification results +tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 3, 64, 64), "float32", remote.cpu(0))) +output = tvm_output.asnumpy() +for b in range(env.BATCH): + print(tvm_output.asnumpy()[b]) From bb3dc0eb570f96a5a5407885a12d0cf7e4b1e95b Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Wed, 6 May 2020 15:33:24 +0800 Subject: [PATCH 17/44] make sync in batch as an option --- vta/runtime/runtime.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 39038da00b51..cedecc59ba55 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -1278,7 +1278,8 @@ class CommandQueue { // FIXME(zhanghao): It is required to use force_serial // by using skip and sync at the final layer, we can avoid do DeviceCopy every time - if (skip) { + const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE"); + if (sync_once && skip) { if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) { LOG(ERROR) << "Synchronizing all in one round requires to use force_serial to make things right"; @@ -1518,7 +1519,8 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly // struct timespec start, stop; // clock_gettime(CLOCK_REALTIME, &start); - VTASynchronize(VTATLSCommandHandle(), 1<<31, false); + const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE"); + if (sync_once) VTASynchronize(VTATLSCommandHandle(), 1<<31, false); // clock_gettime(CLOCK_REALTIME, &stop); // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000; // LOG(WARNING) << "Final Synchronize: " << elapsed << " us"; From cb464779f42698d0fc6faa52c2927e551e0493a4 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 11 May 2020 09:51:49 +0800 Subject: [PATCH 18/44] quickfix for buffer overflow --- vta/python/vta/top/vta_conv2d.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 7ef71074caa4..40e2530ef63c 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -252,13 +252,16 @@ def factors(n): # FIXME(zhanghao): use auto-tune i_factors = factors(x_i_max) i_factor = i_factors[-1] - if i_factor > 28: - i_factor = i_factors[-2] + while i_factor > 28: + del i_factors[-1] + i_factor = i_factors[-1] j_factors = factors(x_j_max) j_factor = j_factors[-1] - if j_factor > 14: - j_factor = j_factors[-2] + while j_factor > 14: + del j_factors[-1] + j_factor = j_factors[-1] + x_i0, x_i1 = s[output].split(x_i, factor=i_factor) x_j0, x_j1 = s[output].split(x_j, factor=j_factor) s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) From 4ede46679fc2bb33b6a0eb9984ebf5b1eedbfb87 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 11 May 2020 18:02:38 +0800 Subject: [PATCH 19/44] bugfix for allocated_ destructor order --- vta/runtime/runtime.cc | 44 +++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index cedecc59ba55..92d5ab06cc8f 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -43,6 +43,8 @@ #include #include #include +#include +#include namespace vta { @@ -120,10 +122,39 @@ class AlignmentAllocator { } }; +class DeviceAllocStat { + public: + void AddAlloc(const void* ptr) { + std::lock_guard lock(mtx_); + allocated_.insert(ptr); + } + + bool CheckAlloc(const void* ptr) { + std::lock_guard lock(mtx_); + return allocated_.count(ptr); + } + + void DelAlloc(const void* ptr) { + std::lock_guard lock(mtx_); + allocated_.erase(ptr); + } + + private: + std::set allocated_; + std::mutex mtx_; +}; + +// here we use a global variable to memorize the allocation stats +static std::shared_ptr alloc_stat(new DeviceAllocStat()); + /*! * \brief Data buffer represents data on CMA. */ struct DataBuffer { + DataBuffer() { + alloc_stat_ = alloc_stat; + } + /*! \return Virtual address of the data. */ void* virt_addr() const { return data_; } /*! \return Physical address of the data. */ @@ -180,7 +211,7 @@ struct DataBuffer { buffer->data_ = data; buffer->phy_addr_ = VTAMemGetPhyAddr(data); - allocated_.insert(buffer); + alloc_stat->AddAlloc(buffer); return buffer; } /*! @@ -188,7 +219,7 @@ struct DataBuffer { * \param buffer The buffer to be freed. */ static void Free(DataBuffer* buffer) { - allocated_.erase(buffer); + alloc_stat->DelAlloc(buffer); VTAMemFree(buffer->data_); delete buffer; } @@ -198,7 +229,7 @@ struct DataBuffer { * \return The corresponding data buffer header. */ static DataBuffer* FromHandle(const void* buffer) { - if (allocated_.count(buffer)) { + if (alloc_stat->CheckAlloc(buffer)) { return const_cast( reinterpret_cast(buffer)); } else { @@ -212,12 +243,11 @@ struct DataBuffer { /*! \brief The physical address of the buffer, excluding header. */ vta_phy_addr_t phy_addr_; - static std::set allocated_; + // a copy of global shared_ptr instance + // to avoid the global instance is destructed before there are still some pending DataBuffers not destructed + std::shared_ptr alloc_stat_; }; -// init static member -std::set DataBuffer::allocated_; - /*! * \brief Micro op kernel. * Contains functions to construct the kernel with prefix Push. From 4f375d5cd205f51c635d01eeb4d278f92e4992fc Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 14 May 2020 17:57:43 +0800 Subject: [PATCH 20/44] refine device annotation --- include/tvm/relay/attrs/device_copy.h | 1 - python/tvm/relay/op/_tensor.py | 3 + src/relay/op/annotation/annotation.cc | 8 +- src/relay/transforms/device_annotation.cc | 92 +++++++++-------------- vta/python/vta/top/graphpack.py | 48 +++++++----- 5 files changed, 74 insertions(+), 78 deletions(-) diff --git a/include/tvm/relay/attrs/device_copy.h b/include/tvm/relay/attrs/device_copy.h index c4a60c827048..7da92b3ff763 100644 --- a/include/tvm/relay/attrs/device_copy.h +++ b/include/tvm/relay/attrs/device_copy.h @@ -37,7 +37,6 @@ namespace relay { struct DeviceCopyAttrs : public tvm::AttrsNode { int dst_dev_type; int src_dev_type; - bool used_for_propagate = true; TVM_DECLARE_ATTRS(DeviceCopyAttrs, "relay.attrs.DeviceCopyAttrs") { TVM_ATTR_FIELD(src_dev_type) diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index 4f409ff4538f..1dd431ac2785 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -92,6 +92,9 @@ register_broadcast_schedule("fast_exp") register_broadcast_schedule("fast_tanh") register_broadcast_schedule("fast_erf") +# a fake on_device schedule. +# this will not be used in actual computation as on_device will be removed during DeviceAnnotation pass +register_injective_schedule("on_device") # zeros diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc index 6be9b0d4a3d5..4db3f930d3b5 100644 --- a/src/relay/op/annotation/annotation.cc +++ b/src/relay/op/annotation/annotation.cc @@ -54,7 +54,13 @@ RELAY_REGISTER_OP("on_device") .add_type_rel("Identity", IdentityRel) .set_attr("TOpPattern", kOpaque) .set_attr("TOpIsStateful", false) - .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout); + .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) + .set_attr("FTVMCompute", + [] (const Attrs& attrs, + const Array& inputs, + const Type& out_type) -> Array { + return {topi::identity(inputs[0])}; + }); Expr StopFusion(Expr data) { static const Op& op = Op::Get("annotation.stop_fusion"); diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc index 2d53751665da..4862a999b85c 100644 --- a/src/relay/transforms/device_annotation.cc +++ b/src/relay/transforms/device_annotation.cc @@ -90,15 +90,6 @@ class ValidateAnnotation : private ExprVisitor { annotation_map_.insert({node, GetDeviceId(call_node)}); } - // FIXME(zhanghao): find a better way - // here assume there are max two device types - if (device_type == fallback_device_ && extra_device_ && extra_device_ != fallback_device_) { - const auto* child = GetRef(node).as()->args[0].operator->(); - // here we mark as negative to indicate this is for copy from only - int ext_dev = -extra_device_; - annotation_map_.insert({child, ext_dev}); - } - if (device_type != fallback_device_) extra_device_ = device_type; } } @@ -261,11 +252,7 @@ class RewriteAnnotation : public ExprMutator { if (annotation_map_.count(dst)) { return src_dev_type != annotation_map_.at(dst); } else { - // TODO(zhanghao): for now, we only make a device_copy when dst is "on_device" marked - // This allows us to do a start-end mark (mark two points) - // to mark all the middle ops with a device_type - return false; - // return src_dev_type != fallback_device_; + return src_dev_type != fallback_device_; } } else { // if annotation value < 0, it means this is for "copy from" only @@ -408,22 +395,38 @@ class DeviceInfo { } void VisitExpr_(const ConstantNode* cn) final { - post_dfs_order_.push_back(std::make_pair(cn, has_copy_)); + device_tag_[cn] = dev_type_; } void VisitExpr_(const CallNode* call) final { // Skip annotation nodes. if (!IsOnDeviceNode(call)) { - if (GetDeviceCopyNode(call)) { + if (const auto* node = GetDeviceCopyNode(call)) { + CHECK(node->IsInstance()); + const auto* call_node = static_cast(node); + auto attrs = call_node->attrs.as(); + num_device_copy_ops_++; bool has_copy_prev = has_copy_; has_copy_ = true; - ExprVisitor::VisitExpr_(call); - post_dfs_order_.push_back(std::make_pair(call, has_copy_)); + dev_type_ = attrs->src_dev_type; + for (auto& arg : call->args) { + Visit(arg); + // restore the type for remaining arguments + dev_type_ = attrs->src_dev_type; + } + device_tag_[call] = attrs->dst_dev_type; + // update the out_dev_type_, which should be the dst_dev_type of last copy + out_dev_type_ = attrs->dst_dev_type; has_copy_ = has_copy_prev; } else { - ExprVisitor::VisitExpr_(call); - post_dfs_order_.push_back(std::make_pair(call, has_copy_)); + for (auto& arg : call->args) { + int cur_dev_type = dev_type_; + Visit(arg); + // restore the type for remaining arguments + dev_type_ = cur_dev_type; + } + device_tag_[call] = dev_type_; } } } @@ -436,22 +439,24 @@ class DeviceInfo { void VisitExpr_(const TupleGetItemNode* op) final { ExprVisitor::VisitExpr_(op); } void VisitExpr_(const VarNode* vn) final { - post_dfs_order_.push_back(std::make_pair(vn, has_copy_)); + device_tag_[vn] = dev_type_; } void VisitExpr_(const LetNode* ln) final { ExprVisitor::VisitExpr_(ln); - post_dfs_order_.push_back(std::make_pair(ln, has_copy_)); + device_tag_[ln] = dev_type_; } void VisitExpr_(const IfNode* in) final { ExprVisitor::VisitExpr_(in); - post_dfs_order_.push_back(std::make_pair(in, has_copy_)); + device_tag_[in] = dev_type_; } int num_device_copy_ops_{0}; bool has_copy_ = false; - std::vector> post_dfs_order_; + int dev_type_ = -1; + int out_dev_type_ = -1; + std::unordered_map device_tag_; friend DeviceInfo; }; @@ -477,39 +482,14 @@ class DeviceInfo { } void PropagateDeviceId() { - // Bottom-up propagation. - int out_dev_type = BottomUpPropagation(); - // propagation for remained nodes. - FillPropagation(out_dev_type); - } - - int BottomUpPropagation() { - const CallNode* last_copy_node = nullptr; - int cur_dev_type = -1; - int out_dev_type = -1; - for (auto it = post_visitor_.post_dfs_order_.crbegin(); - it != post_visitor_.post_dfs_order_.crend(); ++it) { - if (const auto* node = GetDeviceCopyNode(it->first)) { - CHECK(node->IsInstance()); - last_copy_node = static_cast(node); - const auto* attrs = last_copy_node->attrs.as(); - cur_dev_type = attrs->src_dev_type; - if (out_dev_type == -1) out_dev_type = attrs->dst_dev_type; - if (it->second) device_map_.Set(GetRef(it->first), attrs->dst_dev_type); - } else if (last_copy_node) { - Expr expr = GetRef(it->first); - CHECK_EQ(device_map_.count(expr), 0U); - if (it->second) device_map_.Set(expr, cur_dev_type); + int out_dev_type = post_visitor_.out_dev_type_; + for (auto& it : post_visitor_.device_tag_) { + if (it.second != -1) { + device_map_.Set(GetRef(it.first), it.second); + } else { + device_map_.Set(GetRef(it.first), out_dev_type); } } - return out_dev_type; - } - - void FillPropagation(int out_dev_type) { - for (const auto& it : post_visitor_.post_dfs_order_) { - Expr expr = GetRef(it.first); - if (!it.second) device_map_.Set(expr, out_dev_type); - } } PostDfsOrderVisitor post_visitor_; @@ -517,6 +497,7 @@ class DeviceInfo { }; +// TODO(zhanghao): consider to remove this as I think it is not necessary for now class AddDeviceCopy : public ExprMutator { public: Expr Rewrite(const Expr& expr) { @@ -558,7 +539,6 @@ class AddDeviceCopy : public ExprMutator { auto attrs = make_object(); attrs->src_dev_type = src_dev_type; attrs->dst_dev_type = dst_dev_type; - attrs->used_for_propagate = false; static const Op& op = Op::Get("device_copy"); Call device_copy = CallNode::make(op, {this->Mutate(arg)}, Attrs(attrs), {}); device_copy->checked_type_ = arg->checked_type_; diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index 9cdc355f6c64..ac4c8aac4539 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -193,6 +193,7 @@ class ExprDeviceAnnot(ExprMutator): def __init__(self, start=-1, end=-1): self.ext_ctx = tvm.context("ext_dev") self.cpu_ctx = tvm.context("cpu") + self.cast = op.op.get("cast") self.counter = -1 self.start = start self.end = end @@ -210,33 +211,40 @@ def visit_call(self, call): if self.counter == self.start: ret = relay.Call(call.op, args, call.attrs) ret = relay.annotation.on_device(ret, self.ext_ctx) - eprint("add on_device {}: {}".format("ext", ret)) return ret elif self.counter == self.end: ret = relay.Call(call.op, args, call.attrs) ret = relay.annotation.on_device(ret, self.cpu_ctx) - eprint("add on_device {}: {}".format("cpu", ret)) return ret + elif self.counter > self.start and self.counter < self.end: + ret = relay.Call(call.op, args, call.attrs) -# if call.op == self.global_avg_pool2d: -# eprint("graphpack call = ", call) -# eprint("graphpack call annot relu, ", args[0]) -# ret = relay.Call(call.op, args, call.attrs) -# ret = relay.annotation.on_device(ret, self.cpu_ctx) -# return ret -# -# if call.op == self.conv2d and odtype == 'int32': -# if not self.first_conv2d: -# ret = relay.Call(call.op, args, call.attrs) -# ret = relay.annotation.on_device(ret, self.ext_ctx) -# eprint("graphpack call conv2d", type(ret.op), ret.op, type(ret), ret) -# self.first_conv2d = True -# return ret + # skip the float op, i.e., float->int cast + if self.is_float_op(call): + return ret - return relay.Call( - self.visit(call.op), - args, - call.attrs) + return relay.annotation.on_device(ret, self.ext_ctx) + + return relay.Call(self.visit(call.op), args, call.attrs) + + def is_float_op(self, call): + """check if this op belongs to a float op + in general, float op's odtype is float; + a special case is float->int cast, which follow this op sequence: + multiply(float) -> round(float) -> clip(float) -> cast(int); + """ + args = call.args + odtype = _get_tensor_type(call) + op = call.op + + if odtype == "float32": + return True + elif op == self.cast: + idtype = _get_tensor_type(args[0]) + if idtype == "float32": + return True + + return False class ExprLocater(ExprMutator): From d16d5ecdb92affae76b1887fb8d5f7eae8096da2 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 21 May 2020 13:28:08 +0800 Subject: [PATCH 21/44] auto-tune for vta alu ops --- python/tvm/autotvm/task/space.py | 2 +- python/tvm/autotvm/task/topi_integration.py | 8 +- python/tvm/autotvm/tuner/callback.py | 4 +- python/tvm/relay/op/_tensor.py | 9 +- python/tvm/relay/op/op.py | 20 -- python/tvm/relay/op/strategy/generic.py | 15 +- src/relay/backend/compile_engine.cc | 4 +- topi/python/topi/generic/injective.py | 4 - vta/python/vta/top/op.py | 152 +++++++++ vta/python/vta/top/vta_conv2d.py | 98 ------ vta/tutorials/autotvm/tune_alu_vta.py | 341 ++++++++++++++++++++ 11 files changed, 513 insertions(+), 144 deletions(-) create mode 100644 vta/tutorials/autotvm/tune_alu_vta.py diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py index fbf474fc4df7..53ed78a7570d 100644 --- a/python/tvm/autotvm/task/space.py +++ b/python/tvm/autotvm/task/space.py @@ -779,7 +779,7 @@ def _add_new_transform(self, space_class, name, axes, policy, **kwargs): return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))] def __len__(self): - if self._length is None: + if self._length is None or self._length <= 1: self._length = int(np.prod([len(x) for x in self.space_map.values()])) return self._length diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index 59e77f7d0098..d7fa69d571b1 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -215,7 +215,7 @@ def _decorate(topi_schedule): @_register_task_schedule(task_name) def wrapper(outs, *args, **kwargs): """wrapper function for topi schedule""" - workload = get_workload(outs) + workload = get_workload(outs, task_name) if workload is None: raise RuntimeError("Cannot find workload in attribute of this schedule") tgt = _target.Target.current() @@ -227,14 +227,16 @@ def wrapper(outs, *args, **kwargs): return _decorate -def get_workload(outs): +def get_workload(outs, task_name=None): """Retrieve the workload from outputs""" def traverse(tensors): """traverse all ops to find attached workload""" for t in tensors: op = t.op if 'workload' in op.attrs: - return args_to_workload(op.attrs['workload']) + ret = args_to_workload(op.attrs['workload']) + if ret[0] == task_name: + return ret wkl = traverse(op.input_tensors) if wkl: return wkl diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py index cfc1b2c38f85..6c53be582b40 100644 --- a/python/tvm/autotvm/tuner/callback.py +++ b/python/tvm/autotvm/tuner/callback.py @@ -137,7 +137,7 @@ def __del__(self): format_si_prefix(0, si_prefix) if logger.level < logging.DEBUG: # only print progress bar in non-debug mode - sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) ' + sys.stdout.write('\r%s Current/Best: %7.4f/%7.4f GFLOPS | Progress: (%d/%d) ' '| %.2f s' % (prefix, 0, 0, 0, total, time.time() - tic)) sys.stdout.flush() @@ -153,7 +153,7 @@ def _callback(tuner, inputs, results): ctx.cur_flops = flops ctx.best_flops = tuner.best_flops - sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f %sFLOPS | Progress: (%d/%d) ' + sys.stdout.write('\r%s Current/Best: %7.4f/%7.4f %sFLOPS | Progress: (%d/%d) ' '| %.2f s' % (prefix, format_si_prefix(ctx.cur_flops, si_prefix), format_si_prefix(ctx.best_flops, si_prefix), si_prefix, diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index 1dd431ac2785..4c3a2378e9d4 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -51,9 +51,7 @@ register_broadcast_schedule("sign") register_broadcast_schedule("abs") register_broadcast_schedule("tanh") -# NOTE(zhanghao): use customized add schedule -register_add_schedule("add") -# register_broadcast_schedule("add") +register_broadcast_schedule("add") register_broadcast_schedule("subtract") register_broadcast_schedule("multiply") register_broadcast_schedule("divide") @@ -82,10 +80,7 @@ register_broadcast_schedule("isinf") register_injective_schedule("maximum") register_injective_schedule("minimum") -# NOTE(zhanghao): use customized add schedule -# TODO(zhanghao): change the schedule name -register_add_schedule("right_shift") -# register_injective_schedule("right_shift") +register_injective_schedule("right_shift") register_injective_schedule("left_shift") register_injective_schedule("shape_of") register_injective_schedule("ndarray_size") diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index 5056825d007c..8ef51cf595fc 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -240,20 +240,6 @@ def register_injective_schedule(op_name, level=10): return register_schedule(op_name, _schedule_injective, level) -def register_add_schedule(op_name, level=10): - """Register schedule function for add. - - Parameters - ---------- - op_name : str - The name of the op. - - level : int - The priority level - """ - return register_schedule(op_name, _schedule_add, level) - - def register_broadcast_schedule(op_name, level=10): """Register broadcast schedule function for an op. @@ -409,12 +395,6 @@ def register_external_compiler(op_name, fexternal=None, level=10): -def schedule_add(attrs, outputs, target): - """Generic schedule for add.""" - with target: - return topi.generic.schedule_add(outputs) - - @tvm._ffi.register_func("relay.op.compiler._lower") def _lower(name, schedule, inputs, outputs): return lower(schedule, list(inputs) + list(outputs), name=name) diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 48944474e272..025d67630cf9 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -33,6 +33,14 @@ def wrapper(attrs, outs, target): return topi_schedule(outs) return wrapper + +def wrap_topi_compute(topi_compute): + """Wrap TOPI schedule which doesn't use attrs""" + def wrapper(attrs, inputs, out_type): + return [topi_compute(*inputs)] + return wrapper + + def get_conv2d_in_channels(data_shape, data_layout): """Get conv2d input channels""" data_shape = get_const_tuple(data_shape) @@ -69,12 +77,6 @@ def schedule_injective(attrs, outs, target): with target: return topi.generic.schedule_injective(outs) -@generic_func -def schedule_add(attrs, outputs, target): - """Generic schedule for add.""" - with target: - return topi.generic.schedule_add(outputs) - @generic_func def schedule_reduce(attrs, outs, target): """Schedule reduction ops""" @@ -83,7 +85,6 @@ def schedule_reduce(attrs, outs, target): _op._schedule_injective = schedule_injective _op._schedule_reduce = schedule_reduce -_op._schedule_add = schedule_add # concatenate @generic_func diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index 3b0b1b39c62c..37fb0108f111 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -230,7 +230,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator> << "Two complicated op in a primitive function " << " master=" << master_op_ << " current=" << op; } - if (op_pattern >= master_op_pattern_) { + if (op_pattern > master_op_pattern_) { master_op_ = op; master_attrs_ = call_node->attrs; master_op_pattern_ = op_pattern; @@ -288,7 +288,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator> tvm::Target target_; Op master_op_; Attrs master_attrs_; - int master_op_pattern_{0}; + int master_op_pattern_{-1}; OpImplementation master_implementation_; std::ostringstream readable_name_stream_; Array scalars_; diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py index 8aae9a3c5f14..fa6aee4864ec 100644 --- a/topi/python/topi/generic/injective.py +++ b/topi/python/topi/generic/injective.py @@ -63,9 +63,5 @@ def schedule_injective(outs): schedule_injective_from_existing(s, x) return s -@tvm.target.generic_func -def schedule_add(outs): - return schedule_injective(outs) - schedule_elemwise = schedule_injective schedule_broadcast = schedule_injective diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 010daaedf2bc..ae9ca1a90142 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -20,6 +20,7 @@ import tvm from tvm import te +from tvm import autotvm import topi from tvm.relay.op import op as reg @@ -63,6 +64,157 @@ def clip_strategy_vta(attrs, inputs, out_type, target): reg.get("clip").get_attr("FTVMStrategy").register(clip_strategy_vta, "vta") + +@autotvm.register_topi_compute("add.vta") +def add_packed(cfg, lhs, rhs): + ret = topi.add(lhs, rhs) + return ret + + +@autotvm.register_topi_compute("multiply.vta") +def multiply_packed(cfg, lhs, rhs): + return topi.multiply(lhs, rhs) + + +@autotvm.register_topi_compute("copy.vta") +def copy_packed(cfg, i): + return topi.identify(i) + + +def schedule_alu_packed(cfg, outs): + assert len(outs) == 1 + + def is_cast_op(op): + # return op.same_as(Op.op.get("cast")) + # FIXME(zhanghao): find a better way to do compare + return op.name == 'T_cast' + + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + output = outs[0] + s = te.create_schedule([x.op for x in outs]) + te.schedule.AutoInlineInjective(s) + # s[output].fuse(s[output].op.axis) + + # only put the int-related ops to vta + if "int" in output.dtype and len(output.shape) == 6: + ewise_inputs = [] + ewise_ops = [] + const_ops = [] + + def _traverse(op): + if topi.tag.is_broadcast(op.tag): + if not op.same_as(output.op): + if not op.axis: + const_ops.append(op) + elif not is_cast_op(op): + ewise_ops.append(op) + + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.te.PlaceholderOp): + ewise_inputs.append((op, tensor)) + elif is_cast_op(tensor.op) and not op.same_as(output.op): + ewise_inputs.append((op, tensor)) + else: + _traverse(tensor.op) + else: + for tensor in op.input_tensors: + if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \ + and (not is_cast_op(tensor.op)): + _traverse(tensor.op) + + op = output.op + _traverse(op) + for _, t in ewise_inputs: + if t.dtype == 'float32': + return s + + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis + + cfg.define_split('tile_co', x_co, num_outputs=2) + cfg.define_split('tile_h', x_i, num_outputs=2) + cfg.define_split('tile_w', x_j, num_outputs=2) + + x_co_max = topi.util.get_const_int(x_bo.dom.extent) + x_i_max = topi.util.get_const_int(x_i.dom.extent) + x_j_max = topi.util.get_const_int(x_j.dom.extent) + + x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co) + x_i0, x_i1 = cfg['tile_h'].apply(s, output, x_i) + x_j0, x_j1 = cfg['tile_w'].apply(s, output, x_j) + s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) + store_pt = x_j0 + + env = get_env() + for eo in ewise_ops: + s[eo].set_scope(env.acc_scope) + s[eo].pragma(s[eo].op.axis[0], env.alu) + s[eo].compute_at(s[output], store_pt) + + # cache read input + cache_read_ewise = [] + for consumer, tensor in ewise_inputs: + cache_read_ewise.append( + s.cache_read(tensor, env.acc_scope, [consumer])) + + for tensor in cache_read_ewise: + if s[tensor].op.axis: + s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy) + s[tensor].compute_at(s[output], store_pt) + + for op in const_ops: + s[op].compute_inline() + + s[output].pragma(x_co1, env.dma_copy) + + return s + + +@autotvm.register_topi_schedule("add.vta") +def schedule_add_packed(cfg, outs): + return schedule_alu_packed(cfg, outs) + + +@autotvm.register_topi_schedule("multiply.vta") +def schedule_multiply_packed(cfg, outs): + return schedule_alu_packed(cfg, outs) + + +@autotvm.register_topi_schedule("copy.vta") +def schedule_copy_packed(cfg, outs): + return schedule_alu_packed(cfg, outs) + + +def add_strategy_vta(attrs, inputs, out_type, target): + strategy = OpStrategy() + strategy.add_implementation( + _strategy.wrap_topi_compute(add_packed), + _strategy.wrap_topi_schedule(schedule_add_packed), + name="add.vta") + return strategy + + +def multiply_strategy_vta(attrs, inputs, out_type, target): + strategy = OpStrategy() + strategy.add_implementation( + _strategy.wrap_topi_compute(multiply_packed), + _strategy.wrap_topi_schedule(schedule_multiply_packed), + name="multiply.vta") + return strategy + + +def copy_strategy_vta(attrs, inputs, out_type, target): + strategy = OpStrategy() + strategy.add_implementation( + _strategy.wrap_topi_compute(copy_packed), + _strategy.wrap_topi_schedule(schedule_copy_packed), + name="copy.vta") + return strategy + + +reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta") +reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta") +reg.get("copy").get_attr("FTVMStrategy").register(copy_strategy_vta, "vta") + @_strategy.conv2d_strategy.register("vta") def conv2d_strategy_vta(attrs, inputs, out_type, target): """conv2d vta strategy""" diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 40e2530ef63c..525d60ae383d 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -190,102 +190,4 @@ def _traverse(op): return s -# FIXME(zhanghao): move this code to a proper location -@topi.generic.schedule_add.register(["vta"]) -def _schedule_add(outs): - assert len(outs) == 1 - - def is_cast_op(op): - # return op.same_as(Op.op.get("cast")) - # FIXME(zhanghao): find a better way to do compare - return op.name == 'T_cast' - outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs - output = outs[0] - s = te.create_schedule([x.op for x in outs]) - te.schedule.AutoInlineInjective(s) - # s[output].fuse(s[output].op.axis) - - # only put the int-related ops to vta - if "int" in output.dtype: - ewise_inputs = [] - ewise_ops = [] - const_ops = [] - - def _traverse(op): - if topi.tag.is_broadcast(op.tag): - if not op.same_as(output.op): - if not op.axis: - const_ops.append(op) - elif not is_cast_op(op): - ewise_ops.append(op) - - for tensor in op.input_tensors: - if isinstance(tensor.op, tvm.te.PlaceholderOp): - ewise_inputs.append((op, tensor)) - elif is_cast_op(tensor.op) and not op.same_as(output.op): - ewise_inputs.append((op, tensor)) - else: - _traverse(tensor.op) - else: - for tensor in op.input_tensors: - if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \ - and (not is_cast_op(tensor.op)): - _traverse(tensor.op) - - op = output.op - _traverse(op) - x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis - - x_co_max = topi.util.get_const_int(x_bo.dom.extent) - x_i_max = topi.util.get_const_int(x_i.dom.extent) - x_j_max = topi.util.get_const_int(x_j.dom.extent) - - # TODO(zhanghao): auto-tune - x_co0, x_co1 = s[output].split(x_co, factor=1) - - from functools import reduce - def factors(n): - return sorted(set(reduce(list.__add__, - ([i, n//i] for i in range(1, int(n**0.5) + 1) if n % i == 0)))) - - # FIXME(zhanghao): use auto-tune - i_factors = factors(x_i_max) - i_factor = i_factors[-1] - while i_factor > 28: - del i_factors[-1] - i_factor = i_factors[-1] - - j_factors = factors(x_j_max) - j_factor = j_factors[-1] - while j_factor > 14: - del j_factors[-1] - j_factor = j_factors[-1] - - x_i0, x_i1 = s[output].split(x_i, factor=i_factor) - x_j0, x_j1 = s[output].split(x_j, factor=j_factor) - s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) - store_pt = x_j0 - - env = get_env() - for eo in ewise_ops: - s[eo].set_scope(env.acc_scope) - s[eo].pragma(s[eo].op.axis[0], env.alu) - s[eo].compute_at(s[output], store_pt) - - # cache read input - cache_read_ewise = [] - for consumer, tensor in ewise_inputs: - cache_read_ewise.append( - s.cache_read(tensor, env.acc_scope, [consumer])) - - for tensor in cache_read_ewise: - s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy) - s[tensor].compute_at(s[output], store_pt) - - for op in const_ops: - s[op].compute_inline() - - s[output].pragma(x_co1, env.dma_copy) - - return s diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py new file mode 100644 index 000000000000..8a9a09c76856 --- /dev/null +++ b/vta/tutorials/autotvm/tune_alu_vta.py @@ -0,0 +1,341 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-tuning a ALU fused op on VTA +""" + +import os +from mxnet.gluon.model_zoo import vision +import numpy as np +from PIL import Image + +import topi +import tvm +from tvm import te +from tvm import rpc, autotvm, relay +from tvm.contrib import graph_runtime, util, download +from tvm.autotvm.measure.measure_methods import request_remote +from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner +from tvm.autotvm import record + +import vta +from vta.testing import simulator +from vta.top import graph_pack +import copy + +from tvm.contrib.util import eprint + +################################################################# +# Compile network +# --------------- +# Perform vta-specific compilation with Relay from a Gluon model +def compile_network(env, target, model, start_pack, stop_pack, device_annot=False): + + # Populate the shape and data type dictionary + dtype_dict = {"data": 'float32'} + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + + # Get off the shelf gluon model, and convert to relay + gluon_model = vision.get_model(model, pretrained=True) + mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + + # Update shape and type dictionary + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Perform quantization in Relay + # Note: We set opt_level to 3 in order to fold batch norm + with relay.build_config(opt_level=3): + with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): + mod = relay.quantize.quantize(mod, params=params) + + # Perform graph packing and constant folding for VTA target + if target.device_name == "vta": + assert env.BLOCK_IN == env.BLOCK_OUT + relay_prog = graph_pack(mod["main"], + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name=start_pack, + stop_name=stop_pack, + device_annot=device_annot) + + return relay_prog, params + + +########################################### +# Set Tuning Options +# ------------------ +# Before tuning, we should apply some configurations. +# Here we use an Pynq-Z1 board as an example. + +# Tracker host and port can be set by your environment +tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0') +tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190)) + +# Load VTA parameters from the vta/config/vta_config.json file +env = vta.get_env() + +# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device. +# Set ``device=arm_cpu`` to run inference on the CPU +# or ``device=vta`` to run inference on the FPGA. +device = "vta" +target = env.target if device == "vta" else env.target_vta_cpu + +# Name of Gluon model to compile +# The ``start_pack`` and ``stop_pack`` labels indicate where +# to start and end the graph packing relay pass: in other words +# where to start and finish offloading to VTA. +network = "resnet50_v2" +start_pack = "nn.max_pool2d" +stop_pack = "nn.global_avg_pool2d" + +# Tuning option +log_file = "%s.%s.log" % (device, network) +tuning_option = { + 'log_filename': log_file, + + 'tuner': 'random', + 'n_trial': 1000, + 'early_stopping': None, + + 'measure_option': autotvm.measure_option( + builder=autotvm.LocalBuilder(), + runner=autotvm.RPCRunner(env.TARGET, + host=tracker_host, + port=tracker_port, + number=5, + timeout=60, + check_correctness=True), + ), +} + + +def log_to_file(file_out, protocol='json'): + """Log the tuning records into file. + The rows of the log are stored in the format of autotvm.record.encode. + for lhs == rhs, we add an extra rhs = [] record + + Parameters + ---------- + file_out : str + The file to log to. + protocol: str, optional + The log protocol. Can be 'json' or 'pickle' + + Returns + ------- + callback : callable + Callback function to do the logging. + """ + def _callback(_, inputs, results): + with open(file_out, "a") as f: + for inp, result in zip(inputs, results): + eprint("inp = {}, result = {}".format(inp, result)) + f.write(record.encode(inp, result, protocol) + "\n") + + # we only consider task with same lhs and rhs + if inp.task.args[0] == inp.task.args[1]: + args = list(inp.task.args) + args[1] = (args[0][0], (), args[0][2]) + inp_copy = copy.deepcopy(inp) + inp_copy.task.args = tuple(args) + f.write(record.encode(inp_copy, result, protocol) + "\n") + + return _callback + + +def tune_tasks(tasks, + measure_option, + tuner='xgb', + n_trial=10, + early_stopping=None, + log_filename='tuning.log', + use_transfer_learning=True): + + # create tmp log file + tmp_log_file = log_filename + ".tmp" + if os.path.exists(tmp_log_file): + os.remove(tmp_log_file) + + for i, tsk in enumerate(reversed(tasks)): + prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) + + # create tuner + if tuner == 'xgb' or tuner == 'xgb-rank': + tuner_obj = XGBTuner(tsk, loss_type='rank') + elif tuner == 'xgb_knob': + tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob') + elif tuner == 'ga': + tuner_obj = GATuner(tsk, pop_size=50) + elif tuner == 'random': + tuner_obj = RandomTuner(tsk) + elif tuner == 'gridsearch': + tuner_obj = GridSearchTuner(tsk) + else: + raise ValueError("Invalid tuner: " + tuner) + + if use_transfer_learning: + if os.path.isfile(tmp_log_file): + tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) + + # do tuning + tsk_trial = min(n_trial, len(tsk.config_space)) + tuner_obj.tune(n_trial=tsk_trial, + early_stopping=early_stopping, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(tsk_trial, prefix=prefix), + log_to_file(tmp_log_file) + ]) + + # pick best records to a cache file + autotvm.record.pick_best(tmp_log_file, log_filename) + os.remove(tmp_log_file) + + +######################################################################## +# Register VTA-specific tuning tasks +def register_vta_tuning_tasks(): + from tvm.autotvm.task import TaskExtractEnv + + @tvm.te.tag_scope(tag=topi.tag.ELEMWISE) + def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.tir.const(a_min, x.dtype) + const_max = tvm.tir.const(a_max, x.dtype) + x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA") + x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB") + return x + + # init autotvm env to register VTA operator + TaskExtractEnv() + + @autotvm.register_customized_task("add.vta") + def _topi_add(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + A, B = args[:2] + + with tvm.target.vta(): + res = vta.top.op.add_packed(*args, **kwargs) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.Target.current().device_name == 'vta': + s = vta.top.op.schedule_add_packed([res]) + else: + s = te.create_schedule([res.op]) + return s, [A, B, res] + + @autotvm.register_customized_task("multiply.vta") + def _topi_multiply(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + A, B = args[:2] + + with tvm.target.vta(): + res = vta.top.op.multiply_packed(*args, **kwargs) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.Target.current().device_name == 'vta': + s = vta.top.op.schedule_multiply_packed([res]) + else: + s = te.create_schedule([res.op]) + return s, [A, B, res] + + @autotvm.register_customized_task("copy.vta") + def _topi_identity(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + A = args[0] + + with tvm.target.vta(): + res = vta.top.op.copy_packed(*args, **kwargs) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.Target.current().device_name == 'vta': + s = vta.top.op.schedule_copy_packed([res]) + else: + s = te.create_schedule([res.op]) + return s, [A, res] + + +######################################################################## +# Finally, we launch tuning jobs and evaluate the end-to-end performance. +def tune_and_evaluate(tuning_opt): + + if env.TARGET != "sim": + # Get remote from fleet node + remote = autotvm.measure.request_remote(env.TARGET, + tracker_host, + tracker_port, + timeout=10000) + # Reconfigure the JIT runtime and FPGA. + bitstream = os.environ.get("TVM_BIT", None) + if bitstream: + print("Program fpga with {}".format(bitstream)) + vta.reconfig_runtime(remote) + vta.program_fpga(remote, bitstream) + else: + # In simulation mode, host the RPC server locally. + remote = rpc.LocalSession() + + # Register VTA tuning tasks + register_vta_tuning_tasks() + + # Perform task extraction on Relay program + print("Extract tasks...") + relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) + mod = tvm.IRModule.from_expr(relay_prog) + tasks = autotvm.task.extract_from_program(mod, + params=params, + ops=(relay.op.get("add"), relay.op.get("multiply"),), + target=target, + target_host=env.target_host) + + # filter out non-packed alu task + tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) + # filter out float alu task + tasks = list(filter(lambda t: t.args[0][2] != "float32", tasks)) + # filter const rhs, which will be fused with conv2d + # tasks = list(filter(lambda t: len(t.args[1][1]) < 1, tasks)) + + # We should have extracted 10 convolution tasks + tasks_set = {} + print("Extracted {} alu tasks:".format(len(tasks))) + for tsk in tasks: + print("tsk = ", tsk) + + if len(tsk.args[1][1]) == 0: + args = list(tsk.args) + args[1] = args[0] + tsk.args = tuple(args) + + if (tsk.name, tsk.args) in tasks_set: + print("task {} already exists".format(tsk)) + tasks_set[(tsk.name, tsk.args)] = tsk + + tasks = list(tasks_set.values()) + print("After merged, final #tasks={}, tasks = {}".format(len(tasks), tasks)) + + # run tuning tasks + print("Tuning...") + tune_tasks(tasks, **tuning_opt) + + +# Run the tuning and evaluate the results +tune_and_evaluate(tuning_option) From a7526389f104d9f92c4b2e510bc44040cf2a20f5 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Fri, 22 May 2020 16:01:01 +0800 Subject: [PATCH 22/44] bugfix: make get_workload consistent with master_op selection --- python/tvm/autotvm/task/topi_integration.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index d7fa69d571b1..25d1156e2af8 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -233,13 +233,15 @@ def traverse(tensors): """traverse all ops to find attached workload""" for t in tensors: op = t.op + wkl = traverse(op.input_tensors) + if wkl: + return wkl + if 'workload' in op.attrs: ret = args_to_workload(op.attrs['workload']) if ret[0] == task_name: return ret - wkl = traverse(op.input_tensors) - if wkl: - return wkl return None + outs = [outs] if isinstance(outs, tensor.Tensor) else outs return traverse(outs) From 1b7aa5889d2dafcdd0b4195b350dcc2bd719688a Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Wed, 10 Jun 2020 18:47:53 +0800 Subject: [PATCH 23/44] some fixes after rebase with master --- cmake/modules/VTA.cmake | 3 +- include/tvm/relay/transform.h | 1 + python/tvm/contrib/util.py | 5 + src/relay/quantize/realize.cc | 4 +- src/relay/transforms/device_annotation.cc | 7 +- vta/config/vta_cost.py | 102 ---- vta/python/vta/transform.py | 12 + vta/src/intelfocl/AOCLUtils/aocl_utils.h | 32 - vta/src/intelfocl/AOCLUtils/opencl.cpp | 555 ------------------ vta/src/intelfocl/AOCLUtils/opencl.h | 122 ---- vta/src/intelfocl/AOCLUtils/options.cpp | 105 ---- vta/src/intelfocl/AOCLUtils/options.h | 137 ----- vta/src/intelfocl/AOCLUtils/scoped_ptrs.h | 165 ------ vta/src/intelfocl/intelfocl_device.cc | 181 ------ vta/src/intelfocl/intelfocl_device.h | 53 -- vta/src/intelfocl/intelfocl_driver.cc | 74 --- vta/src/pynq/pynq_driver.cc | 167 ------ .../frontend/deploy_classification.py | 8 +- 18 files changed, 28 insertions(+), 1705 deletions(-) delete mode 100644 vta/config/vta_cost.py delete mode 100644 vta/src/intelfocl/AOCLUtils/aocl_utils.h delete mode 100644 vta/src/intelfocl/AOCLUtils/opencl.cpp delete mode 100644 vta/src/intelfocl/AOCLUtils/opencl.h delete mode 100644 vta/src/intelfocl/AOCLUtils/options.cpp delete mode 100644 vta/src/intelfocl/AOCLUtils/options.h delete mode 100644 vta/src/intelfocl/AOCLUtils/scoped_ptrs.h delete mode 100644 vta/src/intelfocl/intelfocl_device.cc delete mode 100644 vta/src/intelfocl/intelfocl_device.h delete mode 100644 vta/src/intelfocl/intelfocl_driver.cc delete mode 100644 vta/src/pynq/pynq_driver.cc diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index 33fe0016fe4a..371bd27fa80e 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -103,7 +103,7 @@ elseif(PYTHON) file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc) file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cpp) list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC}) - list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc vta/src/vmem/virtual_memory.h) + list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h) endif() # Target lib: vta add_library(vta SHARED ${FPGA_RUNTIME_SRCS}) @@ -124,6 +124,7 @@ elseif(PYTHON) "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include") elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include") + target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include) target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL) endif() endif() diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h index 61eb6dd50ce2..a7f5fea98ea2 100644 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@ -384,6 +384,7 @@ TVM_DLL Expr Bind(const Expr& expr, const tvm::Map& binds); * \note this function mutates mod and is not thread-safe. */ TVM_DLL Function InferType(const Function& f, const IRModule& mod, const GlobalVar& var); +TVM_DLL Expr InferType(const Expr& expr, const IRModule& mod); /*! * \brief Apply rewrite rules to rewrite the expr in post DFS order. This diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py index 474741fc1e35..20854ab3fb27 100644 --- a/python/tvm/contrib/util.py +++ b/python/tvm/contrib/util.py @@ -29,6 +29,11 @@ fcntl = None +def eprint(*args, **kwargs): + # return + print(*args, file=sys.stderr, flush=True, **kwargs) + + class DirectoryCreatedPastAtExit(Exception): """Raised when a TempDirectory is created after the atexit hook runs.""" diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 07e61de82958..b71249c8c755 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -245,11 +245,11 @@ Expr Conv2dTransposeRealize(const Call& ref_call, DataType out_dtype = cfg->dtype_activation; attrs->out_dtype = out_dtype; - Expr ret = CallNode::make(ref_call->op, + Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args); Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale); Expr dom_scale = FoldConstantOpt(mul); - return QRealizeIntExprNode::make(ret, dom_scale, out_dtype); + return QRealizeIntExpr(ret, dom_scale, out_dtype); } RELAY_REGISTER_OP("nn.conv2d_transpose") diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc index 4862a999b85c..3609ee0bacc4 100644 --- a/src/relay/transforms/device_annotation.cc +++ b/src/relay/transforms/device_annotation.cc @@ -540,7 +540,7 @@ class AddDeviceCopy : public ExprMutator { attrs->src_dev_type = src_dev_type; attrs->dst_dev_type = dst_dev_type; static const Op& op = Op::Get("device_copy"); - Call device_copy = CallNode::make(op, {this->Mutate(arg)}, Attrs(attrs), {}); + Call device_copy = Call(op, {this->Mutate(arg)}, Attrs(attrs), {}); device_copy->checked_type_ = arg->checked_type_; call_args.push_back(device_copy); } else { @@ -548,7 +548,7 @@ class AddDeviceCopy : public ExprMutator { } } - auto ret = CallNode::make(call_node->op, call_args, call_node->attrs, call_node->type_args); + auto ret = Call(call_node->op, call_args, call_node->attrs, call_node->type_args); // manually add the checked_type_ // alternatively, can call InferType Pass after this ret->checked_type_ = call_node->checked_type_; @@ -641,8 +641,7 @@ Pass AddDeviceCopyOps() { [=](Function f, IRModule m, PassContext pc) { return Downcast(AddDeviceCopyOps(f)); }; - return CreateFunctionPass(pass_func, 1, "AddDeviceCopyOps", - {tir::StringImmNode::make("InferType")}); + return CreateFunctionPass(pass_func, 1, "AddDeviceCopyOps", {"InferType"}); } TVM_REGISTER_GLOBAL("relay._transform.AddDeviceCopy") diff --git a/vta/config/vta_cost.py b/vta/config/vta_cost.py deleted file mode 100644 index 9e1d7389b8c3..000000000000 --- a/vta/config/vta_cost.py +++ /dev/null @@ -1,102 +0,0 @@ -# cost function for intelfocl 32*32 gemm version -def cal_cost(insn): - """ - Cal the runtime cost statically - - Parameters - ------------ - insn: the insn (json) - - Returns - ------------ - the cost in s - """ - factor = 1000000.0 - def alu_imm_cost(iter_out, iter_in, uop_bgn, uop_end): - x = (uop_end - uop_bgn) * iter_out * iter_in - cycles = x + 46 - return cycles / factor - - def alu_cost(iter_out, iter_in, uop_bgn, uop_end): - x = (uop_end - uop_bgn) * iter_out * iter_in - cycles = 2 * x + 46 - return cycles / factor - - def gemm_cost(iter_out, iter_in, uop_bgn, uop_end): - x = (uop_end - uop_bgn) * iter_out * iter_in - cycles = x + 80 - return cycles / factor - - def load_acc_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = x + 150 - return cycles / factor - - def load_acc8_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = 1.2 * x + 150 - return cycles / factor - - def load_inp_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = (x_size + x_pad_0 + x_pad_1) * (y_size + y_pad_0 + y_pad_1) - cycles = 1.1 * x + 150 - return cycles / factor - - def load_uop_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = 1.1 * x + 150 - return cycles / factor - - def load_wgt_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = 17 * x + 150 - return cycles / factor - - def store_cost(y_size, y_pad_0, y_pad_1, x_size, x_pad_0, x_pad_1): - x = x_size * y_size - cycles = x + 150 - return cycles / factor - - def nop_cost(name): - if name == "NOP-COMPUTE-STAGE": - return 38 / factor - elif name == "NOP-MEMORY-STAGE": - return 50 / factor - elif name == "NOP-STORE-STAGE": - return 39 / factor - else: - print("Unknown nop op {}".format(name)) - return 0 - - if insn['type'] == "ALU": - return alu_cost(insn['outer_loop'][0], insn['inner_loop'][0], - insn['range'][0], insn['range'][1]) - elif insn['type'] == "ALU IMM": - return alu_imm_cost(insn['outer_loop'][0], insn['inner_loop'][0], - insn['range'][0], insn['range'][1]) - elif insn['type'] == "GEMM": - return gemm_cost(insn['outer_loop'][0], insn['inner_loop'][0], - insn['range'][0], insn['range'][1]) - elif insn['name'] == "LOAD INP": - return load_inp_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "LOAD WGT": - return load_wgt_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "LOAD UOP": - return load_uop_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "LOAD ACC": - return load_acc_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "LOAD ACC 8": - return load_acc8_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['name'] == "STORE": - return store_cost(insn['y'][0], insn['y'][1], insn['y'][2], - insn['x'][0], insn['x'][1], insn['x'][2]) - elif insn['type'] == "NOP": - return nop_cost(insn['name']) - else: - print("Unknown op type: {}".format(insn['type'])) - return 0 diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py index 207f784b5885..3b13c1769103 100644 --- a/vta/python/vta/transform.py +++ b/vta/python/vta/transform.py @@ -548,10 +548,22 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value): allow_fold = True _check_compact(dst) + + # for int8 -> int32 cast/load + orig_dtype = src.dtype + if src.dtype != data_type: + assert(data_type == "int%d" % env.ACC_WIDTH and \ + src.dtype == "int%d" % env.INP_WIDTH) + src.dtype = data_type + x_size, y_size, x_stride, offset = _get_2d_pattern( src, elem_width, elem_bytes, data_type, dst.scope, allow_fold=allow_fold) + if orig_dtype != src.dtype: + src.dtype = orig_dtype + mem_type = env.dev.MEM_ID_ACC_8 + irb = tvm.tir.ir_builder.create() irb.scope_attr(env.dev.vta_axis, "coproc_scope", env.dev.get_task_qid(task_qid)) diff --git a/vta/src/intelfocl/AOCLUtils/aocl_utils.h b/vta/src/intelfocl/AOCLUtils/aocl_utils.h deleted file mode 100644 index 70e0fc6bcc0a..000000000000 --- a/vta/src/intelfocl/AOCLUtils/aocl_utils.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this -// software and associated documentation files (the "Software"), to deal in the Software -// without restriction, including without limitation the rights to use, copy, modify, merge, -// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to -// whom the Software is furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in all copies or -// substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// This agreement shall be governed in all respects by the laws of the State of California and -// by the laws of the United States of America. - -// Main include file for AOCLUtils. Includes all other utility header files. - -#ifndef AOCL_UTILS_H -#define AOCL_UTILS_H - -#include "opencl.h" -#include "scoped_ptrs.h" -#include "options.h" - -#endif - diff --git a/vta/src/intelfocl/AOCLUtils/opencl.cpp b/vta/src/intelfocl/AOCLUtils/opencl.cpp deleted file mode 100644 index 04d989d7c9ea..000000000000 --- a/vta/src/intelfocl/AOCLUtils/opencl.cpp +++ /dev/null @@ -1,555 +0,0 @@ -// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this -// software and associated documentation files (the "Software"), to deal in the Software -// without restriction, including without limitation the rights to use, copy, modify, merge, -// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to -// whom the Software is furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in all copies or -// substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// This agreement shall be governed in all respects by the laws of the State of California and -// by the laws of the United States of America. - -#include "aocl_utils.h" -#include -#include - -#ifdef _WIN32 // Windows -#include -#else // Linux -#include -#include // readlink, chdir -#endif - -namespace aocl_utils { - -static const char *const VERSION_STR = "161"; - -////////////////////////////////////////// -// Host allocation functions for alignment -////////////////////////////////////////// - -// This is the minimum alignment requirement to ensure DMA can be used. -const unsigned AOCL_ALIGNMENT = 64; - -#ifdef _WIN32 // Windows -void *alignedMalloc(size_t size) { - return _aligned_malloc (size, AOCL_ALIGNMENT); -} - -void alignedFree(void * ptr) { - _aligned_free(ptr); -} -#else // Linux -void *alignedMalloc(size_t size) { - void *result = NULL; - int rc; - rc = posix_memalign (&result, AOCL_ALIGNMENT, size); - (void) rc; - return result; -} - -void alignedFree(void * ptr) { - free (ptr); -} -#endif - -/////////////////////////////// -// Error functions -/////////////////////////////// - -// Print the error associciated with an error code -void printError(cl_int error) { - // Print error message - switch(error) - { - case -1: - printf("CL_DEVICE_NOT_FOUND "); - break; - case -2: - printf("CL_DEVICE_NOT_AVAILABLE "); - break; - case -3: - printf("CL_COMPILER_NOT_AVAILABLE "); - break; - case -4: - printf("CL_MEM_OBJECT_ALLOCATION_FAILURE "); - break; - case -5: - printf("CL_OUT_OF_RESOURCES "); - break; - case -6: - printf("CL_OUT_OF_HOST_MEMORY "); - break; - case -7: - printf("CL_PROFILING_INFO_NOT_AVAILABLE "); - break; - case -8: - printf("CL_MEM_COPY_OVERLAP "); - break; - case -9: - printf("CL_IMAGE_FORMAT_MISMATCH "); - break; - case -10: - printf("CL_IMAGE_FORMAT_NOT_SUPPORTED "); - break; - case -11: - printf("CL_BUILD_PROGRAM_FAILURE "); - break; - case -12: - printf("CL_MAP_FAILURE "); - break; - case -13: - printf("CL_MISALIGNED_SUB_BUFFER_OFFSET "); - break; - case -14: - printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST "); - break; - - case -30: - printf("CL_INVALID_VALUE "); - break; - case -31: - printf("CL_INVALID_DEVICE_TYPE "); - break; - case -32: - printf("CL_INVALID_PLATFORM "); - break; - case -33: - printf("CL_INVALID_DEVICE "); - break; - case -34: - printf("CL_INVALID_CONTEXT "); - break; - case -35: - printf("CL_INVALID_QUEUE_PROPERTIES "); - break; - case -36: - printf("CL_INVALID_COMMAND_QUEUE "); - break; - case -37: - printf("CL_INVALID_HOST_PTR "); - break; - case -38: - printf("CL_INVALID_MEM_OBJECT "); - break; - case -39: - printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR "); - break; - case -40: - printf("CL_INVALID_IMAGE_SIZE "); - break; - case -41: - printf("CL_INVALID_SAMPLER "); - break; - case -42: - printf("CL_INVALID_BINARY "); - break; - case -43: - printf("CL_INVALID_BUILD_OPTIONS "); - break; - case -44: - printf("CL_INVALID_PROGRAM "); - break; - case -45: - printf("CL_INVALID_PROGRAM_EXECUTABLE "); - break; - case -46: - printf("CL_INVALID_KERNEL_NAME "); - break; - case -47: - printf("CL_INVALID_KERNEL_DEFINITION "); - break; - case -48: - printf("CL_INVALID_KERNEL "); - break; - case -49: - printf("CL_INVALID_ARG_INDEX "); - break; - case -50: - printf("CL_INVALID_ARG_VALUE "); - break; - case -51: - printf("CL_INVALID_ARG_SIZE "); - break; - case -52: - printf("CL_INVALID_KERNEL_ARGS "); - break; - case -53: - printf("CL_INVALID_WORK_DIMENSION "); - break; - case -54: - printf("CL_INVALID_WORK_GROUP_SIZE "); - break; - case -55: - printf("CL_INVALID_WORK_ITEM_SIZE "); - break; - case -56: - printf("CL_INVALID_GLOBAL_OFFSET "); - break; - case -57: - printf("CL_INVALID_EVENT_WAIT_LIST "); - break; - case -58: - printf("CL_INVALID_EVENT "); - break; - case -59: - printf("CL_INVALID_OPERATION "); - break; - case -60: - printf("CL_INVALID_GL_OBJECT "); - break; - case -61: - printf("CL_INVALID_BUFFER_SIZE "); - break; - case -62: - printf("CL_INVALID_MIP_LEVEL "); - break; - case -63: - printf("CL_INVALID_GLOBAL_WORK_SIZE "); - break; - default: - printf("UNRECOGNIZED ERROR CODE (%d)", error); - } -} - -// Print line, file name, and error code if there is an error. Exits the -// application upon error. -void _checkError(int line, - const char *file, - cl_int error, - const char *msg, - ...) { - // If not successful - if(error != CL_SUCCESS) { - // Print line and file - printf("ERROR: "); - printError(error); - printf("\nLocation: %s:%d\n", file, line); - - // Print custom message. - va_list vl; - va_start(vl, msg); - vprintf(msg, vl); - printf("\n"); - va_end(vl); - - // Cleanup and bail. - cleanup(); - exit(error); - } -} - -// Sets the current working directory to be the same as the directory -// containing the running executable. -bool setCwdToExeDir() { -#ifdef _WIN32 // Windows - HMODULE hMod = GetModuleHandle(NULL); - char path[MAX_PATH]; - GetModuleFileNameA(hMod, path, MAX_PATH); - -#else // Linux - // Get path of executable. - char path[300]; - ssize_t n = readlink("/proc/self/exe", path, sizeof(path)/sizeof(path[0]) - 1); - if(n == -1) { - return false; - } - path[n] = 0; -#endif - - // Find the last '\' or '/' and terminate the path there; it is now - // the directory containing the executable. - size_t i; - for(i = strlen(path) - 1; i > 0 && path[i] != '/' && path[i] != '\\'; --i); - path[i] = '\0'; - - // Change the current directory. -#ifdef _WIN32 // Windows - SetCurrentDirectoryA(path); -#else // Linux - int rc; - rc = chdir(path); - (void) rc; -#endif - - return true; -} - -// Searches all platforms for the first platform whose name -// contains the search string (case-insensitive). -cl_platform_id findPlatform(const char *platform_name_search) { - cl_int status; - - std::string search = platform_name_search; - std::transform(search.begin(), search.end(), search.begin(), tolower); - - // Get number of platforms. - cl_uint num_platforms; - status = clGetPlatformIDs(0, NULL, &num_platforms); - checkError(status, "Query for number of platforms failed"); - - // Get a list of all platform ids. - scoped_array pids(num_platforms); - status = clGetPlatformIDs(num_platforms, pids, NULL); - checkError(status, "Query for all platform ids failed"); - - // For each platform, get name and compare against the search string. - for(unsigned i = 0; i < num_platforms; ++i) { - std::string name = getPlatformName(pids[i]); - - // Convert to lower case. - std::transform(name.begin(), name.end(), name.begin(), tolower); - - if(name.find(search) != std::string::npos) { - // Found! - return pids[i]; - } - } - - // No platform found. - return NULL; -} - -// Returns the platform name. -std::string getPlatformName(cl_platform_id pid) { - cl_int status; - - size_t sz; - status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz); - checkError(status, "Query for platform name size failed"); - - scoped_array name(sz); - status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL); - checkError(status, "Query for platform name failed"); - - return name.get(); -} - -// Returns the device name. -std::string getDeviceName(cl_device_id did) { - cl_int status; - - size_t sz; - status = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, NULL, &sz); - checkError(status, "Failed to get device name size"); - - scoped_array name(sz); - status = clGetDeviceInfo(did, CL_DEVICE_NAME, sz, name, NULL); - checkError(status, "Failed to get device name"); - - return name.get(); -} - -// Returns the list of all devices. -cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices) { - cl_int status; - - status = clGetDeviceIDs(pid, dev_type, 0, NULL, num_devices); - checkError(status, "Query for number of devices failed"); - - cl_device_id *dids = new cl_device_id[*num_devices]; - status = clGetDeviceIDs(pid, dev_type, *num_devices, dids, NULL); - checkError(status, "Query for device ids"); - - return dids; -} - -// Create a program for all devices associated with the context. -cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices) { - // Early exit for potentially the most common way to fail: AOCX does not exist. - if(!fileExists(binary_file_name)) { - printf("AOCX file '%s' does not exist.\n", binary_file_name); - checkError(CL_INVALID_PROGRAM, "Failed to load binary file"); - } - - // Load the binary. - size_t binary_size; - scoped_array binary(loadBinaryFile(binary_file_name, &binary_size)); - if(binary == NULL) { - checkError(CL_INVALID_PROGRAM, "Failed to load binary file"); - } - - scoped_array binary_lengths(num_devices); - scoped_array binaries(num_devices); - for(unsigned i = 0; i < num_devices; ++i) { - binary_lengths[i] = binary_size; - binaries[i] = binary; - } - - cl_int status; - scoped_array binary_status(num_devices); - - cl_program program = clCreateProgramWithBinary(context, num_devices, devices, binary_lengths, - (const unsigned char **) binaries.get(), binary_status, &status); - checkError(status, "Failed to create program with binary"); - for(unsigned i = 0; i < num_devices; ++i) { - checkError(binary_status[i], "Failed to load binary for device"); - } - - return program; -} - -// Loads a file in binary form. -unsigned char *loadBinaryFile(const char *file_name, size_t *size) { - // Open the File - FILE* fp; -#ifdef _WIN32 - if(fopen_s(&fp, file_name, "rb") != 0) { - return NULL; - } -#else - fp = fopen(file_name, "rb"); - if(fp == 0) { - return NULL; - } -#endif - - // Get the size of the file - fseek(fp, 0, SEEK_END); - *size = ftell(fp); - - // Allocate space for the binary - unsigned char *binary = new unsigned char[*size]; - - // Go back to the file start - rewind(fp); - - // Read the file into the binary - if(fread((void*)binary, *size, 1, fp) == 0) { - delete[] binary; - fclose(fp); - return NULL; - } - - return binary; -} - -bool fileExists(const char *file_name) { -#ifdef _WIN32 // Windows - DWORD attrib = GetFileAttributesA(file_name); - return (attrib != INVALID_FILE_ATTRIBUTES && !(attrib & FILE_ATTRIBUTE_DIRECTORY)); -#else // Linux - return access(file_name, R_OK) != -1; -#endif -} - -std::string getBoardBinaryFile(const char *prefix, cl_device_id device) { - // First check if .aocx exists. Use it if it does. - std::string file_name = std::string(prefix) + ".aocx"; - if(fileExists(file_name.c_str())) { - return file_name; - } - - // Now get the name of the board. For Intel(R) FPGA SDK for OpenCL(TM) boards, - // the name of the device is presented as: - // : ... - std::string device_name = getDeviceName(device); - - // Now search for the " :" in the device name. - size_t end = device_name.find(" :"); - if(end != std::string::npos) { - std::string board_name(device_name, 0, end); - - // Look for a AOCX with the name __.aocx. - file_name = std::string(prefix) + "_" + board_name + "_" + VERSION_STR + ".aocx"; - if(fileExists(file_name.c_str())) { - return file_name; - } - } - - // At this point just use .aocx. This file doesn't exist - // and this should trigger an error later. - return std::string(prefix) + ".aocx"; -} - -// High-resolution timer. -double getCurrentTimestamp() { -#ifdef _WIN32 // Windows - // Use the high-resolution performance counter. - - static LARGE_INTEGER ticks_per_second = {}; - if(ticks_per_second.QuadPart == 0) { - // First call - get the frequency. - QueryPerformanceFrequency(&ticks_per_second); - } - - LARGE_INTEGER counter; - QueryPerformanceCounter(&counter); - - double seconds = double(counter.QuadPart) / double(ticks_per_second.QuadPart); - return seconds; -#else // Linux - timespec a; - clock_gettime(CLOCK_MONOTONIC, &a); - return (double(a.tv_nsec) * 1.0e-9) + double(a.tv_sec); -#endif -} - -cl_ulong getStartEndTime(cl_event event) { - cl_int status; - - cl_ulong start, end; - status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); - checkError(status, "Failed to query event start time"); - status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); - checkError(status, "Failed to query event end time"); - - return end - start; -} - -cl_ulong getStartEndTime(cl_event *events, unsigned num_events) { - cl_int status; - - cl_ulong min_start = 0; - cl_ulong max_end = 0; - for(unsigned i = 0; i < num_events; ++i) { - cl_ulong start, end; - status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); - checkError(status, "Failed to query event start time"); - status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); - checkError(status, "Failed to query event end time"); - - if(i == 0) { - min_start = start; - max_end = end; - } - else { - if(start < min_start) { - min_start = start; - } - if(end > max_end) { - max_end = end; - } - } - } - - return max_end - min_start; -} - -void waitMilliseconds(unsigned ms) { -#ifdef _WIN32 // Windows - Sleep(ms); -#else // Linux - timespec sleeptime = {0, 0}; - sleeptime.tv_sec = ms / 1000; - sleeptime.tv_nsec = long(ms % 1000) * 1000000L; // convert to nanoseconds - nanosleep(&sleeptime, NULL); -#endif -} - -void oclContextCallback(const char *errinfo, const void *, size_t, void *) { - printf("Context callback: %s\n", errinfo); -} - -} // ns aocl_utils - diff --git a/vta/src/intelfocl/AOCLUtils/opencl.h b/vta/src/intelfocl/AOCLUtils/opencl.h deleted file mode 100644 index 4aa5348b67b1..000000000000 --- a/vta/src/intelfocl/AOCLUtils/opencl.h +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this -// software and associated documentation files (the "Software"), to deal in the Software -// without restriction, including without limitation the rights to use, copy, modify, merge, -// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to -// whom the Software is furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in all copies or -// substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// This agreement shall be governed in all respects by the laws of the State of California and -// by the laws of the United States of America. - -// OpenCL utility functions. - -#ifndef AOCL_UTILS_OPENCL_H -#define AOCL_UTILS_OPENCL_H - -#include -#include -#include -#include - -#include "CL/opencl.h" - -// This is assumed to be externally provided by the application. -extern void cleanup(); - -namespace aocl_utils { - -// Host allocation functions -void *alignedMalloc(size_t size); -void alignedFree(void *ptr); - -// Error functions -void printError(cl_int error); -void _checkError(int line, - const char *file, - cl_int error, - const char *msg, - ...); // does not return -#define checkError(status, ...) _checkError(__LINE__, __FILE__, status, __VA_ARGS__) - -// Sets the current working directory to the same directory that contains -// this executable. Returns true on success. -bool setCwdToExeDir(); - -// Find a platform that contains the search string in its name (case-insensitive match). -// Returns NULL if no match is found. -cl_platform_id findPlatform(const char *platform_name_search); - -// Returns the name of the platform. -std::string getPlatformName(cl_platform_id pid); - -// Returns the name of the device. -std::string getDeviceName(cl_device_id did); - -// Returns an array of device ids for the given platform and the -// device type. -// Return value must be freed with delete[]. -cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices); - -// Create a OpenCL program from a binary file. -// The program is created for all given devices associated with the context. The same -// binary is used for all devices. -cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices); - -// Load binary file. -// Return value must be freed with delete[]. -unsigned char *loadBinaryFile(const char *file_name, size_t *size); - -// Checks if a file exists. -bool fileExists(const char *file_name); - -// Returns the path to the AOCX file to use for the given device. -// This is special handling for examples for the Intel(R) FPGA SDK for OpenCL(TM). -// It uses the device name to get the board name and then looks for a -// corresponding AOCX file. Specifically, it gets the device name and -// extracts the board name assuming the device name has the following format: -// : ... -// -// Then the AOCX file is __.aocx. If this -// file does not exist, then the file name defaults to .aocx. -std::string getBoardBinaryFile(const char *prefix, cl_device_id device); - -// Returns the time from a high-resolution timer in seconds. This value -// can be used with a value returned previously to measure a high-resolution -// time difference. -double getCurrentTimestamp(); - -// Returns the difference between the CL_PROFILING_COMMAND_END and -// CL_PROFILING_COMMAND_START values of a cl_event object. -// This requires that the command queue associated with the event be created -// with the CL_QUEUE_PROFILING_ENABLE property. -// -// The return value is in nanoseconds. -cl_ulong getStartEndTime(cl_event event); - -// Returns the maximum time span for the given set of events. -// The time span starts at the earliest event start time. -// The time span ends at the latest event end time. -cl_ulong getStartEndTime(cl_event *events, unsigned num_events); - -// Wait for the specified number of milliseconds. -void waitMilliseconds(unsigned ms); - -// OpenCL context callback function that simply prints the error information -// to stdout (via printf). -void oclContextCallback(const char *errinfo, const void *, size_t, void *); - -} // ns aocl_utils - -#endif - diff --git a/vta/src/intelfocl/AOCLUtils/options.cpp b/vta/src/intelfocl/AOCLUtils/options.cpp deleted file mode 100644 index 05d025b43faf..000000000000 --- a/vta/src/intelfocl/AOCLUtils/options.cpp +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this -// software and associated documentation files (the "Software"), to deal in the Software -// without restriction, including without limitation the rights to use, copy, modify, merge, -// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to -// whom the Software is furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in all copies or -// substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// This agreement shall be governed in all respects by the laws of the State of California and -// by the laws of the United States of America. - -#include "aocl_utils.h" -#include -#include -#include -#include - -namespace aocl_utils { - -Options::Options() { -} - -Options::Options(int num, char *argv[]) { - addFromCommandLine(num, argv); -} - -bool Options::has(const std::string &name) const { - return m_options.find(name) != m_options.end(); -} - -std::string &Options::get(const std::string &name) { - return m_options[name]; -} - -const std::string &Options::get(const std::string &name) const { - OptionMap::const_iterator it = m_options.find(name); - if(it == m_options.end()) { - errorNonExistent(name); - std::cerr << "Option '" << name << "' does not exist.\n"; - exit(1); - } - return it->second; -} - -void Options::addFromCommandLine(int num, char *argv[]) { - for(int i = 1; i < num; ++i) { - const std::string arg = argv[i]; - - // Look for the first '-'. - if(arg.size() > 1 && arg[0] == '-') { - size_t eq = arg.find('='); - size_t name_start = 1; - - // Check if there's a second '-'. - if(arg.size() > 2 && arg[1] == '-') { - name_start = 2; - } - - if(eq == std::string::npos) { - // No '='; treat as a boolean option. - set(arg.substr(name_start), true); - } - else if(eq == name_start) { - // No name?! - errorNameless(); - } - else { - set(arg.substr(name_start, eq - name_start), arg.substr(eq + 1)); - } - } - else { - // Not an option. - m_nonoptions.push_back(arg); - } - } -} - -void Options::errorNameless() const { - std::cerr << "No name provided for option.\n"; - exit(1); -} - -void Options::errorNonExistent(const std::string &name) const { - std::cerr << "Option '" << name << "' does not exist.\n"; - exit(1); -} - -void Options::errorWrongType(const std::string &name) const { - std::cerr << "Value for option '" << name << "' is not of the right type (value = '" - << get(name) << "').\n"; - exit(1); -} - -} // ns aocl_utils - diff --git a/vta/src/intelfocl/AOCLUtils/options.h b/vta/src/intelfocl/AOCLUtils/options.h deleted file mode 100644 index 78d34605e60e..000000000000 --- a/vta/src/intelfocl/AOCLUtils/options.h +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this -// software and associated documentation files (the "Software"), to deal in the Software -// without restriction, including without limitation the rights to use, copy, modify, merge, -// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to -// whom the Software is furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in all copies or -// substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// This agreement shall be governed in all respects by the laws of the State of California and -// by the laws of the United States of America. - -// Declares a utility class used to parse command-line options. - -#ifndef AOCL_UTILS_OPTIONS_H -#define AOCL_UTILS_OPTIONS_H - -#include -#include -#include -#include - -namespace aocl_utils { - -class Options { -public: - typedef std::vector StringVec; - - Options(); - Options(int num, char *argv[]); - - bool has(const std::string &name) const; - std::string &get(const std::string &name); // will create an empty option if it does not exist - const std::string &get(const std::string &name) const; // error if option does not exist - - void set(const std::string &name, const std::string &value) { get(name) = value; } - - // Command line options must be of the following form: - // [-]-name (indicates option exists) - // [-]-name=value - // - // This function assumes that the values are from main(int, char *). - // This means that the argv[0] is skipped. - void addFromCommandLine(int num, char *argv[]); - - // This templated function converts the option value to the given type. - // An assert is raised if the conversion fails. - template - T get(const std::string &name) const; - - template - void set(const std::string &name, const T &value); - - // Non-options are arguments processed in addFromCommandLine - // that were not recognized as options. - const StringVec &getNonOptions() const { return m_nonoptions; } - size_t getNonOptionCount() const { return m_nonoptions.size(); } - const std::string &getNonOption(size_t i) const { return m_nonoptions[i]; } - -private: - typedef std::map OptionMap; - - // Displays an error message indicating that a nameless option - // was provided. - void errorNameless() const; - - // Displays an error message indicating that the given option - // has the wrong type and then exits with an error code. - void errorWrongType(const std::string &name) const; - - // Displays an error message indicating that the given option - // does not exist and then exits with an error code. - void errorNonExistent(const std::string &name) const; - - OptionMap m_options; - StringVec m_nonoptions; - - Options(const Options &); // not implemented - void operator =(const Options &); // not implemented -}; - -template -T Options::get(const std::string &name) const { - std::stringstream ss; - ss << get(name); - - T v; - ss >> v; - if(ss.fail() || !ss.eof()) { - // Failed to parse or did not consume the whole string value. - errorWrongType(name); - } - return v; -} - -// Specialization for bool. -template<> -inline bool Options::get(const std::string &name) const { - if(has(name)) { - const std::string &v = get(name); - if(v == "1") { - return true; - } - } - return false; -} - -// Specialization for std::string. Simply returns the option string. -// Requires specialization because using stringstream to read the string -// will stop at the first whitespace character (which is wrong). -template<> -inline std::string Options::get(const std::string &name) const { - return get(name); -} - -// This assumes the type T can be serialized to a string and back (when get -// is called). -template -void Options::set(const std::string &name, const T &value) { - std::stringstream ss; - ss << value; - set(name, ss.str()); -} - -} // ns aocl_utils - -#endif - diff --git a/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h b/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h deleted file mode 100644 index b11085c5226e..000000000000 --- a/vta/src/intelfocl/AOCLUtils/scoped_ptrs.h +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. -// Permission is hereby granted, free of charge, to any person obtaining a copy of this -// software and associated documentation files (the "Software"), to deal in the Software -// without restriction, including without limitation the rights to use, copy, modify, merge, -// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to -// whom the Software is furnished to do so, subject to the following conditions: -// The above copyright notice and this permission notice shall be included in all copies or -// substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// This agreement shall be governed in all respects by the laws of the State of California and -// by the laws of the United States of America. - -// Scoped pointer definitions. - -#ifndef AOCL_UTILS_SCOPED_PTRS_H -#define AOCL_UTILS_SCOPED_PTRS_H - -namespace aocl_utils { - -// Interface is essentially the combination of std::auto_ptr and boost's smart pointers, -// along with some small extensions (auto conversion to T*). - -// scoped_ptr: assumes pointer was allocated with operator new; destroys with operator delete -template -class scoped_ptr { -public: - typedef scoped_ptr this_type; - - scoped_ptr() : m_ptr(NULL) {} - scoped_ptr(T *ptr) : m_ptr(ptr) {} - ~scoped_ptr() { reset(); } - - T *get() const { return m_ptr; } - operator T *() const { return m_ptr; } - T *operator ->() const { return m_ptr; } - T &operator *() const { return *m_ptr; } - - this_type &operator =(T *ptr) { reset(ptr); return *this; } - - void reset(T *ptr = NULL) { delete m_ptr; m_ptr = ptr; } - T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } - -private: - T *m_ptr; - - // noncopyable - scoped_ptr(const this_type &); - this_type &operator =(const this_type &); -}; - -// scoped_array: assumes pointer was allocated with operator new[]; destroys with operator delete[] -// Also supports allocation/reset with a number, which is the number of -// elements of type T. -template -class scoped_array { -public: - typedef scoped_array this_type; - - scoped_array() : m_ptr(NULL) {} - scoped_array(T *ptr) : m_ptr(NULL) { reset(ptr); } - explicit scoped_array(size_t n) : m_ptr(NULL) { reset(n); } - ~scoped_array() { reset(); } - - T *get() const { return m_ptr; } - operator T *() const { return m_ptr; } - T *operator ->() const { return m_ptr; } - T &operator *() const { return *m_ptr; } - T &operator [](int index) const { return m_ptr[index]; } - - this_type &operator =(T *ptr) { reset(ptr); return *this; } - - void reset(T *ptr = NULL) { delete[] m_ptr; m_ptr = ptr; } - void reset(size_t n) { reset(new T[n]); } - T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } - -private: - T *m_ptr; - - // noncopyable - scoped_array(const this_type &); - this_type &operator =(const this_type &); -}; - -// scoped_aligned_ptr: assumes pointer was allocated with alignedMalloc; destroys with alignedFree -// Also supports allocation/reset with a number, which is the number of -// elements of type T -template -class scoped_aligned_ptr { -public: - typedef scoped_aligned_ptr this_type; - - scoped_aligned_ptr() : m_ptr(NULL) {} - scoped_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); } - explicit scoped_aligned_ptr(size_t n) : m_ptr(NULL) { reset(n); } - ~scoped_aligned_ptr() { reset(); } - - T *get() const { return m_ptr; } - operator T *() const { return m_ptr; } - T *operator ->() const { return m_ptr; } - T &operator *() const { return *m_ptr; } - T &operator [](int index) const { return m_ptr[index]; } - - this_type &operator =(T *ptr) { reset(ptr); return *this; } - - void reset(T *ptr = NULL) { if(m_ptr) alignedFree(m_ptr); m_ptr = ptr; } - void reset(size_t n) { reset((T*) alignedMalloc(sizeof(T) * n)); } - T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } - -private: - T *m_ptr; - - // noncopyable - scoped_aligned_ptr(const this_type &); - this_type &operator =(const this_type &); -}; - -#if USE_SVM_API == 1 -// scoped_SVM_aligned_ptr: assumes pointer was allocated with clSVMAlloc; destroys with clSVMFree -// Also supports allocation/reset with a number, which is the number of -// elements of type T -template -class scoped_SVM_aligned_ptr { -public: - typedef scoped_SVM_aligned_ptr this_type; - - scoped_SVM_aligned_ptr() : m_ptr(NULL) {} - scoped_SVM_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); } - explicit scoped_SVM_aligned_ptr(cl_context ctx, size_t n) : m_ptr(NULL) { reset(ctx, n); } - ~scoped_SVM_aligned_ptr() { reset(); } - - T *get() const { return m_ptr; } - operator T *() const { return m_ptr; } - T *operator ->() const { return m_ptr; } - T &operator *() const { return *m_ptr; } - T &operator [](int index) const { return m_ptr[index]; } - - this_type &operator =(T *ptr) { reset(ptr); return *this; } - - void reset(T *ptr = NULL) { if (m_ptr) clSVMFree(m_ctx, m_ptr); m_ptr = ptr; } - void reset(cl_context ctx, size_t n) { reset((T*)clSVMAlloc(ctx, 0, sizeof(T) * n, 0)); m_ctx = ctx; } - T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } - -private: - T *m_ptr; - cl_context m_ctx; - - // noncopyable - scoped_SVM_aligned_ptr(const this_type &); - this_type &operator =(const this_type &); -}; -#endif /* USE_SVM_API == 1 */ - -} // ns aocl_utils - -#endif - diff --git a/vta/src/intelfocl/intelfocl_device.cc b/vta/src/intelfocl/intelfocl_device.cc deleted file mode 100644 index 5eb1519b1124..000000000000 --- a/vta/src/intelfocl/intelfocl_device.cc +++ /dev/null @@ -1,181 +0,0 @@ -#include -#include -#include "intelfocl_device.h" -#include "AOCLUtils/aocl_utils.h" - -#define MEM_ALIGNMENT (1024) - -#define CL_STATUS_SUCCESS(x) ((x) == CL_SUCCESS) - -void cleanup() {} - -int IntelFOCLDevice::init(size_t mem_size, std::string aocx_file) -{ - cl_int status; - cl_device_id device; - cl_platform_id platform; - unsigned int argi; - bool focl_device_avail; - unsigned int num_devices; - aocl_utils::scoped_array devices; - - platform = aocl_utils::findPlatform("Intel(R) FPGA SDK for OpenCL(TM)"); - CHECK(platform) << "Unable to find Intel(R) FPGA OpenCL platform"; - - devices.reset(aocl_utils::getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices)); - focl_device_avail = false; - for ( unsigned int i = 0; i < num_devices; i ++ ) - { - device = devices[i]; - _context = clCreateContext(NULL, 1, &device, &aocl_utils::oclContextCallback, NULL, &status); - if ( CL_STATUS_SUCCESS(status) ) - { - focl_device_avail = true; - LOG(INFO) << "Using device: " << aocl_utils::getDeviceName(device); - break; - } - } - CHECK(focl_device_avail) << "No FPGA device available"; - num_devices = 1; - - LOG(INFO) << "Using AOCX: " << aocx_file; - _program = aocl_utils::createProgramFromBinary(_context, aocx_file.c_str(), &device, num_devices); - status = clBuildProgram(_program, 0, NULL, "", NULL, NULL); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to build program"; - - for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ ) - { - _kernels[i] = clCreateKernel(_program, kernel_names[i].c_str(), &status); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create kernel"; - _queues[i] = clCreateCommandQueue(_context, device, 0, &status); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create command queue"; - } - - _mem = clCreateBuffer(_context, CL_MEM_READ_WRITE, mem_size, NULL, &status); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to create buffer mem"; - mem_chunk_t init_chunk = {.offset = 0, .size = mem_size, .occupied = false}; - _mem_chunks.push_back(init_chunk); - - argi = 1; - status = clSetKernelArg(_kernels[KERNEL_FETCH], argi++, sizeof(cl_mem), &_mem); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; - argi = 0; - status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; - status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; - status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; - status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; - status = clSetKernelArg(_kernels[KERNEL_COMPUTE], argi++, sizeof(cl_mem), &_mem); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; - - return 0; -} - -ifocl_mem_off_t IntelFOCLDevice::alloc(size_t size) -{ - auto iter = _mem_chunks.begin(); - size_t aligned_size = ((size + MEM_ALIGNMENT - 1) / MEM_ALIGNMENT) * MEM_ALIGNMENT; - - while ( iter != _mem_chunks.end() && (iter->occupied || (iter->size < aligned_size)) ) - { - iter++; - } - - if ( iter == _mem_chunks.end() ) return IFOCL_MEM_OFF_ERR; - - iter->occupied = true; - if ( iter->size != aligned_size ) - { - mem_chunk_t rem = {iter->offset + aligned_size, iter->size - aligned_size, false}; - iter->size = aligned_size; - _mem_chunks.insert(std::next(iter), rem); - } - - return iter->offset; -} - -void IntelFOCLDevice::free(ifocl_mem_off_t offset) -{ - auto iter = _mem_chunks.begin(); - while ( iter != _mem_chunks.end() && iter->offset < offset ) iter++; - - if ( iter == _mem_chunks.end() || iter->offset != offset || !iter->occupied ) - { - return; - } - - iter->occupied = false; - if ( iter != _mem_chunks.begin() && !std::prev(iter)->occupied ) iter--; - - while ( std::next(iter) != _mem_chunks.end() && !std::next(iter)->occupied ) - { - iter->size += std::next(iter)->size; - _mem_chunks.erase(std::next(iter)); - } -} - - -void IntelFOCLDevice::write_mem(ifocl_mem_off_t offset, const void *buf, size_t nbyte) -{ - cl_int status = clEnqueueWriteBuffer(_queues[0], _mem, CL_TRUE, offset, nbyte, buf, 0, NULL, NULL); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue write buffer"; -} - -void IntelFOCLDevice::read_mem(ifocl_mem_off_t offset, void *buf, size_t nbyte) -{ - cl_int status = clEnqueueReadBuffer(_queues[0], _mem, CL_TRUE, offset, nbyte, buf, 0, NULL, NULL); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue read buffer"; -}; - -int IntelFOCLDevice::execute_instructions(ifocl_mem_off_t offset, size_t count) -{ - cl_int status; - unsigned int argi; - unsigned int insn_offset = offset / VTA_INS_ELEM_BYTES; - unsigned int insn_count = count; - const size_t global_work_size = 1; - - argi = 0; - status = clSetKernelArg(_kernels[KERNEL_FETCH], argi, sizeof(unsigned int), &insn_count); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; - argi = 2; - status = clSetKernelArg(_kernels[KERNEL_FETCH], argi, sizeof(unsigned int), &insn_offset); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to set argument " << argi; - - for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ ) - { - status = clEnqueueNDRangeKernel(_queues[i], _kernels[i], 1, NULL, &global_work_size, NULL, 0, NULL, NULL); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to enqueue kernel"; - } - - for ( unsigned int i = 0; i < KERNEL_PROFILE; i++ ) - { - status = clFinish(_queues[i]); - CHECK(CL_STATUS_SUCCESS(status)) << "Failed to clFinish"; - } - - return 0; -}; - -void IntelFOCLDevice::deinit() -{ - for ( unsigned int i = 0; i < NUM_OCL_KERNELS; i++ ) - { - clReleaseKernel(_kernels[i]); - clReleaseCommandQueue(_queues[i]); - } - - clReleaseMemObject(_mem); - - clReleaseProgram(_program); - - clReleaseContext(_context); -} - -IntelFOCLDevice::~IntelFOCLDevice() -{ - deinit(); -} diff --git a/vta/src/intelfocl/intelfocl_device.h b/vta/src/intelfocl/intelfocl_device.h deleted file mode 100644 index 6c53a4d47323..000000000000 --- a/vta/src/intelfocl/intelfocl_device.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef VTA_INTEL_FOCL_DEVICE_H_ -#define VTA_INTEL_FOCL_DEVICE_H_ - -#include -#include - -#include "CL/opencl.h" - -#define NUM_OCL_KERNELS 3 -enum kernel_index {KERNEL_FETCH, KERNEL_COMPUTE, KERNEL_PROFILE}; -static std::string kernel_names[3] = {"fetch", "compute", "profile"}; - -typedef size_t ifocl_mem_off_t; -#define IFOCL_MEM_OFF_ERR (SIZE_MAX) - -typedef struct -{ - ifocl_mem_off_t offset; - size_t size; - bool occupied; -} mem_chunk_t; - -class IntelFOCLDevice { - private: - cl_context _context; - cl_program _program; - cl_mem _mem; - cl_kernel _kernels[NUM_OCL_KERNELS]; - cl_command_queue _queues[NUM_OCL_KERNELS]; - std::list _mem_chunks; - - public: - IntelFOCLDevice() { init(4*1024*1024*1024ULL, "vta_opencl.aocx"); } - - int init(size_t mem_size, std::string aocx_file); - - ifocl_mem_off_t alloc(size_t size); - - void free(ifocl_mem_off_t offset); - - void write_mem(ifocl_mem_off_t offset, const void *buf, size_t nbyte); - - void read_mem(ifocl_mem_off_t offset, void *buf, size_t nbyte); - - int execute_instructions(ifocl_mem_off_t offset, size_t count); - - void deinit(); - - ~IntelFOCLDevice(); -}; - -#endif // VTA_INTEL_FOCL_DEVICE_H_ - diff --git a/vta/src/intelfocl/intelfocl_driver.cc b/vta/src/intelfocl/intelfocl_driver.cc deleted file mode 100644 index a8db9cd0e394..000000000000 --- a/vta/src/intelfocl/intelfocl_driver.cc +++ /dev/null @@ -1,74 +0,0 @@ -#include -#include -#include -#include "intelfocl_device.h" - -#define MEM_ADDR_IDENTIFIER (0x18000000) - -static IntelFOCLDevice focl_device; - -static inline void* mem_get_addr(ifocl_mem_off_t offset) -{ - void *ret = (void *) (offset + MEM_ADDR_IDENTIFIER); - return ret; -} - -static inline ifocl_mem_off_t mem_get_offset(const void *addr) -{ - ifocl_mem_off_t ret = (ifocl_mem_off_t) addr - MEM_ADDR_IDENTIFIER; - return ret; -} - -void* VTAMemAlloc(size_t size, int cached) { - (void) cached; - ifocl_mem_off_t offset = focl_device.alloc(size); - if ( offset == IFOCL_MEM_OFF_ERR ) return NULL; - void *addr = mem_get_addr(offset); - return addr; -} - -void VTAMemFree(void *buf) { - ifocl_mem_off_t offset = mem_get_offset(buf); - focl_device.free(offset); -} - -vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { - ifocl_mem_off_t offset = mem_get_offset(buf); - return (vta_phy_addr_t) offset; -} - -void VTAMemCopyFromHost(void* dst, const void* src, size_t size) { - ifocl_mem_off_t dst_offset = mem_get_offset(dst); - focl_device.write_mem(dst_offset, src, size); -} - -void VTAMemCopyToHost(void* dst, const void* src, size_t size) { - ifocl_mem_off_t src_offset = mem_get_offset(src); - focl_device.read_mem(src_offset, dst, size); -} - -void VTAFlushCache(void * offset, vta_phy_addr_t buf, int size) { - std::cout << "VTAFlushCache not implemented for Intel OpenCL for FPGA devices" << std::endl; -} - -void VTAInvalidateCache(void * offset, vta_phy_addr_t buf, int size) { - std::cout << "VTAInvalidateCache not implemented for Intel OpenCL for FPGA devices" << std::endl; -} - -VTADeviceHandle VTADeviceAlloc() { - return (VTADeviceHandle) &focl_device; -} - -void VTADeviceFree(VTADeviceHandle handle) { - (void) handle; -} - -int VTADeviceRun(VTADeviceHandle handle, - vta_phy_addr_t insn_phy_addr, - uint32_t insn_count, - uint32_t wait_cycles) -{ - (void) wait_cycles; - ifocl_mem_off_t offset = (ifocl_mem_off_t) insn_phy_addr; - return focl_device.execute_instructions(offset, insn_count); -} diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc deleted file mode 100644 index 518b6c368926..000000000000 --- a/vta/src/pynq/pynq_driver.cc +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * - * \file pynq_driver.c - * \brief VTA driver for Zynq SoC boards with Pynq support (see pynq.io). - */ - -#include -#include -#include -#include "pynq_driver.h" - - -void* VTAMemAlloc(size_t size, int cached) { - assert(size <= VTA_MAX_XFER); - // Rely on the pynq-specific cma library - return cma_alloc(size, cached); -} - -void VTAMemFree(void* buf) { - // Rely on the pynq-specific cma library - cma_free(buf); -} - -vta_phy_addr_t VTAMemGetPhyAddr(void* buf) { - return cma_get_phy_addr(buf); -} - -void VTAMemCopyFromHost(void* dst, const void* src, size_t size) { - // For SoC-based FPGAs that used shared memory with the CPU, use memcopy() - memcpy(dst, src, size); -} - -void VTAMemCopyToHost(void* dst, const void* src, size_t size) { - // For SoC-based FPGAs that used shared memory with the CPU, use memcopy() - memcpy(dst, src, size); -} - -void VTAFlushCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { - // Call the cma_flush_cache on the CMA buffer - // so that the FPGA can read the buffer data. - cma_flush_cache(vir_addr, phy_addr, size); -} - -void VTAInvalidateCache(void* vir_addr, vta_phy_addr_t phy_addr, int size) { - // Call the cma_invalidate_cache on the CMA buffer - // so that the host needs to read the buffer data. - cma_invalidate_cache(vir_addr, phy_addr, size); -} - -void *VTAMapRegister(uint32_t addr) { - // Align the base address with the pages - uint32_t virt_base = addr & ~(getpagesize() - 1); - // Calculate base address offset w.r.t the base address - uint32_t virt_offset = addr - virt_base; - // Open file and mmap - uint32_t mmap_file = open("/dev/mem", O_RDWR|O_SYNC); - return mmap(NULL, - (VTA_IP_REG_MAP_RANGE + virt_offset), - PROT_READ|PROT_WRITE, - MAP_SHARED, - mmap_file, - virt_base); -} - -void VTAUnmapRegister(void *vta) { - // Unmap memory - int status = munmap(vta, VTA_IP_REG_MAP_RANGE); - assert(status == 0); -} - -void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) { - *((volatile uint32_t *) (reinterpret_cast(base_addr) + offset)) = val; -} - -uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) { - return *((volatile uint32_t *) (reinterpret_cast(base_addr) + offset)); -} - -class VTADevice { - public: - VTADevice() { - // VTA stage handles - vta_fetch_handle_ = VTAMapRegister(VTA_FETCH_ADDR); - vta_load_handle_ = VTAMapRegister(VTA_LOAD_ADDR); - vta_compute_handle_ = VTAMapRegister(VTA_COMPUTE_ADDR); - vta_store_handle_ = VTAMapRegister(VTA_STORE_ADDR); - } - - ~VTADevice() { - // Close VTA stage handle - VTAUnmapRegister(vta_fetch_handle_); - VTAUnmapRegister(vta_load_handle_); - VTAUnmapRegister(vta_compute_handle_); - VTAUnmapRegister(vta_store_handle_); - } - - int Run(vta_phy_addr_t insn_phy_addr, - uint32_t insn_count, - uint32_t wait_cycles) { - VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_COUNT_OFFSET, insn_count); - VTAWriteMappedReg(vta_fetch_handle_, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy_addr); - VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_INP_ADDR_OFFSET, 0); - VTAWriteMappedReg(vta_load_handle_, VTA_LOAD_WGT_ADDR_OFFSET, 0); - VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_UOP_ADDR_OFFSET, 0); - VTAWriteMappedReg(vta_compute_handle_, VTA_COMPUTE_BIAS_ADDR_OFFSET, 0); - VTAWriteMappedReg(vta_store_handle_, VTA_STORE_OUT_ADDR_OFFSET, 0); - - // VTA start - VTAWriteMappedReg(vta_fetch_handle_, 0x0, VTA_START); - VTAWriteMappedReg(vta_load_handle_, 0x0, VTA_AUTORESTART); - VTAWriteMappedReg(vta_compute_handle_, 0x0, VTA_AUTORESTART); - VTAWriteMappedReg(vta_store_handle_, 0x0, VTA_AUTORESTART); - - // Allow device to respond - struct timespec ts = { .tv_sec = 0, .tv_nsec = 1000 }; - nanosleep(&ts, &ts); - - // Loop until the VTA is done - unsigned t, flag = 0; - for (t = 0; t < wait_cycles; ++t) { - flag = VTAReadMappedReg(vta_compute_handle_, VTA_COMPUTE_DONE_RD_OFFSET); - if (flag == VTA_DONE) break; - std::this_thread::yield(); - } - // Report error if timeout - return t < wait_cycles ? 0 : 1; - } - - private: - // VTA handles (register maps) - void* vta_fetch_handle_{nullptr}; - void* vta_load_handle_{nullptr}; - void* vta_compute_handle_{nullptr}; - void* vta_store_handle_{nullptr}; -}; - -VTADeviceHandle VTADeviceAlloc() { - return new VTADevice(); -} - -void VTADeviceFree(VTADeviceHandle handle) { - delete static_cast(handle); -} - -int VTADeviceRun(VTADeviceHandle handle, - vta_phy_addr_t insn_phy_addr, - uint32_t insn_count, - uint32_t wait_cycles) { - return static_cast(handle)->Run( - insn_phy_addr, insn_count, wait_cycles); -} diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index 30fe7f2b0b06..b61f594872c1 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -96,7 +96,7 @@ # The ``start_pack`` and ``stop_pack`` labels indicate where # to start and end the graph packing relay pass: in other words # where to start and finish offloading to VTA. -model = "resnet18_v1" +model = "resnet50_v2" assert model in pack_dict ###################################################################### @@ -162,7 +162,8 @@ # Load pre-configured AutoTVM schedules log_file = "%s.%s.log-manual-formatv0_2" % (device, model) -with autotvm.tophub.context(target, extra_files=[log_file]): +alu_log_file = "%s.alu.%s.log" % (device, model) +with autotvm.tophub.context(target, extra_files=[log_file, alu_log_file]): # Populate the shape and data type dictionary for ImageNet classifier input dtype_dict = {"data": 'float32'} @@ -176,7 +177,6 @@ # Start front end compilation mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) - eprint("from_mxnet mod = ", mod) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) @@ -189,7 +189,6 @@ with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): mod = relay.quantize.quantize(mod, params=params) - eprint("done quantize", mod) # Perform graph packing and constant folding for VTA target assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack( @@ -199,7 +198,6 @@ env.WGT_WIDTH, start_name=pack_dict[model][0], stop_name=pack_dict[model][1]) - eprint("done graphpack ", relay_prog) else: relay_prog = mod["main"] From 127ae4a72ce794495242a064e1d32b33d9265b38 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Wed, 10 Jun 2020 23:31:04 +0800 Subject: [PATCH 24/44] update vta-hw commit --- 3rdparty/vta-hw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index db65157208ec..7d9629d58945 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit db65157208ec8fabb7b548c94596211b9db04190 +Subproject commit 7d9629d58945f0f042fb1690847d09f2e3e7781c From a6cd975e11ba53e754ae6438145d193600d74de1 Mon Sep 17 00:00:00 2001 From: Li Jiashu Date: Fri, 12 Jun 2020 02:06:13 +0800 Subject: [PATCH 25/44] Rename VTA_MEM_ID_ACC_8 to VTA_MEM_ID_ACC_8BIT --- 3rdparty/vta-hw | 2 +- vta/runtime/runtime.cc | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index 7d9629d58945..410049f9340a 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit 7d9629d58945f0f042fb1690847d09f2e3e7781c +Subproject commit 410049f9340a0ab1552655f8c8bfc1a833851e89 diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 92d5ab06cc8f..d3cfce3e2b66 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -796,7 +796,7 @@ class InsnQueue : public BaseQueue { return "LOAD INP"; } else if (c.mem.memory_type == VTA_MEM_ID_ACC) { return "LOAD ACC"; - } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8) { + } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8BIT) { return "LOAD ACC 8"; } else { return "LOAD"; @@ -1123,7 +1123,7 @@ class InsnQueue : public BaseQueue { } // Get stage of the memory static PipelineStage GetMemPipelineStage(int memory_type) { - if (memory_type == VTA_MEM_ID_ACC || memory_type == VTA_MEM_ID_ACC_8) return kComputeStage; + if (memory_type == VTA_MEM_ID_ACC || memory_type == VTA_MEM_ID_ACC_8BIT) return kComputeStage; if (memory_type == VTA_MEM_ID_UOP) return kComputeStage; return kLoadStage; } @@ -1133,7 +1133,7 @@ class InsnQueue : public BaseQueue { if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage; if (insn->opcode == VTA_OPCODE_LOAD) { if (insn->x_size == 0) return kNoneStage; - if (insn->memory_type == VTA_MEM_ID_ACC || insn->memory_type == VTA_MEM_ID_ACC_8) return kComputeStage; + if (insn->memory_type == VTA_MEM_ID_ACC || insn->memory_type == VTA_MEM_ID_ACC_8BIT) return kComputeStage; if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage; return kLoadStage; } @@ -1218,7 +1218,7 @@ class CommandQueue { case VTA_MEM_ID_OUT: elem_bytes = VTA_OUT_ELEM_BYTES; break; - case VTA_MEM_ID_ACC_8: + case VTA_MEM_ID_ACC_8BIT: elem_bytes = VTA_ACC_ELEM_BYTES / 4; break; default: From 06af08b9a3f09a1aadcc151d0878674759dcdeba Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Fri, 12 Jun 2020 12:15:34 +0800 Subject: [PATCH 26/44] back-compatible other vta hardware impl --- vta/python/vta/testing/simulator.py | 2 +- vta/python/vta/top/op.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py index bf89107f9f79..5ac8c80fed8d 100644 --- a/vta/python/vta/testing/simulator.py +++ b/vta/python/vta/testing/simulator.py @@ -25,7 +25,7 @@ def _load_sw(): """Load hardware library for simulator.""" env = get_env() - lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim" if env.TARGET == "sim" else "libvta" + lib_driver_name = "libvta_tsim" if env.TARGET == "" else "libvta" if env.TARGET == "intelfocl" else "libvta_fsim" # Load driver library lib_driver = find_libvta(lib_driver_name, optional=True) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index ae9ca1a90142..69eee2aad94c 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -95,6 +95,11 @@ def is_cast_op(op): te.schedule.AutoInlineInjective(s) # s[output].fuse(s[output].op.axis) + env = get_env() + # other target does not support alu-only ops + if not (env.TARGET in ["sim", "tsim", "intelfocl"]): + return s + # only put the int-related ops to vta if "int" in output.dtype and len(output.shape) == 6: ewise_inputs = [] @@ -144,7 +149,6 @@ def _traverse(op): s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) store_pt = x_j0 - env = get_env() for eo in ewise_ops: s[eo].set_scope(env.acc_scope) s[eo].pragma(s[eo].op.axis[0], env.alu) From 0855a4a2d6f0f0b69bff6ff000838d78fca8ac5e Mon Sep 17 00:00:00 2001 From: Li Jiashu Date: Fri, 12 Jun 2020 15:54:12 +0800 Subject: [PATCH 27/44] update vta-hw commit --- 3rdparty/vta-hw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index 410049f9340a..5f0a28671be9 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit 410049f9340a0ab1552655f8c8bfc1a833851e89 +Subproject commit 5f0a28671be9f2c621253c7a33c2dcb678a20ae2 From 63977922c63ba157bf73cb6b8f810f411b1aa7ec Mon Sep 17 00:00:00 2001 From: Li Jiashu Date: Fri, 12 Jun 2020 16:22:17 +0800 Subject: [PATCH 28/44] update vta-hw commit --- 3rdparty/vta-hw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index 5f0a28671be9..f0347e202966 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit 5f0a28671be9f2c621253c7a33c2dcb678a20ae2 +Subproject commit f0347e202966322fe6a961eab2f4ff963bced2d5 From e43981f284198f2f570ec80e86df3e4ba48c9419 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Sun, 14 Jun 2020 22:34:37 +0800 Subject: [PATCH 29/44] remove unneeded code --- cmake/modules/VTA.cmake | 1 + include/tvm/relay/transform.h | 29 ------ python/tvm/autotvm/measure/measure_methods.py | 10 -- python/tvm/autotvm/tuner/tuner.py | 13 ++- python/tvm/contrib/util.py | 5 - python/tvm/relay/op/op.py | 1 - python/tvm/relay/op/strategy/generic.py | 2 +- python/tvm/relay/quantize/_partition.py | 3 +- python/tvm/relay/transform/transform.py | 4 - src/relay/backend/build_module.cc | 8 -- src/relay/backend/compile_engine.cc | 4 +- src/relay/backend/graph_plan_memory.cc | 64 +++++++------ src/relay/quantize/realize.cc | 11 +-- src/relay/transforms/device_annotation.cc | 96 +------------------ src/tir/transforms/lower_tvm_builtin.cc | 2 + vta/python/vta/environment.py | 2 +- vta/python/vta/top/graphpack.py | 9 +- vta/python/vta/top/op.py | 25 +---- vta/python/vta/top/vta_conv2d.py | 7 -- vta/python/vta/transform.py | 3 +- vta/runtime/runtime.cc | 38 +------- .../integration/test_benchmark_topi_conv2d.py | 4 +- vta/tutorials/autotvm/tune_alu_vta.py | 2 - vta/tutorials/autotvm/tune_relay_vta.py | 17 ++-- .../frontend/deploy_classification.py | 5 +- vta/tutorials/frontend/deploy_dcgan.py | 2 - vta/tutorials/frontend/deploy_mobilenet.py | 1 - 27 files changed, 75 insertions(+), 293 deletions(-) diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index 371bd27fa80e..4193fbaf657f 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -125,6 +125,7 @@ elseif(PYTHON) elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include") target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL) endif() endif() diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h index a7f5fea98ea2..b287c053e8a9 100644 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@ -116,16 +116,6 @@ TVM_DLL Pass FuseOps(int fuse_opt_level = -1); */ TVM_DLL Pass RewriteAnnotatedOps(int fallback_device); -/*! - * \brief add device_copy if two adjacent nodes are on different devices - * - * \param expr The expression. - * - * \return The updated program. - */ -TVM_DLL Pass AddDeviceCopyOps(); - - /*! * \brief turn a dataflow graph into Administrative Normal Form, or A-Normal Form (ANF). * @@ -384,7 +374,6 @@ TVM_DLL Expr Bind(const Expr& expr, const tvm::Map& binds); * \note this function mutates mod and is not thread-safe. */ TVM_DLL Function InferType(const Function& f, const IRModule& mod, const GlobalVar& var); -TVM_DLL Expr InferType(const Expr& expr, const IRModule& mod); /*! * \brief Apply rewrite rules to rewrite the expr in post DFS order. This @@ -429,24 +418,6 @@ TVM_DLL Expr ForwardRewrite(const Expr& expr, const FForwardRewrite& rewrite_fun */ TVM_DLL Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device); -/*! - * \brief add device_copy if two adjacent nodes are on different devices - * - * \param expr The expression. - * - * \return The updated program. - */ -TVM_DLL Expr AddDeviceCopyOps(const Expr& expr); - -/*! - * \brief Fuse operations into expr into seperate functions. - * - * \param fuse_opt_level Optimization level. If it is -1 it will be inferred from pass context. - * - * \return The pass. - */ -TVM_DLL Expr FuseOps(const Expr& expr, int fuse_opt_level, const IRModule& module); - /*! * \brief Turn an expression into continuation passing style(CPS). * diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 666d307247c1..d6b5defb710c 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -50,8 +50,6 @@ from .measure import MeasureResult, MeasureErrorNo, Builder, Runner from .local_executor import LocalExecutor -from tvm.contrib.util import eprint - logger = logging.getLogger('autotvm') class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))): @@ -489,14 +487,6 @@ def run_through_rpc(measure_input, build_result, try: # upload built module remote = request_remote(*remote_args) - # Program the FPGA every single time when targeting VTA - if hasattr(measure_input.target, 'device_name') and \ - measure_input.target.device_name == 'vta': - # pylint: disable=import-outside-toplevel - from vta import program_fpga, reconfig_runtime - # FIXME(zhanghao): remove this - # program_fpga(remote, None) - # reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py index 4f984aae701f..2441a4ae642f 100644 --- a/python/tvm/autotvm/tuner/tuner.py +++ b/python/tvm/autotvm/tuner/tuner.py @@ -161,13 +161,12 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=(), si_pr logger.debug("Early stopped. Best iter: %d.", self.best_iter) break - # NOTE(zhanghao): comment out as it will raise too many logs - # if error_ct > 150: - # logging.basicConfig() - # logger.warning("Too many errors happen in the tuning. Now is in debug mode") - # logger.setLevel(logging.DEBUG) - # else: - # logger.setLevel(old_level) + if error_ct > 150: + logging.basicConfig() + logger.warning("Too many errors happen in the tuning. Now is in debug mode") + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(old_level) GLOBAL_SCOPE.in_tuning = False del measure_batch diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py index 20854ab3fb27..474741fc1e35 100644 --- a/python/tvm/contrib/util.py +++ b/python/tvm/contrib/util.py @@ -29,11 +29,6 @@ fcntl = None -def eprint(*args, **kwargs): - # return - print(*args, file=sys.stderr, flush=True, **kwargs) - - class DirectoryCreatedPastAtExit(Exception): """Raised when a TempDirectory is created after the atexit hook runs.""" diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index 8ef51cf595fc..7fad9a258f2b 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -394,7 +394,6 @@ def register_external_compiler(op_name, fexternal=None, level=10): return tvm.ir.register_op_attr(op_name, "FTVMExternalCompiler", fexternal, level) - @tvm._ffi.register_func("relay.op.compiler._lower") def _lower(name, schedule, inputs, outputs): return lower(schedule, list(inputs) + list(outputs), name=name) diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 025d67630cf9..63ad1127bbc0 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -35,7 +35,7 @@ def wrapper(attrs, outs, target): def wrap_topi_compute(topi_compute): - """Wrap TOPI schedule which doesn't use attrs""" + """Wrap TOPI compute which doesn't use attrs""" def wrapper(attrs, inputs, out_type): return [topi_compute(*inputs)] return wrapper diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py index f26e88301894..c0234594a38f 100644 --- a/python/tvm/relay/quantize/_partition.py +++ b/python/tvm/relay/quantize/_partition.py @@ -21,7 +21,6 @@ from .. import analysis as _analysis from . import _quantize from .quantize import _forward_op -from tvm.contrib.util import eprint def register_partition_function(op_name, frewrite=None, level=10): return tvm.ir.register_op_attr(op_name, "FQPartitionRewrite", frewrite, level) @@ -55,7 +54,7 @@ def conv2d_partition_function(ref_call, new_args, ctx): @register_partition_function("nn.conv2d_transpose") def conv2d_partition_function(ref_call, new_args, ctx): - """Rewrite function for conv2d for partition""" + """Rewrite function for conv2d_transpose for partition""" data_cond, data = partition_expr_check(new_args[0]) kernel_cond, kernel = partition_expr_check(new_args[1]) diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index d1a93fd5f9b8..8f4ec1046500 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -441,10 +441,6 @@ def RewriteAnnotatedOps(fallback_device): return _ffi_api.RewriteDeviceAnnotation(fallback_device) -def AddDeviceCopy(): - return _transform.AddDeviceCopy() - - def ToANormalForm(): """Turn Graph Normal Form expression into A Normal Form Expression. The scope of the root expression is the global scope. diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index cbe4ae2d4256..f9ce24d410b7 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -419,14 +419,6 @@ class RelayBuildModule : public runtime::ModuleNode { // Get the updated function. auto func = Downcast(relay_module->Lookup("main")); - // do extra pass to check to insert device_copy if necessary - if (targets_.size() > 1) { - func = Downcast(relay::AddDeviceCopyOps(func)); - // we have to do fuseops again as we may add new device_copy ops - func = Downcast(relay::FuseOps(func, -1, relay_module)); - func = Downcast(relay::InferType(func, relay_module)); - } - // Generate code for the updated function. graph_codegen_ = std::unique_ptr(new GraphCodegen()); graph_codegen_->Init(nullptr, targets_); diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index 37fb0108f111..8079fdbe76d8 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -123,7 +123,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator> readable_name_stream_ << "fused"; cache_node->outputs = this->VisitExpr(prim_func->body); auto candidate_name = readable_name_stream_.str(); - constexpr static size_t kMaxFuncNameLength = 800; + constexpr static size_t kMaxFuncNameLength = 80; if (candidate_name.size() > kMaxFuncNameLength) { std::stringstream truncated_name; truncated_name << candidate_name.substr(0, kMaxFuncNameLength); @@ -343,7 +343,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> auto cache_node = make_object(); cache_node->outputs = VisitExpr(prim_func->body); auto candidate_name = readable_name_stream_.str(); - constexpr static size_t kMaxFuncNameLength = 800; + constexpr static size_t kMaxFuncNameLength = 80; if (candidate_name.size() > kMaxFuncNameLength) { std::stringstream truncated_name; truncated_name << candidate_name.substr(0, kMaxFuncNameLength); diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 8ebf9847c3a7..66de20dcf4c0 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -309,35 +309,41 @@ class StorageAllocator : public StorageAllocaBaseVisitor { if (match_range_ == 0) { return this->Alloc(prototype, size); } - // TODO(zhanghao): to avoid overwrite shared storage when we copy all the instructions in a single batch - // auto begin = free_.lower_bound(size / match_range_); - // auto mid = free_.lower_bound(size); - // auto end = free_.upper_bound(size * match_range_); - // // search for memory blocks larger than requested - // for (auto it = mid; it != end; ++it) { - // StorageToken *tok = it->second; - // if (tok->device_type != prototype->device_type) continue; - // CHECK_EQ(tok->ref_counter, 0); - // // Use exect matching strategy - // tok->max_bytes = std::max(size, tok->max_bytes); - // tok->ref_counter = prototype->ref_counter; - // // find a exact match, erase from map and return - // free_.erase(it); - // return tok; - // } - // // then search for memory blocks smaller than requested space - // for (auto it = mid; it != begin;) { - // --it; - // StorageToken *tok = it->second; - // if (tok->device_type != prototype->device_type) continue; - // CHECK_EQ(tok->ref_counter, 0); - // // Use exect matching strategy - // tok->max_bytes = std::max(size, tok->max_bytes); - // tok->ref_counter = prototype->ref_counter; - // // erase from map and return - // free_.erase(it); - // return tok; - // } + // quickfix(zhanghao): we copy all the instructions in a single batch + // to avoid overwrite shared storage, we do not re-use allocation + const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE"); + if (sync_once) { + return this->Alloc(prototype, size); + } + + auto begin = free_.lower_bound(size / match_range_); + auto mid = free_.lower_bound(size); + auto end = free_.upper_bound(size * match_range_); + // search for memory blocks larger than requested + for (auto it = mid; it != end; ++it) { + StorageToken* tok = it->second; + if (tok->device_type != prototype->device_type) continue; + CHECK_EQ(tok->ref_counter, 0); + // Use exect matching strategy + tok->max_bytes = std::max(size, tok->max_bytes); + tok->ref_counter = prototype->ref_counter; + // find a exact match, erase from map and return + free_.erase(it); + return tok; + } + // then search for memory blocks smaller than requested space + for (auto it = mid; it != begin;) { + --it; + StorageToken* tok = it->second; + if (tok->device_type != prototype->device_type) continue; + CHECK_EQ(tok->ref_counter, 0); + // Use exect matching strategy + tok->max_bytes = std::max(size, tok->max_bytes); + tok->ref_counter = prototype->ref_counter; + // erase from map and return + free_.erase(it); + return tok; + } // cannot find anything return a new one. return this->Alloc(prototype, size); } diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index b71249c8c755..9dbc27d2c5a3 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -360,11 +360,7 @@ Array UnifyDTypeScale(const Array& ref_args, const Array& args if (nptrs[i]->dtype != dtype) { auto new_arg = Cast(ret[i], dtype); - // NOTE(zhanghao) - // if you want to let cpu to do all the cast, use the following code - // ret.Set(i, StopFusion(new_arg)); - - // do not fuse float32 cast + // FIXME(zhanghao): do not fuse float32 cast if (nptrs[i]->dtype == DataType::Float(32)) { ret.Set(i, StopFusion(new_arg)); } else { @@ -374,11 +370,6 @@ Array UnifyDTypeScale(const Array& ref_args, const Array& args ref_arg->attrs.as()->kind == kQInput) { auto new_arg = Cast(ret[i], cfg->dtype_input); new_arg = StopFusion(new_arg); - - // NOTE(zhanghao) - // if you want to let cpu to do all the cast, use the following code - // ret.Set(i, StopFusion(Cast(new_arg, dtype))); - ret.Set(i, Cast(new_arg, dtype)); } } diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc index 3609ee0bacc4..fe3cfebf7fe3 100644 --- a/src/relay/transforms/device_annotation.cc +++ b/src/relay/transforms/device_annotation.cc @@ -60,10 +60,8 @@ bool IsDeviceCopyNode(const ExprNode* node) { class ValidateAnnotation : private ExprVisitor { public: - ValidateAnnotation(int fallback_device): fallback_device_(fallback_device) {} - - static std::unordered_map Validate(const Expr& expr, int fallback_device) { - ValidateAnnotation valid(fallback_device); + static std::unordered_map Validate(const Expr& expr) { + ValidateAnnotation valid; valid(expr); return valid.annotation_map_; } @@ -82,15 +80,12 @@ class ValidateAnnotation : private ExprVisitor { CHECK_EQ(call_node->args.size(), 1U); const auto* node = call_node->args[0].operator->(); - // LOG(WARNING) << "annotated node, device_type = " << device_type << " : " << GetRef(node).as()->op; if (annotation_map_.count(node)) { CHECK_EQ(annotation_map_.at(node), device_type) << "An expression node can only be annotated to one device."; } else { annotation_map_.insert({node, GetDeviceId(call_node)}); } - - if (device_type != fallback_device_) extra_device_ = device_type; } } @@ -114,8 +109,6 @@ class ValidateAnnotation : private ExprVisitor { } std::unordered_map annotation_map_; - int fallback_device_ = 0; - int extra_device_ = 0; }; // Replace the use of an expression with the output of a `copy_device` operator @@ -129,7 +122,7 @@ class RewriteAnnotation : public ExprMutator { public: Expr Rewrite(const Expr& expr, int fallback_device) { fallback_device_ = fallback_device; - annotation_map_ = ValidateAnnotation::Validate(expr, fallback_device); + annotation_map_ = ValidateAnnotation::Validate(expr); return this->VisitExpr(expr); } @@ -236,7 +229,6 @@ class RewriteAnnotation : public ExprMutator { CHECK(dit != annotation_map_.end()) << "Device copy op is not required when both src and dst ops are not " "annotated."; - // LOG(WARNING) << "Create device copy " << fallback_device_ << " to " << dit->second << ": " << src.as()->op; return CreateDeviceCopy(src, fallback_device_, dit->second); } else { const auto dit = annotation_map_.find(dst); @@ -255,8 +247,7 @@ class RewriteAnnotation : public ExprMutator { return src_dev_type != fallback_device_; } } else { - // if annotation value < 0, it means this is for "copy from" only - if (annotation_map_.count(dst) && annotation_map_.at(dst) > 0) { + if (annotation_map_.count(dst)) { // Though data copy op could be inserted whenever the `src` and `dst` // ops are annotated to different devices, it leads to high overhead. // @@ -496,68 +487,6 @@ class DeviceInfo { Map device_map_; }; - -// TODO(zhanghao): consider to remove this as I think it is not necessary for now -class AddDeviceCopy : public ExprMutator { - public: - Expr Rewrite(const Expr& expr) { - device_map_ = DeviceInfo::GetDeviceMap(expr); - return this->Mutate(expr); - } - - private: - // add device copy if two nodes not on the same device - Expr VisitExpr_(const CallNode* call_node) override { - auto func_node = call_node->op.as(); - bool src_is_copy_node = false; - if (func_node && IsDeviceCopyNode(func_node->body.as())) { - // LOG(WARNING) << "DeviceCopy skip device_copy node"; - src_is_copy_node = true; - } - - tvm::Array call_args; - auto call_expr = GetRef(call_node); - CHECK(device_map_.count(call_expr)); - - for (auto& arg: call_node->args) { - CHECK(device_map_.count(arg)); - bool dst_is_copy_node = false; - if (auto arg_node = arg.as()) { - auto func_node = arg_node->op.as(); - if (func_node && IsDeviceCopyNode(func_node->body.as())) { - // LOG(WARNING) << "DeviceCopy skip dst device_copy node"; - dst_is_copy_node = true; - } - } - - int src_dev_type = device_map_.count(arg) ? device_map_[arg]->value : 1; - int dst_dev_type = device_map_.count(call_expr) ? device_map_[call_expr]->value : 1; - if (!src_is_copy_node && !dst_is_copy_node && src_dev_type != dst_dev_type) { - // auto arg_call = arg.as(); - // LOG(WARNING) << "Not consistent device type, src = " << src_dev_type << ":" << (arg_call ? arg_call->op : arg); - // LOG(WARNING) << "Not consistent device type, dst = " << dst_dev_type << ":" << call_node->op; - auto attrs = make_object(); - attrs->src_dev_type = src_dev_type; - attrs->dst_dev_type = dst_dev_type; - static const Op& op = Op::Get("device_copy"); - Call device_copy = Call(op, {this->Mutate(arg)}, Attrs(attrs), {}); - device_copy->checked_type_ = arg->checked_type_; - call_args.push_back(device_copy); - } else { - call_args.push_back(this->Mutate(arg)); - } - } - - auto ret = Call(call_node->op, call_args, call_node->attrs, call_node->type_args); - // manually add the checked_type_ - // alternatively, can call InferType Pass after this - ret->checked_type_ = call_node->checked_type_; - return ret; - } - - Map device_map_; -}; - Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) { RewriteAnnotation rewrote = RewriteAnnotation(); Expr new_expr = rewrote.Rewrite(expr, fallback_device); @@ -605,12 +534,6 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) { } } -Expr AddDeviceCopyOps(const Expr& expr) { - auto rewrote = AddDeviceCopy(); - Expr new_expr = rewrote.Rewrite(expr); - return new_expr; -} - Map CollectDeviceInfo(const Expr& expr) { return DeviceInfo::GetDeviceMap(expr); } @@ -636,17 +559,6 @@ Pass RewriteAnnotatedOps(int fallback_device) { TVM_REGISTER_GLOBAL("relay._transform.RewriteDeviceAnnotation").set_body_typed(RewriteAnnotatedOps); -Pass AddDeviceCopyOps() { - runtime::TypedPackedFunc pass_func = - [=](Function f, IRModule m, PassContext pc) { - return Downcast(AddDeviceCopyOps(f)); - }; - return CreateFunctionPass(pass_func, 1, "AddDeviceCopyOps", {"InferType"}); -} - -TVM_REGISTER_GLOBAL("relay._transform.AddDeviceCopy") -.set_body_typed(AddDeviceCopyOps); - } // namespace transform } // namespace relay diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 386e9885807b..3d54d45015c6 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -87,6 +87,8 @@ class BuiltinLower : public StmtExprMutator { // Get constant allocation bound. int64_t nbytes = GetVectorBytes(op->dtype); // FIXME(zhanghao): remove special handling for kDLCPU + // otherwise, may cause LLVM parameters match error + // if in heterogenous targets // if (device_type_.defined()) { // if (arith::GetConst(device_type_, &dev_type)) { // if (dev_type == kDLCPU) { diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index 548dc03aae78..cfed3f77def1 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -62,7 +62,7 @@ class DevContext(object): MEM_ID_INP = 2 MEM_ID_ACC = 3 MEM_ID_OUT = 4 - MEM_ID_ACC_8 = 5 + MEM_ID_ACC_8BIT = 5 # VTA ALU Opcodes ALU_OPCODE_MIN = 0 ALU_OPCODE_MAX = 1 diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index ac4c8aac4539..0934ed15d8b9 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -21,7 +21,6 @@ from tvm import relay from tvm.relay import op, transform from tvm.relay import ExprMutator -from tvm.contrib.util import eprint def run_opt_pass(expr, opt_pass): """Exectue a relay pass.""" @@ -416,11 +415,6 @@ def visit_call(self, call): elif self.start_pack and call.op == op.op.get('cast') and \ input_types[0].dtype == 'int32': cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs) - # zhanghao: force separate cast and copy (to let copy do on cpu) - # cast = relay.Call(op.op.get('annotation.stop_fusion'), [cast]) - - # zhanghao: remove the redudant copy - # return relay.Call(op.op.get('copy'), [cast]) return cast elif call.op == self.pad: pad_width = call.attrs.pad_width @@ -516,7 +510,7 @@ def graph_pack(expr, stop_name="nn.global_avg_pool2d", start_name_idx=None, stop_name_idx=None, - count_meta=False, device_annot=True): + count_meta=False, device_annot=False): """Pack the graph into batch&channel packed format. Parameters @@ -574,6 +568,7 @@ def graph_pack(expr, expr_locator = ExprLocater() expr_locator.visit(expr) + # FIXME(zhanghao): generalize this part # from the first int conv2d to the last int stop_fusion, all will run on vta conv2d = op.op.get("nn.conv2d") conv2d_transpose = op.op.get("nn.conv2d_transpose") diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 69eee2aad94c..617be4b56d19 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -35,10 +35,6 @@ from ..environment import get_env -# override to force partition at copy -# TODO(zhanghao): remove all copy -# reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) - # add clip vta strategy def compute_clip_vta(attrs, inputs, output_type): """ Clip operator. """ @@ -76,11 +72,6 @@ def multiply_packed(cfg, lhs, rhs): return topi.multiply(lhs, rhs) -@autotvm.register_topi_compute("copy.vta") -def copy_packed(cfg, i): - return topi.identify(i) - - def schedule_alu_packed(cfg, outs): assert len(outs) == 1 @@ -183,11 +174,6 @@ def schedule_multiply_packed(cfg, outs): return schedule_alu_packed(cfg, outs) -@autotvm.register_topi_schedule("copy.vta") -def schedule_copy_packed(cfg, outs): - return schedule_alu_packed(cfg, outs) - - def add_strategy_vta(attrs, inputs, out_type, target): strategy = OpStrategy() strategy.add_implementation( @@ -206,18 +192,9 @@ def multiply_strategy_vta(attrs, inputs, out_type, target): return strategy -def copy_strategy_vta(attrs, inputs, out_type, target): - strategy = OpStrategy() - strategy.add_implementation( - _strategy.wrap_topi_compute(copy_packed), - _strategy.wrap_topi_schedule(schedule_copy_packed), - name="copy.vta") - return strategy - - reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta") reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta") -reg.get("copy").get_attr("FTVMStrategy").register(copy_strategy_vta, "vta") + @_strategy.conv2d_strategy.register("vta") def conv2d_strategy_vta(attrs, inputs, out_type, target): diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py index 525d60ae383d..5b23ddeba1c1 100644 --- a/vta/python/vta/top/vta_conv2d.py +++ b/vta/python/vta/top/vta_conv2d.py @@ -25,9 +25,6 @@ from .util import is_packed_layout from ..environment import get_env -from tvm.relay import op as Op -from tvm.contrib.util import eprint - @autotvm.register_topi_compute("conv2d_packed.vta") def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): @@ -66,7 +63,6 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty return res - @autotvm.register_topi_schedule("conv2d_packed.vta") def schedule_conv2d_packed(cfg, outs): """Schedule packed conv2d""" @@ -188,6 +184,3 @@ def _traverse(op): s[output].pragma(x_co1, env.dma_copy) return s - - - diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py index 3b13c1769103..a8ecb1099a89 100644 --- a/vta/python/vta/transform.py +++ b/vta/python/vta/transform.py @@ -549,6 +549,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value): _check_compact(dst) + # FIXME(zhanghao): optimize # for int8 -> int32 cast/load orig_dtype = src.dtype if src.dtype != data_type: @@ -562,7 +563,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value): if orig_dtype != src.dtype: src.dtype = orig_dtype - mem_type = env.dev.MEM_ID_ACC_8 + mem_type = env.dev.MEM_ID_ACC_8BIT irb = tvm.tir.ir_builder.create() irb.scope_attr(env.dev.vta_axis, "coproc_scope", diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index d3cfce3e2b66..cf70f7e19361 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -186,12 +186,7 @@ struct DataBuffer { * Bytes. */ void MemCopyFromHost(void* dst, const void* src, size_t size) { - // struct timespec start, stop; - // clock_gettime(CLOCK_REALTIME, &start); VTAMemCopyFromHost(dst, src, size); - // clock_gettime(CLOCK_REALTIME, &stop); - // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000; - // LOG(WARNING) << "DataBuffer VTAMemCopyFromHost: " << elapsed << " us"; } /*! * \brief Performs a copy operation from buffer allocated with VTAMemAlloc to host memory. @@ -549,16 +544,6 @@ class UopQueue : public BaseQueue { buff_size += cache_[i]->size() * kElemBytes; } CHECK(buff_size <= kMaxBytes); - // Move kernel contents to FPGA readable buffer - // uint32_t offset = 0; - // for (uint32_t i = 0; i < cache_.size(); ++i) { - // uint32_t ksize = cache_[i]->size() * kElemBytes; - // VTAMemCopyFromHost(static_cast(fpga_buff_) + offset, - // cache_[i]->data(), - // ksize); - // // Update offset - // offset += ksize; - // } // merge all the cache entries and do CopyFromHost once uint32_t total_size = 0; @@ -797,7 +782,7 @@ class InsnQueue : public BaseQueue { } else if (c.mem.memory_type == VTA_MEM_ID_ACC) { return "LOAD ACC"; } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8BIT) { - return "LOAD ACC 8"; + return "LOAD ACC 8BIT"; } else { return "LOAD"; } @@ -860,6 +845,7 @@ class InsnQueue : public BaseQueue { // Iterate over all instructions int insn_count = count(); const VTAGenericInsn* insn = data(); + // FIXME(zhanghao): rapidjson dep rapidjson::StringBuffer s; rapidjson::Writer writer(s); @@ -1335,18 +1321,9 @@ class CommandQueue { // Check if there are no instruction to execute at all if (insn_queue_.count() == 0) return; // Synchronization for the queues - // struct timespec start, stop; - // clock_gettime(CLOCK_REALTIME, &start); uop_queue_.AutoReadBarrier(); - // clock_gettime(CLOCK_REALTIME, &stop); - // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000; - // LOG(WARNING) << "UopQueue VTAMemCopyFromHost: " << elapsed << " us"; - // clock_gettime(CLOCK_REALTIME, &start); insn_queue_.AutoReadBarrier(); - // clock_gettime(CLOCK_REALTIME, &stop); - // elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000; - // LOG(WARNING) << "InsnQueue VTAMemCopyFromHost: " << elapsed << " us"; // Dump instructions if debug enabled if (debug_flag_ & VTA_DEBUG_DUMP_INSN) { insn_queue_.DumpInsn(); @@ -1505,6 +1482,7 @@ class CommandQueue { void CheckInsnOverFlow() { // At each API call, we can at most commit: // one pending store, one pending load, and one uop + // FIXME(zhanghao): check why there are 5 insns if ((insn_queue_.count() + 5) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) { this->AutoSync(); } @@ -1547,13 +1525,8 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off if (from_buffer) { // This is an FPGA to host mem transfer // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly - // struct timespec start, stop; - // clock_gettime(CLOCK_REALTIME, &start); const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE"); if (sync_once) VTASynchronize(VTATLSCommandHandle(), 1<<31, false); - // clock_gettime(CLOCK_REALTIME, &stop); - // uint64_t elapsed = 1000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec) / 1000; - // LOG(WARNING) << "Final Synchronize: " << elapsed << " us"; from_buffer->InvalidateCache(from_offset, size); from_buffer->MemCopyToHost(static_cast(to) + to_offset, static_cast(from) + from_offset, size); @@ -1573,8 +1546,6 @@ void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) { static_cast(cmd)->SetDebugFlag(debug_flag); } -// TODO(zhanghao): now we do the check here -// it would be better to do the check in ir_pass before adding the "VTABufferCPUPtr" void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) { auto data_buf = vta::DataBuffer::FromHandle(buffer); if (data_buf) { @@ -1645,5 +1616,4 @@ int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) { } void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip) { - static_cast(cmd)-> - Synchronize(wait_cycles, skip); } + static_cast(cmd)->Synchronize(wait_cycles, skip); } diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index ea6b9cf1e9da..1d940c2ac9be 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -246,12 +246,12 @@ def _run(env, remote): reconfig_runtime(remote) elif device == "arm_cpu": target = env.target_vta_cpu - with autotvm.tophub.context(target, extra_files = ['vta.resnet18_v1.log-manual-formatv0_2']): # load pre-tuned schedule parameters + with autotvm.tophub.context(target): # load pre-tuned schedule parameters for _, wl in resnet_wkls: print(wl) run_conv2d(env, remote, wl, target) vta.testing.run(_run) if __name__ == "__main__": - # test_conv2d(device="arm_cpu") + test_conv2d(device="arm_cpu") test_conv2d(device="vta") diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py index 8a9a09c76856..a5f03cdc22c7 100644 --- a/vta/tutorials/autotvm/tune_alu_vta.py +++ b/vta/tutorials/autotvm/tune_alu_vta.py @@ -37,7 +37,6 @@ from vta.top import graph_pack import copy -from tvm.contrib.util import eprint ################################################################# # Compile network @@ -145,7 +144,6 @@ def log_to_file(file_out, protocol='json'): def _callback(_, inputs, results): with open(file_out, "a") as f: for inp, result in zip(inputs, results): - eprint("inp = {}, result = {}".format(inp, result)) f.write(record.encode(inp, result, protocol) + "\n") # we only consider task with same lhs and rhs diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index 7e537fae9128..3f62f15b6490 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -76,7 +76,7 @@ # Perform vta-specific compilation with Relay from a Gluon model -def compile_network(env, target, model, start_pack, stop_pack, device_annot=False): +def compile_network(env, target, model, start_pack, stop_pack): # Populate the shape and data type dictionary dtype_dict = {"data": 'float32'} @@ -104,8 +104,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, - stop_name=stop_pack, - device_annot=device_annot) + stop_name=stop_pack) return relay_prog, params @@ -195,7 +194,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals # The ``start_pack`` and ``stop_pack`` labels indicate where # to start and end the graph packing relay pass: in other words # where to start and finish offloading to VTA. -network = "resnet50_v2" +network = "resnet18_v1" start_pack = "nn.max_pool2d" stop_pack = "nn.global_avg_pool2d" @@ -368,7 +367,7 @@ def tune_and_evaluate(tuning_opt): tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) # We should have extracted 10 convolution tasks - # assert len(tasks) == 10 + assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: inp = tsk.args[0][1] @@ -386,7 +385,7 @@ def tune_and_evaluate(tuning_opt): # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. - # return + return # run tuning tasks print("Tuning...") @@ -402,9 +401,9 @@ def tune_and_evaluate(tuning_opt): if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build(relay_prog, - target=target, - params=params, - target_host=env.target_host) + target=target, + params=params, + target_host=env.target_host) else: targets = { "cpu": env.target_vta_cpu, diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index b61f594872c1..a168d30c7498 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -59,7 +59,6 @@ import vta from vta.testing import simulator from vta.top import graph_pack -from tvm.contrib.util import eprint # Make sure that TVM was compiled with RPC=1 assert tvm.runtime.enabled("rpc") @@ -96,7 +95,7 @@ # The ``start_pack`` and ``stop_pack`` labels indicate where # to start and end the graph packing relay pass: in other words # where to start and finish offloading to VTA. -model = "resnet50_v2" +model = "resnet18_v1" assert model in pack_dict ###################################################################### @@ -197,7 +196,7 @@ env.BLOCK_OUT, env.WGT_WIDTH, start_name=pack_dict[model][0], - stop_name=pack_dict[model][1]) + stop_name=pack_dict[model][1], device_annot=True) else: relay_prog = mod["main"] diff --git a/vta/tutorials/frontend/deploy_dcgan.py b/vta/tutorials/frontend/deploy_dcgan.py index 95a3731f98f9..6aaff4301258 100644 --- a/vta/tutorials/frontend/deploy_dcgan.py +++ b/vta/tutorials/frontend/deploy_dcgan.py @@ -20,7 +20,6 @@ import vta from vta.testing import simulator from vta.top import graph_pack -from tvm.contrib.util import eprint # Make sure that TVM was compiled with RPC=1 assert tvm.runtime.enabled("rpc") @@ -145,7 +144,6 @@ m = graph_runtime.create(graph, lib, ctxes) image = np.zeros((1, 100), dtype=np.float32) -eprint("image", image.dtype, image) image = np.repeat(image, env.BATCH, axis=0) # Set the network parameters and inputs diff --git a/vta/tutorials/frontend/deploy_mobilenet.py b/vta/tutorials/frontend/deploy_mobilenet.py index 8a94a588741e..9cf9dd98b09c 100644 --- a/vta/tutorials/frontend/deploy_mobilenet.py +++ b/vta/tutorials/frontend/deploy_mobilenet.py @@ -20,7 +20,6 @@ import vta from vta.testing import simulator from vta.top import graph_pack -from tvm.contrib.util import eprint # Make sure that TVM was compiled with RPC=1 assert tvm.runtime.enabled("rpc") From d6993845ea2891fdefb782de5ac0dc997427fd20 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 15 Jun 2020 00:27:32 +0800 Subject: [PATCH 30/44] refine graphpack and deploy exp --- vta/python/vta/top/graphpack.py | 28 ++++++++++------- .../frontend/deploy_classification.py | 31 ++++++++++--------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index 0934ed15d8b9..ea2a20dd8797 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -510,7 +510,10 @@ def graph_pack(expr, stop_name="nn.global_avg_pool2d", start_name_idx=None, stop_name_idx=None, - count_meta=False, device_annot=False): + count_meta=False, + device_annot=False, + annot_start_name="nn.conv2d", + annot_end_name="annotation.stop_fusion"): """Pack the graph into batch&channel packed format. Parameters @@ -547,6 +550,15 @@ def graph_pack(expr, 'expr.astext(show_meta_data=False)'. When count_meta is True, the operator increase logic would count the meta. + device_annot: boolean, optional + if we want to annoate the device_type + + annot_start_name: str, optional + device annotation start node, from which we mark the nodes as `ext_dev` + + annot_end_name: str, optional + device annotation end node, after which we mark the nodes as 'cpu' + Returns ------- expr : Expr @@ -568,18 +580,12 @@ def graph_pack(expr, expr_locator = ExprLocater() expr_locator.visit(expr) - # FIXME(zhanghao): generalize this part - # from the first int conv2d to the last int stop_fusion, all will run on vta - conv2d = op.op.get("nn.conv2d") - conv2d_transpose = op.op.get("nn.conv2d_transpose") - stop_fusion = op.op.get("annotation.stop_fusion") - if (conv2d, "int32") in expr_locator.op2nodes: - start = expr_locator.op2nodes[(conv2d, "int32")][0] - else: - start = expr_locator.op2nodes[(conv2d_transpose, "int32")][0] + annot_start = op.op.get(annot_start_name) + start = expr_locator.op2nodes[(annot_start, "int32")][0] + annot_end = op.op.get(annot_end_name) # we mark the next op to the last stop_fusion on cpu device - end = expr_locator.op2nodes[(stop_fusion, "int8")][-1] + 1 + end = expr_locator.op2nodes[(annot_end, "int8")][-1] + 1 device_annot = ExprDeviceAnnot(start=start, end=end) expr = device_annot.visit(expr) diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index a168d30c7498..fe5b62890922 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -75,11 +75,6 @@ # or ``device=vta`` to run inference on the FPGA. device = "vta" target = env.target if device == "vta" else env.target_vta_cpu -# multiple targets to run both on cpu and vta -targets = { - "cpu": env.target_vta_cpu, - "ext_dev": env.target -} # Dictionary lookup for when to start/end bit packing pack_dict = { @@ -140,8 +135,7 @@ remote = rpc.LocalSession() # Get execution context from remote -# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) -ctxes = [remote.ext_dev(0), remote.cpu(0)] +ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) ###################################################################### # Build the inference graph runtime @@ -160,9 +154,7 @@ # # Load pre-configured AutoTVM schedules -log_file = "%s.%s.log-manual-formatv0_2" % (device, model) -alu_log_file = "%s.alu.%s.log" % (device, model) -with autotvm.tophub.context(target, extra_files=[log_file, alu_log_file]): +with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ImageNet classifier input dtype_dict = {"data": 'float32'} @@ -207,9 +199,15 @@ relay_prog, target=target, params=params, target_host=env.target_host) else: + if env.TARGET == "intelfocl": + # multiple targets to run both on cpu and vta + target = { + "cpu": env.target_vta_cpu, + "ext_dev": target + } with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( - relay_prog, target=targets, + relay_prog, target=target, params=params, target_host=env.target_host) # Measure Relay build time @@ -222,8 +220,13 @@ remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") - # Graph runtime - m = graph_runtime.create(graph, lib, ctxes) + + if env.TARGET == "intelfocl": + ctxes = [remote.ext_dev(0), remote.cpu(0)] + m = graph_runtime.create(graph, lib, ctxes) + else: + # Graph runtime + m = graph_runtime.create(graph, lib, ctx) ###################################################################### # Perform image classification inference @@ -261,7 +264,7 @@ # More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator num = 4 # number of times we run module for a single measurement rep = 3 # number of measurements (we derive std dev from this) -timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep) +timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() From 4dbcdf58c76084e08737d700036a765924d8d7e3 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 15 Jun 2020 12:19:17 +0800 Subject: [PATCH 31/44] some bugfix --- src/arith/detect_linear_equation.cc | 2 +- vta/tutorials/frontend/deploy_classification.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc index 18d28b53a431..c9704e3fff4b 100644 --- a/src/arith/detect_linear_equation.cc +++ b/src/arith/detect_linear_equation.cc @@ -152,7 +152,7 @@ class LinearEqDetector : public ExprFunctor Date: Mon, 15 Jun 2020 12:50:59 +0800 Subject: [PATCH 32/44] remove dcgan and mobilenet tutorial --- python/tvm/relay/testing/mobilenet.py | 50 ++--- vta/python/vta/top/op.py | 2 - vta/python/vta/top/vta_conv2d_transpose.py | 12 +- vta/tutorials/frontend/deploy_dcgan.py | 184 ----------------- vta/tutorials/frontend/deploy_mobilenet.py | 225 --------------------- 5 files changed, 20 insertions(+), 453 deletions(-) delete mode 100644 vta/tutorials/frontend/deploy_dcgan.py delete mode 100644 vta/tutorials/frontend/deploy_mobilenet.py diff --git a/python/tvm/relay/testing/mobilenet.py b/python/tvm/relay/testing/mobilenet.py index e83336525ea7..d5a4d5f1e08f 100644 --- a/python/tvm/relay/testing/mobilenet.py +++ b/python/tvm/relay/testing/mobilenet.py @@ -44,22 +44,20 @@ def conv_block(data, name, channels, kernel_size=(3, 3), strides=(1, 1), def separable_conv_block(data, name, depthwise_channels, pointwise_channels, kernel_size=(3, 3), downsample=False, padding=(1, 1), - epsilon=1e-5, layout='NCHW', dtype="float32", depthwise_group_factor=1): + epsilon=1e-5, layout='NCHW', dtype="float32"): """Helper function to get a separable conv block""" if downsample: strides = (2, 2) else: strides = (1, 1) # depthwise convolution + bn + relu - wshape = (depthwise_channels, depthwise_group_factor) + kernel_size + wshape = (depthwise_channels, 1) + kernel_size weight = relay.var(name + "_weight", shape=wshape, dtype=dtype) - depthwise_group_factor = min(depthwise_group_factor, depthwise_channels) - groups = int(depthwise_channels/depthwise_group_factor) conv1 = layers.conv2d( data=data, weight=weight, channels=depthwise_channels, - groups=groups, + groups=depthwise_channels, kernel_size=kernel_size, strides=strides, padding=padding, @@ -84,59 +82,47 @@ def separable_conv_block(data, name, depthwise_channels, pointwise_channels, def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224), - dtype='float32', alpha=1.0, is_shallow=False, layout='NCHW', - depthwise_group_factor=1): + dtype='float32', alpha=1.0, is_shallow=False, layout='NCHW'): """Function to construct a MobileNet""" data = relay.var("data", shape=data_shape, dtype=dtype) body = conv_block(data, 'conv_block_1', int(32*alpha), strides=(2, 2), layout=layout) body = separable_conv_block(body, 'separable_conv_block_1', int(32*alpha), int(64*alpha), layout=layout, - dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_2', int(64*alpha), int(128*alpha), downsample=True, - layout=layout, dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_3', int(128*alpha), int(128*alpha), layout=layout, - dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_4', int(128*alpha), int(256*alpha), downsample=True, - layout=layout, dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_5', int(256*alpha), int(256*alpha), layout=layout, - dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_6', int(256*alpha), int(512*alpha), downsample=True, - layout=layout, dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + layout=layout, dtype=dtype) if is_shallow: body = separable_conv_block(body, 'separable_conv_block_7', int(512*alpha), int(1024*alpha), - downsample=True, layout=layout, dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + downsample=True, layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_8', int(1024*alpha), int(1024*alpha), - downsample=True, layout=layout, dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + downsample=True, layout=layout, dtype=dtype) else: for i in range(7, 12): body = separable_conv_block(body, 'separable_conv_block_%d' % i, int(512*alpha), int(512*alpha), - layout=layout, dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_12', int(512*alpha), int(1024*alpha), - downsample=True, layout=layout, dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + downsample=True, layout=layout, dtype=dtype) body = separable_conv_block(body, 'separable_conv_block_13', - int(1024*alpha), int(1024*alpha), - layout=layout, dtype=dtype, - depthwise_group_factor=depthwise_group_factor) + int(1024*alpha), int(1024*alpha), + layout=layout, dtype=dtype) pool = relay.nn.global_avg_pool2d(data=body, layout=layout) flatten = relay.nn.batch_flatten(data=pool) weight = relay.var('fc_weight') @@ -148,7 +134,7 @@ def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224), def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224), - dtype='float32', layout='NCHW', depthwise_group_factor=1): + dtype='float32', layout='NCHW'): """Get benchmark workload for mobilenet Parameters @@ -180,5 +166,5 @@ def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224), data_shape = tuple([batch_size] + list(image_shape)) net = mobile_net(num_classes=num_classes, data_shape=data_shape, dtype=dtype, alpha=1.0, is_shallow=False, - layout=layout, depthwise_group_factor=depthwise_group_factor) + layout=layout) return create_workload(net) diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 617be4b56d19..938fefa1e1cc 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -76,8 +76,6 @@ def schedule_alu_packed(cfg, outs): assert len(outs) == 1 def is_cast_op(op): - # return op.same_as(Op.op.get("cast")) - # FIXME(zhanghao): find a better way to do compare return op.name == 'T_cast' outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py index 15383e557c3b..91434e62c79f 100644 --- a/vta/python/vta/top/vta_conv2d_transpose.py +++ b/vta/python/vta/top/vta_conv2d_transpose.py @@ -120,16 +120,8 @@ def _traverse(op): data, kernel = conv2d_stage.op.input_tensors if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag: temp = data.op.input_tensors[0] - # FIXME(zhanghao): force merge pad(dilate(xx)) to one load op - # this may cause results in-correct - # disable for now - if False and isinstance(temp.op, tvm.te.ComputeOp) and ("pad" in temp.op.tag or temp.op.name == "DilatedInput"): - pad_data = data - data = temp.op.input_tensors[0] - s[temp.op].compute_inline() - else: - pad_data = data - data = temp + pad_data = data + data = temp else: pad_data = None diff --git a/vta/tutorials/frontend/deploy_dcgan.py b/vta/tutorials/frontend/deploy_dcgan.py deleted file mode 100644 index 6aaff4301258..000000000000 --- a/vta/tutorials/frontend/deploy_dcgan.py +++ /dev/null @@ -1,184 +0,0 @@ -from __future__ import absolute_import, print_function - -import argparse, json, os, requests, sys, time -from io import BytesIO -from os.path import join, isfile -from PIL import Image - -from mxnet.gluon.model_zoo import vision -import numpy as np -from matplotlib import pyplot as plt - -import tvm -from tvm import te -from tvm import rpc, autotvm, relay -from tvm.contrib import graph_runtime, util, download -from tvm.contrib.debugger import debug_runtime -from tvm.relay import transform -import tvm.relay.testing - -import vta -from vta.testing import simulator -from vta.top import graph_pack - -# Make sure that TVM was compiled with RPC=1 -assert tvm.runtime.enabled("rpc") - -###################################################################### -# Define the platform and model targets -# ------------------------------------- -# Execute on CPU vs. VTA, and define the model. - -# Load VTA parameters from the vta/config/vta_config.json file -env = vta.get_env() - -# Set ``device=arm_cpu`` to run inference on the CPU -# or ``device=vta`` to run inference on the FPGA. -device = "vta" -target = env.target if device == "vta" else env.target_vta_cpu -# multiple targets to run both on cpu and vta -targets = { - "cpu": env.target_vta_cpu, - "ext_dev": env.target -} - -model = "DCGAN" - -###################################################################### -# Obtain an execution remote -# -------------------------- -# When target is 'pynq', reconfigure FPGA and runtime. -# Otherwise, if target is 'sim', execute locally. - -if env.TARGET not in ["sim", "tsim", "intelfocl"]: - - # Get remote from tracker node if environment variable is set. - # To set up the tracker, you'll need to follow the "Auto-tuning - # a convolutional network for VTA" tutorial. - tracker_host = os.environ.get("TVM_TRACKER_HOST", None) - tracker_port = os.environ.get("TVM_TRACKER_PORT", None) - # Otherwise if you have a device you want to program directly from - # the host, make sure you've set the variables below to the IP of - # your board. - device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") - device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091") - if not tracker_host or not tracker_port: - remote = rpc.connect(device_host, int(device_port)) - else: - remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000) - - # Reconfigure the JIT runtime and FPGA. - # You can program the FPGA with your own custom bitstream - # by passing the path to the bitstream file instead of None. - reconfig_start = time.time() - vta.reconfig_runtime(remote) - bitstream = os.environ.get("TVM_BIT", None) - if bitstream: - print("Program fpga with {}".format(bitstream)) - vta.program_fpga(remote, bitstream) - - reconfig_time = time.time() - reconfig_start - print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) - -# In simulation mode, host the RPC server locally. -else: - remote = rpc.LocalSession() - -# Get execution context from remote -# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) -ctxes = [remote.ext_dev(0), remote.cpu(0)] - -# Load pre-configured AutoTVM schedules -with autotvm.tophub.context(target): - - # Populate the shape and data type dictionary for ImageNet classifier input - dtype_dict = {"data": 'float32'} - shape_dict = {"data": (env.BATCH, 100)} - - # get the mobilenet model - mod, params = relay.testing.dcgan.get_workload(batch_size=1, dtype="float32", oshape=(3, 64, 64)) - - # Measure build start time - build_start = time.time() - - # Update shape and type dictionary - shape_dict.update({k: v.shape for k, v in params.items()}) - dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) - - if target.device_name == "vta": - # Perform quantization in Relay - # Note: We set opt_level to 3 in order to fold batch norm - with relay.build_config(opt_level=3): - with relay.quantize.qconfig(global_scale=8.0, - skip_conv_layers=[3]): - mod = relay.quantize.quantize(mod, params=params) - # Perform graph packing and constant folding for VTA target - assert env.BLOCK_IN == env.BLOCK_OUT - relay_prog = graph_pack( - mod["main"], - env.BATCH, - env.BLOCK_OUT, - env.WGT_WIDTH, - start_name="cast", - stop_name="cast", stop_name_idx=52, device_annot=True) - else: - relay_prog = mod["main"] - - # Compile Relay program with AlterOpLayout disabled - with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): - if target.device_name != "vta": - graph, lib, params = relay.build( - relay_prog, target=target, - params=params, target_host=env.target_host) - else: - with vta.build_config(debug_flag=38): - graph, lib, params = relay.build( - relay_prog, target=targets, - params=params, target_host=env.target_host) - - # Measure Relay build time - build_time = time.time() - build_start - print(model + " inference graph built in {0:.2f}s!".format(build_time)) - - # Graph runtime - m = graph_runtime.create(graph, lib, ctxes) - -image = np.zeros((1, 100), dtype=np.float32) -image = np.repeat(image, env.BATCH, axis=0) - -# Set the network parameters and inputs -m.set_input(**params) -m.set_input('data', image) - -# Perform inference and gather execution statistics -# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator -num = 3 # number of times we run module for a single measurement -rep = 3 # number of measurements (we derive std dev from this) -timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep) - -if env.TARGET in ["sim", "tsim"]: - simulator.clear_stats() - # timer() - m['run']() - - sim_stats = simulator.stats() - print("\nExecution statistics:") - for k, v in sim_stats.items(): - # Since we execute the workload many times, we need to normalize stats - # Note that there is always one warm up run - # Therefore we divide the overall stats by (num * rep + 1) - print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1))) -else: - m['run']() - print("Run done") - # tcost = timer() - # std = np.std(tcost.results) * 1000 - # mean = tcost.mean * 1000 - # print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH)) - # print("Average per sample inference time: %.2fms" % (mean/env.BATCH)) - -# Get classification results -tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 3, 64, 64), "float32", remote.cpu(0))) -output = tvm_output.asnumpy() -for b in range(env.BATCH): - print(tvm_output.asnumpy()[b]) diff --git a/vta/tutorials/frontend/deploy_mobilenet.py b/vta/tutorials/frontend/deploy_mobilenet.py deleted file mode 100644 index 9cf9dd98b09c..000000000000 --- a/vta/tutorials/frontend/deploy_mobilenet.py +++ /dev/null @@ -1,225 +0,0 @@ -from __future__ import absolute_import, print_function - -import argparse, json, os, requests, sys, time -from io import BytesIO -from os.path import join, isfile -from PIL import Image - -from mxnet.gluon.model_zoo import vision -import numpy as np -from matplotlib import pyplot as plt - -import tvm -from tvm import te -from tvm import rpc, autotvm, relay -from tvm.contrib import graph_runtime, util, download -from tvm.contrib.debugger import debug_runtime -from tvm.relay import transform -import tvm.relay.testing - -import vta -from vta.testing import simulator -from vta.top import graph_pack - -# Make sure that TVM was compiled with RPC=1 -assert tvm.runtime.enabled("rpc") - -###################################################################### -# Define the platform and model targets -# ------------------------------------- -# Execute on CPU vs. VTA, and define the model. - -# Load VTA parameters from the vta/config/vta_config.json file -env = vta.get_env() - -# Set ``device=arm_cpu`` to run inference on the CPU -# or ``device=vta`` to run inference on the FPGA. -device = "vta" -target = env.target if device == "vta" else env.target_vta_cpu -# multiple targets to run both on cpu and vta -targets = { - "cpu": env.target_vta_cpu, - "ext_dev": env.target -} - -model = "mobilenetG" - -###################################################################### -# Obtain an execution remote -# -------------------------- -# When target is 'pynq', reconfigure FPGA and runtime. -# Otherwise, if target is 'sim', execute locally. - -if env.TARGET not in ["sim", "tsim", "intelfocl"]: - - # Get remote from tracker node if environment variable is set. - # To set up the tracker, you'll need to follow the "Auto-tuning - # a convolutional network for VTA" tutorial. - tracker_host = os.environ.get("TVM_TRACKER_HOST", None) - tracker_port = os.environ.get("TVM_TRACKER_PORT", None) - # Otherwise if you have a device you want to program directly from - # the host, make sure you've set the variables below to the IP of - # your board. - device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") - device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091") - if not tracker_host or not tracker_port: - remote = rpc.connect(device_host, int(device_port)) - else: - remote = autotvm.measure.request_remote(env.TARGET, tracker_host, int(tracker_port), timeout=10000) - - # Reconfigure the JIT runtime and FPGA. - # You can program the FPGA with your own custom bitstream - # by passing the path to the bitstream file instead of None. - reconfig_start = time.time() - vta.reconfig_runtime(remote) - bitstream = os.environ.get("TVM_BIT", None) - if bitstream: - print("Program fpga with {}".format(bitstream)) - vta.program_fpga(remote, bitstream) - - reconfig_time = time.time() - reconfig_start - print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) - -# In simulation mode, host the RPC server locally. -else: - remote = rpc.LocalSession() - -# Get execution context from remote -# ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) -ctxes = [remote.ext_dev(0), remote.cpu(0)] - -# Load pre-configured AutoTVM schedules -with autotvm.tophub.context(target): - - # Populate the shape and data type dictionary for ImageNet classifier input - dtype_dict = {"data": 'float32'} - shape_dict = {"data": (env.BATCH, 3, 224, 224)} - - # get the mobilenet model - mod, params = relay.testing.mobilenet.get_workload(batch_size=1, dtype="float32", - depthwise_group_factor=env.BLOCK_IN) - - # Measure build start time - build_start = time.time() - - # Update shape and type dictionary - shape_dict.update({k: v.shape for k, v in params.items()}) - dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) - - if target.device_name == "vta": - # Perform quantization in Relay - # Note: We set opt_level to 3 in order to fold batch norm - with relay.build_config(opt_level=3): - with relay.quantize.qconfig(global_scale=8.0, - skip_conv_layers=[0]): - mod = relay.quantize.quantize(mod, params=params) - # Perform graph packing and constant folding for VTA target - assert env.BLOCK_IN == env.BLOCK_OUT - relay_prog = graph_pack( - mod["main"], - env.BATCH, - env.BLOCK_OUT, - env.WGT_WIDTH, - start_name="nn.conv2d", - stop_name="nn.global_avg_pool2d") - else: - relay_prog = mod["main"] - - # Compile Relay program with AlterOpLayout disabled - with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): - if target.device_name != "vta": - graph, lib, params = relay.build( - relay_prog, target=target, - params=params, target_host=env.target_host) - else: - with vta.build_config(debug_flag=32): - graph, lib, params = relay.build( - relay_prog, target=targets, - params=params, target_host=env.target_host) - - # Measure Relay build time - build_time = time.time() - build_start - print(model + " inference graph built in {0:.2f}s!".format(build_time)) - - # Graph runtime - m = graph_runtime.create(graph, lib, ctxes) - -###################################################################### -# Perform image classification inference -# -------------------------------------- -# We run classification on an image sample from ImageNet -# We just need to download the categories files, `synset.txt` -# and an input test image. - -# Download ImageNet categories -categ_url = "https://github.com/uwsaml/web-data/raw/master/vta/models/" -categ_fn = "synset.txt" -download.download(join(categ_url, categ_fn), categ_fn) -synset = eval(open(categ_fn).read()) - -# Download test image -image_url = 'https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg' -image_fn = 'cat.png' -download.download(image_url, image_fn) - -# Prepare test image for inference -image = Image.open(image_fn).resize((224, 224)) -plt.imshow(image) -plt.show() -image = np.array(image) - np.array([123., 117., 104.]) -image /= np.array([58.395, 57.12, 57.375]) -image = image.transpose((2, 0, 1)) -image = image[np.newaxis, :] -image = np.repeat(image, env.BATCH, axis=0) - -# Set the network parameters and inputs -m.set_input(**params) -m.set_input('data', image) - -# Perform inference and gather execution statistics -# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator -num = 3 # number of times we run module for a single measurement -rep = 3 # number of measurements (we derive std dev from this) -timer = m.module.time_evaluator("run", ctxes[0], number=num, repeat=rep) - -if env.TARGET in ["sim", "tsim"]: - simulator.clear_stats() - timer() - - sim_stats = simulator.stats() - print("\nExecution statistics:") - for k, v in sim_stats.items(): - # Since we execute the workload many times, we need to normalize stats - # Note that there is always one warm up run - # Therefore we divide the overall stats by (num * rep + 1) - print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1))) -else: - tcost = timer() - std = np.std(tcost.results) * 1000 - mean = tcost.mean * 1000 - print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH)) - print("Average per sample inference time: %.2fms" % (mean/env.BATCH)) - -# Get classification results -tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0))) -output = tvm_output.asnumpy() -for b in range(env.BATCH): - top_categories = np.argsort(tvm_output.asnumpy()[b]) - # print("top_categories = ", top_categories) - # Report top-5 classification results - print("\n{} prediction for sample {}".format(model, b)) - print("\t#1:", synset[top_categories[-1]], output[b][top_categories[-1]]) - print("\t#2:", synset[top_categories[-2]], output[b][top_categories[-2]]) - print("\t#3:", synset[top_categories[-3]], output[b][top_categories[-3]]) - print("\t#4:", synset[top_categories[-4]], output[b][top_categories[-4]]) - print("\t#5:", synset[top_categories[-5]], output[b][top_categories[-5]]) - # This just checks that one of the 5 top categories - # is one variety of cat; this is by no means an accurate - # assessment of how quantization affects classification - # accuracy but is meant to catch changes to the - # quantization pass that would accuracy in the CI. - cat_detected = False - for k in top_categories[-5:]: - if "cat" in synset[k]: - cat_detected = True - assert(cat_detected) From 75f7272552c8f8ff3f76c754b0b5008860f1ca05 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 15 Jun 2020 20:10:12 +0800 Subject: [PATCH 33/44] some bugfix and code optimize --- src/relay/quantize/realize.cc | 32 +++++++++++++------------------- vta/python/vta/top/op.py | 8 +++++--- vta/python/vta/transform.py | 20 +++++++------------- vta/runtime/runtime.cc | 9 ++++----- 4 files changed, 29 insertions(+), 40 deletions(-) diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 9dbc27d2c5a3..74bef7d1e4ed 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -330,7 +330,7 @@ float ChooseDomScale(const std::vector& nptrs) { /* \brief Unify the dom scale of arguments */ Array UnifyDTypeScale(const Array& ref_args, const Array& args, - DataType* dtype_ptr, Expr* scale_ptr) { + DataType* dtype_ptr, Expr* scale_ptr, DataType dtype = DataType::Void()) { static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize"); const QConfig& cfg = QConfig::Current(); @@ -345,27 +345,19 @@ Array UnifyDTypeScale(const Array& ref_args, const Array& args // unify the data type CHECK_EQ(ref_args.size(), args.size()); - DataType dtype; - // FIXME(zhanghao): force to use add(int32, int32) in order to put in VTA ALU - // but this may be not necessary for other devices - // if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) { - // dtype = cfg->dtype_input; - // } else { - // dtype = cfg->dtype_activation; - // } - dtype = cfg->dtype_activation; + if (dtype.is_void()) { + if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) { + dtype = cfg->dtype_input; + } else { + dtype = cfg->dtype_activation; + } + } + for (size_t i = 0; i < ret.size(); ++i) { auto ref_arg = ref_args[i].as(); if (nptrs[i]->dtype != dtype) { - auto new_arg = Cast(ret[i], dtype); - - // FIXME(zhanghao): do not fuse float32 cast - if (nptrs[i]->dtype == DataType::Float(32)) { - ret.Set(i, StopFusion(new_arg)); - } else { - ret.Set(i, new_arg); - } + ret.Set(i, Cast(ret[i], dtype)); } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) && ref_arg->attrs.as()->kind == kQInput) { auto new_arg = Cast(ret[i], cfg->dtype_input); @@ -392,7 +384,9 @@ Expr AddRealize(const Call& ref_call, const Array& new_args, const ObjectR if (new_args[0].as() && new_args[1].as()) { DataType dtype; Expr dom_scale; - Array ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale); + // execute the operation with activation data type. + const QConfig& cfg = QConfig::Current(); + Array ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation); Expr ret = ForwardOp(ref_call, ret_args); return QRealizeIntExpr(ret, dom_scale, dtype); } diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 938fefa1e1cc..20a7af2c5c1b 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -82,7 +82,6 @@ def is_cast_op(op): output = outs[0] s = te.create_schedule([x.op for x in outs]) te.schedule.AutoInlineInjective(s) - # s[output].fuse(s[output].op.axis) env = get_env() # other target does not support alu-only ops @@ -190,8 +189,11 @@ def multiply_strategy_vta(attrs, inputs, out_type, target): return strategy -reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta") -reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta") +env = get_env() +# other target does not support alu-only ops +if env.TARGET in ["sim", "tsim", "intelfocl"]: + reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta") + reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta") @_strategy.conv2d_strategy.register("vta") diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py index a8ecb1099a89..abb152d32314 100644 --- a/vta/python/vta/transform.py +++ b/vta/python/vta/transform.py @@ -381,9 +381,10 @@ def _fold_buffer_dim(buf, scope, elem_block): def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold): elem_block = elem_bytes * 8 // elem_width - if buf.dtype != dtype: - raise RuntimeError("Expect buffer type to be %s instead of %s" % - (dtype, buf.dtype)) + # remove the checking as we have load_int8 insn + # if buf.dtype != dtype: + # raise RuntimeError("Expect buffer type to be %s instead of %s" % + # (dtype, buf.dtype)) shape, strides = buf.shape, buf.strides if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0): raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block)) @@ -549,20 +550,13 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value): _check_compact(dst) - # FIXME(zhanghao): optimize - # for int8 -> int32 cast/load - orig_dtype = src.dtype - if src.dtype != data_type: - assert(data_type == "int%d" % env.ACC_WIDTH and \ - src.dtype == "int%d" % env.INP_WIDTH) - src.dtype = data_type - x_size, y_size, x_stride, offset = _get_2d_pattern( src, elem_width, elem_bytes, data_type, dst.scope, allow_fold=allow_fold) - if orig_dtype != src.dtype: - src.dtype = orig_dtype + if data_type != src.dtype: + assert(data_type == "int%d" % env.ACC_WIDTH and \ + src.dtype == "int%d" % env.INP_WIDTH) mem_type = env.dev.MEM_ID_ACC_8BIT irb = tvm.tir.ir_builder.create() diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index cf70f7e19361..67f055a04538 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -1078,6 +1078,7 @@ class InsnQueue : public BaseQueue { CHECK(fpga_buff_ != nullptr); CHECK(fpga_buff_phy_); uint32_t buff_size = dram_buffer_.size() * elem_bytes_; + CHECK(buff_size <= kMaxBytes); // Copy contents of DRAM buffer to FPGA buff VTAMemCopyFromHost(fpga_buff_, dram_buffer_.data(), buff_size); @@ -1322,7 +1323,6 @@ class CommandQueue { if (insn_queue_.count() == 0) return; // Synchronization for the queues uop_queue_.AutoReadBarrier(); - insn_queue_.AutoReadBarrier(); // Dump instructions if debug enabled if (debug_flag_ & VTA_DEBUG_DUMP_INSN) { @@ -1333,7 +1333,7 @@ class CommandQueue { VTA_OPCODE_FINISH); // Make sure that we don't exceed contiguous physical memory limits - CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER); + CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) <= VTA_MAX_XFER); int timeout = VTADeviceRun(device_, insn_queue_.dram_phy_addr(), insn_queue_.count(), wait_cycles); CHECK_EQ(timeout, 0); @@ -1481,9 +1481,8 @@ class CommandQueue { void CheckInsnOverFlow() { // At each API call, we can at most commit: - // one pending store, one pending load, and one uop - // FIXME(zhanghao): check why there are 5 insns - if ((insn_queue_.count() + 5) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) { + // at most: 2 NOP-COMPUTE-STAGE -> 2 NOP-MEMORY-STAGE -> 1 NOP-COMPUTE-STAGE -> 1 FINISH + if ((insn_queue_.count() + 6) * sizeof(VTAGenericInsn) > VTA_MAX_XFER) { this->AutoSync(); } } From c8a357424d9d6e5e77a8ea169e10488d49e73a93 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Tue, 16 Jun 2020 11:29:07 +0800 Subject: [PATCH 34/44] some minor fix and code refine --- python/tvm/autotvm/measure/measure_methods.py | 14 ++++++-------- src/relay/backend/graph_plan_memory.cc | 6 ++++-- src/relay/quantize/realize.cc | 6 ++++++ src/tir/transforms/lower_tvm_builtin.cc | 2 +- vta/runtime/runtime.cc | 14 ++++++++------ vta/tutorials/autotvm/tune_alu_vta.py | 2 +- vta/tutorials/autotvm/tune_relay_vta.py | 4 ++-- vta/tutorials/frontend/deploy_classification.py | 6 +++--- 8 files changed, 31 insertions(+), 23 deletions(-) diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index d6b5defb710c..26e13f85c964 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -83,8 +83,6 @@ class LocalBuilder(Builder): If is callable, use it as custom build function, expect lib_format field. """ def __init__(self, timeout=10, n_parallel=None, build_func='default'): - # FIXME(zhanghao): quickfix - use single thread. otherwise may cause seg fault - n_parallel = 1 super(LocalBuilder, self).__init__(timeout, n_parallel) if isinstance(build_func, str): @@ -191,7 +189,7 @@ def __init__(self, timeout=10, n_parallel=None, number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1, check_correctness=False): - static_tune = os.getenv("TVM_STATIC_TUNE") + static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL") if static_tune: if n_parallel is None or n_parallel > 1: print("static tune only allows n_parallel == 1") @@ -385,7 +383,7 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti # pylint: disable=import-outside-toplevel import vta - static_tune = os.getenv("TVM_STATIC_TUNE") + static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL") if static_tune: debug_flag = 1 << 6 else: @@ -483,7 +481,7 @@ def run_through_rpc(measure_input, build_result, tic = time.time() errno = MeasureErrorNo.NO_ERROR - static_tune = os.getenv("TVM_STATIC_TUNE") + static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL") try: # upload built module remote = request_remote(*remote_args) @@ -513,8 +511,8 @@ def run_through_rpc(measure_input, build_result, else: func(*args) cost = 0 - insn_dump = os.getenv('TVM_INSN_DUMP', "insn.dump") - insn_cost_file = os.getenv('TVM_INSN_COST', "cost.py") + insn_dump = os.getenv('TVM_INSN_DUMP_FILE', "insn.json") + insn_cost_file = os.getenv('TVM_INSN_COST_FILE', "cost.py") path, filename = os.path.split(insn_cost_file) sys.path.append(path) module_path = filename[:-3] # remove the .py suffix @@ -577,7 +575,7 @@ def request_remote(device_key, host=None, port=None, priority=1, timeout=60): ------ session: RPCSession """ - static_tune = os.getenv("TVM_STATIC_TUNE") + static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL") if static_tune: return _rpc.LocalSession() diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 66de20dcf4c0..9dfc54212f2e 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -309,9 +309,11 @@ class StorageAllocator : public StorageAllocaBaseVisitor { if (match_range_ == 0) { return this->Alloc(prototype, size); } - // quickfix(zhanghao): we copy all the instructions in a single batch + + // TODO(zhanghao): find a better way to do this + // we copy all the instructions in a single batch // to avoid overwrite shared storage, we do not re-use allocation - const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE"); + const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL"); if (sync_once) { return this->Alloc(prototype, size); } diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 74bef7d1e4ed..cafae6c2146c 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -387,6 +387,12 @@ Expr AddRealize(const Call& ref_call, const Array& new_args, const ObjectR // execute the operation with activation data type. const QConfig& cfg = QConfig::Current(); Array ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation); + for (size_t i = 0; i < ret_args.size(); ++i) { + // do not fuse float32 arg + if (new_args[i].as()->dtype == DataType::Float(32)) { + ret_args.Set(i, StopFusion(ret_args[i])); + } + } Expr ret = ForwardOp(ref_call, ret_args); return QRealizeIntExpr(ret, dom_scale, dtype); } diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 3d54d45015c6..628de0604042 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -86,7 +86,7 @@ class BuiltinLower : public StmtExprMutator { op = stmt.as(); // Get constant allocation bound. int64_t nbytes = GetVectorBytes(op->dtype); - // FIXME(zhanghao): remove special handling for kDLCPU + // NOTE(zhanghao): remove special handling for kDLCPU // otherwise, may cause LLVM parameters match error // if in heterogenous targets // if (device_type_.defined()) { diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 67f055a04538..835f65a7947e 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -1278,10 +1278,10 @@ class CommandQueue { } void Synchronize(uint32_t wait_cycles, bool skip=true) { - if (debug_flag_ & VTA_DEBUG_AUTO_TUNE) { - const char* insn_file = std::getenv("TVM_INSN_DUMP"); + if (debug_flag_ & VTA_DEBUG_LOG_INSN) { + const char* insn_file = std::getenv("TVM_INSN_DUMP_FILE"); if (insn_file == nullptr) { - insn_file = "insn.dump"; + insn_file = "insn.json"; } FILE* out = fopen(insn_file, "w+"); if (out) { @@ -1294,8 +1294,10 @@ class CommandQueue { } // FIXME(zhanghao): It is required to use force_serial - // by using skip and sync at the final layer, we can avoid do DeviceCopy every time - const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE"); + // by using skip and sync at the final layer. + // By doing this, we can avoid do DeviceCopy every time + // consider to make it as a flag when mature + const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL"); if (sync_once && skip) { if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) { LOG(ERROR) << @@ -1524,7 +1526,7 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off if (from_buffer) { // This is an FPGA to host mem transfer // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly - const char* sync_once = std::getenv("TVM_VTA_SYNC_ONCE"); + const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL"); if (sync_once) VTASynchronize(VTATLSCommandHandle(), 1<<31, false); from_buffer->InvalidateCache(from_offset, size); from_buffer->MemCopyToHost(static_cast(to) + to_offset, diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py index a5f03cdc22c7..68ea96ec4b64 100644 --- a/vta/tutorials/autotvm/tune_alu_vta.py +++ b/vta/tutorials/autotvm/tune_alu_vta.py @@ -113,7 +113,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals 'early_stopping': None, 'measure_option': autotvm.measure_option( - builder=autotvm.LocalBuilder(), + builder=autotvm.LocalBuilder(n_parallel=1), runner=autotvm.RPCRunner(env.TARGET, host=tracker_host, port=tracker_port, diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index 3f62f15b6490..8e2da559c6c2 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -208,7 +208,7 @@ def compile_network(env, target, model, start_pack, stop_pack): 'early_stopping': None, 'measure_option': autotvm.measure_option( - builder=autotvm.LocalBuilder(), + builder=autotvm.LocalBuilder(n_parallel=1), runner=autotvm.RPCRunner(env.TARGET, host=tracker_host, port=tracker_port, @@ -395,7 +395,7 @@ def tune_and_evaluate(tuning_opt): with autotvm.tophub.context(target, extra_files=[log_file]): # recompile the programs with device annotations print("Recompile") - relay_prog, params = compile_network(env, target, network, start_pack, stop_pack, device_annot=True) + relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) # Compile network print("Compile...") if target.device_name != "vta": diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index 73f13b3bf792..63907b996734 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -188,7 +188,7 @@ env.BLOCK_OUT, env.WGT_WIDTH, start_name=pack_dict[model][0], - stop_name=pack_dict[model][1], device_annot=env.TARGET == "intelfocl") + stop_name=pack_dict[model][1], device_annot=(env.TARGET == "intelfocl" or env.TARGET == "sim")) else: relay_prog = mod["main"] @@ -199,7 +199,7 @@ relay_prog, target=target, params=params, target_host=env.target_host) else: - if env.TARGET == "intelfocl": + if env.TARGET == "intelfocl" or env.TARGET == "sim": # multiple targets to run both on cpu and vta target = { "cpu": env.target_vta_cpu, @@ -221,7 +221,7 @@ lib = remote.load_module("graphlib.o") - if env.TARGET == "intelfocl": + if env.TARGET == "intelfocl" or env.TARGET == "sim": ctxes = [remote.ext_dev(0), remote.cpu(0)] m = graph_runtime.create(graph, lib, ctxes) else: From 7ca6f4098505172e7354174343d1e3b82140bbae Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Tue, 16 Jun 2020 17:55:07 +0800 Subject: [PATCH 35/44] remove rapidjson dep (use picojson) --- cmake/modules/VTA.cmake | 6 +- vta/runtime/runtime.cc | 168 +++++++++++++++++----------------------- vta/runtime/runtime.h | 2 +- 3 files changed, 74 insertions(+), 102 deletions(-) diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index 4193fbaf657f..b586800efe2d 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -27,6 +27,9 @@ endif() message(STATUS "VTA build with VTA_HW_PATH=" ${VTA_HW_PATH}) +# enable picojson int type support +add_definitions(-DPICOJSON_USE_INT64) + if(MSVC) message(STATUS "VTA build is skipped in Windows..") elseif(PYTHON) @@ -108,13 +111,13 @@ elseif(PYTHON) # Target lib: vta add_library(vta SHARED ${FPGA_RUNTIME_SRCS}) target_include_directories(vta PUBLIC vta/runtime) + target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include) foreach(__def ${VTA_DEFINITIONS}) string(SUBSTRING ${__def} 3 -1 __strip_def) target_compile_definitions(vta PUBLIC ${__strip_def}) endforeach() if(${VTA_TARGET} STREQUAL "pynq" OR ${VTA_TARGET} STREQUAL "ultra96") - target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include) target_link_libraries(vta ${__cma_lib}) elseif(${VTA_TARGET} STREQUAL "de10nano") # DE10-Nano rules #target_compile_definitions(vta PUBLIC VTA_MAX_XFER=2097152) # (1<<21) @@ -124,7 +127,6 @@ elseif(PYTHON) "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include") elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include") - target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL) endif() diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 835f65a7947e..628f702de7fc 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -40,9 +40,7 @@ #include #include -#include -#include -#include +#include #include #include @@ -845,35 +843,23 @@ class InsnQueue : public BaseQueue { // Iterate over all instructions int insn_count = count(); const VTAGenericInsn* insn = data(); - // FIXME(zhanghao): rapidjson dep - rapidjson::StringBuffer s; - rapidjson::Writer writer(s); + picojson::array jarr; if (!json) { fprintf(out, "There are %u instructions\n", insn_count); - } else { - writer.StartArray(); } for (int i = 0; i < insn_count; ++i) { // Fetch instruction and decode opcode c.generic = insn[i]; + picojson::object kv; if (json) { - writer.StartObject(); - writer.Key("name"); - writer.String(GetOpName(c).c_str()); - - writer.Key("type"); - writer.String(GetOpcodeName(c).c_str()); - - writer.Key("pop_prev"); - writer.Int(c.mem.pop_prev_dep); - writer.Key("pop_next"); - writer.Int(c.mem.pop_next_dep); - writer.Key("push_prev"); - writer.Int(c.mem.push_prev_dep); - writer.Key("push_next"); - writer.Int(c.mem.push_next_dep); + kv["name"] = picojson::value(GetOpName(c).c_str()); + kv["type"] = picojson::value(GetOpcodeName(c).c_str()); + kv["pop_prev"] = picojson::value(static_cast(c.mem.pop_prev_dep)); + kv["pop_next"] = picojson::value(static_cast(c.mem.pop_next_dep)); + kv["push_prev"] = picojson::value(static_cast(c.mem.push_prev_dep)); + kv["push_next"] = picojson::value(static_cast(c.mem.push_next_dep)); } else { fprintf(out, "INSTRUCTION %u: ", i); fprintf(out, "%s\n", GetOpName(c).c_str()); @@ -887,25 +873,21 @@ class InsnQueue : public BaseQueue { if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) { if (json) { - writer.Key("dram"); - writer.Uint64(c.mem.dram_base); - writer.Key("sram"); - writer.Uint64(c.mem.sram_base); - - writer.Key("y"); - writer.StartArray(); - writer.Uint64(c.mem.y_size); - writer.Uint64(c.mem.y_pad_0); - writer.Uint64(c.mem.y_pad_1); - writer.EndArray(); - - writer.Key("x"); - writer.StartArray(); - writer.Uint64(c.mem.x_size); - writer.Uint64(c.mem.x_pad_0); - writer.Uint64(c.mem.x_pad_1); - writer.Uint64(c.mem.x_stride); - writer.EndArray(); + kv["dram"] = picojson::value(static_cast(c.mem.dram_base)); + kv["sram"] = picojson::value(static_cast(c.mem.sram_base)); + + picojson::array arr; + arr.push_back(picojson::value(static_cast(c.mem.y_size))); + arr.push_back(picojson::value(static_cast(c.mem.y_pad_0))); + arr.push_back(picojson::value(static_cast(c.mem.y_pad_1))); + kv["y"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.mem.x_size))); + arr.push_back(picojson::value(static_cast(c.mem.x_pad_0))); + arr.push_back(picojson::value(static_cast(c.mem.x_pad_1))); + arr.push_back(picojson::value(static_cast(c.mem.x_stride))); + kv["x"] = picojson::value(arr); } else { fprintf(out, "\tDRAM: 0x%08x, SRAM:0x%04x\n", static_cast(c.mem.dram_base), @@ -922,29 +904,26 @@ class InsnQueue : public BaseQueue { } } else if (c.mem.opcode == VTA_OPCODE_GEMM) { if (json) { - writer.Key("reset_out"); - writer.Int(c.gemm.reset_reg); - writer.Key("range"); - writer.StartArray(); - writer.Int(c.gemm.uop_bgn); - writer.Int(c.gemm.uop_end); - writer.EndArray(); - - writer.Key("outer_loop"); - writer.StartArray(); - writer.Int(c.gemm.iter_out); - writer.Int(c.gemm.wgt_factor_out), - writer.Int(c.gemm.src_factor_out), - writer.Int(c.gemm.dst_factor_out); - writer.EndArray(); - - writer.Key("inner_loop"); - writer.StartArray(); - writer.Int(c.gemm.iter_in); - writer.Int(c.gemm.wgt_factor_in), - writer.Int(c.gemm.src_factor_in), - writer.Int(c.gemm.dst_factor_in); - writer.EndArray(); + kv["reset_out"] = picojson::value(static_cast(c.gemm.reset_reg)); + + picojson::array arr; + arr.push_back(picojson::value(static_cast(c.gemm.uop_bgn))); + arr.push_back(picojson::value(static_cast(c.gemm.uop_end))); + kv["range"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.gemm.iter_out))); + arr.push_back(picojson::value(static_cast(c.gemm.wgt_factor_out))); + arr.push_back(picojson::value(static_cast(c.gemm.src_factor_out))); + arr.push_back(picojson::value(static_cast(c.gemm.dst_factor_out))); + kv["outer_loop"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.gemm.iter_in))); + arr.push_back(picojson::value(static_cast(c.gemm.wgt_factor_in))); + arr.push_back(picojson::value(static_cast(c.gemm.src_factor_in))); + arr.push_back(picojson::value(static_cast(c.gemm.dst_factor_in))); + kv["inner_loop"] = picojson::value(arr); } else { fprintf(out, "\treset_out: %d\n", static_cast(c.gemm.reset_reg)); fprintf(out, "\trange (%d, %d)\n", @@ -963,27 +942,23 @@ class InsnQueue : public BaseQueue { } } else if (c.mem.opcode == VTA_OPCODE_ALU) { if (json) { - writer.Key("reset_out"); - writer.Int(c.alu.reset_reg); - writer.Key("range"); - writer.StartArray(); - writer.Int(c.alu.uop_bgn); - writer.Int(c.alu.uop_end); - writer.EndArray(); - - writer.Key("outer_loop"); - writer.StartArray(); - writer.Int(c.alu.iter_out); - writer.Int(c.alu.dst_factor_out), - writer.Int(c.alu.src_factor_out), - writer.EndArray(); - - writer.Key("inner_loop"); - writer.StartArray(); - writer.Int(c.alu.iter_in); - writer.Int(c.alu.dst_factor_in); - writer.Int(c.alu.src_factor_in), - writer.EndArray(); + kv["reset_out"] = picojson::value(static_cast(c.alu.reset_reg)); + picojson::array arr; + arr.push_back(picojson::value(static_cast(c.alu.uop_bgn))); + arr.push_back(picojson::value(static_cast(c.alu.uop_end))); + kv["range"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.alu.iter_out))); + arr.push_back(picojson::value(static_cast(c.alu.dst_factor_out))); + arr.push_back(picojson::value(static_cast(c.alu.src_factor_out))); + kv["outer_loop"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.alu.iter_in))); + arr.push_back(picojson::value(static_cast(c.alu.dst_factor_in))); + arr.push_back(picojson::value(static_cast(c.alu.src_factor_in))); + kv["inner_loop"] = picojson::value(arr); } else { fprintf(out, "\treset_out: %d\n", static_cast(c.alu.reset_reg)); fprintf(out, "\trange (%d, %d)\n", @@ -1027,16 +1002,12 @@ class InsnQueue : public BaseQueue { if (c.gemm.push_next_dep) g2s_queue++; } if (json) { - writer.Key("l2g_queue"); - writer.Int(l2g_queue); - writer.Key("g2l_queue"); - writer.Int(g2l_queue); - writer.Key("s2g_queue"); - writer.Int(s2g_queue); - writer.Key("g2s_queue"); - writer.Int(g2s_queue); - - writer.EndObject(); + kv["l2g_queue"] = picojson::value(static_cast(l2g_queue)); + kv["g2l_queue"] = picojson::value(static_cast(g2l_queue)); + kv["s2g_queue"] = picojson::value(static_cast(s2g_queue)); + kv["g2s_queue"] = picojson::value(static_cast(g2s_queue)); + + jarr.push_back(picojson::value(kv)); } else { fprintf(out, "\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); fprintf(out, "\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); @@ -1044,9 +1015,8 @@ class InsnQueue : public BaseQueue { } if (json) { - writer.EndArray(); - auto str = s.GetString(); - fwrite(str, 1, s.GetSize(), out); + auto str = picojson::value(jarr).serialize(); + fwrite(str.c_str(), 1, str.size(), out); } } // Commit all pending pop of corresponding stage diff --git a/vta/runtime/runtime.h b/vta/runtime/runtime.h index 22cf15a91503..a61906e98ff6 100644 --- a/vta/runtime/runtime.h +++ b/vta/runtime/runtime.h @@ -41,7 +41,7 @@ extern "C" { #define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3) #define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4) #define VTA_DEBUG_FORCE_SERIAL (1 << 5) -#define VTA_DEBUG_AUTO_TUNE (1 << 6) +#define VTA_DEBUG_LOG_INSN (1 << 6) /*! * \brief Allocate data buffer. From 12554d51e45cb7b31e2d51cb91d85f7d2e153de9 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Tue, 16 Jun 2020 18:14:06 +0800 Subject: [PATCH 36/44] bugfix for tune alu vta --- vta/tutorials/autotvm/tune_alu_vta.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py index 68ea96ec4b64..2998b1c57fc9 100644 --- a/vta/tutorials/autotvm/tune_alu_vta.py +++ b/vta/tutorials/autotvm/tune_alu_vta.py @@ -223,7 +223,7 @@ def my_clip(x, a_min, a_max): # init autotvm env to register VTA operator TaskExtractEnv() - @autotvm.register_customized_task("add.vta") + @autotvm.template("add.vta") def _topi_add(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" A, B = args[:2] @@ -239,7 +239,7 @@ def _topi_add(*args, **kwargs): s = te.create_schedule([res.op]) return s, [A, B, res] - @autotvm.register_customized_task("multiply.vta") + @autotvm.template("multiply.vta") def _topi_multiply(*args, **kwargs): assert not kwargs, "Do not support kwargs in template function call" A, B = args[:2] @@ -255,22 +255,6 @@ def _topi_multiply(*args, **kwargs): s = te.create_schedule([res.op]) return s, [A, B, res] - @autotvm.register_customized_task("copy.vta") - def _topi_identity(*args, **kwargs): - assert not kwargs, "Do not support kwargs in template function call" - A = args[0] - - with tvm.target.vta(): - res = vta.top.op.copy_packed(*args, **kwargs) - res = my_clip(res, 0, 127) - res = topi.cast(res, "int8") - - if tvm.target.Target.current().device_name == 'vta': - s = vta.top.op.schedule_copy_packed([res]) - else: - s = te.create_schedule([res.op]) - return s, [A, res] - ######################################################################## # Finally, we launch tuning jobs and evaluate the end-to-end performance. From b8d842ed5c9baa6c5a27c419895cf40da21f2c02 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 18 Jun 2020 16:02:20 +0800 Subject: [PATCH 37/44] cleanup --- python/tvm/relay/quantize/_annotate.py | 2 - src/relay/backend/graph_plan_memory.cc | 4 +- src/relay/transforms/device_annotation.cc | 4 +- src/tir/transforms/inject_copy_intrin.cc | 14 +---- vta.resnet18_v1.log-manual-formatv0_2 | 10 ---- vta/python/vta/top/op.py | 6 ++- vta/python/vta/transform.py | 1 - vta/runtime/runtime.cc | 54 +++++++++---------- vta/tutorials/autotvm/tune_alu_vta.py | 9 ++-- vta/tutorials/autotvm/tune_relay_vta.py | 21 ++------ .../frontend/deploy_classification.py | 9 +--- 11 files changed, 45 insertions(+), 89 deletions(-) delete mode 100644 vta.resnet18_v1.log-manual-formatv0_2 diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 08930527b443..f902a0abf80e 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -195,8 +195,6 @@ def conv2d_transpose_rewrite(ref_call, new_args, ctx): return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) -# TODO(tmoreau89,ziheng) need to include an option to turn off dense quant -# @register_annotate_function("nn.dense") @register_annotate_function("nn.dense") def dense_rewrite(ref_call, new_args, ctx): """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 9dfc54212f2e..4a1bfd874b5c 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -311,8 +311,8 @@ class StorageAllocator : public StorageAllocaBaseVisitor { } // TODO(zhanghao): find a better way to do this - // we copy all the instructions in a single batch - // to avoid overwrite shared storage, we do not re-use allocation + // We copy all the instructions of all layers in a single batch. + // To avoid overwrite shared storage, we do not re-use allocation const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL"); if (sync_once) { return this->Alloc(prototype, size); diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc index fe3cfebf7fe3..319f9ba59064 100644 --- a/src/relay/transforms/device_annotation.cc +++ b/src/relay/transforms/device_annotation.cc @@ -538,9 +538,7 @@ Map CollectDeviceInfo(const Expr& expr) { return DeviceInfo::GetDeviceMap(expr); } -Map CollectDeviceAnnotationOps(const Expr& expr) { - return AnnotatationVisitor::GetAnnotations(expr); -} +Map CollectDeviceAnnotationOps(const Expr& expr) { return AnnotatationVisitor::GetAnnotations(expr); } TVM_REGISTER_GLOBAL("relay.analysis.CollectDeviceInfo").set_body_typed(CollectDeviceInfo); diff --git a/src/tir/transforms/inject_copy_intrin.cc b/src/tir/transforms/inject_copy_intrin.cc index 279274632648..b27459f4bd45 100644 --- a/src/tir/transforms/inject_copy_intrin.cc +++ b/src/tir/transforms/inject_copy_intrin.cc @@ -80,19 +80,7 @@ class CopyIntrinInjector : public StmtMutator { } // for now only support true condition matching if (has_cond) { - auto true_val = sel_true_value.Eval(); - - // TODO(zhanghao): we do cond unfold one more further - // this is used to lift the pad(dilate) to one load op - // However, ignoring false condition may cause incorrect results - PVar sel_cond_extra, sel_true_value_extra, sel_false_value_extra; - bool has_cond_extra = if_then_else(sel_cond_extra, sel_true_value_extra, sel_false_value_extra).Match(true_val) || - select(sel_cond_extra, sel_true_value_extra, sel_false_value_extra).Match(true_val); - if (has_cond_extra) { - load = sel_true_value_extra.Eval().as(); - } else { - load = true_val.as(); - } + load = sel_true_value.Eval().as(); } // cast can be part of the pattern if (cast != nullptr) { diff --git a/vta.resnet18_v1.log-manual-formatv0_2 b/vta.resnet18_v1.log-manual-formatv0_2 deleted file mode 100644 index 7b3c9d61a318..000000000000 --- a/vta.resnet18_v1.log-manual-formatv0_2 +++ /dev/null @@ -1,10 +0,0 @@ -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [16, 8, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 131, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 8]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.0014505], 0, 1.328160047531128, 1578987870.726089], "version": 0.2, "tvm_version": "0.7.dev0"} -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 16, 7, 7, 1, 32], "int8"], ["TENSOR", [16, 16, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 163, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 8]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.002734464], 0, 1.7085223197937012, 1578988000.5012062], "version": 0.2, "tvm_version": "0.7.dev0"} -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [8, 4, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 302, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.0008805], 0, 1.2376818656921387, 1578988097.9650147], "version": 0.2, "tvm_version": "0.7.dev0"} -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [8, 8, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 143, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 14]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.001309522], 0, 1.3671045303344727, 1578988174.358436], "version": 0.2, "tvm_version": "0.7.dev0"} -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [4, 2, 3, 3, 32, 32], "int8"], [2, 2], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 177, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.00079938], 0, 1.1500802040100098, 1578988361.3194962], "version": 0.2, "tvm_version": "0.7.dev0"} -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [4, 4, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 681, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 14]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 1]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 2]]}, "result": [[0.001198882], 0, 1.2445652484893799, 1578988503.2178001], "version": 0.2, "tvm_version": "0.7.dev0"} -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [2, 2, 3, 3, 32, 32], "int8"], [1, 1], [1, 1, 1, 1], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 570, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 4]], ["tile_w", "sp", [-1, 56]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 2]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 2]]}, "result": [[0.001230756], 0, 1.4033727645874023, 1578988610.0491438], "version": 0.2, "tvm_version": "0.7.dev0"} -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 2, 56, 56, 1, 32], "int8"], ["TENSOR", [4, 2, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 176, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 28]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.000339938], 0, 1.025542974472046, 1578988875.3407557], "version": 0.2, "tvm_version": "0.7.dev0"} -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 4, 28, 28, 1, 32], "int8"], ["TENSOR", [8, 4, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 299, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 14]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 4]], ["oc_nthread", "ot", 2], ["h_nthread", "ot", 1]]}, "result": [[0.000387532], 0, 1.095754861831665, 1578988972.0000997], "version": 0.2, "tvm_version": "0.7.dev0"} -{"input": ["ext_dev -device=vta -keys=cpu -model=ultra96_1x32_i8w8a32_15_15_18_17", "conv2d_packed.vta", [["TENSOR", [1, 8, 14, 14, 1, 32], "int8"], ["TENSOR", [16, 8, 1, 1, 32, 32], "int8"], [2, 2], [0, 0, 0, 0], [1, 1], "NCHW1n32c", "int32"], {}], "config": {"index": 67, "code_hash": null, "entity": [["tile_b", "sp", [-1, 1]], ["tile_h", "sp", [-1, 7]], ["tile_w", "sp", [-1, 7]], ["tile_ci", "sp", [-1, 1]], ["tile_co", "sp", [-1, 16]], ["oc_nthread", "ot", 1], ["h_nthread", "ot", 1]]}, "result": [[0.000294566], 0, 0.9454472064971924, 1578989137.6281488], "version": 0.2, "tvm_version": "0.7.dev0"} diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 20a7af2c5c1b..dca42de95ffc 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -35,6 +35,9 @@ from ..environment import get_env +# override to force partition at copy +reg.register_pattern("copy", OpPattern.INJECTIVE, level=15) + # add clip vta strategy def compute_clip_vta(attrs, inputs, output_type): """ Clip operator. """ @@ -63,8 +66,7 @@ def clip_strategy_vta(attrs, inputs, out_type, target): @autotvm.register_topi_compute("add.vta") def add_packed(cfg, lhs, rhs): - ret = topi.add(lhs, rhs) - return ret + return topi.add(lhs, rhs) @autotvm.register_topi_compute("multiply.vta") diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py index abb152d32314..9a340c6d0406 100644 --- a/vta/python/vta/transform.py +++ b/vta/python/vta/transform.py @@ -549,7 +549,6 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value): allow_fold = True _check_compact(dst) - x_size, y_size, x_stride, offset = _get_2d_pattern( src, elem_width, elem_bytes, data_type, dst.scope, allow_fold=allow_fold) diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 628f702de7fc..df20a8e87ed7 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -733,30 +733,30 @@ class InsnQueue : public BaseQueue { } // Helper function: Get Opcode string std::string getOpcodeString(int opcode, bool use_imm, int64_t imm) { - // The string name - if (opcode == VTA_ALU_OPCODE_MIN) { - if (use_imm) { - return std::string("min imm ") + std::to_string(imm); - } else { - return "min"; - } - } else if (opcode == VTA_ALU_OPCODE_MAX) { - if (use_imm) { - return (std::string("max imm ") + std::to_string(imm)); - } else { - return "max"; - } - } else if (opcode == VTA_ALU_OPCODE_ADD) { - if (use_imm) { - return (std::string("add imm ") + std::to_string(imm)); - } else { - return "add"; - } - } else if (opcode == VTA_ALU_OPCODE_SHR) { - return (std::string("shr ") + std::to_string(imm)); - } else if (opcode == VTA_ALU_OPCODE_MUL) { - return "mul"; - } + // The string name + if (opcode == VTA_ALU_OPCODE_MIN) { + if (use_imm) { + return std::string("min imm ") + std::to_string(imm); + } else { + return "min"; + } + } else if (opcode == VTA_ALU_OPCODE_MAX) { + if (use_imm) { + return (std::string("max imm ") + std::to_string(imm)); + } else { + return "max"; + } + } else if (opcode == VTA_ALU_OPCODE_ADD) { + if (use_imm) { + return (std::string("add imm ") + std::to_string(imm)); + } else { + return "add"; + } + } else if (opcode == VTA_ALU_OPCODE_SHR) { + return (std::string("shr ") + std::to_string(imm)); + } else if (opcode == VTA_ALU_OPCODE_MUL) { + return "mul"; + } return "unknown op"; } @@ -832,7 +832,7 @@ class InsnQueue : public BaseQueue { } // Dump instructions in the queue - void DumpInsn(FILE* out = stderr, bool json=false) { + void DumpInsn(FILE* out = stderr, bool json = false) { // Keep tabs on dependence queues int l2g_queue = 0; int g2l_queue = 0; @@ -1265,8 +1265,8 @@ class CommandQueue { // FIXME(zhanghao): It is required to use force_serial // by using skip and sync at the final layer. - // By doing this, we can avoid do DeviceCopy every time - // consider to make it as a flag when mature + // By doing this, we can avoid do DeviceCopy every time. + // TODO: Consider to make it as a flag when mature const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL"); if (sync_once && skip) { if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) { diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py index 2998b1c57fc9..f1638ba49432 100644 --- a/vta/tutorials/autotvm/tune_alu_vta.py +++ b/vta/tutorials/autotvm/tune_alu_vta.py @@ -104,7 +104,7 @@ def compile_network(env, target, model, start_pack, stop_pack, device_annot=Fals stop_pack = "nn.global_avg_pool2d" # Tuning option -log_file = "%s.%s.log" % (device, network) +log_file = "%s.alu.%s.log" % (device, network) tuning_option = { 'log_filename': log_file, @@ -267,11 +267,8 @@ def tune_and_evaluate(tuning_opt): tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. - bitstream = os.environ.get("TVM_BIT", None) - if bitstream: - print("Program fpga with {}".format(bitstream)) - vta.reconfig_runtime(remote) - vta.program_fpga(remote, bitstream) + vta.reconfig_runtime(remote) + vta.program_fpga(remote, bitstream) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index 8e2da559c6c2..e9d4c48e55e7 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -341,11 +341,8 @@ def tune_and_evaluate(tuning_opt): tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. - bitstream = os.environ.get("TVM_BIT", None) - if bitstream: - print("Program fpga with {}".format(bitstream)) - vta.reconfig_runtime(remote) - vta.program_fpga(remote, bitstream) + vta.reconfig_runtime(remote) + vta.program_fpga(remote, bitstream) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() @@ -393,9 +390,6 @@ def tune_and_evaluate(tuning_opt): # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): - # recompile the programs with device annotations - print("Recompile") - relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) # Compile network print("Compile...") if target.device_name != "vta": @@ -405,14 +399,10 @@ def tune_and_evaluate(tuning_opt): params=params, target_host=env.target_host) else: - targets = { - "cpu": env.target_vta_cpu, - "ext_dev": env.target - } - with vta.build_config(opt_level=3, debug_flag=32, disabled_pass={"AlterOpLayout"}): + with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, - target=targets, + target=target, params=params, target_host=env.target_host) @@ -425,8 +415,7 @@ def tune_and_evaluate(tuning_opt): # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) - ctxes = [ctx, remote.cpu(0)] - m = graph_runtime.create(graph, lib, ctxes) + m = graph_runtime.create(graph, lib, ctx) # upload parameters to device image = tvm.nd.array( diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index 63907b996734..33f59bd0e701 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -121,12 +121,7 @@ # by passing the path to the bitstream file instead of None. reconfig_start = time.time() vta.reconfig_runtime(remote) - # vta.program_fpga(remote, bitstream=None) - bitstream = os.environ.get("TVM_BIT", None) - if bitstream: - print("Program fpga with {}".format(bitstream)) - vta.program_fpga(remote, bitstream) - + vta.program_fpga(remote, bitstream=None) reconfig_time = time.time() - reconfig_start print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) @@ -261,7 +256,7 @@ m.set_input('data', image) # Perform inference and gather execution statistics -# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator +# More on: :py:method:`tvm.runtime.Module.time_evaluator` num = 4 # number of times we run module for a single measurement rep = 3 # number of measurements (we derive std dev from this) timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep) From 02b3ea0cb187dde99b342d64b40bfbf047c67464 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 18 Jun 2020 17:42:55 +0800 Subject: [PATCH 38/44] coding style --- cmake/modules/VTA.cmake | 2 +- src/relay/quantize/realize.cc | 6 ++++-- src/relay/transforms/device_annotation.cc | 4 +++- tests/lint/check_file_type.py | 2 ++ 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index b586800efe2d..cf21ca7c0495 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -104,7 +104,7 @@ elseif(PYTHON) file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc) elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc) - file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cpp) + file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cc) list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC}) list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h) endif() diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index cafae6c2146c..dcf58f12ea56 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -330,7 +330,8 @@ float ChooseDomScale(const std::vector& nptrs) { /* \brief Unify the dom scale of arguments */ Array UnifyDTypeScale(const Array& ref_args, const Array& args, - DataType* dtype_ptr, Expr* scale_ptr, DataType dtype = DataType::Void()) { + DataType* dtype_ptr, Expr* scale_ptr, + DataType dtype = DataType::Void()) { static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize"); const QConfig& cfg = QConfig::Current(); @@ -386,7 +387,8 @@ Expr AddRealize(const Call& ref_call, const Array& new_args, const ObjectR Expr dom_scale; // execute the operation with activation data type. const QConfig& cfg = QConfig::Current(); - Array ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale, cfg->dtype_activation); + Array ret_args = UnifyDTypeScale(ref_call->args, new_args, + &dtype, &dom_scale, cfg->dtype_activation); for (size_t i = 0; i < ret_args.size(); ++i) { // do not fuse float32 arg if (new_args[i].as()->dtype == DataType::Float(32)) { diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc index 319f9ba59064..fe3cfebf7fe3 100644 --- a/src/relay/transforms/device_annotation.cc +++ b/src/relay/transforms/device_annotation.cc @@ -538,7 +538,9 @@ Map CollectDeviceInfo(const Expr& expr) { return DeviceInfo::GetDeviceMap(expr); } -Map CollectDeviceAnnotationOps(const Expr& expr) { return AnnotatationVisitor::GetAnnotations(expr); } +Map CollectDeviceAnnotationOps(const Expr& expr) { + return AnnotatationVisitor::GetAnnotations(expr); +} TVM_REGISTER_GLOBAL("relay.analysis.CollectDeviceInfo").set_body_typed(CollectDeviceInfo); diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index da3a456dafb6..36bc66ec1784 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -77,6 +77,8 @@ "tokens", # interface definition "idl", + # opencl file + "cl", } # List of file names allowed From a1cd048bea8ffe17f41dcd0595b97cb9d4e15253 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 18 Jun 2020 23:05:15 +0800 Subject: [PATCH 39/44] update vta-hw commit --- 3rdparty/vta-hw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index f0347e202966..ed466d70d01c 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit f0347e202966322fe6a961eab2f4ff963bced2d5 +Subproject commit ed466d70d01c57cde4fde602c8c593b6a8acc531 From 6960c6a38933704d520ace51751a1e866fb8d494 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Thu, 18 Jun 2020 23:52:08 +0800 Subject: [PATCH 40/44] lint --- vta/tutorials/autotvm/tune_alu_vta.py | 2 +- vta/tutorials/autotvm/tune_relay_vta.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py index f1638ba49432..3e2c1c57b41b 100644 --- a/vta/tutorials/autotvm/tune_alu_vta.py +++ b/vta/tutorials/autotvm/tune_alu_vta.py @@ -268,7 +268,7 @@ def tune_and_evaluate(tuning_opt): timeout=10000) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) - vta.program_fpga(remote, bitstream) + vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index e9d4c48e55e7..9ae54cba0992 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -341,8 +341,8 @@ def tune_and_evaluate(tuning_opt): tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. - vta.reconfig_runtime(remote) - vta.program_fpga(remote, bitstream) + vta.reconfig_runtime(remote) + vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() From 14020b7ac8319b1766e82950c99eea9a60d49e6c Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Sat, 20 Jun 2020 23:27:49 +0800 Subject: [PATCH 41/44] clean up unneeded code --- vta/tutorials/autotvm/tune_alu_vta.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py index 3e2c1c57b41b..cf4922450ce5 100644 --- a/vta/tutorials/autotvm/tune_alu_vta.py +++ b/vta/tutorials/autotvm/tune_alu_vta.py @@ -290,8 +290,6 @@ def tune_and_evaluate(tuning_opt): tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) # filter out float alu task tasks = list(filter(lambda t: t.args[0][2] != "float32", tasks)) - # filter const rhs, which will be fused with conv2d - # tasks = list(filter(lambda t: len(t.args[1][1]) < 1, tasks)) # We should have extracted 10 convolution tasks tasks_set = {} From b6c1763ffcc04682fca2a62e3d873561185df68c Mon Sep 17 00:00:00 2001 From: Li Jiashu Date: Thu, 16 Jul 2020 01:45:27 +0800 Subject: [PATCH 42/44] Move AOCLUtils from Intel FPGA into 3rdparty directory --- 3rdparty/aoclutils/aocl_utils.h | 32 ++ 3rdparty/aoclutils/opencl.cc | 555 +++++++++++++++++++++++++++++++ 3rdparty/aoclutils/opencl.h | 122 +++++++ 3rdparty/aoclutils/options.cc | 105 ++++++ 3rdparty/aoclutils/options.h | 137 ++++++++ 3rdparty/aoclutils/scoped_ptrs.h | 165 +++++++++ 3rdparty/vta-hw | 2 +- cmake/modules/VTA.cmake | 3 +- 8 files changed, 1119 insertions(+), 2 deletions(-) create mode 100644 3rdparty/aoclutils/aocl_utils.h create mode 100644 3rdparty/aoclutils/opencl.cc create mode 100644 3rdparty/aoclutils/opencl.h create mode 100644 3rdparty/aoclutils/options.cc create mode 100644 3rdparty/aoclutils/options.h create mode 100644 3rdparty/aoclutils/scoped_ptrs.h diff --git a/3rdparty/aoclutils/aocl_utils.h b/3rdparty/aoclutils/aocl_utils.h new file mode 100644 index 000000000000..70e0fc6bcc0a --- /dev/null +++ b/3rdparty/aoclutils/aocl_utils.h @@ -0,0 +1,32 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// Main include file for AOCLUtils. Includes all other utility header files. + +#ifndef AOCL_UTILS_H +#define AOCL_UTILS_H + +#include "opencl.h" +#include "scoped_ptrs.h" +#include "options.h" + +#endif + diff --git a/3rdparty/aoclutils/opencl.cc b/3rdparty/aoclutils/opencl.cc new file mode 100644 index 000000000000..04d989d7c9ea --- /dev/null +++ b/3rdparty/aoclutils/opencl.cc @@ -0,0 +1,555 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +#include "aocl_utils.h" +#include +#include + +#ifdef _WIN32 // Windows +#include +#else // Linux +#include +#include // readlink, chdir +#endif + +namespace aocl_utils { + +static const char *const VERSION_STR = "161"; + +////////////////////////////////////////// +// Host allocation functions for alignment +////////////////////////////////////////// + +// This is the minimum alignment requirement to ensure DMA can be used. +const unsigned AOCL_ALIGNMENT = 64; + +#ifdef _WIN32 // Windows +void *alignedMalloc(size_t size) { + return _aligned_malloc (size, AOCL_ALIGNMENT); +} + +void alignedFree(void * ptr) { + _aligned_free(ptr); +} +#else // Linux +void *alignedMalloc(size_t size) { + void *result = NULL; + int rc; + rc = posix_memalign (&result, AOCL_ALIGNMENT, size); + (void) rc; + return result; +} + +void alignedFree(void * ptr) { + free (ptr); +} +#endif + +/////////////////////////////// +// Error functions +/////////////////////////////// + +// Print the error associciated with an error code +void printError(cl_int error) { + // Print error message + switch(error) + { + case -1: + printf("CL_DEVICE_NOT_FOUND "); + break; + case -2: + printf("CL_DEVICE_NOT_AVAILABLE "); + break; + case -3: + printf("CL_COMPILER_NOT_AVAILABLE "); + break; + case -4: + printf("CL_MEM_OBJECT_ALLOCATION_FAILURE "); + break; + case -5: + printf("CL_OUT_OF_RESOURCES "); + break; + case -6: + printf("CL_OUT_OF_HOST_MEMORY "); + break; + case -7: + printf("CL_PROFILING_INFO_NOT_AVAILABLE "); + break; + case -8: + printf("CL_MEM_COPY_OVERLAP "); + break; + case -9: + printf("CL_IMAGE_FORMAT_MISMATCH "); + break; + case -10: + printf("CL_IMAGE_FORMAT_NOT_SUPPORTED "); + break; + case -11: + printf("CL_BUILD_PROGRAM_FAILURE "); + break; + case -12: + printf("CL_MAP_FAILURE "); + break; + case -13: + printf("CL_MISALIGNED_SUB_BUFFER_OFFSET "); + break; + case -14: + printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST "); + break; + + case -30: + printf("CL_INVALID_VALUE "); + break; + case -31: + printf("CL_INVALID_DEVICE_TYPE "); + break; + case -32: + printf("CL_INVALID_PLATFORM "); + break; + case -33: + printf("CL_INVALID_DEVICE "); + break; + case -34: + printf("CL_INVALID_CONTEXT "); + break; + case -35: + printf("CL_INVALID_QUEUE_PROPERTIES "); + break; + case -36: + printf("CL_INVALID_COMMAND_QUEUE "); + break; + case -37: + printf("CL_INVALID_HOST_PTR "); + break; + case -38: + printf("CL_INVALID_MEM_OBJECT "); + break; + case -39: + printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR "); + break; + case -40: + printf("CL_INVALID_IMAGE_SIZE "); + break; + case -41: + printf("CL_INVALID_SAMPLER "); + break; + case -42: + printf("CL_INVALID_BINARY "); + break; + case -43: + printf("CL_INVALID_BUILD_OPTIONS "); + break; + case -44: + printf("CL_INVALID_PROGRAM "); + break; + case -45: + printf("CL_INVALID_PROGRAM_EXECUTABLE "); + break; + case -46: + printf("CL_INVALID_KERNEL_NAME "); + break; + case -47: + printf("CL_INVALID_KERNEL_DEFINITION "); + break; + case -48: + printf("CL_INVALID_KERNEL "); + break; + case -49: + printf("CL_INVALID_ARG_INDEX "); + break; + case -50: + printf("CL_INVALID_ARG_VALUE "); + break; + case -51: + printf("CL_INVALID_ARG_SIZE "); + break; + case -52: + printf("CL_INVALID_KERNEL_ARGS "); + break; + case -53: + printf("CL_INVALID_WORK_DIMENSION "); + break; + case -54: + printf("CL_INVALID_WORK_GROUP_SIZE "); + break; + case -55: + printf("CL_INVALID_WORK_ITEM_SIZE "); + break; + case -56: + printf("CL_INVALID_GLOBAL_OFFSET "); + break; + case -57: + printf("CL_INVALID_EVENT_WAIT_LIST "); + break; + case -58: + printf("CL_INVALID_EVENT "); + break; + case -59: + printf("CL_INVALID_OPERATION "); + break; + case -60: + printf("CL_INVALID_GL_OBJECT "); + break; + case -61: + printf("CL_INVALID_BUFFER_SIZE "); + break; + case -62: + printf("CL_INVALID_MIP_LEVEL "); + break; + case -63: + printf("CL_INVALID_GLOBAL_WORK_SIZE "); + break; + default: + printf("UNRECOGNIZED ERROR CODE (%d)", error); + } +} + +// Print line, file name, and error code if there is an error. Exits the +// application upon error. +void _checkError(int line, + const char *file, + cl_int error, + const char *msg, + ...) { + // If not successful + if(error != CL_SUCCESS) { + // Print line and file + printf("ERROR: "); + printError(error); + printf("\nLocation: %s:%d\n", file, line); + + // Print custom message. + va_list vl; + va_start(vl, msg); + vprintf(msg, vl); + printf("\n"); + va_end(vl); + + // Cleanup and bail. + cleanup(); + exit(error); + } +} + +// Sets the current working directory to be the same as the directory +// containing the running executable. +bool setCwdToExeDir() { +#ifdef _WIN32 // Windows + HMODULE hMod = GetModuleHandle(NULL); + char path[MAX_PATH]; + GetModuleFileNameA(hMod, path, MAX_PATH); + +#else // Linux + // Get path of executable. + char path[300]; + ssize_t n = readlink("/proc/self/exe", path, sizeof(path)/sizeof(path[0]) - 1); + if(n == -1) { + return false; + } + path[n] = 0; +#endif + + // Find the last '\' or '/' and terminate the path there; it is now + // the directory containing the executable. + size_t i; + for(i = strlen(path) - 1; i > 0 && path[i] != '/' && path[i] != '\\'; --i); + path[i] = '\0'; + + // Change the current directory. +#ifdef _WIN32 // Windows + SetCurrentDirectoryA(path); +#else // Linux + int rc; + rc = chdir(path); + (void) rc; +#endif + + return true; +} + +// Searches all platforms for the first platform whose name +// contains the search string (case-insensitive). +cl_platform_id findPlatform(const char *platform_name_search) { + cl_int status; + + std::string search = platform_name_search; + std::transform(search.begin(), search.end(), search.begin(), tolower); + + // Get number of platforms. + cl_uint num_platforms; + status = clGetPlatformIDs(0, NULL, &num_platforms); + checkError(status, "Query for number of platforms failed"); + + // Get a list of all platform ids. + scoped_array pids(num_platforms); + status = clGetPlatformIDs(num_platforms, pids, NULL); + checkError(status, "Query for all platform ids failed"); + + // For each platform, get name and compare against the search string. + for(unsigned i = 0; i < num_platforms; ++i) { + std::string name = getPlatformName(pids[i]); + + // Convert to lower case. + std::transform(name.begin(), name.end(), name.begin(), tolower); + + if(name.find(search) != std::string::npos) { + // Found! + return pids[i]; + } + } + + // No platform found. + return NULL; +} + +// Returns the platform name. +std::string getPlatformName(cl_platform_id pid) { + cl_int status; + + size_t sz; + status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz); + checkError(status, "Query for platform name size failed"); + + scoped_array name(sz); + status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL); + checkError(status, "Query for platform name failed"); + + return name.get(); +} + +// Returns the device name. +std::string getDeviceName(cl_device_id did) { + cl_int status; + + size_t sz; + status = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, NULL, &sz); + checkError(status, "Failed to get device name size"); + + scoped_array name(sz); + status = clGetDeviceInfo(did, CL_DEVICE_NAME, sz, name, NULL); + checkError(status, "Failed to get device name"); + + return name.get(); +} + +// Returns the list of all devices. +cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices) { + cl_int status; + + status = clGetDeviceIDs(pid, dev_type, 0, NULL, num_devices); + checkError(status, "Query for number of devices failed"); + + cl_device_id *dids = new cl_device_id[*num_devices]; + status = clGetDeviceIDs(pid, dev_type, *num_devices, dids, NULL); + checkError(status, "Query for device ids"); + + return dids; +} + +// Create a program for all devices associated with the context. +cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices) { + // Early exit for potentially the most common way to fail: AOCX does not exist. + if(!fileExists(binary_file_name)) { + printf("AOCX file '%s' does not exist.\n", binary_file_name); + checkError(CL_INVALID_PROGRAM, "Failed to load binary file"); + } + + // Load the binary. + size_t binary_size; + scoped_array binary(loadBinaryFile(binary_file_name, &binary_size)); + if(binary == NULL) { + checkError(CL_INVALID_PROGRAM, "Failed to load binary file"); + } + + scoped_array binary_lengths(num_devices); + scoped_array binaries(num_devices); + for(unsigned i = 0; i < num_devices; ++i) { + binary_lengths[i] = binary_size; + binaries[i] = binary; + } + + cl_int status; + scoped_array binary_status(num_devices); + + cl_program program = clCreateProgramWithBinary(context, num_devices, devices, binary_lengths, + (const unsigned char **) binaries.get(), binary_status, &status); + checkError(status, "Failed to create program with binary"); + for(unsigned i = 0; i < num_devices; ++i) { + checkError(binary_status[i], "Failed to load binary for device"); + } + + return program; +} + +// Loads a file in binary form. +unsigned char *loadBinaryFile(const char *file_name, size_t *size) { + // Open the File + FILE* fp; +#ifdef _WIN32 + if(fopen_s(&fp, file_name, "rb") != 0) { + return NULL; + } +#else + fp = fopen(file_name, "rb"); + if(fp == 0) { + return NULL; + } +#endif + + // Get the size of the file + fseek(fp, 0, SEEK_END); + *size = ftell(fp); + + // Allocate space for the binary + unsigned char *binary = new unsigned char[*size]; + + // Go back to the file start + rewind(fp); + + // Read the file into the binary + if(fread((void*)binary, *size, 1, fp) == 0) { + delete[] binary; + fclose(fp); + return NULL; + } + + return binary; +} + +bool fileExists(const char *file_name) { +#ifdef _WIN32 // Windows + DWORD attrib = GetFileAttributesA(file_name); + return (attrib != INVALID_FILE_ATTRIBUTES && !(attrib & FILE_ATTRIBUTE_DIRECTORY)); +#else // Linux + return access(file_name, R_OK) != -1; +#endif +} + +std::string getBoardBinaryFile(const char *prefix, cl_device_id device) { + // First check if .aocx exists. Use it if it does. + std::string file_name = std::string(prefix) + ".aocx"; + if(fileExists(file_name.c_str())) { + return file_name; + } + + // Now get the name of the board. For Intel(R) FPGA SDK for OpenCL(TM) boards, + // the name of the device is presented as: + // : ... + std::string device_name = getDeviceName(device); + + // Now search for the " :" in the device name. + size_t end = device_name.find(" :"); + if(end != std::string::npos) { + std::string board_name(device_name, 0, end); + + // Look for a AOCX with the name __.aocx. + file_name = std::string(prefix) + "_" + board_name + "_" + VERSION_STR + ".aocx"; + if(fileExists(file_name.c_str())) { + return file_name; + } + } + + // At this point just use .aocx. This file doesn't exist + // and this should trigger an error later. + return std::string(prefix) + ".aocx"; +} + +// High-resolution timer. +double getCurrentTimestamp() { +#ifdef _WIN32 // Windows + // Use the high-resolution performance counter. + + static LARGE_INTEGER ticks_per_second = {}; + if(ticks_per_second.QuadPart == 0) { + // First call - get the frequency. + QueryPerformanceFrequency(&ticks_per_second); + } + + LARGE_INTEGER counter; + QueryPerformanceCounter(&counter); + + double seconds = double(counter.QuadPart) / double(ticks_per_second.QuadPart); + return seconds; +#else // Linux + timespec a; + clock_gettime(CLOCK_MONOTONIC, &a); + return (double(a.tv_nsec) * 1.0e-9) + double(a.tv_sec); +#endif +} + +cl_ulong getStartEndTime(cl_event event) { + cl_int status; + + cl_ulong start, end; + status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); + checkError(status, "Failed to query event start time"); + status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); + checkError(status, "Failed to query event end time"); + + return end - start; +} + +cl_ulong getStartEndTime(cl_event *events, unsigned num_events) { + cl_int status; + + cl_ulong min_start = 0; + cl_ulong max_end = 0; + for(unsigned i = 0; i < num_events; ++i) { + cl_ulong start, end; + status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); + checkError(status, "Failed to query event start time"); + status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); + checkError(status, "Failed to query event end time"); + + if(i == 0) { + min_start = start; + max_end = end; + } + else { + if(start < min_start) { + min_start = start; + } + if(end > max_end) { + max_end = end; + } + } + } + + return max_end - min_start; +} + +void waitMilliseconds(unsigned ms) { +#ifdef _WIN32 // Windows + Sleep(ms); +#else // Linux + timespec sleeptime = {0, 0}; + sleeptime.tv_sec = ms / 1000; + sleeptime.tv_nsec = long(ms % 1000) * 1000000L; // convert to nanoseconds + nanosleep(&sleeptime, NULL); +#endif +} + +void oclContextCallback(const char *errinfo, const void *, size_t, void *) { + printf("Context callback: %s\n", errinfo); +} + +} // ns aocl_utils + diff --git a/3rdparty/aoclutils/opencl.h b/3rdparty/aoclutils/opencl.h new file mode 100644 index 000000000000..4aa5348b67b1 --- /dev/null +++ b/3rdparty/aoclutils/opencl.h @@ -0,0 +1,122 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// OpenCL utility functions. + +#ifndef AOCL_UTILS_OPENCL_H +#define AOCL_UTILS_OPENCL_H + +#include +#include +#include +#include + +#include "CL/opencl.h" + +// This is assumed to be externally provided by the application. +extern void cleanup(); + +namespace aocl_utils { + +// Host allocation functions +void *alignedMalloc(size_t size); +void alignedFree(void *ptr); + +// Error functions +void printError(cl_int error); +void _checkError(int line, + const char *file, + cl_int error, + const char *msg, + ...); // does not return +#define checkError(status, ...) _checkError(__LINE__, __FILE__, status, __VA_ARGS__) + +// Sets the current working directory to the same directory that contains +// this executable. Returns true on success. +bool setCwdToExeDir(); + +// Find a platform that contains the search string in its name (case-insensitive match). +// Returns NULL if no match is found. +cl_platform_id findPlatform(const char *platform_name_search); + +// Returns the name of the platform. +std::string getPlatformName(cl_platform_id pid); + +// Returns the name of the device. +std::string getDeviceName(cl_device_id did); + +// Returns an array of device ids for the given platform and the +// device type. +// Return value must be freed with delete[]. +cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices); + +// Create a OpenCL program from a binary file. +// The program is created for all given devices associated with the context. The same +// binary is used for all devices. +cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices); + +// Load binary file. +// Return value must be freed with delete[]. +unsigned char *loadBinaryFile(const char *file_name, size_t *size); + +// Checks if a file exists. +bool fileExists(const char *file_name); + +// Returns the path to the AOCX file to use for the given device. +// This is special handling for examples for the Intel(R) FPGA SDK for OpenCL(TM). +// It uses the device name to get the board name and then looks for a +// corresponding AOCX file. Specifically, it gets the device name and +// extracts the board name assuming the device name has the following format: +// : ... +// +// Then the AOCX file is __.aocx. If this +// file does not exist, then the file name defaults to .aocx. +std::string getBoardBinaryFile(const char *prefix, cl_device_id device); + +// Returns the time from a high-resolution timer in seconds. This value +// can be used with a value returned previously to measure a high-resolution +// time difference. +double getCurrentTimestamp(); + +// Returns the difference between the CL_PROFILING_COMMAND_END and +// CL_PROFILING_COMMAND_START values of a cl_event object. +// This requires that the command queue associated with the event be created +// with the CL_QUEUE_PROFILING_ENABLE property. +// +// The return value is in nanoseconds. +cl_ulong getStartEndTime(cl_event event); + +// Returns the maximum time span for the given set of events. +// The time span starts at the earliest event start time. +// The time span ends at the latest event end time. +cl_ulong getStartEndTime(cl_event *events, unsigned num_events); + +// Wait for the specified number of milliseconds. +void waitMilliseconds(unsigned ms); + +// OpenCL context callback function that simply prints the error information +// to stdout (via printf). +void oclContextCallback(const char *errinfo, const void *, size_t, void *); + +} // ns aocl_utils + +#endif + diff --git a/3rdparty/aoclutils/options.cc b/3rdparty/aoclutils/options.cc new file mode 100644 index 000000000000..05d025b43faf --- /dev/null +++ b/3rdparty/aoclutils/options.cc @@ -0,0 +1,105 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +#include "aocl_utils.h" +#include +#include +#include +#include + +namespace aocl_utils { + +Options::Options() { +} + +Options::Options(int num, char *argv[]) { + addFromCommandLine(num, argv); +} + +bool Options::has(const std::string &name) const { + return m_options.find(name) != m_options.end(); +} + +std::string &Options::get(const std::string &name) { + return m_options[name]; +} + +const std::string &Options::get(const std::string &name) const { + OptionMap::const_iterator it = m_options.find(name); + if(it == m_options.end()) { + errorNonExistent(name); + std::cerr << "Option '" << name << "' does not exist.\n"; + exit(1); + } + return it->second; +} + +void Options::addFromCommandLine(int num, char *argv[]) { + for(int i = 1; i < num; ++i) { + const std::string arg = argv[i]; + + // Look for the first '-'. + if(arg.size() > 1 && arg[0] == '-') { + size_t eq = arg.find('='); + size_t name_start = 1; + + // Check if there's a second '-'. + if(arg.size() > 2 && arg[1] == '-') { + name_start = 2; + } + + if(eq == std::string::npos) { + // No '='; treat as a boolean option. + set(arg.substr(name_start), true); + } + else if(eq == name_start) { + // No name?! + errorNameless(); + } + else { + set(arg.substr(name_start, eq - name_start), arg.substr(eq + 1)); + } + } + else { + // Not an option. + m_nonoptions.push_back(arg); + } + } +} + +void Options::errorNameless() const { + std::cerr << "No name provided for option.\n"; + exit(1); +} + +void Options::errorNonExistent(const std::string &name) const { + std::cerr << "Option '" << name << "' does not exist.\n"; + exit(1); +} + +void Options::errorWrongType(const std::string &name) const { + std::cerr << "Value for option '" << name << "' is not of the right type (value = '" + << get(name) << "').\n"; + exit(1); +} + +} // ns aocl_utils + diff --git a/3rdparty/aoclutils/options.h b/3rdparty/aoclutils/options.h new file mode 100644 index 000000000000..78d34605e60e --- /dev/null +++ b/3rdparty/aoclutils/options.h @@ -0,0 +1,137 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// Declares a utility class used to parse command-line options. + +#ifndef AOCL_UTILS_OPTIONS_H +#define AOCL_UTILS_OPTIONS_H + +#include +#include +#include +#include + +namespace aocl_utils { + +class Options { +public: + typedef std::vector StringVec; + + Options(); + Options(int num, char *argv[]); + + bool has(const std::string &name) const; + std::string &get(const std::string &name); // will create an empty option if it does not exist + const std::string &get(const std::string &name) const; // error if option does not exist + + void set(const std::string &name, const std::string &value) { get(name) = value; } + + // Command line options must be of the following form: + // [-]-name (indicates option exists) + // [-]-name=value + // + // This function assumes that the values are from main(int, char *). + // This means that the argv[0] is skipped. + void addFromCommandLine(int num, char *argv[]); + + // This templated function converts the option value to the given type. + // An assert is raised if the conversion fails. + template + T get(const std::string &name) const; + + template + void set(const std::string &name, const T &value); + + // Non-options are arguments processed in addFromCommandLine + // that were not recognized as options. + const StringVec &getNonOptions() const { return m_nonoptions; } + size_t getNonOptionCount() const { return m_nonoptions.size(); } + const std::string &getNonOption(size_t i) const { return m_nonoptions[i]; } + +private: + typedef std::map OptionMap; + + // Displays an error message indicating that a nameless option + // was provided. + void errorNameless() const; + + // Displays an error message indicating that the given option + // has the wrong type and then exits with an error code. + void errorWrongType(const std::string &name) const; + + // Displays an error message indicating that the given option + // does not exist and then exits with an error code. + void errorNonExistent(const std::string &name) const; + + OptionMap m_options; + StringVec m_nonoptions; + + Options(const Options &); // not implemented + void operator =(const Options &); // not implemented +}; + +template +T Options::get(const std::string &name) const { + std::stringstream ss; + ss << get(name); + + T v; + ss >> v; + if(ss.fail() || !ss.eof()) { + // Failed to parse or did not consume the whole string value. + errorWrongType(name); + } + return v; +} + +// Specialization for bool. +template<> +inline bool Options::get(const std::string &name) const { + if(has(name)) { + const std::string &v = get(name); + if(v == "1") { + return true; + } + } + return false; +} + +// Specialization for std::string. Simply returns the option string. +// Requires specialization because using stringstream to read the string +// will stop at the first whitespace character (which is wrong). +template<> +inline std::string Options::get(const std::string &name) const { + return get(name); +} + +// This assumes the type T can be serialized to a string and back (when get +// is called). +template +void Options::set(const std::string &name, const T &value) { + std::stringstream ss; + ss << value; + set(name, ss.str()); +} + +} // ns aocl_utils + +#endif + diff --git a/3rdparty/aoclutils/scoped_ptrs.h b/3rdparty/aoclutils/scoped_ptrs.h new file mode 100644 index 000000000000..b11085c5226e --- /dev/null +++ b/3rdparty/aoclutils/scoped_ptrs.h @@ -0,0 +1,165 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// Scoped pointer definitions. + +#ifndef AOCL_UTILS_SCOPED_PTRS_H +#define AOCL_UTILS_SCOPED_PTRS_H + +namespace aocl_utils { + +// Interface is essentially the combination of std::auto_ptr and boost's smart pointers, +// along with some small extensions (auto conversion to T*). + +// scoped_ptr: assumes pointer was allocated with operator new; destroys with operator delete +template +class scoped_ptr { +public: + typedef scoped_ptr this_type; + + scoped_ptr() : m_ptr(NULL) {} + scoped_ptr(T *ptr) : m_ptr(ptr) {} + ~scoped_ptr() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { delete m_ptr; m_ptr = ptr; } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + + // noncopyable + scoped_ptr(const this_type &); + this_type &operator =(const this_type &); +}; + +// scoped_array: assumes pointer was allocated with operator new[]; destroys with operator delete[] +// Also supports allocation/reset with a number, which is the number of +// elements of type T. +template +class scoped_array { +public: + typedef scoped_array this_type; + + scoped_array() : m_ptr(NULL) {} + scoped_array(T *ptr) : m_ptr(NULL) { reset(ptr); } + explicit scoped_array(size_t n) : m_ptr(NULL) { reset(n); } + ~scoped_array() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + T &operator [](int index) const { return m_ptr[index]; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { delete[] m_ptr; m_ptr = ptr; } + void reset(size_t n) { reset(new T[n]); } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + + // noncopyable + scoped_array(const this_type &); + this_type &operator =(const this_type &); +}; + +// scoped_aligned_ptr: assumes pointer was allocated with alignedMalloc; destroys with alignedFree +// Also supports allocation/reset with a number, which is the number of +// elements of type T +template +class scoped_aligned_ptr { +public: + typedef scoped_aligned_ptr this_type; + + scoped_aligned_ptr() : m_ptr(NULL) {} + scoped_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); } + explicit scoped_aligned_ptr(size_t n) : m_ptr(NULL) { reset(n); } + ~scoped_aligned_ptr() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + T &operator [](int index) const { return m_ptr[index]; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { if(m_ptr) alignedFree(m_ptr); m_ptr = ptr; } + void reset(size_t n) { reset((T*) alignedMalloc(sizeof(T) * n)); } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + + // noncopyable + scoped_aligned_ptr(const this_type &); + this_type &operator =(const this_type &); +}; + +#if USE_SVM_API == 1 +// scoped_SVM_aligned_ptr: assumes pointer was allocated with clSVMAlloc; destroys with clSVMFree +// Also supports allocation/reset with a number, which is the number of +// elements of type T +template +class scoped_SVM_aligned_ptr { +public: + typedef scoped_SVM_aligned_ptr this_type; + + scoped_SVM_aligned_ptr() : m_ptr(NULL) {} + scoped_SVM_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); } + explicit scoped_SVM_aligned_ptr(cl_context ctx, size_t n) : m_ptr(NULL) { reset(ctx, n); } + ~scoped_SVM_aligned_ptr() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + T &operator [](int index) const { return m_ptr[index]; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { if (m_ptr) clSVMFree(m_ctx, m_ptr); m_ptr = ptr; } + void reset(cl_context ctx, size_t n) { reset((T*)clSVMAlloc(ctx, 0, sizeof(T) * n, 0)); m_ctx = ctx; } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + cl_context m_ctx; + + // noncopyable + scoped_SVM_aligned_ptr(const this_type &); + this_type &operator =(const this_type &); +}; +#endif /* USE_SVM_API == 1 */ + +} // ns aocl_utils + +#endif + diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index ed466d70d01c..98860a2a31ec 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit ed466d70d01c57cde4fde602c8c593b6a8acc531 +Subproject commit 98860a2a31ecc4aaf7c3346daa750d26193847e4 diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index cf21ca7c0495..6c35b8df07d0 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -104,7 +104,7 @@ elseif(PYTHON) file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc) elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc) - file(GLOB AOCLUTIL_SRC ${VTA_HW_PATH}/src/intelfocl/AOCLUtils/*.cc) + file(GLOB AOCLUTIL_SRC 3rdparty/aoclutils/*.cc) list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC}) list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h) endif() @@ -126,6 +126,7 @@ elseif(PYTHON) target_include_directories(vta PUBLIC "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include") elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules + target_include_directories(vta PUBLIC 3rdparty) target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL) From c0f918ccbbd20e3141ac3f3e2c4b3fab2f41e581 Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Sat, 18 Jul 2020 23:04:06 +0800 Subject: [PATCH 43/44] remove unnecessary comment --- vta/python/vta/transform.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py index f53a8ae7923e..c6ba1b95a8cb 100644 --- a/vta/python/vta/transform.py +++ b/vta/python/vta/transform.py @@ -382,10 +382,6 @@ def _fold_buffer_dim(buf, scope, elem_block): def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold): elem_block = elem_bytes * 8 // elem_width - # remove the checking as we have load_int8 insn - # if buf.dtype != dtype: - # raise RuntimeError("Expect buffer type to be %s instead of %s" % - # (dtype, buf.dtype)) shape, strides = buf.shape, buf.strides if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0): raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block)) From 348fb91b6ba34b2df771d73d588370e612b5cd1e Mon Sep 17 00:00:00 2001 From: Zhang Hao Date: Mon, 20 Jul 2020 15:25:15 +0800 Subject: [PATCH 44/44] api to program intelfocl aocx --- vta/python/vta/program_bitstream.py | 10 +++++++++- vta/python/vta/rpc_client.py | 14 ++++++++++---- vta/tutorials/frontend/deploy_classification.py | 4 ++++ 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/vta/python/vta/program_bitstream.py b/vta/python/vta/program_bitstream.py index 62cb5f21d02a..9a48ba75378e 100644 --- a/vta/python/vta/program_bitstream.py +++ b/vta/python/vta/program_bitstream.py @@ -54,7 +54,13 @@ def de10nano_bitstream_program(bitstream_path): program = get_global_func("vta.de10nano.program") program(bitstream_path) -def bitstream_program(target, bitstream): +def intelfocl_bitstream_program(bitstream_path, mem_size=4*1024*1024*1024): + # pylint: disable=import-outside-toplevel + from tvm import get_global_func + program = get_global_func("vta.intelfocl.program") + program(bitstream_path, mem_size) + +def bitstream_program(target, bitstream, *args): if target in ['pynq', 'ultra96']: pynq_bitstream_program(bitstream) elif target in ['de10nano']: @@ -62,6 +68,8 @@ def bitstream_program(target, bitstream): elif target in ['sim', 'tsim']: # In simulation, bit stream programming is a no-op return + elif target in ['intelfocl']: + intelfocl_bitstream_program(bitstream, *args) else: raise RuntimeError("Unknown target {}".format(target)) diff --git a/vta/python/vta/rpc_client.py b/vta/python/vta/rpc_client.py index 097ea8e4a5cc..c76a8c77cb67 100644 --- a/vta/python/vta/rpc_client.py +++ b/vta/python/vta/rpc_client.py @@ -19,6 +19,8 @@ from .environment import get_env from .bitstream import download_bitstream, get_bitstream_path +from tvm import rpc +from vta import program_bitstream def reconfig_runtime(remote): """Reconfigure remote runtime based on current hardware spec. @@ -44,16 +46,20 @@ def program_fpga(remote, bitstream=None): bitstream : str, optional Path to a local bistream file. If unset, tries to download from cache server. """ + env = get_env() + if bitstream: assert os.path.isfile(bitstream) else: bitstream = get_bitstream_path() if not os.path.isfile(bitstream): - env = get_env() if env.TARGET == 'de10nano': return download_bitstream() - fprogram = remote.get_function("tvm.contrib.vta.init") - remote.upload(bitstream) - fprogram(os.path.basename(bitstream)) + if isinstance(remote, rpc.LocalSession): + program_bitstream.bitstream_program(env.TARGET, bitstream) + else: + fprogram = remote.get_function("tvm.contrib.vta.init") + remote.upload(bitstream) + fprogram(os.path.basename(bitstream)) diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index 33f59bd0e701..a9676a0096e8 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -129,6 +129,10 @@ else: remote = rpc.LocalSession() + if env.TARGET in ["intelfocl"]: + # program intelfocl aocx + vta.program_fpga(remote, bitstream="vta_opencl.aocx") + # Get execution context from remote ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)