diff --git a/.gitignore b/.gitignore index 506e54d93067..d61b23ee5248 100644 --- a/.gitignore +++ b/.gitignore @@ -234,3 +234,6 @@ conda/pkg # antlr files *.tokens *.interp + +*log* +*.txt diff --git a/3rdparty/aoclutils/aocl_utils.h b/3rdparty/aoclutils/aocl_utils.h new file mode 100644 index 000000000000..70e0fc6bcc0a --- /dev/null +++ b/3rdparty/aoclutils/aocl_utils.h @@ -0,0 +1,32 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// Main include file for AOCLUtils. Includes all other utility header files. + +#ifndef AOCL_UTILS_H +#define AOCL_UTILS_H + +#include "opencl.h" +#include "scoped_ptrs.h" +#include "options.h" + +#endif + diff --git a/3rdparty/aoclutils/opencl.cc b/3rdparty/aoclutils/opencl.cc new file mode 100644 index 000000000000..04d989d7c9ea --- /dev/null +++ b/3rdparty/aoclutils/opencl.cc @@ -0,0 +1,555 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +#include "aocl_utils.h" +#include +#include + +#ifdef _WIN32 // Windows +#include +#else // Linux +#include +#include // readlink, chdir +#endif + +namespace aocl_utils { + +static const char *const VERSION_STR = "161"; + +////////////////////////////////////////// +// Host allocation functions for alignment +////////////////////////////////////////// + +// This is the minimum alignment requirement to ensure DMA can be used. +const unsigned AOCL_ALIGNMENT = 64; + +#ifdef _WIN32 // Windows +void *alignedMalloc(size_t size) { + return _aligned_malloc (size, AOCL_ALIGNMENT); +} + +void alignedFree(void * ptr) { + _aligned_free(ptr); +} +#else // Linux +void *alignedMalloc(size_t size) { + void *result = NULL; + int rc; + rc = posix_memalign (&result, AOCL_ALIGNMENT, size); + (void) rc; + return result; +} + +void alignedFree(void * ptr) { + free (ptr); +} +#endif + +/////////////////////////////// +// Error functions +/////////////////////////////// + +// Print the error associciated with an error code +void printError(cl_int error) { + // Print error message + switch(error) + { + case -1: + printf("CL_DEVICE_NOT_FOUND "); + break; + case -2: + printf("CL_DEVICE_NOT_AVAILABLE "); + break; + case -3: + printf("CL_COMPILER_NOT_AVAILABLE "); + break; + case -4: + printf("CL_MEM_OBJECT_ALLOCATION_FAILURE "); + break; + case -5: + printf("CL_OUT_OF_RESOURCES "); + break; + case -6: + printf("CL_OUT_OF_HOST_MEMORY "); + break; + case -7: + printf("CL_PROFILING_INFO_NOT_AVAILABLE "); + break; + case -8: + printf("CL_MEM_COPY_OVERLAP "); + break; + case -9: + printf("CL_IMAGE_FORMAT_MISMATCH "); + break; + case -10: + printf("CL_IMAGE_FORMAT_NOT_SUPPORTED "); + break; + case -11: + printf("CL_BUILD_PROGRAM_FAILURE "); + break; + case -12: + printf("CL_MAP_FAILURE "); + break; + case -13: + printf("CL_MISALIGNED_SUB_BUFFER_OFFSET "); + break; + case -14: + printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST "); + break; + + case -30: + printf("CL_INVALID_VALUE "); + break; + case -31: + printf("CL_INVALID_DEVICE_TYPE "); + break; + case -32: + printf("CL_INVALID_PLATFORM "); + break; + case -33: + printf("CL_INVALID_DEVICE "); + break; + case -34: + printf("CL_INVALID_CONTEXT "); + break; + case -35: + printf("CL_INVALID_QUEUE_PROPERTIES "); + break; + case -36: + printf("CL_INVALID_COMMAND_QUEUE "); + break; + case -37: + printf("CL_INVALID_HOST_PTR "); + break; + case -38: + printf("CL_INVALID_MEM_OBJECT "); + break; + case -39: + printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR "); + break; + case -40: + printf("CL_INVALID_IMAGE_SIZE "); + break; + case -41: + printf("CL_INVALID_SAMPLER "); + break; + case -42: + printf("CL_INVALID_BINARY "); + break; + case -43: + printf("CL_INVALID_BUILD_OPTIONS "); + break; + case -44: + printf("CL_INVALID_PROGRAM "); + break; + case -45: + printf("CL_INVALID_PROGRAM_EXECUTABLE "); + break; + case -46: + printf("CL_INVALID_KERNEL_NAME "); + break; + case -47: + printf("CL_INVALID_KERNEL_DEFINITION "); + break; + case -48: + printf("CL_INVALID_KERNEL "); + break; + case -49: + printf("CL_INVALID_ARG_INDEX "); + break; + case -50: + printf("CL_INVALID_ARG_VALUE "); + break; + case -51: + printf("CL_INVALID_ARG_SIZE "); + break; + case -52: + printf("CL_INVALID_KERNEL_ARGS "); + break; + case -53: + printf("CL_INVALID_WORK_DIMENSION "); + break; + case -54: + printf("CL_INVALID_WORK_GROUP_SIZE "); + break; + case -55: + printf("CL_INVALID_WORK_ITEM_SIZE "); + break; + case -56: + printf("CL_INVALID_GLOBAL_OFFSET "); + break; + case -57: + printf("CL_INVALID_EVENT_WAIT_LIST "); + break; + case -58: + printf("CL_INVALID_EVENT "); + break; + case -59: + printf("CL_INVALID_OPERATION "); + break; + case -60: + printf("CL_INVALID_GL_OBJECT "); + break; + case -61: + printf("CL_INVALID_BUFFER_SIZE "); + break; + case -62: + printf("CL_INVALID_MIP_LEVEL "); + break; + case -63: + printf("CL_INVALID_GLOBAL_WORK_SIZE "); + break; + default: + printf("UNRECOGNIZED ERROR CODE (%d)", error); + } +} + +// Print line, file name, and error code if there is an error. Exits the +// application upon error. +void _checkError(int line, + const char *file, + cl_int error, + const char *msg, + ...) { + // If not successful + if(error != CL_SUCCESS) { + // Print line and file + printf("ERROR: "); + printError(error); + printf("\nLocation: %s:%d\n", file, line); + + // Print custom message. + va_list vl; + va_start(vl, msg); + vprintf(msg, vl); + printf("\n"); + va_end(vl); + + // Cleanup and bail. + cleanup(); + exit(error); + } +} + +// Sets the current working directory to be the same as the directory +// containing the running executable. +bool setCwdToExeDir() { +#ifdef _WIN32 // Windows + HMODULE hMod = GetModuleHandle(NULL); + char path[MAX_PATH]; + GetModuleFileNameA(hMod, path, MAX_PATH); + +#else // Linux + // Get path of executable. + char path[300]; + ssize_t n = readlink("/proc/self/exe", path, sizeof(path)/sizeof(path[0]) - 1); + if(n == -1) { + return false; + } + path[n] = 0; +#endif + + // Find the last '\' or '/' and terminate the path there; it is now + // the directory containing the executable. + size_t i; + for(i = strlen(path) - 1; i > 0 && path[i] != '/' && path[i] != '\\'; --i); + path[i] = '\0'; + + // Change the current directory. +#ifdef _WIN32 // Windows + SetCurrentDirectoryA(path); +#else // Linux + int rc; + rc = chdir(path); + (void) rc; +#endif + + return true; +} + +// Searches all platforms for the first platform whose name +// contains the search string (case-insensitive). +cl_platform_id findPlatform(const char *platform_name_search) { + cl_int status; + + std::string search = platform_name_search; + std::transform(search.begin(), search.end(), search.begin(), tolower); + + // Get number of platforms. + cl_uint num_platforms; + status = clGetPlatformIDs(0, NULL, &num_platforms); + checkError(status, "Query for number of platforms failed"); + + // Get a list of all platform ids. + scoped_array pids(num_platforms); + status = clGetPlatformIDs(num_platforms, pids, NULL); + checkError(status, "Query for all platform ids failed"); + + // For each platform, get name and compare against the search string. + for(unsigned i = 0; i < num_platforms; ++i) { + std::string name = getPlatformName(pids[i]); + + // Convert to lower case. + std::transform(name.begin(), name.end(), name.begin(), tolower); + + if(name.find(search) != std::string::npos) { + // Found! + return pids[i]; + } + } + + // No platform found. + return NULL; +} + +// Returns the platform name. +std::string getPlatformName(cl_platform_id pid) { + cl_int status; + + size_t sz; + status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz); + checkError(status, "Query for platform name size failed"); + + scoped_array name(sz); + status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL); + checkError(status, "Query for platform name failed"); + + return name.get(); +} + +// Returns the device name. +std::string getDeviceName(cl_device_id did) { + cl_int status; + + size_t sz; + status = clGetDeviceInfo(did, CL_DEVICE_NAME, 0, NULL, &sz); + checkError(status, "Failed to get device name size"); + + scoped_array name(sz); + status = clGetDeviceInfo(did, CL_DEVICE_NAME, sz, name, NULL); + checkError(status, "Failed to get device name"); + + return name.get(); +} + +// Returns the list of all devices. +cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices) { + cl_int status; + + status = clGetDeviceIDs(pid, dev_type, 0, NULL, num_devices); + checkError(status, "Query for number of devices failed"); + + cl_device_id *dids = new cl_device_id[*num_devices]; + status = clGetDeviceIDs(pid, dev_type, *num_devices, dids, NULL); + checkError(status, "Query for device ids"); + + return dids; +} + +// Create a program for all devices associated with the context. +cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices) { + // Early exit for potentially the most common way to fail: AOCX does not exist. + if(!fileExists(binary_file_name)) { + printf("AOCX file '%s' does not exist.\n", binary_file_name); + checkError(CL_INVALID_PROGRAM, "Failed to load binary file"); + } + + // Load the binary. + size_t binary_size; + scoped_array binary(loadBinaryFile(binary_file_name, &binary_size)); + if(binary == NULL) { + checkError(CL_INVALID_PROGRAM, "Failed to load binary file"); + } + + scoped_array binary_lengths(num_devices); + scoped_array binaries(num_devices); + for(unsigned i = 0; i < num_devices; ++i) { + binary_lengths[i] = binary_size; + binaries[i] = binary; + } + + cl_int status; + scoped_array binary_status(num_devices); + + cl_program program = clCreateProgramWithBinary(context, num_devices, devices, binary_lengths, + (const unsigned char **) binaries.get(), binary_status, &status); + checkError(status, "Failed to create program with binary"); + for(unsigned i = 0; i < num_devices; ++i) { + checkError(binary_status[i], "Failed to load binary for device"); + } + + return program; +} + +// Loads a file in binary form. +unsigned char *loadBinaryFile(const char *file_name, size_t *size) { + // Open the File + FILE* fp; +#ifdef _WIN32 + if(fopen_s(&fp, file_name, "rb") != 0) { + return NULL; + } +#else + fp = fopen(file_name, "rb"); + if(fp == 0) { + return NULL; + } +#endif + + // Get the size of the file + fseek(fp, 0, SEEK_END); + *size = ftell(fp); + + // Allocate space for the binary + unsigned char *binary = new unsigned char[*size]; + + // Go back to the file start + rewind(fp); + + // Read the file into the binary + if(fread((void*)binary, *size, 1, fp) == 0) { + delete[] binary; + fclose(fp); + return NULL; + } + + return binary; +} + +bool fileExists(const char *file_name) { +#ifdef _WIN32 // Windows + DWORD attrib = GetFileAttributesA(file_name); + return (attrib != INVALID_FILE_ATTRIBUTES && !(attrib & FILE_ATTRIBUTE_DIRECTORY)); +#else // Linux + return access(file_name, R_OK) != -1; +#endif +} + +std::string getBoardBinaryFile(const char *prefix, cl_device_id device) { + // First check if .aocx exists. Use it if it does. + std::string file_name = std::string(prefix) + ".aocx"; + if(fileExists(file_name.c_str())) { + return file_name; + } + + // Now get the name of the board. For Intel(R) FPGA SDK for OpenCL(TM) boards, + // the name of the device is presented as: + // : ... + std::string device_name = getDeviceName(device); + + // Now search for the " :" in the device name. + size_t end = device_name.find(" :"); + if(end != std::string::npos) { + std::string board_name(device_name, 0, end); + + // Look for a AOCX with the name __.aocx. + file_name = std::string(prefix) + "_" + board_name + "_" + VERSION_STR + ".aocx"; + if(fileExists(file_name.c_str())) { + return file_name; + } + } + + // At this point just use .aocx. This file doesn't exist + // and this should trigger an error later. + return std::string(prefix) + ".aocx"; +} + +// High-resolution timer. +double getCurrentTimestamp() { +#ifdef _WIN32 // Windows + // Use the high-resolution performance counter. + + static LARGE_INTEGER ticks_per_second = {}; + if(ticks_per_second.QuadPart == 0) { + // First call - get the frequency. + QueryPerformanceFrequency(&ticks_per_second); + } + + LARGE_INTEGER counter; + QueryPerformanceCounter(&counter); + + double seconds = double(counter.QuadPart) / double(ticks_per_second.QuadPart); + return seconds; +#else // Linux + timespec a; + clock_gettime(CLOCK_MONOTONIC, &a); + return (double(a.tv_nsec) * 1.0e-9) + double(a.tv_sec); +#endif +} + +cl_ulong getStartEndTime(cl_event event) { + cl_int status; + + cl_ulong start, end; + status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); + checkError(status, "Failed to query event start time"); + status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); + checkError(status, "Failed to query event end time"); + + return end - start; +} + +cl_ulong getStartEndTime(cl_event *events, unsigned num_events) { + cl_int status; + + cl_ulong min_start = 0; + cl_ulong max_end = 0; + for(unsigned i = 0; i < num_events; ++i) { + cl_ulong start, end; + status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); + checkError(status, "Failed to query event start time"); + status = clGetEventProfilingInfo(events[i], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); + checkError(status, "Failed to query event end time"); + + if(i == 0) { + min_start = start; + max_end = end; + } + else { + if(start < min_start) { + min_start = start; + } + if(end > max_end) { + max_end = end; + } + } + } + + return max_end - min_start; +} + +void waitMilliseconds(unsigned ms) { +#ifdef _WIN32 // Windows + Sleep(ms); +#else // Linux + timespec sleeptime = {0, 0}; + sleeptime.tv_sec = ms / 1000; + sleeptime.tv_nsec = long(ms % 1000) * 1000000L; // convert to nanoseconds + nanosleep(&sleeptime, NULL); +#endif +} + +void oclContextCallback(const char *errinfo, const void *, size_t, void *) { + printf("Context callback: %s\n", errinfo); +} + +} // ns aocl_utils + diff --git a/3rdparty/aoclutils/opencl.h b/3rdparty/aoclutils/opencl.h new file mode 100644 index 000000000000..4aa5348b67b1 --- /dev/null +++ b/3rdparty/aoclutils/opencl.h @@ -0,0 +1,122 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// OpenCL utility functions. + +#ifndef AOCL_UTILS_OPENCL_H +#define AOCL_UTILS_OPENCL_H + +#include +#include +#include +#include + +#include "CL/opencl.h" + +// This is assumed to be externally provided by the application. +extern void cleanup(); + +namespace aocl_utils { + +// Host allocation functions +void *alignedMalloc(size_t size); +void alignedFree(void *ptr); + +// Error functions +void printError(cl_int error); +void _checkError(int line, + const char *file, + cl_int error, + const char *msg, + ...); // does not return +#define checkError(status, ...) _checkError(__LINE__, __FILE__, status, __VA_ARGS__) + +// Sets the current working directory to the same directory that contains +// this executable. Returns true on success. +bool setCwdToExeDir(); + +// Find a platform that contains the search string in its name (case-insensitive match). +// Returns NULL if no match is found. +cl_platform_id findPlatform(const char *platform_name_search); + +// Returns the name of the platform. +std::string getPlatformName(cl_platform_id pid); + +// Returns the name of the device. +std::string getDeviceName(cl_device_id did); + +// Returns an array of device ids for the given platform and the +// device type. +// Return value must be freed with delete[]. +cl_device_id *getDevices(cl_platform_id pid, cl_device_type dev_type, cl_uint *num_devices); + +// Create a OpenCL program from a binary file. +// The program is created for all given devices associated with the context. The same +// binary is used for all devices. +cl_program createProgramFromBinary(cl_context context, const char *binary_file_name, const cl_device_id *devices, unsigned num_devices); + +// Load binary file. +// Return value must be freed with delete[]. +unsigned char *loadBinaryFile(const char *file_name, size_t *size); + +// Checks if a file exists. +bool fileExists(const char *file_name); + +// Returns the path to the AOCX file to use for the given device. +// This is special handling for examples for the Intel(R) FPGA SDK for OpenCL(TM). +// It uses the device name to get the board name and then looks for a +// corresponding AOCX file. Specifically, it gets the device name and +// extracts the board name assuming the device name has the following format: +// : ... +// +// Then the AOCX file is __.aocx. If this +// file does not exist, then the file name defaults to .aocx. +std::string getBoardBinaryFile(const char *prefix, cl_device_id device); + +// Returns the time from a high-resolution timer in seconds. This value +// can be used with a value returned previously to measure a high-resolution +// time difference. +double getCurrentTimestamp(); + +// Returns the difference between the CL_PROFILING_COMMAND_END and +// CL_PROFILING_COMMAND_START values of a cl_event object. +// This requires that the command queue associated with the event be created +// with the CL_QUEUE_PROFILING_ENABLE property. +// +// The return value is in nanoseconds. +cl_ulong getStartEndTime(cl_event event); + +// Returns the maximum time span for the given set of events. +// The time span starts at the earliest event start time. +// The time span ends at the latest event end time. +cl_ulong getStartEndTime(cl_event *events, unsigned num_events); + +// Wait for the specified number of milliseconds. +void waitMilliseconds(unsigned ms); + +// OpenCL context callback function that simply prints the error information +// to stdout (via printf). +void oclContextCallback(const char *errinfo, const void *, size_t, void *); + +} // ns aocl_utils + +#endif + diff --git a/3rdparty/aoclutils/options.cc b/3rdparty/aoclutils/options.cc new file mode 100644 index 000000000000..05d025b43faf --- /dev/null +++ b/3rdparty/aoclutils/options.cc @@ -0,0 +1,105 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +#include "aocl_utils.h" +#include +#include +#include +#include + +namespace aocl_utils { + +Options::Options() { +} + +Options::Options(int num, char *argv[]) { + addFromCommandLine(num, argv); +} + +bool Options::has(const std::string &name) const { + return m_options.find(name) != m_options.end(); +} + +std::string &Options::get(const std::string &name) { + return m_options[name]; +} + +const std::string &Options::get(const std::string &name) const { + OptionMap::const_iterator it = m_options.find(name); + if(it == m_options.end()) { + errorNonExistent(name); + std::cerr << "Option '" << name << "' does not exist.\n"; + exit(1); + } + return it->second; +} + +void Options::addFromCommandLine(int num, char *argv[]) { + for(int i = 1; i < num; ++i) { + const std::string arg = argv[i]; + + // Look for the first '-'. + if(arg.size() > 1 && arg[0] == '-') { + size_t eq = arg.find('='); + size_t name_start = 1; + + // Check if there's a second '-'. + if(arg.size() > 2 && arg[1] == '-') { + name_start = 2; + } + + if(eq == std::string::npos) { + // No '='; treat as a boolean option. + set(arg.substr(name_start), true); + } + else if(eq == name_start) { + // No name?! + errorNameless(); + } + else { + set(arg.substr(name_start, eq - name_start), arg.substr(eq + 1)); + } + } + else { + // Not an option. + m_nonoptions.push_back(arg); + } + } +} + +void Options::errorNameless() const { + std::cerr << "No name provided for option.\n"; + exit(1); +} + +void Options::errorNonExistent(const std::string &name) const { + std::cerr << "Option '" << name << "' does not exist.\n"; + exit(1); +} + +void Options::errorWrongType(const std::string &name) const { + std::cerr << "Value for option '" << name << "' is not of the right type (value = '" + << get(name) << "').\n"; + exit(1); +} + +} // ns aocl_utils + diff --git a/3rdparty/aoclutils/options.h b/3rdparty/aoclutils/options.h new file mode 100644 index 000000000000..78d34605e60e --- /dev/null +++ b/3rdparty/aoclutils/options.h @@ -0,0 +1,137 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// Declares a utility class used to parse command-line options. + +#ifndef AOCL_UTILS_OPTIONS_H +#define AOCL_UTILS_OPTIONS_H + +#include +#include +#include +#include + +namespace aocl_utils { + +class Options { +public: + typedef std::vector StringVec; + + Options(); + Options(int num, char *argv[]); + + bool has(const std::string &name) const; + std::string &get(const std::string &name); // will create an empty option if it does not exist + const std::string &get(const std::string &name) const; // error if option does not exist + + void set(const std::string &name, const std::string &value) { get(name) = value; } + + // Command line options must be of the following form: + // [-]-name (indicates option exists) + // [-]-name=value + // + // This function assumes that the values are from main(int, char *). + // This means that the argv[0] is skipped. + void addFromCommandLine(int num, char *argv[]); + + // This templated function converts the option value to the given type. + // An assert is raised if the conversion fails. + template + T get(const std::string &name) const; + + template + void set(const std::string &name, const T &value); + + // Non-options are arguments processed in addFromCommandLine + // that were not recognized as options. + const StringVec &getNonOptions() const { return m_nonoptions; } + size_t getNonOptionCount() const { return m_nonoptions.size(); } + const std::string &getNonOption(size_t i) const { return m_nonoptions[i]; } + +private: + typedef std::map OptionMap; + + // Displays an error message indicating that a nameless option + // was provided. + void errorNameless() const; + + // Displays an error message indicating that the given option + // has the wrong type and then exits with an error code. + void errorWrongType(const std::string &name) const; + + // Displays an error message indicating that the given option + // does not exist and then exits with an error code. + void errorNonExistent(const std::string &name) const; + + OptionMap m_options; + StringVec m_nonoptions; + + Options(const Options &); // not implemented + void operator =(const Options &); // not implemented +}; + +template +T Options::get(const std::string &name) const { + std::stringstream ss; + ss << get(name); + + T v; + ss >> v; + if(ss.fail() || !ss.eof()) { + // Failed to parse or did not consume the whole string value. + errorWrongType(name); + } + return v; +} + +// Specialization for bool. +template<> +inline bool Options::get(const std::string &name) const { + if(has(name)) { + const std::string &v = get(name); + if(v == "1") { + return true; + } + } + return false; +} + +// Specialization for std::string. Simply returns the option string. +// Requires specialization because using stringstream to read the string +// will stop at the first whitespace character (which is wrong). +template<> +inline std::string Options::get(const std::string &name) const { + return get(name); +} + +// This assumes the type T can be serialized to a string and back (when get +// is called). +template +void Options::set(const std::string &name, const T &value) { + std::stringstream ss; + ss << value; + set(name, ss.str()); +} + +} // ns aocl_utils + +#endif + diff --git a/3rdparty/aoclutils/scoped_ptrs.h b/3rdparty/aoclutils/scoped_ptrs.h new file mode 100644 index 000000000000..b11085c5226e --- /dev/null +++ b/3rdparty/aoclutils/scoped_ptrs.h @@ -0,0 +1,165 @@ +// Copyright (C) 2013-2018 Altera Corporation, San Jose, California, USA. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy of this +// software and associated documentation files (the "Software"), to deal in the Software +// without restriction, including without limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to +// whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of California and +// by the laws of the United States of America. + +// Scoped pointer definitions. + +#ifndef AOCL_UTILS_SCOPED_PTRS_H +#define AOCL_UTILS_SCOPED_PTRS_H + +namespace aocl_utils { + +// Interface is essentially the combination of std::auto_ptr and boost's smart pointers, +// along with some small extensions (auto conversion to T*). + +// scoped_ptr: assumes pointer was allocated with operator new; destroys with operator delete +template +class scoped_ptr { +public: + typedef scoped_ptr this_type; + + scoped_ptr() : m_ptr(NULL) {} + scoped_ptr(T *ptr) : m_ptr(ptr) {} + ~scoped_ptr() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { delete m_ptr; m_ptr = ptr; } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + + // noncopyable + scoped_ptr(const this_type &); + this_type &operator =(const this_type &); +}; + +// scoped_array: assumes pointer was allocated with operator new[]; destroys with operator delete[] +// Also supports allocation/reset with a number, which is the number of +// elements of type T. +template +class scoped_array { +public: + typedef scoped_array this_type; + + scoped_array() : m_ptr(NULL) {} + scoped_array(T *ptr) : m_ptr(NULL) { reset(ptr); } + explicit scoped_array(size_t n) : m_ptr(NULL) { reset(n); } + ~scoped_array() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + T &operator [](int index) const { return m_ptr[index]; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { delete[] m_ptr; m_ptr = ptr; } + void reset(size_t n) { reset(new T[n]); } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + + // noncopyable + scoped_array(const this_type &); + this_type &operator =(const this_type &); +}; + +// scoped_aligned_ptr: assumes pointer was allocated with alignedMalloc; destroys with alignedFree +// Also supports allocation/reset with a number, which is the number of +// elements of type T +template +class scoped_aligned_ptr { +public: + typedef scoped_aligned_ptr this_type; + + scoped_aligned_ptr() : m_ptr(NULL) {} + scoped_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); } + explicit scoped_aligned_ptr(size_t n) : m_ptr(NULL) { reset(n); } + ~scoped_aligned_ptr() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + T &operator [](int index) const { return m_ptr[index]; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { if(m_ptr) alignedFree(m_ptr); m_ptr = ptr; } + void reset(size_t n) { reset((T*) alignedMalloc(sizeof(T) * n)); } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + + // noncopyable + scoped_aligned_ptr(const this_type &); + this_type &operator =(const this_type &); +}; + +#if USE_SVM_API == 1 +// scoped_SVM_aligned_ptr: assumes pointer was allocated with clSVMAlloc; destroys with clSVMFree +// Also supports allocation/reset with a number, which is the number of +// elements of type T +template +class scoped_SVM_aligned_ptr { +public: + typedef scoped_SVM_aligned_ptr this_type; + + scoped_SVM_aligned_ptr() : m_ptr(NULL) {} + scoped_SVM_aligned_ptr(T *ptr) : m_ptr(NULL) { reset(ptr); } + explicit scoped_SVM_aligned_ptr(cl_context ctx, size_t n) : m_ptr(NULL) { reset(ctx, n); } + ~scoped_SVM_aligned_ptr() { reset(); } + + T *get() const { return m_ptr; } + operator T *() const { return m_ptr; } + T *operator ->() const { return m_ptr; } + T &operator *() const { return *m_ptr; } + T &operator [](int index) const { return m_ptr[index]; } + + this_type &operator =(T *ptr) { reset(ptr); return *this; } + + void reset(T *ptr = NULL) { if (m_ptr) clSVMFree(m_ctx, m_ptr); m_ptr = ptr; } + void reset(cl_context ctx, size_t n) { reset((T*)clSVMAlloc(ctx, 0, sizeof(T) * n, 0)); m_ctx = ctx; } + T *release() { T *ptr = m_ptr; m_ptr = NULL; return ptr; } + +private: + T *m_ptr; + cl_context m_ctx; + + // noncopyable + scoped_SVM_aligned_ptr(const this_type &); + this_type &operator =(const this_type &); +}; +#endif /* USE_SVM_API == 1 */ + +} // ns aocl_utils + +#endif + diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index db65157208ec..98860a2a31ec 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit db65157208ec8fabb7b548c94596211b9db04190 +Subproject commit 98860a2a31ecc4aaf7c3346daa750d26193847e4 diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index 0117857783db..6c35b8df07d0 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -27,6 +27,9 @@ endif() message(STATUS "VTA build with VTA_HW_PATH=" ${VTA_HW_PATH}) +# enable picojson int type support +add_definitions(-DPICOJSON_USE_INT64) + if(MSVC) message(STATUS "VTA build is skipped in Windows..") elseif(PYTHON) @@ -99,6 +102,11 @@ elseif(PYTHON) find_library(__cma_lib NAMES cma PATH /usr/lib) elseif(${VTA_TARGET} STREQUAL "de10nano") # DE10-Nano rules file(GLOB FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/de10nano/*.cc ${VTA_HW_PATH}/src/*.cc) + elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules + file(GLOB IFOCL_SRC ${VTA_HW_PATH}/src/intelfocl/*.cc) + file(GLOB AOCLUTIL_SRC 3rdparty/aoclutils/*.cc) + list(APPEND FPGA_RUNTIME_SRCS ${IFOCL_SRC} ${AOCLUTIL_SRC}) + list(APPEND FPGA_RUNTIME_SRCS ${VTA_HW_PATH}/src/vmem/virtual_memory.cc ${VTA_HW_PATH}/src/vmem/virtual_memory.h) endif() # Target lib: vta add_library(vta SHARED ${FPGA_RUNTIME_SRCS}) @@ -117,6 +125,11 @@ elseif(PYTHON) target_include_directories(vta PUBLIC 3rdparty) target_include_directories(vta PUBLIC "/usr/local/intelFPGA_lite/18.1/embedded/ds-5/sw/gcc/arm-linux-gnueabihf/include") + elseif(${VTA_TARGET} STREQUAL "intelfocl") # Intel OpenCL for FPGA rules + target_include_directories(vta PUBLIC 3rdparty) + target_include_directories(vta PUBLIC "/opt/intelFPGA_pro/19.3.0.222/hld/host/include") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") + target_link_libraries(vta -L/opt/intelFPGA_pro/19.3.0.222/hld/host/linux64/lib -lOpenCL) endif() endif() diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 9cef674d3db1..7b71b17f9886 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -32,6 +32,9 @@ import tempfile import numpy as np +import json +import sys +from importlib import import_module import tvm._ffi import tvm.ir.transform @@ -192,6 +195,16 @@ def __init__(self, timeout=10, n_parallel=None, number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1, check_correctness=False, enable_cpu_cache_flush=False): + static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL") + if static_tune: + if n_parallel is None or n_parallel > 1: + print("static tune only allows n_parallel == 1") + n_parallel = 1 + + if check_correctness == True: + print("static tune does not support check_correctness") + check_correctness = False + super(RPCRunner, self).__init__(timeout, n_parallel) self.key = key @@ -383,7 +396,15 @@ def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_opti measure_input.target.device_name == 'vta': # pylint: disable=import-outside-toplevel import vta - func = vta.build(s, args, target_host=task.target_host) + + static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL") + if static_tune: + debug_flag = 1 << 6 + else: + debug_flag = 0 + + with vta.build_config(debug_flag=debug_flag): + func = vta.build(s, args, target_host=task.target_host) else: with tvm.ir.transform.PassContext(config=opts): func = build(s, args, target_host=task.target_host) @@ -483,16 +504,10 @@ def run_through_rpc(measure_input, build_result, tic = time.time() errno = MeasureErrorNo.NO_ERROR + static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL") try: # upload built module remote = request_remote(*remote_args) - # Program the FPGA every single time when targeting VTA - if hasattr(measure_input.target, 'device_name') and \ - measure_input.target.device_name == 'vta': - # pylint: disable=import-outside-toplevel - from vta import program_fpga, reconfig_runtime - program_fpga(remote, None) - reconfig_runtime(remote) remote.upload(build_result.filename) func = remote.load_module(os.path.split(build_result.filename)[1]) ctx = remote.context(str(measure_input.target), 0) @@ -517,12 +532,31 @@ def run_through_rpc(measure_input, build_result, args = [nd.array(x, ctx=ctx) for x in args] ctx.sync() - costs = time_f(*args).results + if static_tune is None: + time_f = func.time_evaluator( + func.entry_name, ctx, number=number, repeat=repeat, min_repeat_ms=min_repeat_ms) + costs = time_f(*args).results - # clean up remote files - remote.remove(build_result.filename) - remote.remove(os.path.splitext(build_result.filename)[0] + '.so') - remote.remove('') + # clean up remote files + remote.remove(build_result.filename) + remote.remove(os.path.splitext(build_result.filename)[0] + '.so') + remote.remove('') + else: + func(*args) + cost = 0 + insn_dump = os.getenv('TVM_INSN_DUMP_FILE', "insn.json") + insn_cost_file = os.getenv('TVM_INSN_COST_FILE', "cost.py") + path, filename = os.path.split(insn_cost_file) + sys.path.append(path) + module_path = filename[:-3] # remove the .py suffix + module = import_module(module_path) + cal_cost = getattr(module, "cal_cost") + with open(insn_dump) as json_file: + insns = json.load(json_file) + for insn in insns: + cost += cal_cost(insn) + + costs = [cost] * repeat if len(costs) > 2: # remove largest and smallest value to reduce variance costs = list(costs) @@ -543,6 +577,10 @@ def run_through_rpc(measure_input, build_result, msg = msg[:msg.index("CUDA Source")] costs = (RuntimeError(msg[:1024]),) errno = MeasureErrorNo.RUNTIME_DEVICE + except Exception as exc: + costs = (exc,) + errno = MeasureErrorNo.UNKNOWN_ERROR + tstamp = time.time() time.sleep(cooldown_interval) return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp) @@ -570,6 +608,10 @@ def request_remote(device_key, host=None, port=None, priority=1, timeout=60): ------ session: RPCSession """ + static_tune = os.getenv("TVM_STATIC_TUNE_EXPERIMENTAL") + if static_tune: + return _rpc.LocalSession() + # connect to the tracker host = host or os.environ['TVM_TRACKER_HOST'] port = port or int(os.environ['TVM_TRACKER_PORT']) diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py index fbf474fc4df7..53ed78a7570d 100644 --- a/python/tvm/autotvm/task/space.py +++ b/python/tvm/autotvm/task/space.py @@ -779,7 +779,7 @@ def _add_new_transform(self, space_class, name, axes, policy, **kwargs): return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))] def __len__(self): - if self._length is None: + if self._length is None or self._length <= 1: self._length = int(np.prod([len(x) for x in self.space_map.values()])) return self._length diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index 59e77f7d0098..25d1156e2af8 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -215,7 +215,7 @@ def _decorate(topi_schedule): @_register_task_schedule(task_name) def wrapper(outs, *args, **kwargs): """wrapper function for topi schedule""" - workload = get_workload(outs) + workload = get_workload(outs, task_name) if workload is None: raise RuntimeError("Cannot find workload in attribute of this schedule") tgt = _target.Target.current() @@ -227,17 +227,21 @@ def wrapper(outs, *args, **kwargs): return _decorate -def get_workload(outs): +def get_workload(outs, task_name=None): """Retrieve the workload from outputs""" def traverse(tensors): """traverse all ops to find attached workload""" for t in tensors: op = t.op - if 'workload' in op.attrs: - return args_to_workload(op.attrs['workload']) wkl = traverse(op.input_tensors) if wkl: return wkl + + if 'workload' in op.attrs: + ret = args_to_workload(op.attrs['workload']) + if ret[0] == task_name: + return ret return None + outs = [outs] if isinstance(outs, tensor.Tensor) else outs return traverse(outs) diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py index cfc1b2c38f85..6c53be582b40 100644 --- a/python/tvm/autotvm/tuner/callback.py +++ b/python/tvm/autotvm/tuner/callback.py @@ -137,7 +137,7 @@ def __del__(self): format_si_prefix(0, si_prefix) if logger.level < logging.DEBUG: # only print progress bar in non-debug mode - sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) ' + sys.stdout.write('\r%s Current/Best: %7.4f/%7.4f GFLOPS | Progress: (%d/%d) ' '| %.2f s' % (prefix, 0, 0, 0, total, time.time() - tic)) sys.stdout.flush() @@ -153,7 +153,7 @@ def _callback(tuner, inputs, results): ctx.cur_flops = flops ctx.best_flops = tuner.best_flops - sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f %sFLOPS | Progress: (%d/%d) ' + sys.stdout.write('\r%s Current/Best: %7.4f/%7.4f %sFLOPS | Progress: (%d/%d) ' '| %.2f s' % (prefix, format_si_prefix(ctx.cur_flops, si_prefix), format_si_prefix(ctx.best_flops, si_prefix), si_prefix, diff --git a/python/tvm/contrib/util.py b/python/tvm/contrib/util.py index 8f6dfc7f28ec..474741fc1e35 100644 --- a/python/tvm/contrib/util.py +++ b/python/tvm/contrib/util.py @@ -19,6 +19,7 @@ import contextlib import datetime import os +import sys import tempfile import threading import shutil diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index feeec1fa89ec..a43d9d7913d5 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -87,6 +87,9 @@ register_broadcast_schedule("fast_exp") register_broadcast_schedule("fast_tanh") register_broadcast_schedule("fast_erf") +# a fake on_device schedule. +# this will not be used in actual computation as on_device will be removed during DeviceAnnotation pass +register_injective_schedule("on_device") # zeros diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 62c2948b51e2..eae4d5785211 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -33,6 +33,14 @@ def wrapper(attrs, outs, target): return topi_schedule(outs) return wrapper + +def wrap_topi_compute(topi_compute): + """Wrap TOPI compute which doesn't use attrs""" + def wrapper(attrs, inputs, out_type): + return [topi_compute(*inputs)] + return wrapper + + def get_conv2d_in_channels(data_shape, data_layout): """Get conv2d input channels""" data_shape = get_const_tuple(data_shape) diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py index 952a86466300..f902a0abf80e 100644 --- a/python/tvm/relay/quantize/_annotate.py +++ b/python/tvm/relay/quantize/_annotate.py @@ -173,6 +173,28 @@ def conv2d_rewrite(ref_call, new_args, ctx): return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) +@register_annotate_function("nn.conv2d_transpose") +def conv2d_transpose_rewrite(ref_call, new_args, ctx): + """Rewrite function for conv2d_transpose. Lhs of conv will be quantized to + input field, and rhs of conv will be quantized to weight field. + Output would be in activation field""" + if quantize_context().check_to_skip(ref_call): + return None + + lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) + rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) + + if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION: + lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT) + + assert rhs_kind is None + rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT) + + expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) + + return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) + + @register_annotate_function("nn.dense") def dense_rewrite(ref_call, new_args, ctx): """Rewrite function for dense. Lhs of dense will be quantized to input field, and rhs of @@ -281,6 +303,7 @@ def identity_rewrite(ref_call, new_args, ctx): return QAnnotateExpr(ret_expr, x_kind) +register_annotate_function("reshape", identity_rewrite) register_annotate_function("clip", identity_rewrite) register_annotate_function("nn.relu", identity_rewrite) register_annotate_function("strided_slice", identity_rewrite) diff --git a/python/tvm/relay/quantize/_partition.py b/python/tvm/relay/quantize/_partition.py index b72f51c4add5..c16672628cea 100644 --- a/python/tvm/relay/quantize/_partition.py +++ b/python/tvm/relay/quantize/_partition.py @@ -52,6 +52,19 @@ def conv2d_partition_function(ref_call, new_args, ctx): return QPartitionExpr(ret) +@register_partition_function("nn.conv2d_transpose") +def conv2d_partition_function(ref_call, new_args, ctx): + """Rewrite function for conv2d_transpose for partition""" + data_cond, data = partition_expr_check(new_args[0]) + kernel_cond, kernel = partition_expr_check(new_args[1]) + + assert not kernel_cond + if data_cond: + data = new_args[0].realize() + ret = _forward_op(ref_call, [data, kernel]) + return QPartitionExpr(ret) + + def identity_partition_function(ref_call, new_args, ctx): cond, expr = partition_expr_check(new_args[0]) if cond: @@ -81,7 +94,7 @@ def add_partition_generic(ref_call, new_args, ctx): # ... lhs = new_args[0].realize() rhs = new_args[1].realize() - return _forward_op(ref_call, [lhs, rhs]) + return QPartitionExpr(_forward_op(ref_call, [lhs, rhs])) if not lhs_cond and rhs_cond: # - introduced by residual connection in ResNet # ... @@ -128,6 +141,7 @@ def mul_partition_generic(ref_call, new_args, ctx): if lhs_cond: # introduced by bn: multiply(out, scale) + lhs = new_args[0].realize() return QPartitionExpr(_forward_op(ref_call, [lhs, rhs])) if not lhs_cond and not rhs_cond: @@ -153,3 +167,16 @@ def add_partition_function(ref_call, new_args, ctx): def multiply_partition_function(ref_call, new_args, ctx): """Rewrite function for ewise multiply for partition""" return mul_partition_generic(ref_call, new_args, ctx) + + +# add cast after the relu op to make it run on vta +@register_partition_function("nn.global_avg_pool2d") +def global_avg_pool2d_partition_function(ref_call, new_args, ctx): + cond, expr = partition_expr_check(new_args[0]) + if cond: + expr = new_args[0].realize() + return _forward_op(ref_call, [expr]) + else: + expr = QPartitionExpr(new_args[0]).realize() + return _forward_op(ref_call, [expr]) + return None diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py index 28ebf7f3032b..b7371a3c3068 100644 --- a/python/tvm/relay/quantize/quantize.py +++ b/python/tvm/relay/quantize/quantize.py @@ -209,10 +209,18 @@ def check_to_skip(self, ref_call): # check skip conv layers skipped_indices = [int(x) for x in current_qconfig().skip_conv_layers] if self._conv2d_counter in skipped_indices: - if ref_call.op.name == 'nn.conv2d': + if ref_call.op.name == 'nn.conv2d' or ref_call.op.name == 'nn.conv2d_transpose': self._conv2d_counter += 1 - return True - if ref_call.op.name == 'nn.conv2d': + return True + else: + # counter is 0 before visiting the first conv2d + # if the first conv2d is skipped, all ops before it will also be skipped + # otherwise, we don't skip until the counter become +1 + if self._conv2d_counter == 0: + return True + else: + return False + if ref_call.op.name == 'nn.conv2d' or ref_call.op.name == 'nn.conv2d_transpose': self._conv2d_counter += 1 return False diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc index f0634feac083..c9704e3fff4b 100644 --- a/src/arith/detect_linear_equation.cc +++ b/src/arith/detect_linear_equation.cc @@ -71,6 +71,16 @@ class LinearEqDetector : public ExprFunctora, op->a); + LinearEqEntry b = VisitExpr(op->b, op->b); + LinearEqEntry ret; + ret.base = FloorDivCombine(a.base, b.base); + ret.coeff = FloorDivCombine(a.coeff, b.coeff); + return ret; + } + LinearEqEntry VisitExpr_(const SubNode* op, const PrimExpr& e) final { if (fail_) return LinearEqEntry(); LinearEqEntry a = VisitExpr(op->a, op->a); @@ -138,6 +148,12 @@ class LinearEqDetector : public ExprFunctor DetectLinearEquation(const PrimExpr& e, const Array& vars) { diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index 2aae8546248f..367849112543 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -230,7 +230,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator> << "Two complicated op in a primitive function " << " master=" << master_op_ << " current=" << op; } - if (op_pattern >= master_op_pattern_) { + if (op_pattern > master_op_pattern_) { master_op_ = op; master_attrs_ = call_node->attrs; master_op_pattern_ = op_pattern; @@ -288,7 +288,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator> tvm::Target target_; Op master_op_; Attrs master_attrs_; - int master_op_pattern_{0}; + int master_op_pattern_{-1}; OpImplementation master_implementation_; std::ostringstream readable_name_stream_; Array scalars_; diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc index 820e17f8a498..4a1bfd874b5c 100644 --- a/src/relay/backend/graph_plan_memory.cc +++ b/src/relay/backend/graph_plan_memory.cc @@ -309,6 +309,15 @@ class StorageAllocator : public StorageAllocaBaseVisitor { if (match_range_ == 0) { return this->Alloc(prototype, size); } + + // TODO(zhanghao): find a better way to do this + // We copy all the instructions of all layers in a single batch. + // To avoid overwrite shared storage, we do not re-use allocation + const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL"); + if (sync_once) { + return this->Alloc(prototype, size); + } + auto begin = free_.lower_bound(size / match_range_); auto mid = free_.lower_bound(size); auto end = free_.upper_bound(size * match_range_); diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc index 6be9b0d4a3d5..4db3f930d3b5 100644 --- a/src/relay/op/annotation/annotation.cc +++ b/src/relay/op/annotation/annotation.cc @@ -54,7 +54,13 @@ RELAY_REGISTER_OP("on_device") .add_type_rel("Identity", IdentityRel) .set_attr("TOpPattern", kOpaque) .set_attr("TOpIsStateful", false) - .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout); + .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) + .set_attr("FTVMCompute", + [] (const Attrs& attrs, + const Array& inputs, + const Type& out_type) -> Array { + return {topi::identity(inputs[0])}; + }); Expr StopFusion(Expr data) { static const Op& op = Op::Get("annotation.stop_fusion"); diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index ace2c2473173..228413cdc8ff 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -234,6 +234,41 @@ Expr Conv2dRealize(const Call& ref_call, const Array& new_args, const Obje RELAY_REGISTER_OP("nn.conv2d").set_attr("FQRealizeRewrite", Conv2dRealize); +Expr Conv2dTransposeRealize(const Call& ref_call, + const Array& new_args, + const ObjectRef& ctx) { + const QConfig& cfg = QConfig::Current(); + CHECK_EQ(new_args.size(), 2); + if (!new_args[0]->IsInstance() || !new_args[1]->IsInstance()) { + return Expr(nullptr); + } + const auto* lhs = new_args[0].as(); + CHECK(lhs); + const auto* rhs = new_args[1].as(); + CHECK(rhs); + + Expr ldata = lhs->data; + if (lhs->dtype != cfg->dtype_input) { + ldata = Cast(ldata, cfg->dtype_input); + } + Expr rdata = Cast(rhs->data, cfg->dtype_weight); + + const auto ref_attrs = ref_call->attrs.as(); + auto attrs = make_object(); + *attrs = *ref_attrs; + DataType out_dtype = cfg->dtype_activation; + attrs->out_dtype = out_dtype; + + Expr ret = Call(ref_call->op, + {ldata, rdata}, Attrs(attrs), ref_call->type_args); + Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale); + Expr dom_scale = FoldConstantOpt(mul); + return QRealizeIntExpr(ret, dom_scale, out_dtype); +} + +RELAY_REGISTER_OP("nn.conv2d_transpose") +.set_attr("FQRealizeRewrite", Conv2dTransposeRealize); + Expr DenseRealize(const Call& ref_call, const Array& new_args, const ObjectRef& ctx) { const QConfig& cfg = QConfig::Current(); CHECK_EQ(new_args.size(), 2); @@ -309,7 +344,8 @@ float ChooseDomScale(const std::vector& nptrs) { /* \brief Unify the dom scale of arguments */ Array UnifyDTypeScale(const Array& ref_args, const Array& args, - DataType* dtype_ptr, Expr* scale_ptr) { + DataType* dtype_ptr, Expr* scale_ptr, + DataType dtype = DataType::Void()) { static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize"); const QConfig& cfg = QConfig::Current(); @@ -324,13 +360,15 @@ Array UnifyDTypeScale(const Array& ref_args, const Array& args // unify the data type CHECK_EQ(ref_args.size(), args.size()); - DataType dtype; - if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) { - dtype = cfg->dtype_input; - } else { - dtype = cfg->dtype_activation; + if (dtype.is_void()) { + if (ret.size() == 2 && nptrs[1]->dtype == cfg->dtype_input) { + dtype = cfg->dtype_input; + } else { + dtype = cfg->dtype_activation; + } } + for (size_t i = 0; i < ret.size(); ++i) { auto ref_arg = ref_args[i].as(); if (nptrs[i]->dtype != dtype) { @@ -361,7 +399,16 @@ Expr AddRealize(const Call& ref_call, const Array& new_args, const ObjectR if (new_args[0].as() && new_args[1].as()) { DataType dtype; Expr dom_scale; - Array ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale); + // execute the operation with activation data type. + const QConfig& cfg = QConfig::Current(); + Array ret_args = UnifyDTypeScale(ref_call->args, new_args, + &dtype, &dom_scale, cfg->dtype_activation); + for (size_t i = 0; i < ret_args.size(); ++i) { + // do not fuse float32 arg + if (new_args[i].as()->dtype == DataType::Float(32)) { + ret_args.Set(i, StopFusion(ret_args[i])); + } + } Expr ret = ForwardOp(ref_call, ret_args); return QRealizeIntExpr(ret, dom_scale, dtype); } @@ -430,6 +477,8 @@ Expr IdentityRealize(const Call& ref_call, const Array& new_args, const Ob RELAY_REGISTER_OP("nn.relu").set_attr("FQRealizeRewrite", IdentityRealize); +RELAY_REGISTER_OP("reshape").set_attr("FQRealizeRewrite", IdentityRealize); + RELAY_REGISTER_OP("strided_slice").set_attr("FQRealizeRewrite", IdentityRealize); RELAY_REGISTER_OP("nn.batch_flatten") diff --git a/src/relay/transforms/device_annotation.cc b/src/relay/transforms/device_annotation.cc index 39cf563f730a..fe3cfebf7fe3 100644 --- a/src/relay/transforms/device_annotation.cc +++ b/src/relay/transforms/device_annotation.cc @@ -386,22 +386,38 @@ class DeviceInfo { } void VisitExpr_(const ConstantNode* cn) final { - post_dfs_order_.push_back(std::make_pair(cn, has_copy_)); + device_tag_[cn] = dev_type_; } void VisitExpr_(const CallNode* call) final { // Skip annotation nodes. if (!IsOnDeviceNode(call)) { - if (GetDeviceCopyNode(call)) { + if (const auto* node = GetDeviceCopyNode(call)) { + CHECK(node->IsInstance()); + const auto* call_node = static_cast(node); + auto attrs = call_node->attrs.as(); + num_device_copy_ops_++; bool has_copy_prev = has_copy_; has_copy_ = true; - ExprVisitor::VisitExpr_(call); - post_dfs_order_.push_back(std::make_pair(call, has_copy_)); + dev_type_ = attrs->src_dev_type; + for (auto& arg : call->args) { + Visit(arg); + // restore the type for remaining arguments + dev_type_ = attrs->src_dev_type; + } + device_tag_[call] = attrs->dst_dev_type; + // update the out_dev_type_, which should be the dst_dev_type of last copy + out_dev_type_ = attrs->dst_dev_type; has_copy_ = has_copy_prev; } else { - ExprVisitor::VisitExpr_(call); - post_dfs_order_.push_back(std::make_pair(call, has_copy_)); + for (auto& arg : call->args) { + int cur_dev_type = dev_type_; + Visit(arg); + // restore the type for remaining arguments + dev_type_ = cur_dev_type; + } + device_tag_[call] = dev_type_; } } } @@ -414,22 +430,24 @@ class DeviceInfo { void VisitExpr_(const TupleGetItemNode* op) final { ExprVisitor::VisitExpr_(op); } void VisitExpr_(const VarNode* vn) final { - post_dfs_order_.push_back(std::make_pair(vn, has_copy_)); + device_tag_[vn] = dev_type_; } void VisitExpr_(const LetNode* ln) final { ExprVisitor::VisitExpr_(ln); - post_dfs_order_.push_back(std::make_pair(ln, has_copy_)); + device_tag_[ln] = dev_type_; } void VisitExpr_(const IfNode* in) final { ExprVisitor::VisitExpr_(in); - post_dfs_order_.push_back(std::make_pair(in, has_copy_)); + device_tag_[in] = dev_type_; } int num_device_copy_ops_{0}; bool has_copy_ = false; - std::vector> post_dfs_order_; + int dev_type_ = -1; + int out_dev_type_ = -1; + std::unordered_map device_tag_; friend DeviceInfo; }; @@ -455,39 +473,14 @@ class DeviceInfo { } void PropagateDeviceId() { - // Bottom-up propagation. - int out_dev_type = BottomUpPropagation(); - // propagation for remained nodes. - FillPropagation(out_dev_type); - } - - int BottomUpPropagation() { - const CallNode* last_copy_node = nullptr; - int cur_dev_type = -1; - int out_dev_type = -1; - for (auto it = post_visitor_.post_dfs_order_.crbegin(); - it != post_visitor_.post_dfs_order_.crend(); ++it) { - if (const auto* node = GetDeviceCopyNode(it->first)) { - CHECK(node->IsInstance()); - last_copy_node = static_cast(node); - const auto* attrs = last_copy_node->attrs.as(); - cur_dev_type = attrs->src_dev_type; - if (out_dev_type == -1) out_dev_type = attrs->dst_dev_type; - if (it->second) device_map_.Set(GetRef(it->first), attrs->dst_dev_type); - } else if (last_copy_node) { - Expr expr = GetRef(it->first); - CHECK_EQ(device_map_.count(expr), 0U); - if (it->second) device_map_.Set(expr, cur_dev_type); + int out_dev_type = post_visitor_.out_dev_type_; + for (auto& it : post_visitor_.device_tag_) { + if (it.second != -1) { + device_map_.Set(GetRef(it.first), it.second); + } else { + device_map_.Set(GetRef(it.first), out_dev_type); } } - return out_dev_type; - } - - void FillPropagation(int out_dev_type) { - for (const auto& it : post_visitor_.post_dfs_order_) { - Expr expr = GetRef(it.first); - if (!it.second) device_map_.Set(expr, out_dev_type); - } } PostDfsOrderVisitor post_visitor_; @@ -541,7 +534,9 @@ Expr RewriteAnnotatedOps(const Expr& expr, int fallback_device) { } } -Map CollectDeviceInfo(const Expr& expr) { return DeviceInfo::GetDeviceMap(expr); } +Map CollectDeviceInfo(const Expr& expr) { + return DeviceInfo::GetDeviceMap(expr); +} Map CollectDeviceAnnotationOps(const Expr& expr) { return AnnotatationVisitor::GetAnnotations(expr); diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index f07170489298..2991a0842b7a 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -87,16 +87,19 @@ class BuiltinLower : public StmtExprMutator { op = stmt.as(); // Get constant allocation bound. int64_t nbytes = GetVectorBytes(op->dtype); - if (device_type_.defined()) { - if (const auto* dev_type = device_type_.as()) { - if (dev_type->value == kDLCPU) { - int32_t constant_size = op->constant_allocation_size(); - if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) { - return stmt; - } - } - } - } + // NOTE(zhanghao): remove special handling for kDLCPU + // otherwise, may cause LLVM parameters match error + // if in heterogenous targets + // if (device_type_.defined()) { + // if (arith::GetConst(device_type_, &dev_type)) { + // if (dev_type == kDLCPU) { + // int32_t constant_size = op->constant_allocation_size(); + // if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) { + // return stmt; + // } + // } + // } + // } PrimExpr total_bytes = make_const(op->extents[0].dtype(), nbytes); for (size_t i = 0; i < op->extents.size(); ++i) { total_bytes = total_bytes * op->extents[i]; diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index 08baaf7e29ad..1379a506d0b7 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -77,6 +77,8 @@ "tokens", # interface definition "idl", + # opencl file + "cl", } # List of file names allowed diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py index 3a18cf74bc1d..ebd44194f659 100644 --- a/vta/python/vta/environment.py +++ b/vta/python/vta/environment.py @@ -62,11 +62,13 @@ class DevContext(object): MEM_ID_INP = 2 MEM_ID_ACC = 3 MEM_ID_OUT = 4 + MEM_ID_ACC_8BIT = 5 # VTA ALU Opcodes ALU_OPCODE_MIN = 0 ALU_OPCODE_MAX = 1 ALU_OPCODE_ADD = 2 ALU_OPCODE_SHR = 3 + ALU_OPCODE_MUL = 4 # Task queue id (pipeline stage) QID_LOAD_INP = 1 QID_LOAD_WGT = 1 @@ -240,7 +242,7 @@ def target_host(self): return "llvm -mtriple=armv7-none-linux-gnueabihf" if self.TARGET == "ultra96": return "llvm -mtriple=aarch64-linux-gnu" - if self.TARGET in ["sim", "tsim"]: + if self.TARGET in ["sim", "tsim", "intelfocl"]: return "llvm" raise ValueError("Unknown target %s" % self.TARGET) @@ -294,7 +296,7 @@ def coproc_sync(op): return tvm.tir.call_extern( "int32", "VTASynchronize", get_env().dev.command_handle, - tvm.runtime.const(1<<31, dtype="uint32")) + tvm.runtime.const(1<<31, dtype="uint32"), True) diff --git a/vta/python/vta/program_bitstream.py b/vta/python/vta/program_bitstream.py index 62cb5f21d02a..9a48ba75378e 100644 --- a/vta/python/vta/program_bitstream.py +++ b/vta/python/vta/program_bitstream.py @@ -54,7 +54,13 @@ def de10nano_bitstream_program(bitstream_path): program = get_global_func("vta.de10nano.program") program(bitstream_path) -def bitstream_program(target, bitstream): +def intelfocl_bitstream_program(bitstream_path, mem_size=4*1024*1024*1024): + # pylint: disable=import-outside-toplevel + from tvm import get_global_func + program = get_global_func("vta.intelfocl.program") + program(bitstream_path, mem_size) + +def bitstream_program(target, bitstream, *args): if target in ['pynq', 'ultra96']: pynq_bitstream_program(bitstream) elif target in ['de10nano']: @@ -62,6 +68,8 @@ def bitstream_program(target, bitstream): elif target in ['sim', 'tsim']: # In simulation, bit stream programming is a no-op return + elif target in ['intelfocl']: + intelfocl_bitstream_program(bitstream, *args) else: raise RuntimeError("Unknown target {}".format(target)) diff --git a/vta/python/vta/rpc_client.py b/vta/python/vta/rpc_client.py index 097ea8e4a5cc..c76a8c77cb67 100644 --- a/vta/python/vta/rpc_client.py +++ b/vta/python/vta/rpc_client.py @@ -19,6 +19,8 @@ from .environment import get_env from .bitstream import download_bitstream, get_bitstream_path +from tvm import rpc +from vta import program_bitstream def reconfig_runtime(remote): """Reconfigure remote runtime based on current hardware spec. @@ -44,16 +46,20 @@ def program_fpga(remote, bitstream=None): bitstream : str, optional Path to a local bistream file. If unset, tries to download from cache server. """ + env = get_env() + if bitstream: assert os.path.isfile(bitstream) else: bitstream = get_bitstream_path() if not os.path.isfile(bitstream): - env = get_env() if env.TARGET == 'de10nano': return download_bitstream() - fprogram = remote.get_function("tvm.contrib.vta.init") - remote.upload(bitstream) - fprogram(os.path.basename(bitstream)) + if isinstance(remote, rpc.LocalSession): + program_bitstream.bitstream_program(env.TARGET, bitstream) + else: + fprogram = remote.get_function("tvm.contrib.vta.init") + remote.upload(bitstream) + fprogram(os.path.basename(bitstream)) diff --git a/vta/python/vta/testing/simulator.py b/vta/python/vta/testing/simulator.py index 16827c4ab079..5ac8c80fed8d 100644 --- a/vta/python/vta/testing/simulator.py +++ b/vta/python/vta/testing/simulator.py @@ -25,7 +25,7 @@ def _load_sw(): """Load hardware library for simulator.""" env = get_env() - lib_driver_name = "libvta_tsim" if env.TARGET == "tsim" else "libvta_fsim" + lib_driver_name = "libvta_tsim" if env.TARGET == "" else "libvta" if env.TARGET == "intelfocl" else "libvta_fsim" # Load driver library lib_driver = find_libvta(lib_driver_name, optional=True) diff --git a/vta/python/vta/testing/util.py b/vta/python/vta/testing/util.py index afbf00ddac8c..83da2e157164 100644 --- a/vta/python/vta/testing/util.py +++ b/vta/python/vta/testing/util.py @@ -32,7 +32,7 @@ def run(run_func): """ env = get_env() - if env.TARGET in ["sim", "tsim"]: + if env.TARGET in ["sim", "tsim", "intelfocl"]: # Talk to local RPC if necessary to debug RPC server. # Compile vta on your host with make at the root. # Make sure TARGET is set to "sim" in the config.json file. diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py index 633ef3f60c9b..255724afc809 100644 --- a/vta/python/vta/top/graphpack.py +++ b/vta/python/vta/top/graphpack.py @@ -93,7 +93,7 @@ def _weight_shape_match_transpose(data, dshape, channels, cfactor_out): if pad_width != 0: pad_width = cfactor_out - pad_width data = op.nn.pad(data, [[0, 0], [0, pad_width], [0, 0], [0, 0]]) - dshape = tuple(dshape[0], [dshape[1] + pad_width, dshape[2], dshape[3]]) + dshape = tuple([dshape[0]] + [dshape[1] + pad_width, dshape[2], dshape[3]]) if channels_pad != 0: channels = channels + (cfactor_out - channels_pad) @@ -174,6 +174,104 @@ def _operator_idx_inc(expr, count_meta, operator_current_idx): operator_current_idx = operator_current_idx + 1 return operator_current_idx + +class ExprDeviceAnnot(ExprMutator): + """Visitor to perform graph annotation on an AST. + + Parameters + ---------- + start: int + the start location to mark run on vta (inclusive) + end: int + the end location to mark run on vta (exclusive) + + Returns + --------- + None + """ + def __init__(self, start=-1, end=-1): + self.ext_ctx = tvm.context("ext_dev") + self.cpu_ctx = tvm.context("cpu") + self.cast = op.op.get("cast") + self.counter = -1 + self.start = start + self.end = end + super().__init__() + + def visit_call(self, call): + """ Visit the children. """ + # First visit the children. + oshape = _get_tensor_shape(call) + odtype = _get_tensor_type(call) + input_types = [arg.checked_type for arg in call.args] + args = [self.visit(arg) for arg in call.args] + + self.counter += 1 + if self.counter == self.start: + ret = relay.Call(call.op, args, call.attrs) + ret = relay.annotation.on_device(ret, self.ext_ctx) + return ret + elif self.counter == self.end: + ret = relay.Call(call.op, args, call.attrs) + ret = relay.annotation.on_device(ret, self.cpu_ctx) + return ret + elif self.counter > self.start and self.counter < self.end: + ret = relay.Call(call.op, args, call.attrs) + + # skip the float op, i.e., float->int cast + if self.is_float_op(call): + return ret + + return relay.annotation.on_device(ret, self.ext_ctx) + + return relay.Call(self.visit(call.op), args, call.attrs) + + def is_float_op(self, call): + """check if this op belongs to a float op + in general, float op's odtype is float; + a special case is float->int cast, which follow this op sequence: + multiply(float) -> round(float) -> clip(float) -> cast(int); + """ + args = call.args + odtype = _get_tensor_type(call) + op = call.op + + if odtype == "float32": + return True + elif op == self.cast: + idtype = _get_tensor_type(args[0]) + if idtype == "float32": + return True + + return False + + +class ExprLocater(ExprMutator): + """Visitor to locate op on an AST. + """ + def __init__(self): + self.counter = -1 + self.op2nodes = {} + super().__init__() + + def visit_call(self, call): + """ Visit the children. """ + # First visit the children. + args = [self.visit(arg) for arg in call.args] + + odtype = _get_tensor_type(call) + self.counter += 1 + if (call.op, odtype) in self.op2nodes: + self.op2nodes[(call.op, odtype)].append(self.counter) + else: + self.op2nodes[(call.op, odtype)] = [self.counter] + + return relay.Call( + self.visit(call.op), + args, + call.attrs) + + class ExprPack(ExprMutator): """Visitor to perform graph packing on an AST. """ @@ -317,7 +415,7 @@ def visit_call(self, call): elif self.start_pack and call.op == op.op.get('cast') and \ input_types[0].dtype == 'int32': cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs) - return relay.Call(op.op.get('copy'), [cast]) + return cast elif call.op == self.pad: pad_width = call.attrs.pad_width if len(pad_width) == 6: @@ -412,7 +510,10 @@ def graph_pack(expr, stop_name="nn.global_avg_pool2d", start_name_idx=None, stop_name_idx=None, - count_meta=False): + count_meta=False, + device_annot=False, + annot_start_name="nn.conv2d", + annot_end_name="annotation.stop_fusion"): """Pack the graph into batch&channel packed format. Parameters @@ -449,13 +550,23 @@ def graph_pack(expr, 'expr.astext(show_meta_data=False)'. When count_meta is True, the operator increase logic would count the meta. + device_annot: boolean, optional + if we want to annoate the device_type + + annot_start_name: str, optional + device annotation start node, from which we mark the nodes as `ext_dev` + + annot_end_name: str, optional + device annotation end node, after which we mark the nodes as 'cpu' + Returns ------- expr : Expr The transformed expression. """ assert isinstance(expr, relay.Function) - assert ((start_name != stop_name) or (start_name_idx < stop_name_idx)) + assert ((start_name != stop_name) or (start_name_idx is None != stop_name_idx is None) or \ + (not (start_name_idx is None and stop_name_idx is None)) or (start_name_idx < stop_name_idx)) expr = get_subgraph(expr, start_name, stop_name, start_name_idx, stop_name_idx, count_meta) expr = run_opt_pass(expr, transform.InferType()) packer = ExprPack( @@ -463,4 +574,23 @@ def graph_pack(expr, weight_bits) expr = packer.visit(expr) assert not packer.start_pack - return run_opt_pass(expr, transform.InferType()) + expr = run_opt_pass(expr, transform.InferType()) + + if device_annot: + expr_locator = ExprLocater() + expr_locator.visit(expr) + + annot_start = op.op.get(annot_start_name) + start = expr_locator.op2nodes[(annot_start, "int32")][0] + + annot_end = op.op.get(annot_end_name) + # we mark the next op to the last stop_fusion on cpu device + end = expr_locator.op2nodes[(annot_end, "int8")][-1] + 1 + + device_annot = ExprDeviceAnnot(start=start, end=end) + expr = device_annot.visit(expr) + ret = run_opt_pass(expr, transform.InferType()) + + return ret + else: + return expr diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py index 2198ed4c191f..dca42de95ffc 100644 --- a/vta/python/vta/top/op.py +++ b/vta/python/vta/top/op.py @@ -20,6 +20,7 @@ import tvm from tvm import te +from tvm import autotvm import topi from tvm.relay.op import op as reg @@ -62,6 +63,141 @@ def clip_strategy_vta(attrs, inputs, out_type, target): reg.get("clip").get_attr("FTVMStrategy").register(clip_strategy_vta, "vta") + +@autotvm.register_topi_compute("add.vta") +def add_packed(cfg, lhs, rhs): + return topi.add(lhs, rhs) + + +@autotvm.register_topi_compute("multiply.vta") +def multiply_packed(cfg, lhs, rhs): + return topi.multiply(lhs, rhs) + + +def schedule_alu_packed(cfg, outs): + assert len(outs) == 1 + + def is_cast_op(op): + return op.name == 'T_cast' + + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + output = outs[0] + s = te.create_schedule([x.op for x in outs]) + te.schedule.AutoInlineInjective(s) + + env = get_env() + # other target does not support alu-only ops + if not (env.TARGET in ["sim", "tsim", "intelfocl"]): + return s + + # only put the int-related ops to vta + if "int" in output.dtype and len(output.shape) == 6: + ewise_inputs = [] + ewise_ops = [] + const_ops = [] + + def _traverse(op): + if topi.tag.is_broadcast(op.tag): + if not op.same_as(output.op): + if not op.axis: + const_ops.append(op) + elif not is_cast_op(op): + ewise_ops.append(op) + + for tensor in op.input_tensors: + if isinstance(tensor.op, tvm.te.PlaceholderOp): + ewise_inputs.append((op, tensor)) + elif is_cast_op(tensor.op) and not op.same_as(output.op): + ewise_inputs.append((op, tensor)) + else: + _traverse(tensor.op) + else: + for tensor in op.input_tensors: + if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) \ + and (not is_cast_op(tensor.op)): + _traverse(tensor.op) + + op = output.op + _traverse(op) + for _, t in ewise_inputs: + if t.dtype == 'float32': + return s + + x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis + + cfg.define_split('tile_co', x_co, num_outputs=2) + cfg.define_split('tile_h', x_i, num_outputs=2) + cfg.define_split('tile_w', x_j, num_outputs=2) + + x_co_max = topi.util.get_const_int(x_bo.dom.extent) + x_i_max = topi.util.get_const_int(x_i.dom.extent) + x_j_max = topi.util.get_const_int(x_j.dom.extent) + + x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co) + x_i0, x_i1 = cfg['tile_h'].apply(s, output, x_i) + x_j0, x_j1 = cfg['tile_w'].apply(s, output, x_j) + s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci) + store_pt = x_j0 + + for eo in ewise_ops: + s[eo].set_scope(env.acc_scope) + s[eo].pragma(s[eo].op.axis[0], env.alu) + s[eo].compute_at(s[output], store_pt) + + # cache read input + cache_read_ewise = [] + for consumer, tensor in ewise_inputs: + cache_read_ewise.append( + s.cache_read(tensor, env.acc_scope, [consumer])) + + for tensor in cache_read_ewise: + if s[tensor].op.axis: + s[tensor].pragma(s[tensor].op.axis[0], env.dma_copy) + s[tensor].compute_at(s[output], store_pt) + + for op in const_ops: + s[op].compute_inline() + + s[output].pragma(x_co1, env.dma_copy) + + return s + + +@autotvm.register_topi_schedule("add.vta") +def schedule_add_packed(cfg, outs): + return schedule_alu_packed(cfg, outs) + + +@autotvm.register_topi_schedule("multiply.vta") +def schedule_multiply_packed(cfg, outs): + return schedule_alu_packed(cfg, outs) + + +def add_strategy_vta(attrs, inputs, out_type, target): + strategy = OpStrategy() + strategy.add_implementation( + _strategy.wrap_topi_compute(add_packed), + _strategy.wrap_topi_schedule(schedule_add_packed), + name="add.vta") + return strategy + + +def multiply_strategy_vta(attrs, inputs, out_type, target): + strategy = OpStrategy() + strategy.add_implementation( + _strategy.wrap_topi_compute(multiply_packed), + _strategy.wrap_topi_schedule(schedule_multiply_packed), + name="multiply.vta") + return strategy + + +env = get_env() +# other target does not support alu-only ops +if env.TARGET in ["sim", "tsim", "intelfocl"]: + reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta") + reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta") + + @_strategy.conv2d_strategy.register("vta") def conv2d_strategy_vta(attrs, inputs, out_type, target): """conv2d vta strategy""" diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py index ddfebc2cc8c1..2153531c72d0 100644 --- a/vta/python/vta/top/vta_conv2d_transpose.py +++ b/vta/python/vta/top/vta_conv2d_transpose.py @@ -81,6 +81,7 @@ def schedule_conv2d_transpose_packed(cfg, outs): """Schedule packed conv2d_transpose""" assert len(outs) == 1 output = outs[0] + const_ops = [] ewise_inputs = [] ewise_ops = [] conv2d_res = [] @@ -90,7 +91,10 @@ def schedule_conv2d_transpose_packed(cfg, outs): def _traverse(op): if topi.tag.is_broadcast(op.tag): if not op.same_as(output.op): - ewise_ops.append(op) + if not op.axis: + const_ops.append(op) + else: + ewise_ops.append(op) for tensor in op.input_tensors: if isinstance(tensor.op, tvm.te.PlaceholderOp): ewise_inputs.append((op, tensor)) @@ -146,6 +150,9 @@ def _traverse(op): s[op].set_scope(env.acc_scope) s[op].pragma(s[op].op.axis[0], env.alu) + for op in const_ops: + s[op].compute_inline() + # tile x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis x_co0, x_co1 = cfg['tile_co'].apply(s, output, x_co) diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py index d9f47f1f71ec..c6ba1b95a8cb 100644 --- a/vta/python/vta/transform.py +++ b/vta/python/vta/transform.py @@ -382,9 +382,6 @@ def _fold_buffer_dim(buf, scope, elem_block): def _get_2d_pattern(buf, elem_width, elem_bytes, dtype, scope, allow_fold): elem_block = elem_bytes * 8 // elem_width - if buf.dtype != dtype: - raise RuntimeError("Expect buffer type to be %s instead of %s" % - (dtype, buf.dtype)) shape, strides = buf.shape, buf.strides if not util.equal_const_int(idxm(buf.elem_offset, elem_block), 0): raise RuntimeError("scope %s need to have block=%d" % (scope, elem_block)) @@ -553,6 +550,11 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value): src, elem_width, elem_bytes, data_type, dst.scope, allow_fold=allow_fold) + if data_type != src.dtype: + assert(data_type == "int%d" % env.ACC_WIDTH and \ + src.dtype == "int%d" % env.INP_WIDTH) + mem_type = env.dev.MEM_ID_ACC_8BIT + irb = tvm.tir.ir_builder.create() irb.scope_attr(env.dev.vta_axis, "coproc_scope", env.dev.get_task_qid(task_qid)) diff --git a/vta/runtime/runtime.cc b/vta/runtime/runtime.cc index 49fe9c557336..df20a8e87ed7 100644 --- a/vta/runtime/runtime.cc +++ b/vta/runtime/runtime.cc @@ -36,6 +36,13 @@ #include #include #include +#include +#include +#include + +#include +#include +#include namespace vta { @@ -47,10 +54,105 @@ static const bool kBufferCoherent = VTA_COHERENT_ACCESSES; /*! \brief Always cache buffers (otherwise, write back to DRAM from CPU) */ static const bool kAlwaysCache = true; +template +class AlignmentAllocator { +public: + typedef T value_type; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + typedef T * pointer; + typedef const T * const_pointer; + + typedef T & reference; + typedef const T & const_reference; + + public: + inline AlignmentAllocator () throw () { } + + template + inline AlignmentAllocator (const AlignmentAllocator &) throw () { } + + inline ~AlignmentAllocator () throw () { } + + inline pointer adress (reference r) { + return &r; + } + + inline const_pointer adress (const_reference r) const { + return &r; + } + + inline pointer allocate (size_type n) { + return (pointer)memalign(N, n*sizeof(value_type)); + } + + inline void deallocate (pointer p, size_type) { + free(p); + } + + inline void construct (pointer p, const value_type & wert) { + new (p) value_type (wert); + } + + inline void destroy (pointer p) { + p->~value_type (); + } + + inline size_type max_size () const throw () { + return size_type (-1) / sizeof (value_type); + } + + template + struct rebind { + typedef AlignmentAllocator other; + }; + + bool operator!=(const AlignmentAllocator& other) const { + return !(*this == other); + } + + // Returns true if and only if storage allocated from *this + // can be deallocated from other, and vice versa. + // Always returns true for stateless allocators. + bool operator==(const AlignmentAllocator& other) const { + return true; + } +}; + +class DeviceAllocStat { + public: + void AddAlloc(const void* ptr) { + std::lock_guard lock(mtx_); + allocated_.insert(ptr); + } + + bool CheckAlloc(const void* ptr) { + std::lock_guard lock(mtx_); + return allocated_.count(ptr); + } + + void DelAlloc(const void* ptr) { + std::lock_guard lock(mtx_); + allocated_.erase(ptr); + } + + private: + std::set allocated_; + std::mutex mtx_; +}; + +// here we use a global variable to memorize the allocation stats +static std::shared_ptr alloc_stat(new DeviceAllocStat()); + /*! * \brief Data buffer represents data on CMA. */ struct DataBuffer { + DataBuffer() { + alloc_stat_ = alloc_stat; + } + /*! \return Virtual address of the data. */ void* virt_addr() const { return data_; } /*! \return Physical address of the data. */ @@ -101,6 +203,8 @@ struct DataBuffer { DataBuffer* buffer = new DataBuffer(); buffer->data_ = data; buffer->phy_addr_ = VTAMemGetPhyAddr(data); + + alloc_stat->AddAlloc(buffer); return buffer; } /*! @@ -108,6 +212,7 @@ struct DataBuffer { * \param buffer The buffer to be freed. */ static void Free(DataBuffer* buffer) { + alloc_stat->DelAlloc(buffer); VTAMemFree(buffer->data_); delete buffer; } @@ -117,7 +222,12 @@ struct DataBuffer { * \return The corresponding data buffer header. */ static DataBuffer* FromHandle(const void* buffer) { - return const_cast(reinterpret_cast(buffer)); + if (alloc_stat->CheckAlloc(buffer)) { + return const_cast( + reinterpret_cast(buffer)); + } else { + return nullptr; + } } private: @@ -125,6 +235,10 @@ struct DataBuffer { void* data_; /*! \brief The physical address of the buffer, excluding header. */ vta_phy_addr_t phy_addr_; + + // a copy of global shared_ptr instance + // to avoid the global instance is destructed before there are still some pending DataBuffers not destructed + std::shared_ptr alloc_stat_; }; /*! @@ -329,7 +443,7 @@ class BaseQueue { // End location of current SRAM write in FIFO mode uint32_t sram_end_{0}; // The buffer in DRAM - std::vector dram_buffer_; + std::vector> dram_buffer_; // FPGA accessible buffer void* fpga_buff_{NULL}; // Physical address of the FPGA buffer @@ -413,7 +527,6 @@ class UopQueue : public BaseQueue { kernel->sram_begin_ = 0; kernel->sram_end_ = 0; } - cache_.clear(); cache_idx_ = 0; BaseQueue::Reset(); @@ -429,14 +542,24 @@ class UopQueue : public BaseQueue { buff_size += cache_[i]->size() * kElemBytes; } CHECK(buff_size <= kMaxBytes); - // Move kernel contents to FPGA readable buffer + + // merge all the cache entries and do CopyFromHost once + uint32_t total_size = 0; + for (uint32_t i = 0; i < cache_.size(); ++i) { + uint32_t ksize = cache_[i]->size() * kElemBytes; + total_size += ksize; + } + + char *lbuf = (char*)memalign(64, total_size); uint32_t offset = 0; for (uint32_t i = 0; i < cache_.size(); ++i) { uint32_t ksize = cache_[i]->size() * kElemBytes; - VTAMemCopyFromHost(static_cast(fpga_buff_) + offset, cache_[i]->data(), ksize); - // Update offset + memcpy(lbuf + offset, cache_[i]->data(), ksize); offset += ksize; } + VTAMemCopyFromHost(static_cast(fpga_buff_), lbuf, total_size); + free(lbuf); + // Flush if we're using a shared memory system // and if interface is non-coherent if (!coherent_ && always_cache_) { @@ -609,34 +732,107 @@ class InsnQueue : public BaseQueue { CommitPendingPop(kComputeStage); } // Helper function: Get Opcode string - const char* getOpcodeString(int opcode, bool use_imm) { + std::string getOpcodeString(int opcode, bool use_imm, int64_t imm) { // The string name if (opcode == VTA_ALU_OPCODE_MIN) { - if (use_imm) { - return "min imm"; - } else { - return "min"; - } + if (use_imm) { + return std::string("min imm ") + std::to_string(imm); + } else { + return "min"; + } } else if (opcode == VTA_ALU_OPCODE_MAX) { - if (use_imm) { - return "max imm"; - } else { - return "max"; - } + if (use_imm) { + return (std::string("max imm ") + std::to_string(imm)); + } else { + return "max"; + } } else if (opcode == VTA_ALU_OPCODE_ADD) { - if (use_imm) { - return "add imm"; - } else { - return "add"; - } + if (use_imm) { + return (std::string("add imm ") + std::to_string(imm)); + } else { + return "add"; + } } else if (opcode == VTA_ALU_OPCODE_SHR) { - return "shr"; + return (std::string("shr ") + std::to_string(imm)); + } else if (opcode == VTA_ALU_OPCODE_MUL) { + return "mul"; } return "unknown op"; } + + std::string GetOpName(const union VTAInsn& c) { + switch (c.mem.opcode) { + case VTA_OPCODE_LOAD: + if (c.mem.x_size == 0) { + if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) { + return "NOP-COMPUTE-STAGE"; + } else { + return "NOP-MEMORY-STAGE"; + } + } else { + if (c.mem.memory_type == VTA_MEM_ID_UOP) { + return "LOAD UOP"; + } else if (c.mem.memory_type == VTA_MEM_ID_WGT) { + return "LOAD WGT"; + } else if (c.mem.memory_type == VTA_MEM_ID_INP) { + return "LOAD INP"; + } else if (c.mem.memory_type == VTA_MEM_ID_ACC) { + return "LOAD ACC"; + } else if (c.mem.memory_type == VTA_MEM_ID_ACC_8BIT) { + return "LOAD ACC 8BIT"; + } else { + return "LOAD"; + } + } + case VTA_OPCODE_STORE: + if (c.mem.x_size == 0) { + return "NOP-STORE-STAGE"; + } else { + return "STORE"; + } + case VTA_OPCODE_GEMM: + return "GEMM"; + case VTA_OPCODE_ALU: + return "ALU - " + getOpcodeString(c.alu.alu_opcode, c.alu.use_imm, c.alu.imm); + case VTA_OPCODE_FINISH: + return "FINISH"; + default: + return "Not recogonized"; + } + } + + std::string GetOpcodeName(const union VTAInsn& c) { + switch (c.mem.opcode) { + case VTA_OPCODE_LOAD: + if (c.mem.x_size == 0) { + return "NOP"; + } else { + return "LOAD"; + } + case VTA_OPCODE_STORE: + if (c.mem.x_size == 0) { + return "NOP"; + } else { + return "STORE"; + } + case VTA_OPCODE_GEMM: + return "GEMM"; + case VTA_OPCODE_ALU: + if (c.alu.use_imm) { + return "ALU IMM"; + } else { + return "ALU"; + } + case VTA_OPCODE_FINISH: + return "NOP"; + default: + return "Unknown"; + } + } + // Dump instructions in the queue - void DumpInsn() { + void DumpInsn(FILE* out = stderr, bool json = false) { // Keep tabs on dependence queues int l2g_queue = 0; int g2l_queue = 0; @@ -647,97 +843,136 @@ class InsnQueue : public BaseQueue { // Iterate over all instructions int insn_count = count(); const VTAGenericInsn* insn = data(); - printf("There are %u instructions\n", insn_count); + picojson::array jarr; + + if (!json) { + fprintf(out, "There are %u instructions\n", insn_count); + } + for (int i = 0; i < insn_count; ++i) { // Fetch instruction and decode opcode c.generic = insn[i]; - printf("INSTRUCTION %u: ", i); + picojson::object kv; + if (json) { + kv["name"] = picojson::value(GetOpName(c).c_str()); + kv["type"] = picojson::value(GetOpcodeName(c).c_str()); + kv["pop_prev"] = picojson::value(static_cast(c.mem.pop_prev_dep)); + kv["pop_next"] = picojson::value(static_cast(c.mem.pop_next_dep)); + kv["push_prev"] = picojson::value(static_cast(c.mem.push_prev_dep)); + kv["push_next"] = picojson::value(static_cast(c.mem.push_next_dep)); + } else { + fprintf(out, "INSTRUCTION %u: ", i); + fprintf(out, "%s\n", GetOpName(c).c_str()); + + fprintf(out, "\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", + static_cast(c.mem.pop_prev_dep), + static_cast(c.mem.pop_next_dep), + static_cast(c.mem.push_prev_dep), + static_cast(c.mem.push_next_dep)); + } + if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) { - if (c.mem.x_size == 0) { - if (c.mem.opcode == VTA_OPCODE_STORE) { - printf("NOP-STORE-STAGE\n"); - } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) { - printf("NOP-COMPUTE-STAGE\n"); - } else { - printf("NOP-MEMORY-STAGE\n"); - } - printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - static_cast(c.mem.pop_prev_dep), static_cast(c.mem.pop_next_dep), - static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); - // Count status in queues - if (c.mem.opcode == VTA_OPCODE_STORE) { - CHECK(c.mem.pop_next_dep == false); - CHECK(c.mem.push_next_dep == false); - if (c.mem.pop_prev_dep) g2s_queue--; - if (c.mem.push_prev_dep) s2g_queue++; - } else if (c.mem.opcode == VTA_OPCODE_LOAD && - (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) { - CHECK(c.mem.pop_prev_dep == false); - CHECK(c.mem.push_prev_dep == false); - if (c.mem.pop_next_dep) g2l_queue--; - if (c.mem.push_next_dep) l2g_queue++; - } else { - if (c.mem.pop_prev_dep) l2g_queue--; - if (c.mem.push_prev_dep) g2l_queue++; - if (c.mem.pop_next_dep) s2g_queue--; - if (c.mem.push_next_dep) g2s_queue++; - } - printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); - printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); - continue; - } - // Print instruction field information - if (c.mem.opcode == VTA_OPCODE_LOAD) { - printf("LOAD "); - if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n"); - if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n"); - if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n"); - if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n"); - } - if (c.mem.opcode == VTA_OPCODE_STORE) { - printf("STORE:\n"); + if (json) { + kv["dram"] = picojson::value(static_cast(c.mem.dram_base)); + kv["sram"] = picojson::value(static_cast(c.mem.sram_base)); + + picojson::array arr; + arr.push_back(picojson::value(static_cast(c.mem.y_size))); + arr.push_back(picojson::value(static_cast(c.mem.y_pad_0))); + arr.push_back(picojson::value(static_cast(c.mem.y_pad_1))); + kv["y"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.mem.x_size))); + arr.push_back(picojson::value(static_cast(c.mem.x_pad_0))); + arr.push_back(picojson::value(static_cast(c.mem.x_pad_1))); + arr.push_back(picojson::value(static_cast(c.mem.x_stride))); + kv["x"] = picojson::value(arr); + } else { + fprintf(out, "\tDRAM: 0x%08x, SRAM:0x%04x\n", + static_cast(c.mem.dram_base), + static_cast(c.mem.sram_base)); + fprintf(out, "\ty: size=%d, pad=[%d, %d]\n", + static_cast(c.mem.y_size), + static_cast(c.mem.y_pad_0), + static_cast(c.mem.y_pad_1)); + fprintf(out, "\tx: size=%d, stride=%d, pad=[%d, %d]\n", + static_cast(c.mem.x_size), + static_cast(c.mem.x_stride), + static_cast(c.mem.x_pad_0), + static_cast(c.mem.x_pad_1)); } - printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - static_cast(c.mem.pop_prev_dep), static_cast(c.mem.pop_next_dep), - static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); - printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", static_cast(c.mem.dram_base), - static_cast(c.mem.sram_base)); - printf("\ty: size=%d, pad=[%d, %d]\n", static_cast(c.mem.y_size), - static_cast(c.mem.y_pad_0), static_cast(c.mem.y_pad_1)); - printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", static_cast(c.mem.x_size), - static_cast(c.mem.x_stride), static_cast(c.mem.x_pad_0), - static_cast(c.mem.x_pad_1)); } else if (c.mem.opcode == VTA_OPCODE_GEMM) { - // Print instruction field information - printf("GEMM\n"); - - printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - static_cast(c.mem.pop_prev_dep), static_cast(c.mem.pop_next_dep), - static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); - printf("\treset_out: %d\n", static_cast(c.gemm.reset_reg)); - printf("\trange (%d, %d)\n", static_cast(c.gemm.uop_bgn), - static_cast(c.gemm.uop_end)); - printf("\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", - static_cast(c.gemm.iter_out), static_cast(c.gemm.wgt_factor_out), - static_cast(c.gemm.src_factor_out), static_cast(c.gemm.dst_factor_out)); - printf("\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", - static_cast(c.gemm.iter_in), static_cast(c.gemm.wgt_factor_in), - static_cast(c.gemm.src_factor_in), static_cast(c.gemm.dst_factor_in)); + if (json) { + kv["reset_out"] = picojson::value(static_cast(c.gemm.reset_reg)); + + picojson::array arr; + arr.push_back(picojson::value(static_cast(c.gemm.uop_bgn))); + arr.push_back(picojson::value(static_cast(c.gemm.uop_end))); + kv["range"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.gemm.iter_out))); + arr.push_back(picojson::value(static_cast(c.gemm.wgt_factor_out))); + arr.push_back(picojson::value(static_cast(c.gemm.src_factor_out))); + arr.push_back(picojson::value(static_cast(c.gemm.dst_factor_out))); + kv["outer_loop"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.gemm.iter_in))); + arr.push_back(picojson::value(static_cast(c.gemm.wgt_factor_in))); + arr.push_back(picojson::value(static_cast(c.gemm.src_factor_in))); + arr.push_back(picojson::value(static_cast(c.gemm.dst_factor_in))); + kv["inner_loop"] = picojson::value(arr); + } else { + fprintf(out, "\treset_out: %d\n", static_cast(c.gemm.reset_reg)); + fprintf(out, "\trange (%d, %d)\n", + static_cast(c.gemm.uop_bgn), + static_cast(c.gemm.uop_end)); + fprintf(out, "\touter loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", + static_cast(c.gemm.iter_out), + static_cast(c.gemm.wgt_factor_out), + static_cast(c.gemm.src_factor_out), + static_cast(c.gemm.dst_factor_out)); + fprintf(out, "\tinner loop - iter: %d, wgt: %d, inp: %d, acc: %d\n", + static_cast(c.gemm.iter_in), + static_cast(c.gemm.wgt_factor_in), + static_cast(c.gemm.src_factor_in), + static_cast(c.gemm.dst_factor_in)); + } } else if (c.mem.opcode == VTA_OPCODE_ALU) { - // Print instruction field information - printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm)); - printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n", - static_cast(c.mem.pop_prev_dep), static_cast(c.mem.pop_next_dep), - static_cast(c.mem.push_prev_dep), static_cast(c.mem.push_next_dep)); - printf("\treset_out: %d\n", static_cast(c.alu.reset_reg)); - printf("\trange (%d, %d)\n", static_cast(c.alu.uop_bgn), - static_cast(c.alu.uop_end)); - printf("\touter loop - iter: %d, dst: %d, src: %d\n", static_cast(c.alu.iter_out), - static_cast(c.alu.dst_factor_out), static_cast(c.alu.src_factor_out)); - printf("\tinner loop - iter: %d, dst: %d, src: %d\n", static_cast(c.alu.iter_in), - static_cast(c.alu.dst_factor_in), static_cast(c.alu.src_factor_in)); - } else if (c.mem.opcode == VTA_OPCODE_FINISH) { - printf("FINISH\n"); + if (json) { + kv["reset_out"] = picojson::value(static_cast(c.alu.reset_reg)); + picojson::array arr; + arr.push_back(picojson::value(static_cast(c.alu.uop_bgn))); + arr.push_back(picojson::value(static_cast(c.alu.uop_end))); + kv["range"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.alu.iter_out))); + arr.push_back(picojson::value(static_cast(c.alu.dst_factor_out))); + arr.push_back(picojson::value(static_cast(c.alu.src_factor_out))); + kv["outer_loop"] = picojson::value(arr); + + arr.clear(); + arr.push_back(picojson::value(static_cast(c.alu.iter_in))); + arr.push_back(picojson::value(static_cast(c.alu.dst_factor_in))); + arr.push_back(picojson::value(static_cast(c.alu.src_factor_in))); + kv["inner_loop"] = picojson::value(arr); + } else { + fprintf(out, "\treset_out: %d\n", static_cast(c.alu.reset_reg)); + fprintf(out, "\trange (%d, %d)\n", + static_cast(c.alu.uop_bgn), + static_cast(c.alu.uop_end)); + fprintf(out, "\touter loop - iter: %d, dst: %d, src: %d\n", + static_cast(c.alu.iter_out), + static_cast(c.alu.dst_factor_out), + static_cast(c.alu.src_factor_out)); + fprintf(out, "\tinner loop - iter: %d, dst: %d, src: %d\n", + static_cast(c.alu.iter_in), + static_cast(c.alu.dst_factor_in), + static_cast(c.alu.src_factor_in)); + } } // Count status in queues @@ -766,8 +1001,22 @@ class InsnQueue : public BaseQueue { if (c.gemm.pop_next_dep) s2g_queue--; if (c.gemm.push_next_dep) g2s_queue++; } - printf("\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); - printf("\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); + if (json) { + kv["l2g_queue"] = picojson::value(static_cast(l2g_queue)); + kv["g2l_queue"] = picojson::value(static_cast(g2l_queue)); + kv["s2g_queue"] = picojson::value(static_cast(s2g_queue)); + kv["g2s_queue"] = picojson::value(static_cast(g2s_queue)); + + jarr.push_back(picojson::value(kv)); + } else { + fprintf(out, "\tl2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue); + fprintf(out, "\ts2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue); + } + } + + if (json) { + auto str = picojson::value(jarr).serialize(); + fwrite(str.c_str(), 1, str.size(), out); } } // Commit all pending pop of corresponding stage @@ -799,6 +1048,7 @@ class InsnQueue : public BaseQueue { CHECK(fpga_buff_ != nullptr); CHECK(fpga_buff_phy_); uint32_t buff_size = dram_buffer_.size() * elem_bytes_; + CHECK(buff_size <= kMaxBytes); // Copy contents of DRAM buffer to FPGA buff VTAMemCopyFromHost(fpga_buff_, dram_buffer_.data(), buff_size); @@ -830,7 +1080,7 @@ class InsnQueue : public BaseQueue { } // Get stage of the memory static PipelineStage GetMemPipelineStage(int memory_type) { - if (memory_type == VTA_MEM_ID_ACC) return kComputeStage; + if (memory_type == VTA_MEM_ID_ACC || memory_type == VTA_MEM_ID_ACC_8BIT) return kComputeStage; if (memory_type == VTA_MEM_ID_UOP) return kComputeStage; return kLoadStage; } @@ -840,7 +1090,7 @@ class InsnQueue : public BaseQueue { if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage; if (insn->opcode == VTA_OPCODE_LOAD) { if (insn->x_size == 0) return kNoneStage; - if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage; + if (insn->memory_type == VTA_MEM_ID_ACC || insn->memory_type == VTA_MEM_ID_ACC_8BIT) return kComputeStage; if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage; return kLoadStage; } @@ -888,6 +1138,8 @@ class InsnQueue : public BaseQueue { int pending_pop_next_[4]; static constexpr int kElemBytes = sizeof(VTAGenericInsn); static constexpr int kMaxElems = kMaxBytes / kElemBytes; + + friend class CommandQueue; }; /*! @@ -923,6 +1175,9 @@ class CommandQueue { case VTA_MEM_ID_OUT: elem_bytes = VTA_OUT_ELEM_BYTES; break; + case VTA_MEM_ID_ACC_8BIT: + elem_bytes = VTA_ACC_ELEM_BYTES / 4; + break; default: LOG(FATAL) << "Memory id not recognized:" << memory_id; break; @@ -992,7 +1247,35 @@ class CommandQueue { } } - void Synchronize(uint32_t wait_cycles) { + void Synchronize(uint32_t wait_cycles, bool skip=true) { + if (debug_flag_ & VTA_DEBUG_LOG_INSN) { + const char* insn_file = std::getenv("TVM_INSN_DUMP_FILE"); + if (insn_file == nullptr) { + insn_file = "insn.json"; + } + FILE* out = fopen(insn_file, "w+"); + if (out) { + insn_queue_.DumpInsn(out, true); + fclose(out); + } else { + LOG(ERROR) << insn_file << " open failed"; + } + return; + } + + // FIXME(zhanghao): It is required to use force_serial + // by using skip and sync at the final layer. + // By doing this, we can avoid do DeviceCopy every time. + // TODO: Consider to make it as a flag when mature + const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL"); + if (sync_once && skip) { + if (!(debug_flag_ & VTA_DEBUG_FORCE_SERIAL)) { + LOG(ERROR) << + "Synchronizing all in one round requires to use force_serial to make things right"; + } + return; + } + // Insert dependences to force serialization if (debug_flag_ & VTA_DEBUG_FORCE_SERIAL) { insn_queue_.RewriteForceSerial(); @@ -1022,7 +1305,7 @@ class CommandQueue { VTA_OPCODE_FINISH); // Make sure that we don't exceed contiguous physical memory limits - CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) < VTA_MAX_XFER); + CHECK(insn_queue_.count() * sizeof(VTAGenericInsn) <= VTA_MAX_XFER); int timeout = VTADeviceRun(device_, insn_queue_.dram_phy_addr(), insn_queue_.count(), wait_cycles); CHECK_EQ(timeout, 0); @@ -1170,13 +1453,13 @@ class CommandQueue { void CheckInsnOverFlow() { // At each API call, we can at most commit: - // one pending store, one pending load, and one uop - if ((insn_queue_.count() + 4) * sizeof(VTAGenericInsn) >= VTA_MAX_XFER) { + // at most: 2 NOP-COMPUTE-STAGE -> 2 NOP-MEMORY-STAGE -> 1 NOP-COMPUTE-STAGE -> 1 FINISH + if ((insn_queue_.count() + 6) * sizeof(VTAGenericInsn) > VTA_MAX_XFER) { this->AutoSync(); } } // Auto sync when instruction overflow - void AutoSync() { this->Synchronize(1 << 31); } + void AutoSync() { this->Synchronize(1 << 31, false); } // Internal debug flag int debug_flag_{0}; @@ -1212,6 +1495,9 @@ void VTABufferCopy(const void* from, size_t from_offset, void* to, size_t to_off if (from_buffer) { // This is an FPGA to host mem transfer + // NOTE: Issue synchronize manually as we delay the copy until we do it synchronously and explicitly + const char* sync_once = std::getenv("VTA_SYNC_ONCE_EXPERIMENTAL"); + if (sync_once) VTASynchronize(VTATLSCommandHandle(), 1<<31, false); from_buffer->InvalidateCache(from_offset, size); from_buffer->MemCopyToHost(static_cast(to) + to_offset, static_cast(from) + from_offset, size); @@ -1232,7 +1518,12 @@ void VTASetDebugMode(VTACommandHandle cmd, int debug_flag) { } void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer) { - return vta::DataBuffer::FromHandle(buffer)->virt_addr(); + auto data_buf = vta::DataBuffer::FromHandle(buffer); + if (data_buf) { + return data_buf->virt_addr(); + } else { // it is a raw ptr allocated by CPU + return buffer; + } } void VTAWriteBarrier(VTACommandHandle cmd, void* buffer, uint32_t elem_bits, uint32_t start, @@ -1295,6 +1586,5 @@ int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid) { return 0; } -void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles) { - static_cast(cmd)->Synchronize(wait_cycles); -} +void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip) { + static_cast(cmd)->Synchronize(wait_cycles, skip); } diff --git a/vta/runtime/runtime.h b/vta/runtime/runtime.h index 24ebb8e1247b..a61906e98ff6 100644 --- a/vta/runtime/runtime.h +++ b/vta/runtime/runtime.h @@ -41,6 +41,7 @@ extern "C" { #define VTA_DEBUG_SKIP_READ_BARRIER (1 << 3) #define VTA_DEBUG_SKIP_WRITE_BARRIER (1 << 4) #define VTA_DEBUG_FORCE_SERIAL (1 << 5) +#define VTA_DEBUG_LOG_INSN (1 << 6) /*! * \brief Allocate data buffer. @@ -251,7 +252,7 @@ TVM_DLL int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid); * \param wait_cycles The limit of poll cycles. * */ -TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles); +TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles, bool skip=true); #ifdef __cplusplus } diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py index b3c36e85d56b..1d940c2ac9be 100644 --- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py +++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py @@ -240,7 +240,7 @@ def test_conv2d(device): def _run(env, remote): if device == "vta": target = env.target - if env.TARGET not in ["sim", "tsim"]: + if env.TARGET not in ["sim", "tsim", "intelfocl"]: assert tvm.runtime.enabled("rpc") program_fpga(remote, bitstream=None) reconfig_runtime(remote) diff --git a/vta/tutorials/autotvm/tune_alu_vta.py b/vta/tutorials/autotvm/tune_alu_vta.py new file mode 100644 index 000000000000..cf4922450ce5 --- /dev/null +++ b/vta/tutorials/autotvm/tune_alu_vta.py @@ -0,0 +1,318 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-tuning a ALU fused op on VTA +""" + +import os +from mxnet.gluon.model_zoo import vision +import numpy as np +from PIL import Image + +import topi +import tvm +from tvm import te +from tvm import rpc, autotvm, relay +from tvm.contrib import graph_runtime, util, download +from tvm.autotvm.measure.measure_methods import request_remote +from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner +from tvm.autotvm import record + +import vta +from vta.testing import simulator +from vta.top import graph_pack +import copy + + +################################################################# +# Compile network +# --------------- +# Perform vta-specific compilation with Relay from a Gluon model +def compile_network(env, target, model, start_pack, stop_pack, device_annot=False): + + # Populate the shape and data type dictionary + dtype_dict = {"data": 'float32'} + shape_dict = {"data": (env.BATCH, 3, 224, 224)} + + # Get off the shelf gluon model, and convert to relay + gluon_model = vision.get_model(model, pretrained=True) + mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) + + # Update shape and type dictionary + shape_dict.update({k: v.shape for k, v in params.items()}) + dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) + + # Perform quantization in Relay + # Note: We set opt_level to 3 in order to fold batch norm + with relay.build_config(opt_level=3): + with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): + mod = relay.quantize.quantize(mod, params=params) + + # Perform graph packing and constant folding for VTA target + if target.device_name == "vta": + assert env.BLOCK_IN == env.BLOCK_OUT + relay_prog = graph_pack(mod["main"], + env.BATCH, + env.BLOCK_OUT, + env.WGT_WIDTH, + start_name=start_pack, + stop_name=stop_pack, + device_annot=device_annot) + + return relay_prog, params + + +########################################### +# Set Tuning Options +# ------------------ +# Before tuning, we should apply some configurations. +# Here we use an Pynq-Z1 board as an example. + +# Tracker host and port can be set by your environment +tracker_host = os.environ.get("TVM_TRACKER_HOST", '0.0.0.0') +tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190)) + +# Load VTA parameters from the vta/config/vta_config.json file +env = vta.get_env() + +# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device. +# Set ``device=arm_cpu`` to run inference on the CPU +# or ``device=vta`` to run inference on the FPGA. +device = "vta" +target = env.target if device == "vta" else env.target_vta_cpu + +# Name of Gluon model to compile +# The ``start_pack`` and ``stop_pack`` labels indicate where +# to start and end the graph packing relay pass: in other words +# where to start and finish offloading to VTA. +network = "resnet50_v2" +start_pack = "nn.max_pool2d" +stop_pack = "nn.global_avg_pool2d" + +# Tuning option +log_file = "%s.alu.%s.log" % (device, network) +tuning_option = { + 'log_filename': log_file, + + 'tuner': 'random', + 'n_trial': 1000, + 'early_stopping': None, + + 'measure_option': autotvm.measure_option( + builder=autotvm.LocalBuilder(n_parallel=1), + runner=autotvm.RPCRunner(env.TARGET, + host=tracker_host, + port=tracker_port, + number=5, + timeout=60, + check_correctness=True), + ), +} + + +def log_to_file(file_out, protocol='json'): + """Log the tuning records into file. + The rows of the log are stored in the format of autotvm.record.encode. + for lhs == rhs, we add an extra rhs = [] record + + Parameters + ---------- + file_out : str + The file to log to. + protocol: str, optional + The log protocol. Can be 'json' or 'pickle' + + Returns + ------- + callback : callable + Callback function to do the logging. + """ + def _callback(_, inputs, results): + with open(file_out, "a") as f: + for inp, result in zip(inputs, results): + f.write(record.encode(inp, result, protocol) + "\n") + + # we only consider task with same lhs and rhs + if inp.task.args[0] == inp.task.args[1]: + args = list(inp.task.args) + args[1] = (args[0][0], (), args[0][2]) + inp_copy = copy.deepcopy(inp) + inp_copy.task.args = tuple(args) + f.write(record.encode(inp_copy, result, protocol) + "\n") + + return _callback + + +def tune_tasks(tasks, + measure_option, + tuner='xgb', + n_trial=10, + early_stopping=None, + log_filename='tuning.log', + use_transfer_learning=True): + + # create tmp log file + tmp_log_file = log_filename + ".tmp" + if os.path.exists(tmp_log_file): + os.remove(tmp_log_file) + + for i, tsk in enumerate(reversed(tasks)): + prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) + + # create tuner + if tuner == 'xgb' or tuner == 'xgb-rank': + tuner_obj = XGBTuner(tsk, loss_type='rank') + elif tuner == 'xgb_knob': + tuner_obj = XGBTuner(tsk, loss_type='rank', feature_type='knob') + elif tuner == 'ga': + tuner_obj = GATuner(tsk, pop_size=50) + elif tuner == 'random': + tuner_obj = RandomTuner(tsk) + elif tuner == 'gridsearch': + tuner_obj = GridSearchTuner(tsk) + else: + raise ValueError("Invalid tuner: " + tuner) + + if use_transfer_learning: + if os.path.isfile(tmp_log_file): + tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) + + # do tuning + tsk_trial = min(n_trial, len(tsk.config_space)) + tuner_obj.tune(n_trial=tsk_trial, + early_stopping=early_stopping, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(tsk_trial, prefix=prefix), + log_to_file(tmp_log_file) + ]) + + # pick best records to a cache file + autotvm.record.pick_best(tmp_log_file, log_filename) + os.remove(tmp_log_file) + + +######################################################################## +# Register VTA-specific tuning tasks +def register_vta_tuning_tasks(): + from tvm.autotvm.task import TaskExtractEnv + + @tvm.te.tag_scope(tag=topi.tag.ELEMWISE) + def my_clip(x, a_min, a_max): + """Unlike topi's current clip, put min and max into two stages.""" + const_min = tvm.tir.const(a_min, x.dtype) + const_max = tvm.tir.const(a_max, x.dtype) + x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA") + x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB") + return x + + # init autotvm env to register VTA operator + TaskExtractEnv() + + @autotvm.template("add.vta") + def _topi_add(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + A, B = args[:2] + + with tvm.target.vta(): + res = vta.top.op.add_packed(*args, **kwargs) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.Target.current().device_name == 'vta': + s = vta.top.op.schedule_add_packed([res]) + else: + s = te.create_schedule([res.op]) + return s, [A, B, res] + + @autotvm.template("multiply.vta") + def _topi_multiply(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + A, B = args[:2] + + with tvm.target.vta(): + res = vta.top.op.multiply_packed(*args, **kwargs) + res = my_clip(res, 0, 127) + res = topi.cast(res, "int8") + + if tvm.target.Target.current().device_name == 'vta': + s = vta.top.op.schedule_multiply_packed([res]) + else: + s = te.create_schedule([res.op]) + return s, [A, B, res] + + +######################################################################## +# Finally, we launch tuning jobs and evaluate the end-to-end performance. +def tune_and_evaluate(tuning_opt): + + if env.TARGET != "sim": + # Get remote from fleet node + remote = autotvm.measure.request_remote(env.TARGET, + tracker_host, + tracker_port, + timeout=10000) + # Reconfigure the JIT runtime and FPGA. + vta.reconfig_runtime(remote) + vta.program_fpga(remote, bitstream=None) + else: + # In simulation mode, host the RPC server locally. + remote = rpc.LocalSession() + + # Register VTA tuning tasks + register_vta_tuning_tasks() + + # Perform task extraction on Relay program + print("Extract tasks...") + relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) + mod = tvm.IRModule.from_expr(relay_prog) + tasks = autotvm.task.extract_from_program(mod, + params=params, + ops=(relay.op.get("add"), relay.op.get("multiply"),), + target=target, + target_host=env.target_host) + + # filter out non-packed alu task + tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) + # filter out float alu task + tasks = list(filter(lambda t: t.args[0][2] != "float32", tasks)) + + # We should have extracted 10 convolution tasks + tasks_set = {} + print("Extracted {} alu tasks:".format(len(tasks))) + for tsk in tasks: + print("tsk = ", tsk) + + if len(tsk.args[1][1]) == 0: + args = list(tsk.args) + args[1] = args[0] + tsk.args = tuple(args) + + if (tsk.name, tsk.args) in tasks_set: + print("task {} already exists".format(tsk)) + tasks_set[(tsk.name, tsk.args)] = tsk + + tasks = list(tasks_set.values()) + print("After merged, final #tasks={}, tasks = {}".format(len(tasks), tasks)) + + # run tuning tasks + print("Tuning...") + tune_tasks(tasks, **tuning_opt) + + +# Run the tuning and evaluate the results +tune_and_evaluate(tuning_option) diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index a92b1ee5d90b..9ae54cba0992 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -208,7 +208,7 @@ def compile_network(env, target, model, start_pack, stop_pack): 'early_stopping': None, 'measure_option': autotvm.measure_option( - builder=autotvm.LocalBuilder(), + builder=autotvm.LocalBuilder(n_parallel=1), runner=autotvm.RPCRunner(env.TARGET, host=tracker_host, port=tracker_port, diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py index 3a367851ed25..a9676a0096e8 100644 --- a/vta/tutorials/frontend/deploy_classification.py +++ b/vta/tutorials/frontend/deploy_classification.py @@ -99,7 +99,7 @@ # When target is 'pynq', reconfigure FPGA and runtime. # Otherwise, if target is 'sim', execute locally. -if env.TARGET not in ["sim", "tsim"]: +if env.TARGET not in ["sim", "tsim", "intelfocl"]: # Get remote from tracker node if environment variable is set. # To set up the tracker, you'll need to follow the "Auto-tuning @@ -129,6 +129,10 @@ else: remote = rpc.LocalSession() + if env.TARGET in ["intelfocl"]: + # program intelfocl aocx + vta.program_fpga(remote, bitstream="vta_opencl.aocx") + # Get execution context from remote ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) @@ -183,7 +187,7 @@ env.BLOCK_OUT, env.WGT_WIDTH, start_name=pack_dict[model][0], - stop_name=pack_dict[model][1]) + stop_name=pack_dict[model][1], device_annot=(env.TARGET == "intelfocl" or env.TARGET == "sim")) else: relay_prog = mod["main"] @@ -194,6 +198,12 @@ relay_prog, target=target, params=params, target_host=env.target_host) else: + if env.TARGET == "intelfocl" or env.TARGET == "sim": + # multiple targets to run both on cpu and vta + target = { + "cpu": env.target_vta_cpu, + "ext_dev": target + } with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, target=target, @@ -209,8 +219,13 @@ remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") - # Graph runtime - m = graph_runtime.create(graph, lib, ctx) + + if env.TARGET == "intelfocl" or env.TARGET == "sim": + ctxes = [remote.ext_dev(0), remote.cpu(0)] + m = graph_runtime.create(graph, lib, ctxes) + else: + # Graph runtime + m = graph_runtime.create(graph, lib, ctx) ###################################################################### # Perform image classification inference diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py index ab416874b71b..8ac7307f5a05 100644 --- a/vta/tutorials/vta_get_started.py +++ b/vta/tutorials/vta_get_started.py @@ -91,7 +91,7 @@ vta.program_fpga(remote, bitstream=None) # In simulation mode, host the RPC server locally. -elif env.TARGET == "sim": +elif env.TARGET in ("sim", "tsim", "intelfocl"): remote = rpc.LocalSession() ######################################################################