diff --git a/Makefile b/Makefile index e7d8fd796383..6bd63e295daf 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ include $(config) # specify tensor path .PHONY: clean all test doc -all: lib/libtvm.a lib/libtvm.so +all: lib/libtvm.so lib/libtvm_runtime.so lib/libtvm.a LIB_HALIDE_IR = HalideIR/lib/libHalideIR.a @@ -19,6 +19,11 @@ SRC = $(wildcard src/*.cc src/*/*.cc src/*/*/*.cc) ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC)) ALL_DEP = $(ALL_OBJ) $(LIB_HALIDE_IR) +RUNTIME_SRC = $(wildcard src/runtime/*.cc src/runtime/*/*.cc) +RUNTIME_DEP = $(patsubst src/%.cc, build/%.o, $(RUNTIME_SRC)) + +ALL_DEP = $(ALL_OBJ) $(LIB_HALIDE_IR) + export LDFLAGS = -pthread -lm export CFLAGS = -std=c++11 -Wall -O2 -fno-rtti\ -Iinclude -Idmlc-core/include -IHalideIR/src -fPIC -DDMLC_ENABLE_RTTI=0 @@ -77,15 +82,18 @@ build/%.o: src/%.cc $(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d $(CXX) -c $(CFLAGS) -c $< -o $@ - -lib/libtvm.a: $(ALL_DEP) +lib/libtvm.so: $(ALL_DEP) @mkdir -p $(@D) - ar crv $@ $(filter %.o, $?) + $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) -lib/libtvm.so: $(ALL_DEP) +lib/libtvm_runtime.so: $(RUNTIME_DEP) @mkdir -p $(@D) $(CXX) $(CFLAGS) $(FRAMEWORKS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) +lib/libtvm.a: $(ALL_DEP) + @mkdir -p $(@D) + ar crv $@ $(filter %.o, $?) + $(LIB_HALIDE_IR): LIBHALIDEIR LIBHALIDEIR: diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h index c4c2a42ebac0..9c2b5194ebe9 100644 --- a/include/tvm/api_registry.h +++ b/include/tvm/api_registry.h @@ -1,76 +1,20 @@ /*! - * Copyright (c) 2016 by Contributors + * Copyright (c) 2017 by Contributors * \file api_registry.h - * \brief This file defines the TVM API registry. - * - * The API registry stores type-erased functions. - * Each registered function is automatically exposed - * to front-end language(e.g. python). - * Front-end can also pass callbacks as PackedFunc, or register - * then into the same global registry in C++. - * The goal is to mix the front-end language and the TVM back-end. - * - * \code - * // register the function as MyAPIFuncName - * TVM_REGISTER_API(MyAPIFuncName) - * .set_body([](TVMArgs args, TVMRetValue* rv) { - * // my code. - * }); - * \endcode + * \brief This files include necessary headers to + * be used to register an global API function. */ #ifndef TVM_API_REGISTRY_H_ #define TVM_API_REGISTRY_H_ -#include -#include #include "./base.h" -#include "./runtime/packed_func.h" #include "./packed_func_ext.h" - -namespace tvm { - -/*! \brief Utility to register API. */ -class APIRegistry { - public: - /*! - * \brief set the body of the function to be f - * \param f The body of the function. - */ - APIRegistry& set_body(PackedFunc f); // NOLINT(*) - /*! - * \brief set the body of the function to be f - * \param f The body of the function. - */ - APIRegistry& set_body(PackedFunc::FType f) { // NOLINT(*) - return set_body(PackedFunc(f)); - } - /*! - * \brief Register a function with given name - * \param name The name of the function. - */ - static APIRegistry& __REGISTER__(const std::string& name); // NOLINT(*) - - private: - /*! \brief name of the function */ - std::string name_; -}; +#include "./runtime/registry.h" /*! - * \brief Get API function by name. + * \brief Register an API function globally. + * It simply redirects to TVM_REGISTER_GLOBAL * - * \param name The name of the function. - * \return the corresponding API function. - * \note It is really PackedFunc::GetGlobal under the hood. - */ -inline PackedFunc GetAPIFunc(const std::string& name) { - return PackedFunc::GetGlobal(name); -} - -#define _TVM_REGISTER_VAR_DEF_ \ - static DMLC_ATTRIBUTE_UNUSED ::tvm::APIRegistry& __make_TVMRegistry_ - -/*! - * \brief Register API function globally. * \code * TVM_REGISTER_API(MyPrint) * .set_body([](TVMArgs args, TVMRetValue* rv) { @@ -78,8 +22,6 @@ inline PackedFunc GetAPIFunc(const std::string& name) { * }); * \endcode */ -#define TVM_REGISTER_API(OpName) \ - DMLC_STR_CONCAT(_TVM_REGISTER_VAR_DEF_, __COUNTER__) = \ - ::tvm::APIRegistry::__REGISTER__(#OpName) -} // namespace tvm +#define TVM_REGISTER_API(OpName) TVM_REGISTER_GLOBAL(OpName) + #endif // TVM_API_REGISTRY_H_ diff --git a/include/tvm/codegen.h b/include/tvm/codegen.h index 6865586a501e..e85d571afe8b 100644 --- a/include/tvm/codegen.h +++ b/include/tvm/codegen.h @@ -10,9 +10,9 @@ #include "./base.h" #include "./expr.h" #include "./lowered_func.h" +#include "./api_registry.h" #include "./runtime/packed_func.h" - namespace tvm { /*! \brief namespace for lowlevel IR pass and codegen */ namespace codegen { @@ -22,41 +22,21 @@ using runtime::TVMArgs; using runtime::TVMRetValue; /*! - * \brief Build a stack VM function. - * \param func The LoweredFunc to be build - * \param device_funcs The additional device functions - * \return A packed function representing the func. - */ -PackedFunc BuildStackVM( - LoweredFunc func, - const std::unordered_map& device_funcs); - -/*! - * \brief Build a LLVM VM function, this is still beta - * \param func The LoweredFunc to be build - * \return A packed function representing the func. - */ -PackedFunc BuildLLVM(LoweredFunc func); - -/*! - * \brief Build a CUDA function with NVRTC + * \brief Build a module from array of lowered function. + * \param funcs The functions to be built. + * \param target The target to be built. + * \return The builded module. * - * \param fsplits The LoweredFuncs to be build (after SplitHostDevice) - * The first element is the host function, followed by device functions. - * \param host_mode The host side compilation mode: - * - "stackvm": use stack vm to interpret host side code. + * \note Calls global API function "_codegen_build_" + target */ -PackedFunc BuildNVRTC(Array fsplits, std::string host_mode); +runtime::Module Build(const Array& funcs, + const std::string& target); /*! - * \brief Build a OpenCL function. - * - * \param fsplits The LoweredFuncs to be build (after SplitHostDevice) - * The first element is the host function, followed by device functions. - * \param host_mode The host side compilation mode: - * - "stackvm": use stack vm to interpret host side code. + * \param target The target to be queried. + * \return Whether target is enabled. */ -PackedFunc BuildOpenCL(Array fsplits, std::string host_mode); +bool TargetEnabled(const std::string& target); } // namespace codegen } // namespace tvm diff --git a/include/tvm/ir.h b/include/tvm/ir.h index 29b70e654832..f5ca7ad7b81e 100644 --- a/include/tvm/ir.h +++ b/include/tvm/ir.h @@ -120,23 +120,14 @@ constexpr const char* tvm_handle_is_null = "tvm_handle_is_null"; /*! * \brief See pesudo code * - * int tvm_call_global(name, TVMValue* args) { - * PackedFunc f = PackedFunc::GetGlobal(name); - * f (args, type_code_of(args), len(args)); + * int tvm_call_packed(name, TVMValue* args) { + * ModuleNode* env = GetCurrentEnv(); + * const PackedFunc* f = env->GetFuncFromEnv(name); + * (*f)(args, type_code_of(args), len(args)); * return 0; * } */ -constexpr const char* tvm_call_global = "tvm_call_global"; -/*! - * \brief See pesudo code - * - * int tvm_call_device(name, TVMValue* args) { - * PackedFunc df = CodeGenEnv->GetDevice(name); - * f (args, type_code_of(args), len(args)); - * return 0; - * } - */ -constexpr const char* tvm_call_device = "tvm_call_device"; +constexpr const char* tvm_call_packed = "tvm_call_packed"; /*! * \brief See pesudo code * diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h index ce67c9cd3a94..c3197b5c1bde 100644 --- a/include/tvm/ir_pass.h +++ b/include/tvm/ir_pass.h @@ -147,12 +147,15 @@ Stmt LiftAllocate(Stmt stmt); * \param body The body of the function. * \param name The name of the function. * \param api_args Arguments to the function, can be either Var, or Buffer - * \param num_packed_args Number of arguments that are processed in packed form. + * \param num_unpacked_args Number of arguments that + * are processed in plain form instead of packed form. * \return a LoweredFunc with the specified signiture. * * \note * The function signiture have two cases * + * let num_packed_args = len(api_args) - num_unpacked_args; + * * if num_packed_args is zero: * f(api_arg_0, api_arg_1, .., api_arg_n) where n == len(api_args) * @@ -167,7 +170,7 @@ Stmt LiftAllocate(Stmt stmt); LoweredFunc MakeAPI(Stmt body, std::string name, Array api_args, - int num_packed_args); + int num_unpacked_args); /*! * \brief Count number of undefined vars in f. diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h index 2639db01b4b2..af05b8ab1aa4 100644 --- a/include/tvm/lowered_func.h +++ b/include/tvm/lowered_func.h @@ -72,6 +72,8 @@ class LoweredFuncNode : public FunctionBaseNode { * constant Expr of given type is used. */ Map handle_data_type; + /*! \brief Whether this function is packed function */ + bool is_packed_func{true}; /*! \brief The body statment of the function */ Stmt body; /*! \return name of the operation */ @@ -88,6 +90,7 @@ class LoweredFuncNode : public FunctionBaseNode { v->Visit("args", &args); v->Visit("thread_axis", &thread_axis); v->Visit("handle_data_type", &handle_data_type); + v->Visit("is_packed_func", &is_packed_func); v->Visit("body", &body); } diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 14b3974420a2..6faf4d21de94 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -51,9 +51,10 @@ typedef enum { kArrayHandle = 5U, kTVMType = 6U, kNodeHandle = 7U, - kFuncHandle = 8U, - kStr = 9U, - kBytes = 10U + kModuleHandle = 8U, + kFuncHandle = 9U, + kStr = 10U, + kBytes = 11U } TVMTypeCode; /*! @@ -140,17 +141,19 @@ typedef struct { TVMContext ctx; } TVMArray; -/*! - * \brief The stream that is specific to device - * can be NULL, which indicates the default one. - */ -typedef void* TVMStreamHandle; +/*! \brief Handle to TVM runtime modules. */ +typedef void* TVMModuleHandle; /*! \brief Handle to packed function handle. */ typedef void* TVMFunctionHandle; /*! \brief Handle to hold return value. */ typedef void* TVMRetValueHandle; /*! \brief the array handle */ typedef TVMArray* TVMArrayHandle; +/*! + * \brief The stream that is specific to device + * can be NULL, which indicates the default one. + */ +typedef void* TVMStreamHandle; /*! * \brief Used for implementing C API function. @@ -169,74 +172,87 @@ TVM_DLL void TVMAPISetLastError(const char* msg); * \return error info */ TVM_DLL const char *TVMGetLastError(void); - /*! - * \brief Initialize certain type of devices, this may - * not be necessary for all device types. But is needed for OpenCL. + * \brief Load module from file. + * \param file_name The file name to load the module from. + * \param format The format of the module. + * \param out The result module * - * \param dev_mask The device mask of device type to be initialized - * \param option_keys Additional option keys to pass. - * \param option_vals Additional option values to pass - * \param num_options Number of options to be passed into it. - * \param out_code 1: success, 0: already initialized * \return 0 when success, -1 when failure happens + * \note The resulting module do not contain import relation. + * It can be reconstructed by TVMModImport. */ -TVM_DLL int TVMDeviceInit(int dev_mask, - const char** option_keys, - const char** option_vals, - int num_options, - int *out_code); +TVM_DLL int TVMModLoadFromFile(const char* file_name, + const char* format, + TVMModuleHandle* out); /*! - * \brief Whether the specified context is enabled. + * \brief Add dep to mod's dependency. + * This allows functions in this module to use modules. * - * \param ctx The context to be checked. - * \param out_enabled whether the ctx is enabled. - * \return Whether the function is successful. + * \param mod The module handle. + * \param dep The dependent module to be imported. + * \return 0 when success, -1 when failure happens */ -TVM_DLL int TVMContextEnabled(TVMContext ctx, - int* out_enabled); +TVM_DLL int TVMModImport(TVMModuleHandle mod, + TVMModuleHandle dep); /*! - * \brief Allocate a nd-array's memory, - * including space of shape, of given spec. - * - * \param shape The shape of the array, the data content will be copied to out - * \param ndim The number of dimension of the array. - * \param dtype The array data type. - * \param ctx The ctx this array sits on. - * \param out The output handle. - * \return 0 when success, -1 when failure happens + * \brief Get function from the module. + * \param mod The module handle. + * \param func_name The name of the function. + * \param query_imports Whether to query imported modules + * \param out The result function, can be NULL if it is not available. + * \return 0 when no error is thrown, -1 when failure happens */ -TVM_DLL int TVMArrayAlloc(const tvm_index_t* shape, - tvm_index_t ndim, - TVMType dtype, - TVMContext ctx, - TVMArrayHandle* out); +TVM_DLL int TVMModGetFunction(TVMModuleHandle mod, + const char* func_name, + int query_imports, + TVMFunctionHandle *out); + /*! - * \brief Free the TVM Array. - * \param handle The array handle to be freed. - * \return 0 when success, -1 when failure happens + * \brief Precompile the function under given context. + * Many TVMFunctionHandle is initialized lazily, + * This call eagerly prepares the resources under given context. + * Useful for benchmarking purposes. + * + * \param mod The module handle. + * \param func_name The name of the function. + * \param ctx The context to be precompiled on. + * \return 0 when no error is thrown, -1 when failure happens */ -TVM_DLL int TVMArrayFree(TVMArrayHandle handle); +TVM_DLL int TVMModPreCompile(TVMModuleHandle mod, + const char* func_name, + TVMContext ctx); /*! - * \brief Copy the array, both from and to must be valid during the copy. - * \param from The array to be copied from. - * \param to The target space. - * \param stream The stream where the copy happens, can be NULL. - * \return 0 when success, -1 when failure happens + * \brief Backend function for modules to get function + * from its environment mod_node (its imports and global function). + * + * The user do should not call TVMFuncFree on func. + * + * \note This API is supposed to be used by backend, + * it is not supposed to be used by user. + * + * \param mod_node The module handle. + * \param func_name The name of the function. + * \param out The result function. + * \return 0 when no error is thrown, -1 when failure happens */ -TVM_DLL int TVMArrayCopyFromTo(TVMArrayHandle from, - TVMArrayHandle to, - TVMStreamHandle stream); +TVM_DLL int TVMBackendGetFuncFromEnv(void* mod_node, + const char* func_name, + TVMFunctionHandle *out); /*! - * \brief Wait until all computations on stream completes. - * \param ctx The ctx to be synchronized. - * \param stream The stream to be synchronized. - * \return 0 when success, -1 when failure happens + * \brief Free the Module + * \param mod The module to be freed. + * + * \note This may not free up the module's resources. + * If there is active TVMFunctionHandle uses the module + * Or if this module is imported by another active module. + * + * The all functions remains valid until TVMFuncFree is called. */ -TVM_DLL int TVMSynchronize(TVMContext ctx, TVMStreamHandle stream); +TVM_DLL int TVMModFree(TVMModuleHandle mod); /*! * \brief Free the function when it is no longer needed. @@ -355,6 +371,76 @@ TVM_DLL int TVMFuncGetGlobal(const char* name, TVMFunctionHandle* out); */ TVM_DLL int TVMFuncListGlobalNames(int *out_size, const char*** out_array); + +// Array related apis for quick proptying +/*! + * \brief Initialize certain type of devices, this may + * not be necessary for all device types. But is needed for OpenCL. + * + * \param dev_mask The device mask of device type to be initialized + * \param option_keys Additional option keys to pass. + * \param option_vals Additional option values to pass + * \param num_options Number of options to be passed into it. + * \param out_code 1: success, 0: already initialized + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMDeviceInit(int dev_mask, + const char** option_keys, + const char** option_vals, + int num_options, + int *out_code); + + +/*! + * \brief Whether the specified context is enabled. + * + * \param ctx The context to be checked. + * \param out_enabled whether the ctx is enabled. + * \return Whether the function is successful. + */ +TVM_DLL int TVMContextEnabled(TVMContext ctx, + int* out_enabled); + +/*! + * \brief Allocate a nd-array's memory, + * including space of shape, of given spec. + * + * \param shape The shape of the array, the data content will be copied to out + * \param ndim The number of dimension of the array. + * \param dtype The array data type. + * \param ctx The ctx this array sits on. + * \param out The output handle. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMArrayAlloc(const tvm_index_t* shape, + tvm_index_t ndim, + TVMType dtype, + TVMContext ctx, + TVMArrayHandle* out); +/*! + * \brief Free the TVM Array. + * \param handle The array handle to be freed. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMArrayFree(TVMArrayHandle handle); + +/*! + * \brief Copy the array, both from and to must be valid during the copy. + * \param from The array to be copied from. + * \param to The target space. + * \param stream The stream where the copy happens, can be NULL. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMArrayCopyFromTo(TVMArrayHandle from, + TVMArrayHandle to, + TVMStreamHandle stream); +/*! + * \brief Wait until all computations on stream completes. + * \param ctx The ctx to be synchronized. + * \param stream The stream to be synchronized. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMSynchronize(TVMContext ctx, TVMStreamHandle stream); } // TVM_EXTERN_C #endif // TVM_RUNTIME_C_RUNTIME_API_H_ diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h new file mode 100644 index 000000000000..82817e121ac3 --- /dev/null +++ b/include/tvm/runtime/module.h @@ -0,0 +1,163 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file module.h + * \brief Runtime container of the functions generated by TVM, + * This is used to support dynamically link, load and save + * functions from different convention under unified API. + */ +#ifndef TVM_RUNTIME_MODULE_H_ +#define TVM_RUNTIME_MODULE_H_ + +#include +#include +#include +#include +#include "./c_runtime_api.h" + +namespace tvm { +namespace runtime { + +// The internal container of module. +class ModuleNode; +class PackedFunc; + +/*! + * \brief Module container of TVM. + */ +class Module { + public: + Module() {} + // constructor from container. + explicit Module(std::shared_ptr n) + : node_(n) {} + /*! + * \brief Get packed function from current module by name. + * + * \param name The name of the function. + * \param query_imports Whether also query dependency modules. + * \return The result function. + * This function will return PackedFunc(nullptr) if function do not exist. + */ + PackedFunc GetFunction(const std::string& name, bool query_imports); + /*! + * \brief Import another module into this module. + * \param other The module to be imported. + * + * \note Cyclic dependency is not allowed among modules, + * An error will be thrown when cyclic dependency is detected. + */ + void Import(Module other); + /*! + * \brief Load a module from file. + * \param file_name The name of the host function module. + * \param format The format of the file. + * \note This function won't load the import relationship. + * Re-create import relationship by calling Import. + */ + static Module LoadFromFile(const std::string& file_name, + const std::string& format); + /*! \return internal container */ + inline ModuleNode* operator->(); + + private: + std::shared_ptr node_; +}; + +/*! + * \brief Base node container of module. + * Do not create this directly, instead use Module. + */ +class ModuleNode { + public: + /*! \brief virtual destructor */ + virtual ~ModuleNode() {} + /*! \return The module type key */ + virtual const char* type_key() const = 0; + /*! + * \brief Eagerly compile the function under certain context, + * assuming that it is used by the current thread. + * + * This is useful for benchmarking to eliminate lazy compilation + * overhead during the first execution of the kernel. + * + * \param name The name of the function. + * \param ctx The context to be executed. + */ + virtual void PreCompile(const std::string& name, TVMContext ctx) = 0; + /*! + * \brief Get a PackedFunc from module. + * + * The PackedFunc may not be fully initialized, + * there might still be first time running overhead when + * executing the function on certain devices. + * For benchmarking, use prepare to eliminate + * + * \param name the name of the function. + * \param sptr_to_self The shared_ptr that points to this module node. + * + * \return PackedFunc(nullptr) when it is not available. + * + * \note The function will always remain valid. + * If the function need resource from the module(e.g. late linking), + * it should capture sptr_to_self. + */ + virtual PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) = 0; + /*! + * \brief Save the module to file. + * \param file_name The file to be saved to. + * \param format The format of the file. + */ + virtual void SaveToFile(const std::string& file_name, + const std::string& format) = 0; + /*! + * \brief Get the source code of module, when available. + * \param format Format of the source code, can be empty by default. + * \return Possible source code when available. + */ + virtual std::string GetSource( + const std::string& format = "") = 0; + + /*! + * \brief Get a function from current environment + * The environment includes all the imports as well as Global functions. + * + * \param name name of the function. + * \return The corresponding function. + */ + const PackedFunc* GetFuncFromEnv(const std::string& name); + /*! \return The module it imports from */ + const std::vector& imports() const { + return imports_; + } + + private: + friend class Module; + /*! \brief The modules this module depend on */ + std::vector imports_; + /*! \brief Cache used by GetImport */ + std::unordered_map > import_cache_; +}; + +/*! \brief namespace for constant symbols */ +namespace symbol { +/*! \brief Global variable to store module context. */ +constexpr const char* tvm_module_ctx = "__tvm_module_ctx"; +/*! \brief Local function to set the device during API entry. */ +constexpr const char* tvm_entry_setdevice = "__tvm_entry_setdevice"; +/*! \brief Placeholder for the module's entry function. */ +constexpr const char* tvm_module_main = "__tvm_main__"; +} // packed symbol + +// implementations of inline functions. +inline ModuleNode* Module::operator->() { + return node_.get(); +} + +} // namespace runtime +} // namespace tvm + +#include "./packed_func.h" +#endif // TVM_RUNTIME_MODULE_H_ diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index bcc5fffc43b2..558dd9b9541e 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -1,7 +1,7 @@ /*! * Copyright (c) 2017 by Contributors * \file packed_func.h - * \brief Runtime related c++ class. + * \brief Type-erased function used across TVM API. */ #ifndef TVM_RUNTIME_PACKED_FUNC_H_ #define TVM_RUNTIME_PACKED_FUNC_H_ @@ -15,6 +15,7 @@ #include #include #include "./c_runtime_api.h" +#include "./module.h" namespace Halide { // Forward declare type for extensions @@ -97,30 +98,14 @@ class PackedFunc { inline void CallPacked(TVMArgs args, TVMRetValue* rv) const; /*! \return the internal body function */ inline FType body() const; - /*! - * \brief Register f as into global function table - * \param name The name of the function. - * \param f The function to be registered. - * \return Reference to the registered function. - * \note The returned reference is valid until the end of the program - */ - static const PackedFunc& RegisterGlobal(const std::string& name, PackedFunc f); - /*! - * \brief Get the global function by name. - * \param name The name of the function. - * \return reference to the registered function. - */ - static const PackedFunc& GetGlobal(const std::string& name); - /*! - * \brief Whether the global function exist - * \param name The name of the function. - * \return Whetehr the global function exist. - */ - static bool GlobalExist(const std::string& name); - /*! - * \brief Get the names of currently registered global function. - */ - static std::vector ListGlobalNames(); + /*! \return Whether the packed function is nullptr */ + bool operator==(std::nullptr_t null) const { + return body_ == nullptr; + } + /*! \return Whether the packed function is not nullptr */ + bool operator!=(std::nullptr_t null) const { + return body_ != nullptr; + } private: /*! \brief internal container of packed function */ @@ -292,6 +277,10 @@ class TVMArgValue : public TVMPODValue_ { TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle); return *ptr(); } + operator Module() const { + TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle); + return *ptr(); + } const TVMValue& value() const { return value_; } @@ -364,6 +353,10 @@ class TVMRetValue : public TVMPODValue_ { TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle); return *ptr(); } + operator Module() const { + TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle); + return *ptr(); + } // Assign operators TVMRetValue& operator=(TVMRetValue&& other) { this->Clear(); @@ -415,6 +408,10 @@ class TVMRetValue : public TVMPODValue_ { this->SwitchToClass(kFuncHandle, f); return *this; } + TVMRetValue& operator=(Module m) { + this->SwitchToClass(kModuleHandle, m); + return *this; + } TVMRetValue& operator=(const TVMRetValue& other) { // NOLINT(*0 this->Assign(other); return *this; @@ -444,6 +441,7 @@ class TVMRetValue : public TVMPODValue_ { const TVMValue& value() const { CHECK(type_code_ != kNodeHandle && type_code_ != kFuncHandle && + type_code_ != kModuleHandle && type_code_ != kStr) << "TVMRetValue.value can only be used for POD data"; return value_; } @@ -471,6 +469,10 @@ class TVMRetValue : public TVMPODValue_ { SwitchToClass(kFuncHandle, other); break; } + case kModuleHandle: { + SwitchToClass(kModuleHandle, other); + break; + } case kNodeHandle: { SwitchToClass >( kNodeHandle, *other.template ptr >()); @@ -506,6 +508,7 @@ class TVMRetValue : public TVMPODValue_ { switch (type_code_) { case kStr: delete ptr(); break; case kFuncHandle: delete ptr(); break; + case kModuleHandle: delete ptr(); break; case kNodeHandle: delete ptr >(); break; } type_code_ = kNull; @@ -518,12 +521,14 @@ inline const char* TypeCode2Str(int type_code) { case kInt: return "int"; case kFloat: return "float"; case kStr: return "str"; + case kBytes: return "bytes"; case kHandle: return "handle"; case kNull: return "NULL"; case kNodeHandle: return "NodeHandle"; case kArrayHandle: return "ArrayHandle"; case kTVMType: return "TVMType"; case kFuncHandle: return "FunctionHandle"; + case kModuleHandle: return "ModuleHandle"; default: LOG(FATAL) << "unknown type_code=" << static_cast(type_code); return ""; } @@ -667,6 +672,10 @@ class TVMArgsSetter { values_[i].v_handle = &value; type_codes_[i] = kFuncHandle; } + void operator()(size_t i, Module& value) const { // NOLINT(*) + values_[i].v_handle = &value; + type_codes_[i] = kModuleHandle; + } void operator()(size_t i, TVMRetValue& value) const { // NOLINT(*) if (value.type_code() == kStr) { values_[i].v_str = value.ptr()->c_str(); diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h new file mode 100644 index 000000000000..aeba7e8625e2 --- /dev/null +++ b/include/tvm/runtime/registry.h @@ -0,0 +1,108 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file registry.h + * \brief This file defines the TVM global function registry. + * + * The registered functions will be made available to front-end + * as well as backend users. + * + * The registry stores type-erased functions. + * Each registered function is automatically exposed + * to front-end language(e.g. python). + * + * Front-end can also pass callbacks as PackedFunc, or register + * then into the same global registry in C++. + * The goal is to mix the front-end language and the TVM back-end. + * + * \code + * // register the function as MyAPIFuncName + * TVM_REGISTER_GLOBAL(MyAPIFuncName) + * .set_body([](TVMArgs args, TVMRetValue* rv) { + * // my code. + * }); + * \endcode + */ +#ifndef TVM_RUNTIME_REGISTRY_H_ +#define TVM_RUNTIME_REGISTRY_H_ + +#include +#include +#include "./packed_func.h" + +namespace tvm { +namespace runtime { + +/*! \brief Registry for global function */ +class Registry { + public: + /*! + * \brief set the body of the function to be f + * \param f The body of the function. + */ + Registry& set_body(PackedFunc f); // NOLINT(*) + /*! + * \brief set the body of the function to be f + * \param f The body of the function. + */ + Registry& set_body(PackedFunc::FType f) { // NOLINT(*) + return set_body(PackedFunc(f)); + } + /*! + * \brief Register a function with given name + * \param name The name of the function. + */ + static Registry& Register(const std::string& name); // NOLINT(*) + /*! + * \brief Erase global function from registry, if exist. + * \param name The name of the function. + * \return Whether function exist. + */ + static bool Remove(const std::string& name); + /*! + * \brief Get the global function by name. + * \param name The name of the function. + * \return pointer to the registered function, + * nullptr if it does not exist. + */ + static const PackedFunc* Get(const std::string& name); // NOLINT(*) + /*! + * \brief Get the names of currently registered global function. + * \return The names + */ + static std::vector ListNames(); + + private: + /*! \brief name of the function */ + std::string name_; + /*! \brief internal packed function */ + PackedFunc func_; + // Internal class. + struct Manager; + friend struct Manager; +}; + + +/*! \brief helper macro to supress unused warning */ +#if defined(__GNUC__) +#define TVM_ATTRIBUTE_UNUSED __attribute__((unused)) +#else +#define TVM_ATTRIBUTE_UNUSED +#endif + +/*! + * \brief Register a function globally. + * \code + * TVM_REGISTER_GLOBAL(MyPrint) + * .set_body([](TVMArgs args, TVMRetValue* rv) { + * // my code. + * }); + * \endcode + */ +#define TVM_REGISTER_GLOBAL(OpName) \ + static TVM_ATTRIBUTE_UNUSED ::tvm::runtime::Registry& \ + __make_TVMRegistry_ ## OpName = \ + ::tvm::runtime::Registry::Register(#OpName) + +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_REGISTRY_H_ diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 33c148f92d97..bd4ef614f585 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -12,6 +12,7 @@ from . import codegen from . import collections from . import schedule +from . import module from . import ndarray as nd from .ndarray import cpu, gpu, opencl, init_opencl, cl diff --git a/python/tvm/_ctypes/_function.py b/python/tvm/_ctypes/_function.py index d32583366b47..3150c2903bcf 100644 --- a/python/tvm/_ctypes/_function.py +++ b/python/tvm/_ctypes/_function.py @@ -1,6 +1,6 @@ # coding: utf-8 -# pylint: disable=invalid-name, protected-access, too-many-branches -"""Symbolic configuration API.""" +# pylint: disable=invalid-name, protected-access, too-many-branches, global-statement +"""Function configuration API.""" from __future__ import absolute_import import ctypes @@ -16,6 +16,7 @@ from ._ndarray import NDArrayBase FunctionHandle = ctypes.c_void_p +ModuleHandle = ctypes.c_void_p TVMRetValueHandle = ctypes.c_void_p def _ctypes_free_resource(rhandle): @@ -110,6 +111,9 @@ def _make_tvm_args(args, temp_args): values[i].v_handle = arg.handle type_codes[i] = TypeCode.NODE_HANDLE temp_args.append(arg) + elif isinstance(arg, ModuleBase): + values[i].v_handle = arg.handle + type_codes[i] = TypeCode.MODULE_HANDLE elif isinstance(arg, Function): values[i].v_handle = arg.handle type_codes[i] = TypeCode.FUNC_HANDLE @@ -158,6 +162,102 @@ def __call__(self, *args): return RETURN_SWITCH[ret_tcode.value](ret_val) +class ModuleBase(object): + """Base class for module""" + __slots__ = ["handle", "_entry"] + def __init__(self, handle): + self.handle = handle + self._entry = None + + @property + def entry_func(self): + """Get the entry function + + Returns + ------- + f : Function + The entry function if exist + """ + if self._entry: + return self._entry + else: + self._entry = self.get_function("__tvm_main__") + return self._entry + + def get_function(self, name, query_imports=False): + """Get function from the module. + + Parameters + ---------- + name : str + The name of the function + + query_imports : bool + Whether also query modules imported by this module. + + Returns + ------- + f : Function + The result function. + """ + ret_handle = FunctionHandle() + check_call(_LIB.TVMModGetFunction( + self.handle, c_str(name), + ctypes.c_int(query_imports), + ctypes.byref(ret_handle))) + if not ret_handle.value: + raise AttributeError( + "Module has no function '%s'" % name) + return Function(ret_handle) + + def import_module(self, module): + """Add module to the import list of current one. + + Parameters + ---------- + module : Module + The other module. + """ + check_call(_LIB.TVMModImport(self.handle, module.handle)) + + def precompile(self, func_name, ctx): + """Add module to the import list of current one. + + Parameters + ---------- + func_name : str + The name of function to be precompiled. + + ctx : Context + The context to be precompiled. + """ + check_call(_LIB.TVMModPreCompile( + self.handle, c_str(func_name), ctx)) + + def __getitem__(self, name): + if not isinstance(name, string_types): + raise ValueError("Can only take string as function name") + return self.get_function(name) + + def __del__(self): + check_call(_LIB.TVMModFree(self.handle)) + + def __call__(self, *args): + if self._entry: + return self._entry(*args) + else: + f = self.entry_func + return f(*args) + +_module_cls = None + +def _return_module(x): + """Return function""" + handle = x.v_handle + if not isinstance(handle, ModuleHandle): + handle = ModuleHandle(handle) + return _module_cls(handle) + def _handle_return_func(x): """Return function""" handle = x.v_handle @@ -167,6 +267,8 @@ def _handle_return_func(x): # setup return handle for function type RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func +RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module + def register_func(func_name, f=None): """Register global function @@ -248,6 +350,7 @@ def _init_api_functions(root_namespace): "_arith_": sys.modules["%s.arith" % root_namespace], "_pass_": sys.modules["%s.ir_pass" % root_namespace], "_codegen_": sys.modules["%s.codegen" % root_namespace], + "_module_": sys.modules["%s.module" % root_namespace], "_schedule_": sys.modules["%s.schedule" % root_namespace] } for name in list_global_func_names(): @@ -259,3 +362,9 @@ def _init_api_functions(root_namespace): target_module = v f = get_global_func(name) setattr(target_module, fname, f) + + +def _init_module_module(module_class): + """Initialize the module.""" + global _module_cls + _module_cls = module_class diff --git a/python/tvm/_ctypes/_types.py b/python/tvm/_ctypes/_types.py index 3a832d2b664a..fcd6325f1839 100644 --- a/python/tvm/_ctypes/_types.py +++ b/python/tvm/_ctypes/_types.py @@ -18,9 +18,10 @@ class TypeCode(object): ARRAY_HANDLE = 5 TVM_TYPE = 6 NODE_HANDLE = 7 - FUNC_HANDLE = 8 - STR = 9 - BYTES = 10 + MODULE_HANDLE = 8 + FUNC_HANDLE = 9 + STR = 10 + BYTES = 11 def _api_type(code): """create a type accepted by API""" diff --git a/python/tvm/addon/cc_compiler.py b/python/tvm/addon/cc_compiler.py new file mode 100644 index 000000000000..71a32328da96 --- /dev/null +++ b/python/tvm/addon/cc_compiler.py @@ -0,0 +1,42 @@ +# pylint: disable=invalid-name +"""Util to compile with C++ code""" +from __future__ import absolute_import as _abs +import sys +import subprocess + +def create_shared(path_target, objects, + options=None, cc="g++"): + """Create shared library. + + Parameters + ---------- + path_target : str + The target shared library. + + objects : list + List of object files. + + options : str + The additional options. + + cc : str + The compile string. + """ + cmd = [cc] + cmd += ["-shared"] + cmd += ["-o", path_target] + cmd += objects + if options: + cmd += options + args = ' '.join(cmd) + + proc = subprocess.Popen( + args, shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = proc.communicate() + + if proc.returncode != 0: + sys.stderr.write("Compilation error:\n") + sys.stderr.write(out) + sys.stderr.flush() diff --git a/python/tvm/addon/nvcc_compiler.py b/python/tvm/addon/nvcc_compiler.py index 7895a2b98c56..81465faff387 100644 --- a/python/tvm/addon/nvcc_compiler.py +++ b/python/tvm/addon/nvcc_compiler.py @@ -1,10 +1,13 @@ +# pylint: disable=invalid-name, too-many-locals """Util to compile with NVCC""" +from __future__ import absolute_import as _abs import os import sys import tempfile import subprocess -def compile_source(code, target="cubin", options=None): +def compile_source(code, target="ptx", arch=None, + options=None, path_target=None): """Compile cuda code with NVCC from env. Parameters @@ -15,9 +18,15 @@ def compile_source(code, target="cubin", options=None): target : str The target format + arch : str + The architecture + options : str The additional options + path_target : str, optional + Output file. + Return ------ cubin : bytearray @@ -26,18 +35,23 @@ def compile_source(code, target="cubin", options=None): temp_dir = tempfile.mkdtemp() if target not in ["cubin", "ptx", "fatbin"]: raise ValueError("target must be in cubin, ptx, fatbin") - path_code = os.path.join(temp_dir, "my_kernel.cu") - path_target = os.path.join(temp_dir, "my_kernel.%s" % target) + temp_code = os.path.join(temp_dir, "my_kernel.cu") + temp_target = os.path.join(temp_dir, "my_kernel.%s" % target) - with open(path_code, "w") as out_file: + with open(temp_code, "w") as out_file: out_file.write(code) + if target == "cubin" and arch is None: + raise ValueError("arch(sm_xy) must be passed for generating cubin") + file_target = path_target if path_target else temp_target cmd = ["nvcc"] cmd += ["--%s" % target, "-O3"] - cmd += ["-o", path_target] + cmd += ["-arch", arch] + cmd += ["-o", file_target] + if options: cmd += options - cmd += [path_code] + cmd += [temp_code] args = ' '.join(cmd) proc = subprocess.Popen( @@ -52,9 +66,9 @@ def compile_source(code, target="cubin", options=None): sys.stderr.flush() cubin = None else: - cubin = bytearray(open(path_target, "rb").read()) - os.remove(path_code) - if os.path.exists(path_target): - os.remove(path_target) + cubin = bytearray(open(file_target, "rb").read()) + os.remove(temp_code) + if os.path.exists(temp_target): + os.remove(temp_target) os.rmdir(temp_dir) return cubin diff --git a/python/tvm/arith.py b/python/tvm/arith.py index a18e05d00ba1..c3fad6670749 100644 --- a/python/tvm/arith.py +++ b/python/tvm/arith.py @@ -1,6 +1,7 @@ # pylint: disable=protected-access, no-member """Arithmetic data structure and utility""" from __future__ import absolute_import as _abs + from ._ctypes._node import NodeBase, register_node from . import _api_internal @@ -15,6 +16,7 @@ def is_everything(self): """Whether the set represent everything""" return _api_internal._IntSetIsEverything(self) + @register_node class IntervalSet(IntSet): """Represent set of continuous interval""" @@ -26,8 +28,8 @@ def max(self): """get the maximum value""" return _api_internal._IntervalSetGetMax(self) + @register_node class StrideSet(IntSet): """Represent set of strided integers""" pass - diff --git a/python/tvm/build.py b/python/tvm/build.py index 764db0ae5304..c97c8f15922d 100644 --- a/python/tvm/build.py +++ b/python/tvm/build.py @@ -15,9 +15,9 @@ def build(sch, args, target, + target_host="stackvm", name="default_function", binds=None, - record_codes=None, max_auto_unroll_step=8): """Build a function with arguments as signiture. @@ -32,6 +32,9 @@ def build(sch, target : str The target of the compilation. + target_host : + Host compilation target, if target is device. + name : str The name of result function. @@ -74,22 +77,17 @@ def build(sch, stmt = ir_pass.LiftAllocate(stmt) stmt = ir_pass.UnrollLoop(stmt, max_auto_unroll_step) stmt = ir_pass.Simplify(stmt) - fapi = ir_pass.MakeAPI(stmt, name, arg_list, len(arg_list)) + fapi = ir_pass.MakeAPI(stmt, name, arg_list, 0) fsplits = ir_pass.SplitHostDevice(fapi) fsplits = [x for x in fsplits] for i in range(1, len(fsplits)): fsplits[i] = ir_pass.StorageSync(fsplits[i], "shared") - if record_codes is not None: - output_ssa = False - for i, f in enumerate(fsplits): - t = target if i >= 1 else "c" - record_codes.append(codegen.CompileToC(f, output_ssa, t)) - - if target == "cuda": - ret = codegen.BuildNVRTC(fsplits, "stackvm") - elif target == "opencl": - ret = codegen.BuildOpenCL(fsplits, "stackvm") + if len(fsplits) > 1: + mhost = codegen.build(fsplits[0], target_host) + if target: + mdev = codegen.build(fsplits[1:], target) + mhost.import_module(mdev) + return mhost else: - raise ValueError("Unknown target %s" % target) - return ret + return codegen.build(fsplits[0], target) diff --git a/python/tvm/module.py b/python/tvm/module.py new file mode 100644 index 000000000000..a29067c1266b --- /dev/null +++ b/python/tvm/module.py @@ -0,0 +1,65 @@ +"""Runtime module related stuffs""" +# pylint: disable=unused-import, invalid-name, undefined-variable +from __future__ import absolute_import as _abs +from ._ctypes._function import ModuleBase, _init_module_module + +class Module(ModuleBase): + """Module container of all TVM generated functions""" + def __repr__(self): + return "Module(%s, %x)" % (self.type_key, self.handle.value) + + @property + def type_key(self): + """Get type key of the module.""" + return _GetTypeKey(self) + + def get_source(self, fmt=""): + """Get source code from module, if available. + + Parameters + ---------- + fmt : str, optional + The specified format. + """ + return _GetSource(self, fmt) + + @property + def imported_modules(self): + """Get imported modules + + Returns + ---------- + modules : list of Modules + The module + """ + nmod = ImportsSize(self) + return [_GetImport(self, i) for i in range(nmod)] + + def save(self, file_name, fmt=""): + """Save the module to file. + + Parameters + ---------- + file_name : str + The name of the file. + fmt : str + The format of the file. + """ + _SaveToFile(self, file_name, fmt) + + +def load(path, fmt=""): + """Load module from file + + Parameters + ---------- + path : str + The path to the module file. + + fmt : str, optional + The format of the file, if not specified + it will be inferred from suffix of the file. + """ + return _LoadFromFile(path, fmt) + +_init_module_module(Module) diff --git a/src/api/api_codegen.cc b/src/api/api_codegen.cc index f6018e6adeb1..4f267038a738 100644 --- a/src/api/api_codegen.cc +++ b/src/api/api_codegen.cc @@ -6,51 +6,24 @@ #include #include #include +#include #include -#include "../codegen/codegen_c.h" -#include "../codegen/codegen_cuda.h" -#include "../codegen/codegen_opencl.h" namespace tvm { namespace codegen { -TVM_REGISTER_API(_codegen_CompileToC) +TVM_REGISTER_API(_codegen_build) .set_body([](TVMArgs args, TVMRetValue *ret) { - std::string mode = "c"; - if (args.size() > 2) { - mode = args[2].operator std::string(); - } - if (mode == "c") { - *ret = CodeGenC().Compile(args[0], args[1]); - } else if (mode == "cuda") { - *ret = CodeGenCUDA().Compile(args[0], args[1]); - } else if (mode == "opencl") { - *ret = CodeGenOpenCL().Compile(args[0], args[1]); + if (args[0].IsNodeType()) { + *ret = Build({args[0]}, args[1]); } else { - LOG(FATAL) << "cannot recognize mode"; + *ret = Build(args[0], args[1]); } }); -TVM_REGISTER_API(_codegen_BuildStackVM) -.set_body([](TVMArgs args, TVMRetValue *ret) { - *ret = BuildStackVM(args[0], - std::unordered_map()); - }); - -TVM_REGISTER_API(_codegen_BuildLLVM) +TVM_REGISTER_API(_codegen_target_enabled) .set_body([](TVMArgs args, TVMRetValue *ret) { - *ret = BuildLLVM(args[0]); + *ret = TargetEnabled(args[0]); }); - -TVM_REGISTER_API(_codegen_BuildNVRTC) -.set_body([](TVMArgs args, TVMRetValue *ret) { - *ret = BuildNVRTC(args[0], args[1]); - }); - -TVM_REGISTER_API(_codegen_BuildOpenCL) -.set_body([](TVMArgs args, TVMRetValue *ret) { - *ret = BuildOpenCL(args[0], args[1]); - }); - } // namespace codegen } // namespace tvm diff --git a/src/api/api_registry.cc b/src/api/api_registry.cc deleted file mode 100644 index d4ab90282ea6..000000000000 --- a/src/api/api_registry.cc +++ /dev/null @@ -1,35 +0,0 @@ -/*! - * Copyright (c) 2017 by Contributors - * \file api_registry.cc - */ -#include -#include -#include -#include - -namespace tvm { - -struct APIManager { - std::unordered_map > fmap; - - static APIManager* Global() { - static APIManager inst; - return &inst; - } -}; - -APIRegistry& APIRegistry::__REGISTER__(const std::string& name) { // NOLINT(*) - APIManager* m = APIManager::Global(); - CHECK(!m->fmap.count(name)) - << "API function " << name << " has already been registered"; - std::unique_ptr p(new APIRegistry()); - p->name_ = name; - m->fmap[name] = std::move(p); - return *(m->fmap[name]); -} - -APIRegistry& APIRegistry::set_body(PackedFunc f) { // NOLINT(*) - PackedFunc::RegisterGlobal(name_, f); - return *this; -} -} // namespace tvm diff --git a/src/codegen/build_cuda.cc b/src/codegen/build_cuda.cc new file mode 100644 index 000000000000..17dd4d0b1834 --- /dev/null +++ b/src/codegen/build_cuda.cc @@ -0,0 +1,92 @@ +/*! + * Copyright (c) 2017 by Contributors + * Build cuda modules from source. + * \file build_cuda.cc + */ +#include +#include +#include "./codegen_cuda.h" + +#if TVM_CUDA_RUNTIME + +#include +#include "../runtime/meta_data.h" +#include "../runtime/cuda/cuda_common.h" +#include "../runtime/cuda/cuda_module.h" + +namespace tvm { +namespace codegen { + +#define NVRTC_CALL(x) \ + { \ + nvrtcResult result = x; \ + if (result != NVRTC_SUCCESS) { \ + LOG(FATAL) \ + << "NvrtcError: " #x " failed with error: " \ + << nvrtcGetErrorString(result); \ + } \ + } + +std::string NVRTCCompile(const std::string& code) { + nvrtcProgram prog; + NVRTC_CALL(nvrtcCreateProgram( + &prog, code.c_str(), nullptr, 0, nullptr, nullptr)); + nvrtcResult compile_res = nvrtcCompileProgram(prog, 0, nullptr); + size_t log_size; + NVRTC_CALL(nvrtcGetProgramLogSize(prog, &log_size)); + std::string log; log.resize(log_size); + NVRTC_CALL(nvrtcGetProgramLog(prog, &log[0])); + CHECK_EQ(compile_res, NVRTC_SUCCESS) << log; + size_t ptx_size; + NVRTC_CALL(nvrtcGetPTXSize(prog, &ptx_size)); + + std::string ptx; + ptx.resize(ptx_size); + NVRTC_CALL(nvrtcGetPTX(prog, &ptx[0])); + NVRTC_CALL(nvrtcDestroyProgram(&prog)); + return ptx; +} + +runtime::Module BuildCUDA(Array funcs) { + using tvm::runtime::Registry; + bool output_ssa = false; + CodeGenCUDA cg; + cg.Init(output_ssa); + + for (LoweredFunc f : funcs) { + cg.AddFunction(f); + } + std::string code = cg.Finish(); + + if (const auto* f = Registry::Get("tvm_callback_cuda_postproc")) { + code = (*f)(code).operator std::string(); + } + + std::string ptx; + if (const auto* f = Registry::Get("tvm_callback_cuda_compile")) { + ptx = (*f)(code).operator std::string(); + } else { + ptx = NVRTCCompile(code); + } + + std::unordered_map fmap; + for (LoweredFunc f : funcs) { + runtime::FunctionInfo info; + for (size_t i = 0; i < f->args.size(); ++i) { + info.arg_types.push_back(Type2TVMType(f->args[i].type())); + } + for (size_t i = 0; i < f->thread_axis.size(); ++i) { + info.thread_axis_tags.push_back(f->thread_axis[i]->thread_tag); + } + fmap[f->name] = info; + } + return CUDAModuleCreate(ptx, "ptx", fmap, code); +} + +TVM_REGISTER_API(_codegen_build_cuda) +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = BuildCUDA(args[0]); + }); +} // namespace codegen +} // namespace tvm +#endif // TVM_CUDA_RUNTIME diff --git a/src/codegen/build_opencl.cc b/src/codegen/build_opencl.cc new file mode 100644 index 000000000000..cae7c5463ed7 --- /dev/null +++ b/src/codegen/build_opencl.cc @@ -0,0 +1,50 @@ +/*! + * Copyright (c) 2017 by Contributors + * Build opencl modules from source. + * \file build_opencl.cc + */ +#include +#include +#include "./codegen_opencl.h" + +#if TVM_OPENCL_RUNTIME + +#include "../runtime/meta_data.h" +#include "../runtime/opencl/opencl_common.h" +#include "../runtime/opencl/opencl_module.h" + +namespace tvm { +namespace codegen { + +runtime::Module BuildOpenCL(Array funcs) { + std::ostringstream os; + bool output_ssa = false; + CodeGenOpenCL cg; + cg.Init(output_ssa); + + for (LoweredFunc f : funcs) { + cg.AddFunction(f); + } + std::string code = cg.Finish(); + + std::unordered_map fmap; + for (LoweredFunc f : funcs) { + runtime::FunctionInfo info; + for (size_t i = 0; i < f->args.size(); ++i) { + info.arg_types.push_back(Type2TVMType(f->args[i].type())); + } + for (size_t i = 0; i < f->thread_axis.size(); ++i) { + info.thread_axis_tags.push_back(f->thread_axis[i]->thread_tag); + } + fmap[f->name] = info; + } + return OpenCLModuleCreate(code, "cl", fmap); +} + +TVM_REGISTER_API(_codegen_build_opencl) +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = BuildOpenCL(args[0]); + }); +} // namespace codegen +} // namespace tvm +#endif // TVM_OPENCL_RUNTIME diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc new file mode 100644 index 000000000000..636ef9c95824 --- /dev/null +++ b/src/codegen/codegen.cc @@ -0,0 +1,35 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file codegen.cc + * \brief Common utilities to generated C style code. + */ +#include +#include +#include + +namespace tvm { +namespace codegen { + +runtime::Module Build(const Array& funcs, + const std::string& target) { + std::string mode = target; + size_t pos = mode.find("-"); + if (pos != std::string::npos) { + mode = mode.substr(0, pos); + } + std::string build_f_name = "_codegen_build_" + mode; + + const PackedFunc* bf = runtime::Registry::Get(build_f_name); + CHECK(bf != nullptr) + << "Target " << target << " is not enabled"; + runtime::Module m = (*bf)(funcs, target); + return m; +} + +bool TargetEnabled(const std::string& target) { + std::string build_f_name = "_codegen_build_" + target; + return runtime::Registry::Get(build_f_name) != nullptr; +} + +} // namespace codegen +} // namespace tvm diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index c89faeb6c209..4dbc8efef4f5 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -12,9 +12,22 @@ namespace codegen { using namespace ir; -std::string CodeGenC::Compile(LoweredFunc f, - bool output_ssa) { +void CodeGenC::Init(bool output_ssa) { print_ssa_form_ = output_ssa; +} + +void CodeGenC::InitFuncState(LoweredFunc f) { + alloc_storage_scope_.clear(); + name_alloc_map_.clear(); + ssa_assign_map_.clear(); + var_idmap_.clear(); + handle_data_type_.clear(); + scope_mark_.clear(); +} +void CodeGenC::AddFunction(LoweredFunc f) { + // clear previous generated state. + this->InitFuncState(f); + // skip the first underscore, so SSA variable starts from _1 GetUniqueName("_"); // add to alloc buffer type. @@ -47,7 +60,10 @@ std::string CodeGenC::Compile(LoweredFunc f, this->PrintStmt(f->body); this->EndScope(func_scope); this->PrintIndent(); - this->stream << "}\n"; + this->stream << "}\n\n"; +} + +std::string CodeGenC::Finish() { return stream.str(); } diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 04c22b3d2a0e..1c87c9164034 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -24,14 +24,20 @@ namespace codegen { class CodeGenC { public: /*! - * \brief Generate the C code of statement - * \param f The function to be compiled - * \param output_ssa Whether output ssa form. - * \note Only call compile once, - * create a new codegen object each time. + * \brief Initialize the code generator. + * \param output_ssa Whether output SSA. */ - std::string Compile(LoweredFunc f, - bool output_ssa); + void Init(bool output_ssa); + /*! + * \brief Add the function to the generated module. + * \param f The function to be compiled. + */ + void AddFunction(LoweredFunc f); + /*! + * \brief Finalize the compilation and return the code. + * \return The code. + */ + std::string Finish(); /*! * \brief Print the Stmt n to CodeGenC->stream * \param n The statement to be printed. @@ -73,6 +79,11 @@ class CodeGenC { */ std::string GetVarID(const Variable* v) const; // The following parts are overloadable print operations. + /*! + * \brief Initialize codegen state for generating f. + * \param f The function to be compiled. + */ + virtual void InitFuncState(LoweredFunc f); /*! * Print Type represetnation of type t. * \param t The type representation. @@ -182,7 +193,6 @@ class CodeGenC { * \return The storage scope. */ std::string GetStorageScope(const Variable* buf_var) const; - /*! \brief the storage scope of allocation */ std::unordered_map alloc_storage_scope_; diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc index c526ec8d0587..8112ee7b1d8c 100644 --- a/src/codegen/codegen_cuda.cc +++ b/src/codegen/codegen_cuda.cc @@ -4,23 +4,19 @@ */ #include #include +#include #include #include #include #include "./codegen_cuda.h" -#include "./codegen_stack_vm.h" #include "../arithmetic/compute_expr.h" -#include "../runtime/cuda/cuda_common.h" -#include "../runtime/cuda/cuda_module.h" namespace tvm { namespace codegen { -std::string CodeGenCUDA::Compile( - LoweredFunc f, - bool output_ssa) { +void CodeGenCUDA::AddFunction(LoweredFunc f) { this->stream << "extern \"C\" __global__ "; - return CodeGenC::Compile(f, output_ssa); + CodeGenC::AddFunction(f); } void CodeGenCUDA::PrintStmt(const ir::For* op) { @@ -152,70 +148,5 @@ void CodeGenCUDA::PrintStorageScope( os << "__shared__ "; } } - -#if TVM_CUDA_RUNTIME -std::unordered_map -MakeNVRTC(Array funcs) { - std::ostringstream os; - bool output_ssa = false; - for (LoweredFunc f : funcs) { - os << CodeGenCUDA().Compile(f, output_ssa); - os << '\n'; - } - std::string code = os.str(); - - if (PackedFunc::GlobalExist("tvm_callback_cuda_postproc")) { - const auto& f = PackedFunc::GetGlobal("tvm_callback_cuda_postproc"); - code = f(code).operator std::string(); - } - - std::string ptx; - if (PackedFunc::GlobalExist("tvm_callback_cuda_compile")) { - const auto& f = PackedFunc::GetGlobal("tvm_callback_cuda_compile"); - ptx = f(code).operator std::string(); - } else { - ptx = runtime::NVRTCCompile(os.str()); - } - std::unordered_map ret; - - runtime::CUDAModule m = runtime::CUDAModule::Create(ptx); - for (LoweredFunc f : funcs) { - std::vector arg_types(f->args.size()); - std::vector thread_axis_tags(f->thread_axis.size()); - - for (size_t i = 0; i < f->args.size(); ++i) { - arg_types[i] = Type2TVMType(f->args[i].type()); - } - for (size_t i = 0; i < f->thread_axis.size(); ++i) { - thread_axis_tags[i] = f->thread_axis[i]->thread_tag; - } - ret[f] = m.GetPackedFunc(f->name, arg_types, thread_axis_tags); - } - - return ret; -} - -PackedFunc BuildNVRTC(Array fsplits, std::string host_mode) { - Array device_list(fsplits.begin() + 1, fsplits.end()); - std::unordered_map device_funcs = MakeNVRTC(device_list); - if (host_mode == "stackvm") { - StackVM vm = codegen::CodeGenStackVM().Compile(fsplits[0], device_funcs); - auto f = [vm](TVMArgs args, TVMRetValue* rv) { - runtime::AutoSetCUDADevice(args); - vm(args); - }; - return PackedFunc(f); - } else { - LOG(FATAL) << "unknown host mode " << host_mode; - return PackedFunc(); - } -} -#else -// dummy function when cuda is not available -PackedFunc BuildNVRTC(Array func, std::string host_mode) { - LOG(FATAL) << "CUDA is not enabled"; - return PackedFunc(); -} -#endif // TVM_CUDA_RUNTIME } // namespace codegen } // namespace tvm diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h index 641c28f95ee7..4519e779fa8b 100644 --- a/src/codegen/codegen_cuda.h +++ b/src/codegen/codegen_cuda.h @@ -16,16 +16,7 @@ namespace codegen { class CodeGenCUDA : public CodeGenC { public: - /*! - * \brief Generate the C code of statement - * \param f The function to be compiled - * \param output_ssa Whether output ssa form. - * \note Only call compile once, - * create a new codegen object each time. - */ - std::string Compile(LoweredFunc f, - bool output_ssa); - + void AddFunction(LoweredFunc f); // override behavior void PrintStmt(const ir::For* op) final; void PrintStorageSync(const std::string& sync) final; diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc index 36cb4fff636c..715f72068eab 100644 --- a/src/codegen/codegen_opencl.cc +++ b/src/codegen/codegen_opencl.cc @@ -7,24 +7,23 @@ #include #include #include "./codegen_opencl.h" -#include "./codegen_stack_vm.h" -#include "../runtime/opencl/opencl_common.h" -#include "../runtime/opencl/opencl_module.h" #include "../runtime/thread_storage_scope.h" namespace tvm { namespace codegen { -std::string CodeGenOpenCL::Compile( - LoweredFunc f, - bool output_ssa) { - this->stream << " __kernel "; +void CodeGenOpenCL::InitFuncState(LoweredFunc f) { + CodeGenC::InitFuncState(f); for (Var arg : f->args) { if (arg.type().is_handle()) { alloc_storage_scope_[arg.get()] = "global"; } } - return CodeGenC::Compile(f, output_ssa); +} + +void CodeGenOpenCL::AddFunction(LoweredFunc f) { + this->stream << " __kernel "; + CodeGenC::AddFunction(f); } void CodeGenOpenCL::PrintThreadIndexExpr( @@ -129,56 +128,5 @@ void CodeGenOpenCL::PrintStorageScope(const std::string& scope, std::ostream& os os << "__local "; } } - -#if TVM_OPENCL_RUNTIME -std::unordered_map -MakeOpenCL(Array funcs) { - std::ostringstream os; - bool output_ssa = false; - for (LoweredFunc f : funcs) { - os << CodeGenOpenCL().Compile(f, output_ssa); - os << '\n'; - } - std::unordered_map ret; - runtime::OpenCLModule m = - runtime::OpenCLModule::CreateWithSource(os.str()); - for (LoweredFunc f : funcs) { - std::vector arg_types(f->args.size()); - std::vector thread_axis_tags(f->thread_axis.size()); - - for (size_t i = 0; i < f->args.size(); ++i) { - arg_types[i] = Type2TVMType(f->args[i].type()); - } - for (size_t i = 0; i < f->thread_axis.size(); ++i) { - thread_axis_tags[i] = f->thread_axis[i]->thread_tag; - } - ret[f] = m.GetPackedFunc(f->name, arg_types, thread_axis_tags); - } - return ret; -} - -PackedFunc BuildOpenCL(Array fsplits, std::string host_mode) { - Array device_list(fsplits.begin() + 1, fsplits.end()); - std::unordered_map device_funcs = MakeOpenCL(device_list); - - if (host_mode == "stackvm") { - StackVM vm = codegen::CodeGenStackVM().Compile(fsplits[0], device_funcs); - auto f = [vm](TVMArgs args, TVMRetValue* rv) { - runtime::AutoSetOpenCLContext(args); - vm(args); - }; - return PackedFunc(f); - } else { - LOG(FATAL) << "unknown host mode " << host_mode; - return PackedFunc(); - } -} -#else -// dummy function when opencl is not available -PackedFunc BuildOpenCL(Array func, std::string host_mode) { - LOG(FATAL) << "OpenCL is not enabled"; - return PackedFunc(); -} -#endif // TVM_OPENCL_RUNTIME } // namespace codegen } // namespace tvm diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h index 58147f8b9d79..55168fdfea2c 100644 --- a/src/codegen/codegen_opencl.h +++ b/src/codegen/codegen_opencl.h @@ -16,16 +16,9 @@ namespace codegen { class CodeGenOpenCL : public CodeGenC { public: - /*! - * \brief Generate the OpenCL code of statement - * \param f The function to be compiled - * \param output_ssa Whether output ssa form. - * \note Only call compile once, - * create a new codegen object each time. - */ - std::string Compile(LoweredFunc f, - bool output_ssa); + void AddFunction(LoweredFunc f); // override print thread tag. + void InitFuncState(LoweredFunc f) final; void PrintThreadIndexExpr( std::string tag, std::ostream& os) final; // NOLINT(*) void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*) diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc index 31b6b1c5d0d5..232ef4923aae 100644 --- a/src/codegen/llvm/codegen_llvm.cc +++ b/src/codegen/llvm/codegen_llvm.cc @@ -12,6 +12,7 @@ namespace tvm { namespace codegen { void CodeGenLLVM::Init(const std::string& module_name, + const std::string& target_triple, llvm::LLVMContext* ctx) { InitializeLLVM(); static_assert(sizeof(TVMValue) == sizeof(double), "invariant"); @@ -60,16 +61,37 @@ void CodeGenLLVM::Init(const std::string& module_name, t_tvm_value_->getPointerTo(), t_int_->getPointerTo()}, false), llvm::Function::ExternalLinkage, "TVMFuncCall", module_.get()); - f_tvm_func_get_global_ = llvm::Function::Create( + f_tvm_get_func_from_env_ = llvm::Function::Create( llvm::FunctionType::get(t_int_, { + t_void_p_, t_char_->getPointerTo(), t_tvm_func_handle_->getPointerTo()}, false), - llvm::Function::ExternalLinkage, "TVMFuncGetGlobal", module_.get()); + llvm::Function::ExternalLinkage, "TVMBackendGetFuncFromEnv", module_.get()); f_tvm_api_set_last_error_ = llvm::Function::Create( llvm::FunctionType::get(t_void_, {t_char_->getPointerTo()}, false), llvm::Function::ExternalLinkage, "TVMAPISetLastError", module_.get()); + + this->InitTarget(target_triple); // initialize builder builder_.reset(new IRBuilder(*ctx)); + this->InitGlobalContext(); +} + +void CodeGenLLVM::InitTarget(const std::string& target) { + llvm::TargetMachine* tm; + std::string target_triple; + std::tie(tm, target_triple) = LLVMGetTarget(target); + module_->setTargetTriple(target_triple); + module_->setDataLayout(tm->createDataLayout()); + data_layout_.reset(new llvm::DataLayout(module_.get())); +} + +void CodeGenLLVM::InitGlobalContext() { + gv_mod_ctx_ = new llvm::GlobalVariable( + *module_, t_void_p_, false, + llvm::GlobalValue::LinkOnceODRLinkage, 0, "__tvm_module_ctx"); + gv_mod_ctx_->setAlignment(data_layout_->getTypeAllocSize(t_void_p_)); + gv_mod_ctx_->setInitializer(llvm::Constant::getNullValue(t_void_p_)); } void CodeGenLLVM::AddFunction(const LoweredFunc& f) { @@ -104,6 +126,24 @@ void CodeGenLLVM::AddFunction(const LoweredFunc& f) { builder_->CreateRet(ConstInt32(0)); } +void CodeGenLLVM::AddMainFunction(const std::string& entry_func_name) { + llvm::Function* f = module_->getFunction(entry_func_name); + CHECK(f) << "Function " << entry_func_name << "does not in module"; + CHECK(!module_->getFunction(runtime::symbol::tvm_module_main)); + llvm::FunctionType* ftype = f->getFunctionType(); + function_ = llvm::cast( + module_->getOrInsertFunction(runtime::symbol::tvm_module_main, ftype)); + function_->setCallingConv(llvm::CallingConv::C); + std::vector args; + for (auto it = function_->arg_begin(); + it != function_->arg_end(); ++it) { + args.push_back(&(*it)); + } + llvm::BasicBlock* block = llvm::BasicBlock::Create(*ctx_, "entry", function_); + builder_->SetInsertPoint(block); + builder_->CreateRet(builder_->CreateCall(f, args)); +} + class FPassManager : public llvm::legacy::FunctionPassManager { public: explicit FPassManager(llvm::Module* m) @@ -364,14 +404,12 @@ void CodeGenLLVM::Visit_(const Load* op) { CHECK(!t.is_vector()); if (t.is_scalar()) { - llvm::DataLayout layout(module_.get()); - uint64_t valign = layout.getTypeAllocSize(LLVMType(t)); llvm::LoadInst* inst = builder_->CreateAlignedLoad( CreateBufferPtr( t, GetVarValue(op->buffer_var.get()), MakeValue(op->index)), - valign); + data_layout_->getTypeAllocSize(LLVMType(t))); AddAliasInfo(inst, op->buffer_var.get(), op->index); value_ = inst; } else { @@ -384,15 +422,13 @@ void CodeGenLLVM::Visit_(const Store* op) { Type t = op->value.type(); CHECK(!t.is_vector()); if (t.is_scalar()) { - llvm::DataLayout layout(module_.get()); - uint64_t valign = layout.getTypeAllocSize(value->getType()); llvm::StoreInst* inst = builder_->CreateAlignedStore( value, CreateBufferPtr( t, GetVarValue(op->buffer_var.get()), MakeValue(op->index)), - valign); + data_layout_->getTypeAllocSize(value->getType())); AddAliasInfo(inst, op->buffer_var.get(), op->index); } else { LOG(FATAL) << "not yet supported"; @@ -400,8 +436,7 @@ void CodeGenLLVM::Visit_(const Store* op) { } void CodeGenLLVM::Visit_(const Call* op) { - if (op->is_intrinsic(intrinsic::tvm_call_global) || - op->is_intrinsic(intrinsic::tvm_call_device)) { + if (op->is_intrinsic(intrinsic::tvm_call_packed)) { value_ = CreateCallPacked(op); } else if (op->call_type == Call::Intrinsic || op->call_type == Call::PureIntrinsic) { @@ -734,14 +769,14 @@ llvm::Value* CodeGenLLVM::CreateCast(Type from, Type to, llvm::Value* value) { } } -llvm::Value* CodeGenLLVM::GetPackedFuncHandle( - const std::string& fname, bool global) { +llvm::Value* CodeGenLLVM::GetPackedFuncHandle(const std::string& fname) { using llvm::BasicBlock; // We will store the packed function handle in global space. // Initialize it during the first call. llvm::DataLayout layout(module_.get()); - uint64_t halign = layout.getTypeAllocSize(t_tvm_func_handle_); + uint64_t align = layout.getTypeAllocSize(t_tvm_func_handle_); auto it = func_handle_map_.find(fname); + llvm::GlobalVariable* hptr; if (it == func_handle_map_.end()) { // create global location for the handle @@ -749,7 +784,7 @@ llvm::Value* CodeGenLLVM::GetPackedFuncHandle( hptr = new llvm::GlobalVariable( *module_, t_tvm_func_handle_, false, llvm::GlobalValue::PrivateLinkage, 0, ".tvm_func"); - hptr->setAlignment(halign); + hptr->setAlignment(align); hptr->setInitializer(llvm::Constant::getNullValue(t_tvm_func_handle_)); func_handle_map_[fname] = hptr; } else { @@ -761,26 +796,19 @@ llvm::Value* CodeGenLLVM::GetPackedFuncHandle( *ctx_, "handle_init", function_); BasicBlock* end_block = BasicBlock::Create( *ctx_, "handle_init_end", function_); - llvm::Value* handle = builder_->CreateAlignedLoad(hptr, halign); + llvm::Value* handle = builder_->CreateAlignedLoad(hptr, align); llvm::Value* handle_not_null = builder_->CreateICmpNE( handle, llvm::Constant::getNullValue(t_tvm_func_handle_)); builder_->CreateCondBr( handle_not_null, end_block, init_block, md_very_likely_branch_); - // loaded handle, if created by call. - llvm::Value* loaded_handle = nullptr; - // Then block. - // We do not do lock here, so unlike static variable initialization - // This clause might be executed multiple times, but it is safe to do so. + // Initialize the handle if needed. builder_->SetInsertPoint(init_block); - if (global) { - llvm::Value* out = builder_->CreateAlloca(t_tvm_func_handle_); - llvm::Value* retcode = builder_->CreateCall( - f_tvm_func_get_global_, {GetConstString(fname), out}); - init_block = CheckPackedCallSuccess(retcode); - loaded_handle = builder_->CreateAlignedLoad(out, halign); - } else { - LOG(FATAL) << "not yet supported"; - } + llvm::Value* out = builder_->CreateAlloca(t_tvm_func_handle_); + llvm::Value* ctx = builder_->CreateLoad(gv_mod_ctx_); + llvm::Value* retcode = builder_->CreateCall( + f_tvm_get_func_from_env_, {ctx, GetConstString(fname), out}); + init_block = CheckPackedCallSuccess(retcode); + llvm::Value* loaded_handle = builder_->CreateAlignedLoad(out, align); builder_->CreateBr(end_block); // end block builder_->SetInsertPoint(end_block); @@ -793,11 +821,7 @@ llvm::Value* CodeGenLLVM::GetPackedFuncHandle( llvm::Value* CodeGenLLVM::CreateCallPacked(const Call* op) { CHECK_GE(op->args.size(), 1U); std::string func_name = op->args[0].as()->value; - CHECK(!op->is_intrinsic(intrinsic::tvm_call_device)) - << "not implemented for now"; - llvm::Value* handle = GetPackedFuncHandle( - func_name, op->is_intrinsic(intrinsic::tvm_call_global)); - + llvm::Value* handle = GetPackedFuncHandle(func_name); // call the function unsigned nargs = static_cast(op->args.size() - 1); llvm::Value* targs = builder_->CreateAlloca( diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h index a6d9e1bfa57a..36fa0389ad45 100644 --- a/src/codegen/llvm/codegen_llvm.h +++ b/src/codegen/llvm/codegen_llvm.h @@ -28,14 +28,22 @@ class CodeGenLLVM : public IRVisitor { /*! * \brief Initialize the code generator with given context * \param module_name The name of the module. + * \param target_triple The target triple, can be empty. * \param ctx The context. */ - void Init(const std::string& module_name, llvm::LLVMContext* ctx); + void Init(const std::string& module_name, + const std::string& target_triple, + llvm::LLVMContext* ctx); /*! * \brief Compile and add function f to the current module. * \param f The function to be added. */ void AddFunction(const LoweredFunc& f); + /*! + * \brief Add main function as the entry name + * \param entry_func_name The name of entry function to be added. + */ + void AddMainFunction(const std::string& entry_func_name); /*! * \brief Finish current pass of codegen, get the module. * \return the created module. @@ -119,6 +127,7 @@ class CodeGenLLVM : public IRVisitor { std::unique_ptr builder_; // The module to be returned; std::unique_ptr module_; + std::unique_ptr data_layout_; // Internal metabuilder std::unique_ptr md_builder_; // llvm context @@ -145,7 +154,7 @@ class CodeGenLLVM : public IRVisitor { llvm::StructType* t_tvm_value_{nullptr}; // tvm api functions llvm::Function* f_tvm_func_call_{nullptr}; - llvm::Function* f_tvm_func_get_global_{nullptr}; + llvm::Function* f_tvm_get_func_from_env_{nullptr}; llvm::Function* f_tvm_api_set_last_error_{nullptr}; // The acting body llvm::BasicBlock* block_{nullptr}; @@ -166,17 +175,23 @@ class CodeGenLLVM : public IRVisitor { llvm::Value* GetConstString(const std::string& str); llvm::Value* CreateBufferPtr(Type t, llvm::Value* buffer, llvm::Value* index); llvm::Value* CreateCast(Type from, Type to, llvm::Value* value); - llvm::Value* GetPackedFuncHandle(const std::string& str, bool global); + llvm::Value* GetPackedFuncHandle(const std::string& str); // Check if the call to packed function is successful // if not directly finalize function and pass on return code. // return the end block after the check llvm::BasicBlock* CheckPackedCallSuccess(llvm::Value* retcode); + // Initialize target + void InitTarget(const std::string& target); + // Add a function to set global module context + void InitGlobalContext(); // add alias information. void AddAliasInfo(llvm::Instruction* load, const Variable* buffer, Expr index); // The definition of local variable. std::unordered_map var_map_; // global strings std::unordered_map str_map_; + // The local module_context + llvm::GlobalVariable* gv_mod_ctx_{nullptr}; // global to packed function handle std::unordered_map func_handle_map_; }; diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc index 7dbcf13906a1..3e0c6e10c5b3 100644 --- a/src/codegen/llvm/llvm_common.cc +++ b/src/codegen/llvm/llvm_common.cc @@ -13,7 +13,7 @@ namespace codegen { struct LLVMEnv { std::mutex mu; - bool native_initialized{false}; + bool all_initialized{false}; static LLVMEnv* Global() { static LLVMEnv inst; @@ -23,17 +23,47 @@ struct LLVMEnv { void InitializeLLVM() { LLVMEnv* e = LLVMEnv::Global(); - if (!e->native_initialized) { + if (!e->all_initialized) { std::lock_guard(e->mu); - if (!e->native_initialized) { - e->native_initialized = true; - llvm::InitializeNativeTarget(); - llvm::InitializeNativeTargetAsmPrinter(); - llvm::InitializeNativeTargetAsmParser(); + if (!e->all_initialized) { + e->all_initialized = true; + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargets(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); } } } +std::pair +LLVMGetTarget(const std::string& target_str) { + // setup target triple + std::string target_triple; + CHECK_EQ(target_str.substr(0, 4), "llvm"); + if (target_str.length() > 4) { + target_triple = target_str.substr(5, target_str.length() - 5); + } else { + target_triple = ""; + } + if (target_triple.length() == 0 || + target_triple == "default") { + target_triple = llvm::sys::getDefaultTargetTriple(); + } + + std::string err; + const llvm::Target* target = + llvm::TargetRegistry::lookupTarget(target_triple, err); + CHECK(target) << err << " target_triple=" << target_triple; + std::string cpu = "generic"; + std::string features = ""; + llvm::TargetOptions opt; + auto rmodel = llvm::Reloc::PIC_; + llvm::TargetMachine* tm = + target->createTargetMachine(target_triple, cpu, features, opt, rmodel); + return {tm, target_triple}; +} + } // namespace codegen } // namespace tvm #endif // TVM_LLVM_VERSION diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h index e10e8430e14c..7a24799974a7 100644 --- a/src/codegen/llvm/llvm_common.h +++ b/src/codegen/llvm/llvm_common.h @@ -9,6 +9,8 @@ #include +#include + #include #include #include @@ -27,15 +29,16 @@ #include #include +#include #include +#include #include +#include +#include + +#include +#include -extern "C" { -// Function signature for LLVM generated packed function. -typedef int (*LLVMPackedCFunc)(void* args, - int* type_codes, - int num_args); -} // extern "C" namespace tvm { namespace codegen { @@ -46,6 +49,14 @@ namespace codegen { */ void InitializeLLVM(); +/*! + * \brief Get target machine from target_str string. + * \param target_str Target triple string, can have llvm- prefix, can be empty. + * \return Pair of target machine and target triple. + */ +std::pair +LLVMGetTarget(const std::string& target_str); + } // namespace codegen } // namespace tvm #endif // TVM_LLVM_VERSION diff --git a/src/codegen/llvm/llvm_exec_engine.cc b/src/codegen/llvm/llvm_exec_engine.cc deleted file mode 100644 index b31505253900..000000000000 --- a/src/codegen/llvm/llvm_exec_engine.cc +++ /dev/null @@ -1,77 +0,0 @@ -/*! - * Copyright (c) 2017 by Contributors - * \file llvm_exec_engine.cc - */ -#include -#include -#include "./llvm_common.h" -#include "./codegen_llvm.h" - -namespace tvm { -namespace codegen { - -using runtime::TVMArgs; -using runtime::TVMRetValue; -using runtime::PackedFunc; - -#ifdef TVM_LLVM_VERSION -// Environment to keep jit resources alive. -struct LLVMJITEnv { - std::shared_ptr ctx; - llvm::ExecutionEngine* ee{nullptr}; - // constructor - LLVMJITEnv(std::shared_ptr ctx, - llvm::ExecutionEngine* ee) - : ctx(ctx), ee(ee) { - } - // destructor - ~LLVMJITEnv() { - if (ee != nullptr) { - ee->runStaticConstructorsDestructors(true); - delete ee; - } - } -}; - - -PackedFunc JITCompile(std::unique_ptr module, - std::shared_ptr ctx, - const std::string& func_name) { - llvm::EngineBuilder builder(std::move(module)); - builder.setEngineKind(llvm::EngineKind::JIT); - builder.setOptLevel(llvm::CodeGenOpt::Aggressive); - std::shared_ptr env = std::make_shared( - ctx, builder.create()); - CHECK(env->ee != nullptr); - auto* faddr = reinterpret_cast( - env->ee->getFunctionAddress(func_name)); - env->ee->runStaticConstructorsDestructors(false); - return PackedFunc([env, faddr](TVMArgs args, TVMRetValue* rv) { - int ret = (*faddr)( - (void*)args.values, // NOLINT(*) - (int*)args.type_codes, // NOLINT(*) - args.num_args); - CHECK(ret == 0) << TVMGetLastError(); - }); -} - -PackedFunc BuildLLVM(LoweredFunc func) { - InitializeLLVM(); - // use one context per function. - std::shared_ptr ctx = - std::make_shared(); - CodeGenLLVM cg; - cg.Init(func->name, ctx.get()); - cg.AddFunction(func); - std::unique_ptr m = cg.Finish(); - return JITCompile(std::move(m), ctx, func->name); -} - -#else -PackedFunc BuildLLVM(LoweredFunc func) { - LOG(FATAL) << "LLVM is not enabled"; - return PackedFunc(); -} -#endif // TVM_LLVM_VERSION -} // namespace codegen -} // namespace tvm diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc new file mode 100644 index 000000000000..74abbd7318cd --- /dev/null +++ b/src/codegen/llvm/llvm_module.cc @@ -0,0 +1,160 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file llvm_module.cc + * \brief LLVM runtime module for TVM + */ +#ifdef TVM_LLVM_VERSION +#include +#include +#include "./llvm_common.h" +#include "./codegen_llvm.h" +#include "../../runtime/file_util.h" +#include "../../runtime/meta_data.h" + +namespace tvm { +namespace codegen { + +using runtime::TVMArgs; +using runtime::TVMRetValue; +using runtime::PackedFunc; + +class LLVMModuleNode : public runtime::ModuleNode { + public: + ~LLVMModuleNode() { + module_.reset(); + if (ee_ != nullptr) { + ee_->runStaticConstructorsDestructors(true); + delete ee_; + } + } + + const char* type_key() const { + return "llvm"; + } + + void PreCompile(const std::string& name, TVMContext ctx) final { + if (ee_ == nullptr) LazyInitJIT(); + std::lock_guard lock(mutex_); + BackendPackedCFunc faddr = + reinterpret_cast(ee_->getFunctionAddress(name)); + CHECK(faddr != nullptr) + << "Failed to Precompile function " << name; + } + + PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) final { + if (ee_ == nullptr) LazyInitJIT(); + std::lock_guard lock(mutex_); + BackendPackedCFunc faddr = + reinterpret_cast(ee_->getFunctionAddress(name)); + if (faddr == nullptr) return PackedFunc(); + return PackedFunc([faddr, sptr_to_self](TVMArgs args, TVMRetValue* rv) { + int ret = (*faddr)( + (void*)args.values, // NOLINT(*) + (int*)args.type_codes, // NOLINT(*) + args.num_args); + CHECK_EQ(ret, 0) << TVMGetLastError(); + }); + } + + void SaveToFile(const std::string& file_name, + const std::string& format) final { + std::string fmt = runtime::GetFileFormat(file_name, format); + std::error_code ecode; + llvm::raw_fd_ostream dest(file_name, ecode, llvm::sys::fs::F_None); + CHECK_EQ(ecode.value(), 0) << "Cannot open file: " << file_name + << " " << ecode.message(); + if (fmt == "o" || fmt == "obj") { + llvm::legacy::PassManager pass; + CHECK(tm_); + CHECK(tm_->addPassesToEmitFile( + pass, dest, llvm::TargetMachine::CGFT_ObjectFile) == 0) + << "Cannot emit target CGFT_ObjectFile"; + pass.run(*mptr_); + } else if (fmt == "ll") { + mptr_->print(dest, nullptr); + } else if (fmt == "bc") { + llvm::WriteBitcodeToFile(mptr_, dest); + } else { + LOG(FATAL) << "Do not know how to save file " + << file_name << " with format=\'"<< format << "\'"; + } + dest.close(); + } + + std::string GetSource(const std::string& format) final { + std::string type_str; + llvm::raw_string_ostream rso(type_str); + CHECK(mptr_ != nullptr); + mptr_->print(rso, nullptr); + return rso.str(); + } + + void Init(const Array& funcs, std::string target) { + InitializeLLVM(); + std::tie(tm_, target_triple_) = LLVMGetTarget(target); + CHECK_NE(funcs.size(), 0U); + ctx_ = std::make_shared(); + CodeGenLLVM cg; + cg.Init(funcs[0]->name, target, ctx_.get()); + for (LoweredFunc f : funcs) { + cg.AddFunction(f); + } + cg.AddMainFunction(funcs[0]->name); + module_ = cg.Finish(); + mptr_ = module_.get(); + } + + private: + void LazyInitJIT() { + CHECK(ee_ == nullptr); + std::lock_guard lock(mutex_); + std::string target_triple = mptr_->getTargetTriple(); + llvm::EngineBuilder builder(std::move(module_)); + builder.setEngineKind(llvm::EngineKind::JIT); + builder.setOptLevel(llvm::CodeGenOpt::Aggressive); + llvm::TargetMachine *tm = builder.selectTarget(); + llvm::DataLayout layout(tm->createDataLayout()); + CHECK(layout == mptr_->getDataLayout()) + << "Data layout mismatch between module(" + << mptr_->getDataLayout().getStringRepresentation() << ")" + << " and ExecutionEngine (" + << layout.getStringRepresentation() << ")"; + ee_ = builder.create(tm); + CHECK(ee_ != nullptr) + << "Failed to initialize git engine for " << target_triple; + ee_->runStaticConstructorsDestructors(false); + // setup context address. + void** ctx_addr = + reinterpret_cast( + ee_->getGlobalValueAddress(runtime::symbol::tvm_module_ctx)); + if (ctx_addr != nullptr) { + *ctx_addr = this; + } + } + // The target configuration string + std::string target_triple_; + // JIT lock + std::mutex mutex_; + // execution engine + llvm::ExecutionEngine *ee_{nullptr}; + // The raw pointer to the module. + llvm::Module* mptr_{nullptr}; + // The target machine + llvm::TargetMachine* tm_{nullptr}; + // The module, can be moved to ee if JIT is enabled. + std::unique_ptr module_; + // the context. + std::shared_ptr ctx_; +}; + +TVM_REGISTER_API(_codegen_build_llvm) +.set_body([](TVMArgs args, TVMRetValue* rv) { + std::shared_ptr n = std::make_shared(); + n->Init(args[0], args[1]); + *rv = runtime::Module(n); + }); +} // namespace codegen +} // namespace tvm +#endif // TVM_LLVM_VERSION diff --git a/src/codegen/codegen_stack_vm.cc b/src/codegen/stack_vm/codegen_stack_vm.cc similarity index 90% rename from src/codegen/codegen_stack_vm.cc rename to src/codegen/stack_vm/codegen_stack_vm.cc index ad979f3b328e..e905d7c192d7 100644 --- a/src/codegen/codegen_stack_vm.cc +++ b/src/codegen/stack_vm/codegen_stack_vm.cc @@ -2,6 +2,7 @@ * Copyright (c) 2017 by Contributors * \file codegen_stack_vm.cc */ +#include #include #include #include "./codegen_stack_vm.h" @@ -11,34 +12,16 @@ namespace codegen { using namespace ir; -PackedFunc BuildStackVM( - LoweredFunc func, - const std::unordered_map& device_funcs) { - StackVM vm = codegen::CodeGenStackVM().Compile(func, device_funcs); - auto f = [vm](TVMArgs args, TVMRetValue* rv) { - vm(args); - }; - return PackedFunc(f); -} - CodeGenStackVM::FType& CodeGenStackVM::vtable() { // NOLINT(*) static FType inst; return inst; } -StackVM CodeGenStackVM::Compile( - LoweredFunc f, - const std::unordered_map& device_funcs) { +StackVM CodeGenStackVM::Compile(LoweredFunc f) { for (size_t i = 0; i < f->args.size(); ++i) { Var v = f->args[i]; int vid = AllocVarID(v.get()); CHECK_EQ(static_cast(vid), i); } - // setup device function map - for (const auto& kv : device_funcs) { - int fid = static_cast(vm_.packed_func.size()); - vm_.packed_func.push_back(kv.second); - device_fun_idmap_[kv.first->name] = fid; - } this->Push(f->body); return std::move(vm_); } @@ -194,7 +177,7 @@ void CodeGenStackVM::Push_(const ir::Call* op) { case intrinsic::kTypeLanes: PushOp(StackVM::TVM_ARRAY_GET_TYPE_LANES); break; default: LOG(FATAL) << "unknown field code"; } - } else if (op->is_intrinsic(intrinsic::tvm_call_global)) { + } else if (op->is_intrinsic(intrinsic::tvm_call_packed)) { CHECK_GE(op->args.size(), 1U); const StringImm* s = op->args[0].as(); CHECK(s != nullptr) << "tvm_call_global expect first argument as function name"; @@ -203,15 +186,14 @@ void CodeGenStackVM::Push_(const ir::Call* op) { } // find the fuction id. const std::string& func_name = s->value; - auto it = global_fun_idmap_.find(func_name); + auto it = extern_fun_idmap_.find(func_name); int fid; - if (it != global_fun_idmap_.end()) { + if (it != extern_fun_idmap_.end()) { fid = it->second; } else { - fid = static_cast(vm_.packed_func.size()); - PackedFunc f = PackedFunc::GetGlobal(func_name); - vm_.packed_func.push_back(f); - global_fun_idmap_[func_name] = fid; + fid = static_cast(vm_.extern_func_name.size()); + vm_.extern_func_name.push_back(func_name); + extern_fun_idmap_[func_name] = fid; } // get the argument type code. std::vector arg_type_codes; @@ -228,21 +210,6 @@ void CodeGenStackVM::Push_(const ir::Call* op) { this->Push(op->args[0]); this->PushOp(StackVM::PUSH_I64, 0); this->PushOp(StackVM::EQ_I64); - } else if (op->is_intrinsic(intrinsic::tvm_call_device)) { - std::string func_name = op->args[0].as()->value; - auto it = device_fun_idmap_.find(func_name); - CHECK(it != device_fun_idmap_.end()) - << "Cannot find device function " << func_name; - const int fid = it->second; - std::vector arg_type_codes; - for (size_t i = 1; i < op->args.size(); ++i) { - this->Push(op->args[i]); - Type t = op->args[i].type(); - int lanes = t.lanes(); - CHECK_EQ(lanes, 1); - arg_type_codes.push_back(t.code()); - } - this->PushCallPacked(fid, arg_type_codes); } else { this->HandleUnknownCall(op); } diff --git a/src/codegen/codegen_stack_vm.h b/src/codegen/stack_vm/codegen_stack_vm.h similarity index 87% rename from src/codegen/codegen_stack_vm.h rename to src/codegen/stack_vm/codegen_stack_vm.h index bf640d0b5ef6..11919bd40955 100644 --- a/src/codegen/codegen_stack_vm.h +++ b/src/codegen/stack_vm/codegen_stack_vm.h @@ -3,8 +3,8 @@ * \file codegen_stack_vm.h * \brief Codegen into Simple Stack VM. */ -#ifndef TVM_CODEGEN_CODEGEN_STACK_VM_H_ -#define TVM_CODEGEN_CODEGEN_STACK_VM_H_ +#ifndef TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_ +#define TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_ #include #include @@ -13,13 +13,11 @@ #include #include -#include "../runtime/stack_vm/stack_vm.h" +#include "./stack_vm.h" namespace tvm { namespace codegen { -using runtime::StackVM; - /*! * \brief A base class to generate a stack VM. * This module is used to generate host wrapper @@ -34,9 +32,7 @@ class CodeGenStackVM { * \note Only call compile once, * create a new codegen object each time. */ - StackVM Compile( - LoweredFunc f, - const std::unordered_map& device_funcs); + StackVM Compile(LoweredFunc f); /*! \brief Push stmt to generate new code */ void Push(const Stmt& n); /*! \brief Push expr to generate new code */ @@ -108,11 +104,9 @@ class CodeGenStackVM { /*! \brief id of each string */ std::unordered_map str_idmap_; /*! \brief id of each global function */ - std::unordered_map global_fun_idmap_; - /*! \brief id of device function */ - std::unordered_map device_fun_idmap_; + std::unordered_map extern_fun_idmap_; }; } // namespace codegen } // namespace tvm -#endif // TVM_CODEGEN_CODEGEN_STACK_VM_H_ +#endif // TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_ diff --git a/src/runtime/stack_vm/stack_vm.cc b/src/codegen/stack_vm/stack_vm.cc similarity index 96% rename from src/runtime/stack_vm/stack_vm.cc rename to src/codegen/stack_vm/stack_vm.cc index f631dab0130c..9df2afe8487c 100644 --- a/src/runtime/stack_vm/stack_vm.cc +++ b/src/codegen/stack_vm/stack_vm.cc @@ -7,7 +7,7 @@ #include "./stack_vm.h" namespace tvm { -namespace runtime { +namespace codegen { typedef dmlc::ThreadLocalStore StackVMStateStore; @@ -182,6 +182,7 @@ void StackVM::operator()(const runtime::TVMArgs& args) const { if (s->heap.size() < this->heap_size) { s->heap.resize(this->heap_size); } + s->heap[0].v_handle = (void*)args.values; // NOLINT(*) s->heap[1].v_handle = (void*)args.type_codes; // NOLINT(*) s->heap[2].v_int64 = args.num_args; @@ -193,7 +194,8 @@ void StackVM::Run(State* s) const { int64_t pc = s->pc; std::vector& stack = s->stack; std::vector& heap = s->heap; - + s->extern_func.clear(); + s->extern_func.resize(extern_func_name.size()); if (stack.size() < stack_size) { stack.resize(stack_size); } @@ -287,7 +289,7 @@ void StackVM::Run(State* s) const { alignof(Code) == alignof(int), "asusmption"); const int* type_codes = &(code[pc].v_int) + 3; runtime::TVMRetValue rv; - packed_func[call_fid].CallPacked( + GetExtern(s, call_fid).CallPacked( runtime::TVMArgs(&stack[sp + 1 - num_args], type_codes, num_args), &rv); sp = sp + 1 - num_args; stack[sp] = rv.value(); @@ -364,5 +366,18 @@ void StackVM::Run(State* s) const { } } -} // namespace runtime +const PackedFunc& StackVM::GetExtern(State* s, int fid) const { + PackedFunc& f = s->extern_func[fid]; + if (f == nullptr) { + CHECK(mod_ctx != nullptr) + << "No local context is set in stackvm"; + const PackedFunc* pf = mod_ctx->GetFuncFromEnv(extern_func_name[fid]); + CHECK(pf != nullptr); + f = *pf; + CHECK(f != nullptr); + } + return f; +} + +} // namespace codegen } // namespace tvm diff --git a/src/runtime/stack_vm/stack_vm.h b/src/codegen/stack_vm/stack_vm.h similarity index 90% rename from src/runtime/stack_vm/stack_vm.h rename to src/codegen/stack_vm/stack_vm.h index d45f857eedc9..982076ce2675 100644 --- a/src/runtime/stack_vm/stack_vm.h +++ b/src/codegen/stack_vm/stack_vm.h @@ -7,17 +7,20 @@ * to setup calls into device functions * when only Runtime compilation for device is available(via NVRTC or OpenCL). */ -#ifndef TVM_RUNTIME_STACK_VM_STACK_VM_H_ -#define TVM_RUNTIME_STACK_VM_STACK_VM_H_ +#ifndef TVM_CODEGEN_STACK_VM_STACK_VM_H_ +#define TVM_CODEGEN_STACK_VM_STACK_VM_H_ #include #include +#include +#include #include #include namespace tvm { -namespace runtime { +namespace codegen { +using runtime::operator<<; /*! * \brief A simple stack-based virtual machine. */ @@ -209,11 +212,19 @@ class StackVM { std::vector stack; /*! \brief The global heap space */ std::vector heap; + /*! \brief extern functions */ + std::vector extern_func; /*! \brief stack pointer */ int64_t sp{0}; /*! \brief program counter */ int64_t pc{0}; }; + /*! \brief The external function entries. */ + struct ExternFuncEntry { + std::string name; + runtime::PackedFunc func; + }; + /*! \brief execute the stack vm with given state */ void Run(State* state) const; /*! @@ -229,9 +240,11 @@ class StackVM { std::vector code; /*! \brief constant error messages */ std::vector str_data; - /*! \brief Extern functions in packed func format */ - std::vector packed_func; - /*! \brief name of each heap id*/ + /*! \brief The current module context of stackvm */ + runtime::ModuleNode* mod_ctx{nullptr}; + /*! \brief Extern functions */ + std::vector extern_func_name; + /*! \brief name of each heap id */ std::vector heap_id_name; /*! \brief The memory size needed */ size_t heap_size{0}; @@ -296,8 +309,12 @@ class StackVM { return ADDR_LOAD_FP64; } friend std::ostream& operator<<(std::ostream& os, const StackVM& vm); // NOLINT(*) + + private: + // get extern function. + const PackedFunc& GetExtern(State* s, int fid) const; }; -} // namespace runtime +} // namespace codegen } // namespace tvm -#endif // TVM_RUNTIME_STACK_VM_STACK_VM_H_ +#endif // TVM_CODEGEN_STACK_VM_STACK_VM_H_ diff --git a/src/codegen/stack_vm/stack_vm_module.cc b/src/codegen/stack_vm/stack_vm_module.cc new file mode 100644 index 000000000000..f2a361bf0693 --- /dev/null +++ b/src/codegen/stack_vm/stack_vm_module.cc @@ -0,0 +1,78 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file stack_vm_module.cc + */ +#include +#include +#include +#include "./codegen_stack_vm.h" + +namespace tvm { +namespace codegen { + +class StackVMModuleNode : public runtime::ModuleNode { + public: + const char* type_key() const { + return "stackvm"; + } + + void PreCompile(const std::string& name, TVMContext ctx) final {} + + PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) final { + if (name == runtime::symbol::tvm_module_main) { + return GetFunction(entry_func_, sptr_to_self); + } + auto it = fmap_.find(name); + if (it == fmap_.end()) return PackedFunc(); + const StackVM& vm = it->second; + // capture sptr_to_self to keep module node alive. + return PackedFunc([vm, sptr_to_self](TVMArgs args, TVMRetValue* rv) { + vm(args); + }); + } + + void SaveToFile(const std::string& file_name, + const std::string& format) final { + LOG(FATAL) << "StackVM do not support SaveToFile"; + } + + std::string GetSource(const std::string& format) final { + std::ostringstream os; + for (const auto& kv : fmap_) { + os << "Function: " << kv.first << '\n'; + os << kv.second; + } + return os.str(); + } + + static runtime::Module Build(const Array& funcs) { + CHECK_NE(funcs.size(), 0U); + std::shared_ptr n = + std::make_shared(); + for (LoweredFunc f : funcs) { + StackVM vm = codegen::CodeGenStackVM().Compile(f); + CHECK(!n->fmap_.count(f->name)) + << "Function name " << f->name << "already exist in list"; + vm.mod_ctx = n.get(); + n->fmap_[f->name] = std::move(vm); + } + n->entry_func_ = funcs[0]->name; + return runtime::Module(n); + } + + private: + // entry function. + std::string entry_func_; + // internal function map + std::unordered_map fmap_; +}; + +TVM_REGISTER_API(_codegen_build_stackvm) +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = StackVMModuleNode::Build(args[0]); + }); + +} // namespace codegen +} // namespace tvm diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc index de35b54cc75d..a9bff236b163 100644 --- a/src/pass/make_api.cc +++ b/src/pass/make_api.cc @@ -35,9 +35,12 @@ inline Stmt MakeAssertEQ(Expr lhs, Expr rhs, std::string msg) { LoweredFunc MakeAPI(Stmt body, std::string name, Array api_args, - int num_packed_args) { + int num_unpacked_args) { const Type tvm_index_type = UInt(32); const Stmt nop = Evaluate::make(0); + int num_args = static_cast(api_args.size()); + CHECK_LE(num_unpacked_args, num_args); + int num_packed_args = num_args - num_unpacked_args; // Data field definitions // The packed fields Var v_packed_args("args", Handle()); @@ -182,6 +185,7 @@ LoweredFunc MakeAPI(Stmt body, n->name = name; n->args = args; n->handle_data_type = handle_data_type; + n->is_packed_func = num_unpacked_args == 0; n->body = MergeNest({seq_init, seq_check}, body); LoweredFunc f(n); Array undefined = UndefinedVars(f); diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc index 0a26493803bc..c832b726ffce 100644 --- a/src/pass/split_host_device.cc +++ b/src/pass/split_host_device.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace tvm { @@ -154,6 +155,19 @@ class HostDeviceSplitter : public IRMutator { std::make_shared(*f.operator->()); n->body = this->Mutate(f->body); + if (f->is_packed_func && device_funcs_.size() != 0) { + // insert auto set device from device function. + Array args = {StringImm::make(runtime::symbol::tvm_entry_setdevice)}; + for (Var arg : f->args) { + args.push_back(arg); + } + n->body = Block::make( + Evaluate::make(Call::make( + Int(32), intrinsic::tvm_call_packed, + args, Call::Intrinsic)), + n->body); + } + Array ret{LoweredFunc(n)}; for (LoweredFunc x : device_funcs_) { ret.push_back(x); @@ -194,7 +208,7 @@ class HostDeviceSplitter : public IRMutator { } device_funcs_.emplace_back(f_device); return Evaluate::make(Call::make( - Int(32), intrinsic::tvm_call_device, + Int(32), intrinsic::tvm_call_packed, call_args, Call::Intrinsic)); } diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index aea44946a4fc..925d1243da7a 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include "./runtime_base.h" @@ -69,10 +70,136 @@ using namespace tvm::runtime; struct TVMRuntimeEntry { std::string ret_str; + std::string last_error; }; typedef dmlc::ThreadLocalStore TVMAPIRuntimeStore; +const char *TVMGetLastError() { + return TVMAPIRuntimeStore::Get()->last_error.c_str(); +} + +void TVMAPISetLastError(const char* msg) { + TVMAPIRuntimeStore::Get()->last_error = msg; +} + +int TVMModLoadFromFile(const char* file_name, + const char* format, + TVMModuleHandle* out) { + API_BEGIN(); + Module m = Module::LoadFromFile(file_name, format); + *out = new Module(m); + API_END(); +} + +int TVMModImport(TVMModuleHandle mod, + TVMModuleHandle dep) { + API_BEGIN(); + static_cast(mod)->Import( + *static_cast(dep)); + API_END(); +} + +int TVMModGetFunction(TVMModuleHandle mod, + const char* func_name, + int query_imports, + TVMFunctionHandle *func) { + API_BEGIN(); + PackedFunc pf = static_cast(mod)->GetFunction( + func_name, query_imports); + if (pf != nullptr) { + *func = new PackedFunc(pf); + } else { + *func = nullptr; + } + API_END(); +} + +int TVMModPreCompile(TVMModuleHandle mod, + const char* func_name, + TVMContext ctx) { + API_BEGIN(); + (*static_cast(mod))->PreCompile(func_name, ctx); + API_END(); +} + +int TVMBackendGetFuncFromEnv(void* mod_node, + const char* func_name, + TVMFunctionHandle *func) { + API_BEGIN(); + *func = (TVMFunctionHandle)( + static_cast(mod_node)->GetFuncFromEnv(func_name)); + API_END(); +} + +int TVMModFree(TVMModuleHandle mod) { + API_BEGIN(); + delete static_cast(mod); + API_END(); +} + +int TVMFuncFree(TVMFunctionHandle func) { + API_BEGIN(); + delete static_cast(func); + API_END(); +} + +int TVMFuncCall(TVMFunctionHandle func, + TVMValue* args, + int* arg_type_codes, + int num_args, + TVMValue* ret_val, + int* ret_type_code) { + API_BEGIN(); + TVMRetValue rv; + (*static_cast(func)).CallPacked( + TVMArgs(args, arg_type_codes, num_args), &rv); + // handle return string. + if (rv.type_code() == kStr || + rv.type_code() == kTVMType) { + TVMRuntimeEntry* e = TVMAPIRuntimeStore::Get(); + e->ret_str = rv.operator std::string(); + *ret_type_code = kStr; + ret_val->v_str = e->ret_str.c_str(); + } else { + rv.MoveToCHost(ret_val, ret_type_code); + } + API_END(); +} + +int TVMCFuncSetReturn(TVMRetValueHandle ret, + TVMValue value, + int type_code) { + API_BEGIN(); + TVMRetValue* rv = static_cast(ret); + *rv = TVMArgValue(value, type_code); + API_END(); +} + +int TVMFuncCreateFromCFunc(TVMPackedCFunc func, + void* resource_handle, + TVMPackedCFuncFinalizer fin, + TVMFunctionHandle *out) { + API_BEGIN(); + if (fin == nullptr) { + *out = new PackedFunc( + [func, resource_handle](TVMArgs args, TVMRetValue* rv) { + func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*) + args.num_args, rv, resource_handle); + }); + } else { + // wrap it in a shared_ptr, with fin as deleter. + // so fin will be called when the lambda went out of scope. + std::shared_ptr rpack(resource_handle, fin); + *out = new PackedFunc( + [func, rpack](TVMArgs args, TVMRetValue* rv) { + func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*) + args.num_args, rv, rpack.get()); + }); + } + API_END(); +} + int TVMDeviceInit(int dev_mask, const char** option_keys, const char** option_vals, @@ -175,65 +302,3 @@ int TVMSynchronize(TVMContext ctx, TVMStreamHandle stream) { }); API_END(); } - -int TVMFuncFree(TVMFunctionHandle func) { - API_BEGIN(); - delete static_cast(func); - API_END(); -} - -int TVMFuncCall(TVMFunctionHandle func, - TVMValue* args, - int* arg_type_codes, - int num_args, - TVMValue* ret_val, - int* ret_type_code) { - API_BEGIN(); - TVMRetValue rv; - (*static_cast(func)).CallPacked( - TVMArgs(args, arg_type_codes, num_args), &rv); - // handle return string. - if (rv.type_code() == kStr || - rv.type_code() == kTVMType) { - TVMRuntimeEntry* e = TVMAPIRuntimeStore::Get(); - e->ret_str = rv.operator std::string(); - *ret_type_code = kStr; - ret_val->v_str = e->ret_str.c_str(); - } else { - rv.MoveToCHost(ret_val, ret_type_code); - } - API_END(); -} - -int TVMCFuncSetReturn(TVMRetValueHandle ret, - TVMValue value, - int type_code) { - API_BEGIN(); - TVMRetValue* rv = static_cast(ret); - *rv = TVMArgValue(value, type_code); - API_END(); -} - -int TVMFuncCreateFromCFunc(TVMPackedCFunc func, - void* resource_handle, - TVMPackedCFuncFinalizer fin, - TVMFunctionHandle *out) { - API_BEGIN(); - if (fin == nullptr) { - *out = new PackedFunc( - [func, resource_handle](TVMArgs args, TVMRetValue* rv) { - func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*) - args.num_args, rv, resource_handle); - }); - } else { - // wrap it in a shared_ptr, with fin as deleter. - // so fin will be called when the lambda went out of scope. - std::shared_ptr rpack(resource_handle, fin); - *out = new PackedFunc( - [func, rpack](TVMArgs args, TVMRetValue* rv) { - func((TVMValue*)args.values, (int*)args.type_codes, // NOLINT(*) - args.num_args, rv, rpack.get()); - }); - } - API_END(); -} diff --git a/src/runtime/cuda/cuda_common.h b/src/runtime/cuda/cuda_common.h index b735064e01c4..dd7e0a14d33d 100644 --- a/src/runtime/cuda/cuda_common.h +++ b/src/runtime/cuda/cuda_common.h @@ -41,29 +41,6 @@ namespace runtime { * \return The PTX code. */ std::string NVRTCCompile(const std::string& code); - -/*! - * \brief Automatically detect and set cuda device. - * \param args The arguments. - */ -inline void AutoSetCUDADevice(const TVMArgs& args) { - int dev_id = -1; - for (int i = 0; i < args.size(); ++i) { - if (args.type_codes[i] == kArrayHandle) { - TVMContext ctx = static_cast( - args.values[i].v_handle)->ctx; - CHECK_EQ(ctx.dev_mask, kGPU) - << "All operands need to be GPU"; - if (dev_id == -1) { - dev_id = ctx.dev_id; - } else { - CHECK_EQ(dev_id, ctx.dev_id) - << "Operands comes from different devices "; - } - } - } - CUDA_CALL(cudaSetDevice(dev_id)); -} } // namespace runtime } // namespace tvm #endif // TVM_CUDA_RUNTIME diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc index 962254dd76d0..8b3a79c5a75d 100644 --- a/src/runtime/cuda/cuda_module.cc +++ b/src/runtime/cuda/cuda_module.cc @@ -5,6 +5,7 @@ #include "./cuda_module.h" #if TVM_CUDA_RUNTIME +#include #include #include #include @@ -14,20 +15,64 @@ #include "./cuda_common.h" #include "../void_addr_args.h" #include "../thread_storage_scope.h" +#include "../meta_data.h" +#include "../file_util.h" namespace tvm { namespace runtime { -/*! - * \brief Internal data structure to support multi-gpu execution. - * Try to use CUDA runtime's primary context. - */ -class CUDAModule::Internal { +// Module to support thread-safe multi-GPU execution. +// cuModule is a per-GPU module +// The runtime will contain a per-device module table +// The modules will be lazily loaded +class CUDAModuleNode : public runtime::ModuleNode { public: - explicit Internal(std::string data) - : data_(data) { + explicit CUDAModuleNode(std::string data, + std::string fmt, + std::unordered_map fmap, + std::string cuda_source) + : data_(data), fmt_(fmt), fmap_(fmap), cuda_source_(cuda_source) { std::fill(module_.begin(), module_.end(), nullptr); } + // destructor + ~CUDAModuleNode() { + for (size_t i = 0; i < module_.size(); ++i) { + if (module_[i] != nullptr) { + CUDA_CALL(cudaSetDevice(i)); + CUDA_DRIVER_CALL(cuModuleUnload(module_[i])); + } + } + } + + const char* type_key() const final { + return "cuda"; + } + + void PreCompile(const std::string& name, TVMContext ctx) final { + CUDA_CALL(cudaSetDevice(ctx.dev_id)); + cudaFree(nullptr); + this->GetFunc(ctx.dev_id, name); + } + + PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) final; + + void SaveToFile(const std::string& file_name, + const std::string& format) final { + LOG(FATAL) << "Not implemented"; + } + + std::string GetSource(const std::string& format) final { + if (format == fmt_) return data_; + if (cuda_source_.length() != 0) { + return cuda_source_; + } else { + if (fmt_ == "ptx") return data_; + return ""; + } + } + // get a CUfunction from primary context in dev_id CUfunction GetFunc(int dev_id, const std::string& func_name) { std::lock_guard lock(mutex_); @@ -46,21 +91,18 @@ class CUDAModule::Internal { } return func; } - // destructor - ~Internal() { - for (size_t i = 0; i < module_.size(); ++i) { - if (module_[i] != nullptr) { - CUDA_CALL(cudaSetDevice(i)); - CUDA_DRIVER_CALL(cuModuleUnload(module_[i])); - } - } - } private: // the binary data std::string data_; + // The format + std::string fmt_; + // function information table. + std::unordered_map fmap_; + // The cuda source. + std::string cuda_source_; // the internal modules per GPU, to be lazily initialized. - std::array module_; + std::array module_; // internal mutex when updating the module std::mutex mutex_; }; @@ -69,11 +111,13 @@ class CUDAModule::Internal { class CUDAWrappedFunc { public: // initialize the CUDA function. - void Init(std::shared_ptr m, + void Init(CUDAModuleNode* m, + std::shared_ptr sptr, const std::string& func_name, size_t num_void_args, - const std::vector& thread_axis_tags) { + const std::vector& thread_axis_tags) { m_ = m; + sptr_ = sptr; func_name_ = func_name; std::fill(fcache_.begin(), fcache_.end(), nullptr); thread_axis_cfg_.Init(num_void_args, thread_axis_tags); @@ -101,32 +145,87 @@ class CUDAWrappedFunc { private: // internal module - std::shared_ptr m_; + CUDAModuleNode* m_; + // the resource holder + std::shared_ptr sptr_; // The name of the function. std::string func_name_; // Device function cache per device. // mark as mutable, to enable lazy initialization - mutable std::array fcache_; + mutable std::array fcache_; // thread axis configuration ThreadAxisConfig thread_axis_cfg_; }; -PackedFunc CUDAModule::GetPackedFunc( - const std::string& func_name, - const std::vector arg_types, - const std::vector thread_axis_tags) const { +void AutoSetCUDADevice(const TVMArgs& args, TVMRetValue* rv) { + CHECK_EQ(args.size(), 3); + TVMValue* values = static_cast(args[0].operator void*()); + int* type_codes = static_cast(args[1].operator void*()); + int num_args = args[2].operator int(); + + int dev_id = -1; + for (int i = 0; i < num_args; ++i) { + if (type_codes[i] == kArrayHandle) { + TVMContext ctx = static_cast(values[i].v_handle)->ctx; + CHECK_EQ(ctx.dev_mask, kGPU) + << "All operands need to be GPU"; + if (dev_id == -1) { + dev_id = ctx.dev_id; + } else { + CHECK_EQ(dev_id, ctx.dev_id) + << "Operands comes from different devices "; + } + } + } + CHECK_NE(dev_id, -1) + << "Cannot detect device id from list"; + CUDA_CALL(cudaSetDevice(dev_id)); +} + +PackedFunc CUDAModuleNode::GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) { + CHECK_EQ(sptr_to_self.get(), this); + CHECK_NE(name, symbol::tvm_module_main) + << "Device function do not have main"; + if (name == symbol::tvm_entry_setdevice) { + return PackedFunc(AutoSetCUDADevice); + } + auto it = fmap_.find(name); + if (it == fmap_.end()) return PackedFunc(); + const FunctionInfo& info = it->second; CUDAWrappedFunc f; - f.Init(ptr_, func_name, arg_types.size(), thread_axis_tags); - return PackFromVoidAddrArgs(f, arg_types); + f.Init(this, sptr_to_self, name, info.arg_types.size(), info.thread_axis_tags); + return PackFromVoidAddrArgs(f, info.arg_types); } -CUDAModule CUDAModule::Create(std::string ptx) { - // call a runtime API to make sure the context is created. - CUDAModule m; - m.ptr_ = std::make_shared(ptx); - return m; +Module CUDAModuleCreate( + std::string data, + std::string fmt, + std::unordered_map fmap, + std::string cuda_source) { + std::shared_ptr n = + std::make_shared(data, fmt, fmap, cuda_source); + return Module(n); } + +// Load module from module. +Module CUDAModuleLoad(const std::string& file_name, + const std::string& format) { + std::string fmt = GetFileFormat(file_name, format); + std::string data = LoadBinaryFile(file_name); + return CUDAModuleCreate(data, fmt, {{}}, std::string()); +} + +TVM_REGISTER_GLOBAL(_module_loadfile_cubin) +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = CUDAModuleLoad(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL(_module_loadfile_ptx) +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = CUDAModuleLoad(args[0], args[1]); + }); } // namespace runtime } // namespace tvm - #endif // TVM_CUDA_RUNTIME diff --git a/src/runtime/cuda/cuda_module.h b/src/runtime/cuda/cuda_module.h index 38c805eea9d8..6c7095734aaa 100644 --- a/src/runtime/cuda/cuda_module.h +++ b/src/runtime/cuda/cuda_module.h @@ -7,44 +7,31 @@ #define TVM_RUNTIME_CUDA_CUDA_MODULE_H_ #include -#include +#include #include #include #include +#include "../meta_data.h" namespace tvm { namespace runtime { +/*! \brief Maximum number of GPU supported in CUDAModule */ +static constexpr const int kMaxNumGPUs = 32; + /*! - * \brief Handle execution of CUDA kernels as PackedFunc. - * It wraps around driver API to work with CUDA runtime API. + * \brief create a cuda module from data. + * + * \param data The module data, can be ptx, cubin + * \param fmt The format of the data, can be "ptx", "cubin" + * \param fmap The map function information map of each function. + * \param cuda_source Optional, cuda source file */ -class CUDAModule { - public: - /*! - * \brief Get CUDA Kernel launch wrapped as PackedFunc - * \param func_name The name of the function. - * \param arg_types The type of each argument in the function. - * \param thread_axis_tags The tag sequence of the thread axis. - */ - PackedFunc GetPackedFunc( - const std::string& func_name, - const std::vector arg_types, - const std::vector thread_axis_tags) const; - /*! - * \brief create a cuda module from data. - * \param data The module data. - */ - static CUDAModule Create(std::string data); - /*! \brief hidden internal data structure. */ - class Internal; - /*! \brief Maximum number of GPU supported in CUDAModule */ - static constexpr const int kMaxNumGPUs = 32; - - private: - std::shared_ptr ptr_; -}; - +Module CUDAModuleCreate( + std::string data, + std::string fmt, + std::unordered_map fmap, + std::string cuda_source); } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_CUDA_CUDA_MODULE_H_ diff --git a/src/runtime/cuda/nvrtc.cc b/src/runtime/cuda/nvrtc.cc deleted file mode 100644 index 38c13e22e7fc..000000000000 --- a/src/runtime/cuda/nvrtc.cc +++ /dev/null @@ -1,46 +0,0 @@ -/*! - * Copyright (c) 2017 by Contributors - * \file nvrtc.cc - */ -#include "./cuda_common.h" - -#if TVM_CUDA_RUNTIME - -#include - -namespace tvm { -namespace runtime { - -#define NVRTC_CALL(x) \ - { \ - nvrtcResult result = x; \ - if (result != NVRTC_SUCCESS) { \ - LOG(FATAL) \ - << "NvrtcError: " #x " failed with error: " \ - << nvrtcGetErrorString(result); \ - } \ - } - -std::string NVRTCCompile(const std::string& code) { - nvrtcProgram prog; - NVRTC_CALL(nvrtcCreateProgram( - &prog, code.c_str(), nullptr, 0, nullptr, nullptr)); - nvrtcResult compile_res = nvrtcCompileProgram(prog, 0, nullptr); - size_t log_size; - NVRTC_CALL(nvrtcGetProgramLogSize(prog, &log_size)); - std::string log; log.resize(log_size); - NVRTC_CALL(nvrtcGetProgramLog(prog, &log[0])); - CHECK_EQ(compile_res, NVRTC_SUCCESS) << log; - size_t ptx_size; - NVRTC_CALL(nvrtcGetPTXSize(prog, &ptx_size)); - - std::string ptx; - ptx.resize(ptx_size); - NVRTC_CALL(nvrtcGetPTX(prog, &ptx[0])); - NVRTC_CALL(nvrtcDestroyProgram(&prog)); - return ptx; -} - -} // namespace runtime -} // namespace tvm -#endif // TVM_CUDA_RUNTIME diff --git a/src/runtime/dso_module.cc b/src/runtime/dso_module.cc new file mode 100644 index 000000000000..502f8216b16f --- /dev/null +++ b/src/runtime/dso_module.cc @@ -0,0 +1,118 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file dso_module.cc + * \brief Module to load from dynamic shared library. + */ +#include +#include +#include +#include "./meta_data.h" + +#if defined(_WIN32) +#include +#else +#include +#endif + +namespace tvm { +namespace runtime { + +// Module to load from dynamic shared libary. +// This is the default module TVM used for hostside AOT +class DSOModuleNode : public ModuleNode { + public: + ~DSOModuleNode() { + if (lib_handle_) Unload(); + } + + const char* type_key() const { + return "dso"; + } + + void PreCompile(const std::string& name, TVMContext ctx) final { + GetFuncPtr(name); + } + + PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) final { + BackendPackedCFunc faddr = GetFuncPtr(name); + if (faddr == nullptr) return PackedFunc(); + return PackedFunc([faddr, sptr_to_self](TVMArgs args, TVMRetValue* rv) { + int ret = (*faddr)( + (void*)args.values, // NOLINT(*) + (int*)args.type_codes, // NOLINT(*) + args.num_args); + CHECK_EQ(ret, 0) << TVMGetLastError(); + }); + } + + void SaveToFile(const std::string& file_name, + const std::string& format) final { + LOG(FATAL) << "Cannot save dso to another file"; + } + + std::string GetSource(const std::string& format) final { + return ""; + } + + void Init(const std::string& name) { + Load(name); + CHECK(lib_handle_ != nullptr) + << "Failed to load dynamic shared library " << name; + void** ctx_addr = + reinterpret_cast( + GetGlobalVPtr(runtime::symbol::tvm_module_ctx)); + if (ctx_addr != nullptr) { + *ctx_addr = this; + } + } + + private: + // Platform dependent handling. +#if defined(_WIN32) + // library handle + HMODULE lib_handle_{nullptr}; + // Load the library + void Load(const std::string& name) { + lib_handle_ = LoadLibrary(name.c_str()); + } + BackendPackedCFunc GetFuncPtr(const std::string& name) { + return reinterpret_cast( + GetProcAddress(lib_handle_, name.c_str())); // NOLINT(*) + } + void* GetGlobalVPtr(const std::string& name) { + return reinterpret_cast( + GetProcAddress(lib_handle_, name.c_str())); // NOLINT(*) + } + void Unload() { + FreeLibrary(lib_handle_); + } +#else + // Library handle + void* lib_handle_{nullptr}; + // load the library + void Load(const std::string& name) { + lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL); + } + BackendPackedCFunc GetFuncPtr(const std::string& name) { + return reinterpret_cast( + dlsym(lib_handle_, name.c_str())); + } + void* GetGlobalVPtr(const std::string& name) { + return dlsym(lib_handle_, name.c_str()); + } + void Unload() { + dlclose(lib_handle_); + } +#endif +}; + +TVM_REGISTER_GLOBAL(_module_loadfile_so) +.set_body([](TVMArgs args, TVMRetValue* rv) { + std::shared_ptr n = std::make_shared(); + n->Init(args[0]); + *rv = runtime::Module(n); + }); +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/error_handle.cc b/src/runtime/error_handle.cc deleted file mode 100644 index 2a6c0bf75a7c..000000000000 --- a/src/runtime/error_handle.cc +++ /dev/null @@ -1,22 +0,0 @@ -/*! - * Copyright (c) 2016 by Contributors - * Implementation of error handling API - * \file error_handle.cc - */ -#include -#include -#include "./runtime_base.h" - -struct TVMErrorEntry { - std::string last_error; -}; - -typedef dmlc::ThreadLocalStore TVMAPIErrorStore; - -const char *TVMGetLastError() { - return TVMAPIErrorStore::Get()->last_error.c_str(); -} - -void TVMAPISetLastError(const char* msg) { - TVMAPIErrorStore::Get()->last_error = msg; -} diff --git a/src/runtime/file_util.h b/src/runtime/file_util.h new file mode 100644 index 000000000000..15759130682d --- /dev/null +++ b/src/runtime/file_util.h @@ -0,0 +1,55 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file file_util.h + * \brief Minimum file manipulation util for runtime. + */ +#ifndef TVM_RUNTIME_FILE_UTIL_H_ +#define TVM_RUNTIME_FILE_UTIL_H_ + +#include +#include +#include + +namespace tvm { +namespace runtime { +/*! + * \brief Get file format from given file name or format argument. + * \param file_name The name of the file. + * \param format The format of the file. + */ +inline std::string GetFileFormat(const std::string& file_name, + const std::string& format) { + std::string fmt = format; + if (fmt.length() == 0) { + size_t pos = file_name.find_last_of("."); + if (pos != std::string::npos) { + return file_name.substr(pos + 1, file_name.length() - pos - 1); + } else { + return ""; + } + } else { + return format; + } +} + +/*! + * \brief Load binary file into a in-memory buffer. + * \param file_name The name of the file. + */ +inline std::string LoadBinaryFile(const std::string& file_name) { + std::ifstream fs(file_name, std::ios::in | std::ios::binary); + CHECK(!fs.fail()) + << "Cannot open " << file_name; + // get its size: + fs.seekg(0, std::ios::end); + size_t size = fs.tellg(); + fs.seekg(0, std::ios::beg); + std::string data; + data.resize(size); + fs.read(&data[0], size); + return data; +} + +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_FILE_UTIL_H_ diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h new file mode 100644 index 000000000000..3e8a269351d0 --- /dev/null +++ b/src/runtime/meta_data.h @@ -0,0 +1,58 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file meta_data.h + * \brief Meta data related utilities + */ +#ifndef TVM_RUNTIME_META_DATA_H_ +#define TVM_RUNTIME_META_DATA_H_ + +#include +#include +#include +#include "./runtime_base.h" + +extern "C" { +// Function signature for generated packed function in shared library +typedef int (*BackendPackedCFunc)(void* args, + int* type_codes, + int num_args); +} // extern "C" + +namespace tvm { +namespace runtime { + +/*! \brief function information needed by device */ +struct FunctionInfo { + std::string name; + std::vector arg_types; + std::vector thread_axis_tags; + + void Save(dmlc::JSONWriter *writer) const { + std::vector sarg_types(arg_types.size()); + for (size_t i = 0; i < arg_types.size(); ++i) { + sarg_types[i] = TVMType2String(arg_types[i]); + } + writer->BeginObject(); + writer->WriteObjectKeyValue("name", name); + writer->WriteObjectKeyValue("arg_types", sarg_types); + writer->WriteObjectKeyValue("thread_axis_tags", thread_axis_tags); + writer->EndObject(); + } + + void Load(dmlc::JSONReader *reader) { + dmlc::JSONObjectReadHelper helper; + std::vector sarg_types; + helper.DeclareField("name", &name); + helper.DeclareField("arg_types", &sarg_types); + helper.DeclareField("thread_axis_tags", &thread_axis_tags); + helper.ReadAllFields(reader); + arg_types.resize(sarg_types.size()); + for (size_t i = 0; i < arg_types.size(); ++i) { + arg_types[i] = String2TVMType(sarg_types[i]); + } + } +}; + +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_META_DATA_H_ diff --git a/src/runtime/module.cc b/src/runtime/module.cc new file mode 100644 index 000000000000..4b4ffea94d11 --- /dev/null +++ b/src/runtime/module.cc @@ -0,0 +1,119 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file module.cc + * \brief The global registry of packed function. + */ +#include +#include +#include +#include +#include "./file_util.h" +#include "./meta_data.h" + +namespace tvm { +namespace runtime { + +PackedFunc Module::GetFunction( + const std::string& name, bool query_imports) { + PackedFunc pf = node_->GetFunction(name, node_); + if (pf != nullptr) return pf; + if (query_imports) { + for (const Module& m : node_->imports_) { + pf = m.node_->GetFunction(name, m.node_); + if (pf != nullptr) return pf; + } + } + return pf; +} + +void Module::Import(Module other) { + // cyclic detection. + std::unordered_set visited{other.node_.get()}; + std::vector stack{other.node_.get()}; + while (!stack.empty()) { + const ModuleNode* n = stack.back(); + stack.pop_back(); + for (const Module& m : n->imports_) { + const ModuleNode* next = m.node_.get(); + if (visited.count(next)) continue; + visited.insert(next); + stack.push_back(next); + } + } + CHECK(!visited.count(node_.get())) + << "Cyclic dependency detected during import"; + node_->imports_.emplace_back(std::move(other)); +} + +Module Module::LoadFromFile(const std::string& file_name, + const std::string& format) { + std::string fmt = GetFileFormat(file_name, format); + CHECK(fmt.length() != 0) + << "Cannot deduce format of file " << file_name; + if (fmt == "dll" || fmt == "dylib" || fmt == "dso") { + fmt = "so"; + } + std::string load_f_name = "_module_loadfile_" + fmt; + const PackedFunc* f = Registry::Get(load_f_name); + CHECK(f != nullptr) + << "Loader of " << format << "(" + << load_f_name << ") is not presented."; + Module m = (*f)(file_name, format); + return m; +} + +const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) { + auto it = import_cache_.find(name); + if (it != import_cache_.end()) return it->second.get(); + PackedFunc pf; + for (Module& m : this->imports_) { + pf = m.GetFunction(name, false); + if (pf != nullptr) break; + } + if (pf == nullptr) { + const PackedFunc* f = Registry::Get(name); + CHECK(f != nullptr) + << "Cannot find function " << name + << " in the imported modules or global registry"; + return f; + } else { + std::unique_ptr f(new PackedFunc(pf)); + import_cache_[name] = std::move(f); + return import_cache_.at(name).get(); + } +} + +TVM_REGISTER_GLOBAL(_module__GetSource) +.set_body([](TVMArgs args, TVMRetValue *ret) { + *ret = args[0].operator Module()->GetSource(args[1]); + }); + +TVM_REGISTER_GLOBAL(_module__ImportsSize) +.set_body([](TVMArgs args, TVMRetValue *ret) { + *ret = static_cast( + args[0].operator Module()->imports().size()); + }); + +TVM_REGISTER_GLOBAL(_module__GetImport) +.set_body([](TVMArgs args, TVMRetValue *ret) { + *ret = args[0].operator Module()-> + imports().at(args[1].operator int()); + }); + +TVM_REGISTER_GLOBAL(_module__GetTyeKey) +.set_body([](TVMArgs args, TVMRetValue *ret) { + *ret = args[0].operator Module()->type_key(); + }); + +TVM_REGISTER_GLOBAL(_module__LoadFromFile) +.set_body([](TVMArgs args, TVMRetValue *ret) { + *ret = Module::LoadFromFile(args[0], args[1]); + }); + +TVM_REGISTER_GLOBAL(_module__SaveToFile) +.set_body([](TVMArgs args, TVMRetValue *ret) { + args[0].operator Module()-> + SaveToFile(args[1], args[2]); + }); +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index e22bc3403389..db8e5e2871aa 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -114,6 +114,10 @@ class OpenCLWorkspace { // Number of registered kernels // Used to register kernel into the workspace. size_t num_registered_kernels{0}; + // The version counter, used + size_t timestamp{0}; + // Ids that are freed by kernels. + std::vector free_kernel_ids; // the mutex for initialization std::mutex mu; // destructor @@ -139,13 +143,21 @@ class OpenCLWorkspace { static OpenCLWorkspace* Global(); }; + /*! \brief Thread local workspace */ class OpenCLThreadEntry { public: + // The kernel entry and version. + struct KTEntry { + // The kernel handle. + cl_kernel kernel{nullptr}; + // timestamp used to recognize stale kernel + size_t version{0}; + }; /*! \brief The current context */ TVMContext context; /*! \brief The thread-local kernel table */ - std::vector kernel_table; + std::vector kernel_table; OpenCLThreadEntry() { context.dev_id = 0; @@ -155,29 +167,6 @@ class OpenCLThreadEntry { static OpenCLThreadEntry* ThreadLocal(); }; } // namespace cl -/*! - * \brief Automatically detect and set cuda device. - * \param args The arguments. - */ -inline void AutoSetOpenCLContext(const TVMArgs& args) { - // TODO(tqchen): merge this with CUDA logic. - int dev_id = -1; - for (int i = 0; i < args.size(); ++i) { - if (args.type_codes[i] == kArrayHandle) { - TVMContext ctx = static_cast( - args.values[i].v_handle)->ctx; - CHECK_EQ(ctx.dev_mask, kOpenCL) - << "All operands need to be GPU"; - if (dev_id == -1) { - dev_id = ctx.dev_id; - } else { - CHECK_EQ(dev_id, ctx.dev_id) - << "Operands comes from different devices "; - } - } - } - cl::OpenCLThreadEntry::ThreadLocal()->context.dev_id = dev_id; -} } // namespace runtime } // namespace tvm #endif // TVM_OPENCL_RUNTIME diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 98e889311ca2..6c0b4619bd1d 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -16,47 +16,164 @@ namespace tvm { namespace runtime { -using namespace detail; - -/*! - * \brief Internal data structure to support multi-gpu execution. - * Try to use OpenCL runtime's primary context. - */ -class OpenCLModule::Internal { +// Module to support thread-safe multi-device execution. +// OpenCL runtime is a bit tricky because clSetKernelArg is not thread-safe +// To make the call thread-safe, we create a thread-local kernel table +// and lazily install new kernels into the kernel table when the kernel is called. +// The kernels are recycled when the module get destructed. +class OpenCLModuleNode : public ModuleNode { public: - // the binary data - cl_program program; - // kernel id cache - std::unordered_map kid_map; + // Kernel table reference entry. + struct KTRefEntry { + size_t kernel_id; + size_t version; + }; + explicit OpenCLModuleNode(std::string data, + std::string fmt, + std::unordered_map fmap) + : data_(data), fmt_(fmt), fmap_(fmap) {} + // destructor + ~OpenCLModuleNode() { + { + // free the kernel ids in global table. + cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global(); + std::lock_guard lock(w->mu); + for (auto& kv : kid_map_) { + w->free_kernel_ids.push_back(kv.second.kernel_id); + } + } + // free the kernels + for (cl_kernel k : kernels_) { + OPENCL_CALL(clReleaseKernel(k)); + } + if (program_) { + OPENCL_CALL(clReleaseProgram(program_)); + } + } - explicit Internal(cl_program program) - : program(program) { + const char* type_key() const final { + return "opencl"; } - // destructor - ~Internal() { - OPENCL_CALL(clReleaseProgram(program)); + + void PreCompile(const std::string& name, TVMContext ctx) final { + InstallKernel(cl::OpenCLWorkspace::Global(), + cl::OpenCLThreadEntry::ThreadLocal(), + name, kid_map_.at(name)); + } + + PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) final; + + void SaveToFile(const std::string& file_name, + const std::string& format) final { + LOG(FATAL) << "Not implemented"; } - // get kernel id given key(function name. - size_t GetKernelID(const std::string& key) { + + std::string GetSource(const std::string& format) final { + if (format == fmt_) return data_; + if (fmt_ == "cl") { + return data_; + } else { + return ""; + } + } + + // Initialize the programs + void InitProgram() { cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global(); + CHECK(w->initialized()); + if (fmt_ == "cl") { + const char* s = data_.c_str(); + size_t len = data_.length(); + cl_int err; + program_ = clCreateProgramWithSource( + w->context, 1, &s, &len, &err); + OPENCL_CHECK_ERROR(err); + } else { + LOG(FATAL) << "Unknown OpenCL format " << fmt_; + } + device_built_flag_.resize(w->devices.size(), false); + // initialize the kernel id, need to lock global table. std::lock_guard lock(w->mu); - if (kid_map.count(key)) return kid_map.at(key); - size_t kid = w->num_registered_kernels++; - kid_map[key] = kid; - return kid; + for (const auto& kv : fmap_) { + const std::string& key = kv.first; + KTRefEntry e; + if (w->free_kernel_ids.size() != 0) { + e.kernel_id = w->free_kernel_ids.back(); + w->free_kernel_ids.pop_back(); + } else { + e.kernel_id = w->num_registered_kernels++; + } + e.version = w->timestamp++; + kid_map_[key] = e; + } + } + // install a new kernel to thread local entry + cl_kernel InstallKernel(cl::OpenCLWorkspace* w, + cl::OpenCLThreadEntry* t, + const std::string& func_name, + const KTRefEntry& e) { + std::lock_guard lock(build_lock_); + int dev_id = t->context.dev_id; + if (!device_built_flag_[dev_id]) { + // build program + cl_int err; + cl_device_id dev = w->devices[dev_id]; + err = clBuildProgram(program_, 1, &dev, nullptr, nullptr, nullptr); + if (err != CL_SUCCESS) { + size_t len; + std::string log; + clGetProgramBuildInfo( + program_, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len); + log.resize(len); + clGetProgramBuildInfo( + program_, dev, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr); + LOG(FATAL) << "OpenCL build error for device=" << dev << log; + } + device_built_flag_[dev_id] = true; + } + // build kernel + cl_int err; + cl_kernel kernel = clCreateKernel(program_, func_name.c_str(), &err); + OPENCL_CHECK_ERROR(err); + t->kernel_table[e.kernel_id].kernel = kernel; + t->kernel_table[e.kernel_id].version = e.version; + kernels_.push_back(kernel); + return kernel; } + + private: + // the binary data + std::string data_; + // The format + std::string fmt_; + // function information table. + std::unordered_map fmap_; + // Module local mutex + std::mutex build_lock_; + // the binary data + cl_program program_{nullptr}; + // build info + std::vector device_built_flag_; + // kernel id cache + std::unordered_map kid_map_; + // kernels build so far. + std::vector kernels_; }; class OpenCLWrappedFunc { public: // initialize the CUDA function. - void Init(std::shared_ptr m, - size_t kernel_id, + void Init(OpenCLModuleNode* m, + std::shared_ptr sptr, + OpenCLModuleNode::KTRefEntry entry, std::string func_name, std::vector arg_size, const std::vector& thread_axis_tags) { m_ = m; - kernel_id_ = kernel_id; + sptr_ = sptr; + entry_ = entry; func_name_ = func_name; arg_size_ = arg_size; thread_axis_cfg_.Init(arg_size.size(), thread_axis_tags); @@ -69,15 +186,13 @@ class OpenCLWrappedFunc { cl::OpenCLThreadEntry* t = cl::OpenCLThreadEntry::ThreadLocal(); CHECK(w->initialized()); // get the kernel from thread local kernel table. - if (kernel_id_ >= t->kernel_table.size()) { - t->kernel_table.resize(kernel_id_ + 1, nullptr); + if (entry_.kernel_id >= t->kernel_table.size()) { + t->kernel_table.resize(entry_.kernel_id + 1); } - cl_kernel kernel = t->kernel_table[kernel_id_]; - if (kernel == nullptr) { - cl_int err; - kernel = clCreateKernel(m_->program, func_name_.c_str(), &err); - OPENCL_CHECK_ERROR(err); - t->kernel_table[kernel_id_] = kernel; + const auto& e = t->kernel_table[entry_.kernel_id]; + cl_kernel kernel = e.kernel; + if (kernel == nullptr || e.version != entry_.version) { + kernel = m_->InstallKernel(w, t, func_name_, entry_); } // setup arguments. for (cl_uint i = 0; i < arg_size_.size(); ++i) { @@ -98,10 +213,12 @@ class OpenCLWrappedFunc { } private: - // modulex - std::shared_ptr m_; + // The module + OpenCLModuleNode* m_; + // resource handle + std::shared_ptr sptr_; // global kernel id in the kernel table. - size_t kernel_id_; + OpenCLModuleNode::KTRefEntry entry_; // The name of the function. std::string func_name_; // convert code for void argument @@ -110,52 +227,71 @@ class OpenCLWrappedFunc { ThreadAxisConfig thread_axis_cfg_; }; -PackedFunc OpenCLModule::GetPackedFunc( - const std::string& func_name, - const std::vector arg_types, - const std::vector thread_axis_tags) const { +/*! + * \brief Automatically detect and set cuda device. + * \param args The arguments. + */ +void AutoSetOpenCLDevice(const TVMArgs& args, TVMRetValue* rv) { + CHECK_EQ(args.size(), 3); + TVMValue* values = static_cast(args[0].operator void*()); + int* type_codes = static_cast(args[1].operator void*()); + int num_args = args[2].operator int(); + + // TODO(tqchen): merge this with CUDA logic. + int dev_id = -1; + for (int i = 0; i < num_args; ++i) { + if (type_codes[i] == kArrayHandle) { + TVMContext ctx = static_cast(values[i].v_handle)->ctx; + CHECK_EQ(ctx.dev_mask, kOpenCL) + << "All operands need to be OpenCL"; + if (dev_id == -1) { + dev_id = ctx.dev_id; + } else { + CHECK_EQ(dev_id, ctx.dev_id) + << "Operands comes from different devices "; + } + } + } + CHECK_NE(dev_id, -1) + << "Cannot detect device id from list"; + cl::OpenCLThreadEntry::ThreadLocal()->context.dev_id = dev_id; +} + +PackedFunc OpenCLModuleNode::GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) { + CHECK_EQ(sptr_to_self.get(), this); + CHECK_NE(name, symbol::tvm_module_main) + << "Device function do not have main"; + if (name == symbol::tvm_entry_setdevice) { + return PackedFunc(AutoSetOpenCLDevice); + } + auto it = fmap_.find(name); + if (it == fmap_.end()) return PackedFunc(); + const FunctionInfo& info = it->second; OpenCLWrappedFunc f; - // get the kernel id. - size_t kid = ptr_->GetKernelID(func_name); - std::vector arg_size(arg_types.size()); - for (size_t i = 0; i < arg_types.size(); ++i) { - TVMType t = arg_types[i]; + std::vector arg_size(info.arg_types.size()); + for (size_t i = 0; i < info.arg_types.size(); ++i) { + TVMType t = info.arg_types[i]; CHECK_EQ(t.lanes, 1U); uint32_t bits = t.bits; CHECK_EQ(bits % 8, 0U); arg_size[i] = bits / 8; } // initialize the wrapped func. - f.Init(ptr_, kid, func_name, arg_size, thread_axis_tags); - return PackFromVoidAddrArgs(f, arg_types); + f.Init(this, sptr_to_self, kid_map_.at(name), + name, arg_size, info.thread_axis_tags); + return PackFromVoidAddrArgs(f, info.arg_types); } -OpenCLModule OpenCLModule::CreateWithSource(std::string source) { - cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global(); - CHECK(w->initialized()); - const char* s = source.c_str(); - size_t len = source.length(); - cl_int err; - cl_program prog = clCreateProgramWithSource( - w->context, 1, &s, &len, &err); - OPENCL_CHECK_ERROR(err); - - for (cl_device_id dev_id : w->devices) { - err = clBuildProgram(prog, 1, &dev_id, nullptr, nullptr, nullptr); - if (err != CL_SUCCESS) { - size_t len; - std::string log; - clGetProgramBuildInfo( - prog, dev_id, CL_PROGRAM_BUILD_LOG, 0, nullptr, &len); - log.resize(len); - clGetProgramBuildInfo( - prog, dev_id, CL_PROGRAM_BUILD_LOG, len, &log[0], nullptr); - LOG(FATAL) << "OpenCL build error for device=" << dev_id << log; - } - } - OpenCLModule m; - m.ptr_ = std::make_shared(prog); - return m; +Module OpenCLModuleCreate( + std::string data, + std::string fmt, + std::unordered_map fmap) { + std::shared_ptr n = + std::make_shared(data, fmt, fmap); + n->InitProgram(); + return Module(n); } } // namespace runtime diff --git a/src/runtime/opencl/opencl_module.h b/src/runtime/opencl/opencl_module.h index e9d6fb5da395..8acced9f5ecb 100644 --- a/src/runtime/opencl/opencl_module.h +++ b/src/runtime/opencl/opencl_module.h @@ -11,38 +11,21 @@ #include #include #include +#include "../meta_data.h" namespace tvm { namespace runtime { - /*! - * \brief Handle execution of OPENCL kernels as PackedFunc. - * It wraps around driver API to work with OPENCL runtime API. + * \brief create a cuda module from data. + * + * \param data The module data, can be ptx, cubin + * \param fmt The format of the data, can be "clbin", "cl" + * \param fmap The map function information map of each function. */ -class OpenCLModule { - public: - /*! - * \brief Get OpenCL Kernel launch wrapped as PackedFunc - * \param func_name The name of the function. - * \param arg_types The type of each argument in the function. - * \param thread_axis_tags The tag sequence of the thread axis. - */ - PackedFunc GetPackedFunc( - const std::string& func_name, - const std::vector arg_types, - const std::vector thread_axis_tags) const; - /*! - * \brief create a OpenCL module from data. - * \param source The module data. - */ - static OpenCLModule CreateWithSource(std::string source); - /*! \brief hidden internal data structure. */ - class Internal; - - private: - std::shared_ptr ptr_; -}; - +Module OpenCLModuleCreate( + std::string data, + std::string fmt, + std::unordered_map fmap); } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_OPENCL_OPENCL_MODULE_H_ diff --git a/src/runtime/packed_func_registry.cc b/src/runtime/registry.cc similarity index 53% rename from src/runtime/packed_func_registry.cc rename to src/runtime/registry.cc index b3f2ff371a5a..9f3908519799 100644 --- a/src/runtime/packed_func_registry.cc +++ b/src/runtime/registry.cc @@ -1,62 +1,74 @@ /*! * Copyright (c) 2017 by Contributors - * \file packed_func_registry.cc + * \file registry.cc * \brief The global registry of packed function. */ #include #include -#include +#include #include +#include #include #include "./runtime_base.h" namespace tvm { namespace runtime { -struct PackedFuncRegistry { +struct Registry::Manager { // map storing the functions. // We delibrately used raw pointer // This is because PackedFunc can contain callbacks into the host languge(python) // and the resource can become invalid because of indeterminstic order of destruction. // The resources will only be recycled during program exit. - std::unordered_map fmap; + std::unordered_map fmap; + std::mutex mutex; - static PackedFuncRegistry* Global() { - static PackedFuncRegistry inst; + static Manager* Global() { + static Manager inst; return &inst; } }; -const PackedFunc& PackedFunc::RegisterGlobal( - const std::string& name, PackedFunc f) { - PackedFuncRegistry* r = PackedFuncRegistry::Global(); - auto it = r->fmap.find(name); - CHECK(it == r->fmap.end()) +Registry& Registry::set_body(PackedFunc f) { // NOLINT(*) + func_ = f; + return *this; +} + +Registry& Registry::Register(const std::string& name) { // NOLINT(*) + Manager* m = Manager::Global(); + std::lock_guard(m->mutex); + auto it = m->fmap.find(name); + CHECK(it == m->fmap.end()) << "Global PackedFunc " << name << " is already registered"; - PackedFunc* fp = new PackedFunc(f); - r->fmap[name] = fp; - return *fp; + Registry* r = new Registry(); + r->name_ = name; + m->fmap[name] = r; + return *r; } -const PackedFunc& PackedFunc::GetGlobal(const std::string& name) { - PackedFuncRegistry* r = PackedFuncRegistry::Global(); - auto it = r->fmap.find(name); - CHECK(it != r->fmap.end()) - << "Global PackedFunc " << name << " is not registered"; - return *(it->second); +bool Registry::Remove(const std::string& name) { + Manager* m = Manager::Global(); + std::lock_guard(m->mutex); + auto it = m->fmap.find(name); + if (it == m->fmap.end()) return false; + m->fmap.erase(it); + return true; } -bool PackedFunc::GlobalExist(const std::string& name) { - PackedFuncRegistry* r = PackedFuncRegistry::Global(); - auto it = r->fmap.find(name); - return it != r->fmap.end(); +const PackedFunc* Registry::Get(const std::string& name) { + Manager* m = Manager::Global(); + std::lock_guard(m->mutex); + auto it = m->fmap.find(name); + if (it == m->fmap.end()) return nullptr; + return &(it->second->func_); } -std::vector PackedFunc::ListGlobalNames() { - PackedFuncRegistry* r = PackedFuncRegistry::Global(); +std::vector Registry::ListNames() { + Manager* m = Manager::Global(); + std::lock_guard(m->mutex); std::vector keys; - keys.reserve(r->fmap.size()); - for (const auto &kv : r->fmap) { + keys.reserve(m->fmap.size()); + for (const auto &kv : m->fmap) { keys.push_back(kv.first); } return keys; @@ -78,26 +90,27 @@ typedef dmlc::ThreadLocalStore TVMFuncThreadLocalStore; int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f) { - using tvm::runtime::PackedFunc; API_BEGIN(); - PackedFunc::RegisterGlobal(name, *static_cast(f)); + tvm::runtime::Registry::Register(name) + .set_body(*static_cast(f)); API_END(); } int TVMFuncGetGlobal(const char* name, TVMFunctionHandle* out) { - using tvm::runtime::PackedFunc; API_BEGIN(); - const PackedFunc& f = PackedFunc::GetGlobal(name); - *out = (TVMFunctionHandle)(&f); // NOLINT(*) + const tvm::runtime::PackedFunc* fp = + tvm::runtime::Registry::Get(name); + CHECK(fp != nullptr) + << "Cannot find global function " << name; + *out = (TVMFunctionHandle)(fp); // NOLINT(*) API_END(); } int TVMFuncListGlobalNames(int *out_size, const char*** out_array) { - using tvm::runtime::PackedFunc; API_BEGIN(); TVMFuncThreadLocalEntry *ret = TVMFuncThreadLocalStore::Get(); - ret->ret_vec_str = PackedFunc::ListGlobalNames(); + ret->ret_vec_str = tvm::runtime::Registry::ListNames(); ret->ret_vec_charp.clear(); for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) { ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str()); diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py index 57c0d60ed9fe..b87000f3f5d9 100644 --- a/tests/python/integration/test_ewise.py +++ b/tests/python/integration/test_ewise.py @@ -19,20 +19,15 @@ def test_add(): s[C].vectorize(x) # one line to build the function. - def check_device(target): - codes = [] - fadd = tvm.build(s, [A, B, C], - target, record_codes=codes, - name="myadd") - if target == "cuda": - ctx = tvm.gpu(0) - else: - ctx = tvm.cl(0) - if not ctx.enabled: + def check_device(device, host="stackvm"): + if not tvm.codegen.target_enabled(host): return - - for c in codes[1:]: - print(c) + if not tvm.codegen.target_enabled(device): + return + fadd = tvm.build(s, [A, B, C], + device, host, + name="myadd") + ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) @@ -43,7 +38,7 @@ def check_device(target): c.asnumpy(), a.asnumpy() + b.asnumpy()) tvm.init_opencl() - check_device("cuda") + check_device("cuda", "llvm") check_device("opencl") diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py index a1233d5e9ad7..9beaa5b516c9 100644 --- a/tests/python/integration/test_gemm.py +++ b/tests/python/integration/test_gemm.py @@ -52,16 +52,16 @@ def test_gemm(): # lowering test s.normalize() - def check_device(target): - codes = [] - f = tvm.build(s, [A, B, C], target, record_codes=codes, - max_auto_unroll_step=max_auto_unroll_step) - if target == "cuda": - ctx = tvm.gpu(0) - else: - ctx = tvm.cl(0) - if not ctx.enabled: + # one line to build the function. + def check_device(device, host="stackvm"): + if not tvm.codegen.target_enabled(host): + return + if not tvm.codegen.target_enabled(device): return + + f = tvm.build(s, [A, B, C], device, host, + max_auto_unroll_step=max_auto_unroll_step) + ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) # launch the kernel. n = nn m = n diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py index 30646cf6b9e6..499a4606b9d2 100644 --- a/tests/python/integration/test_reduce.py +++ b/tests/python/integration/test_reduce.py @@ -17,29 +17,29 @@ def test_sum(): _, x = s[B].split(B.op.axis[0], factor=num_thread, outer=block_x) _, x = s[B].split(x, outer=thread_x) - tvm.init_opencl() - codes = [] - fsum = tvm.build(s, - args=[A, B], - target="opencl", name="myadd", - record_codes=codes) - for c in codes: - print(c) - num_device = 1 - for i in range(num_device): - ctx = tvm.opencl(i) - if not ctx.enabled: - continue + # one line to build the function. + def check_device(device, host="stackvm"): + if not tvm.codegen.target_enabled(host): + return + if not tvm.codegen.target_enabled(device): + return + ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) + fsum = tvm.build(s, + args=[A, B], + target=device, target_host=host, + name="mysum") # launch the kernel. n = 1028 m = 129 - #a = tvm.nd.array(np.zeros((n, m)).astype(A.dtype), ctx) a = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) fsum(a, b) np.testing.assert_allclose( b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4) + tvm.init_opencl() + check_device("cuda") + check_device("opencl") if __name__ == "__main__": test_sum() diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py index 08adab491639..d194c9a62d98 100644 --- a/tests/python/integration/test_scan.py +++ b/tests/python/integration/test_scan.py @@ -22,20 +22,15 @@ def test_scan(): _, x = s[s_update].split(x, outer=thread_x) # one line to build the function. - def check_device(target): - codes = [] + def check_device(device, host="stackvm"): + if not tvm.codegen.target_enabled(host): + return + if not tvm.codegen.target_enabled(device): + return fscan = tvm.build(s, [X, res], - target, record_codes=codes, + device, host, name="myscan") - if target == "cuda": - ctx = tvm.gpu(0) - else: - ctx = tvm.cl(0) - if not ctx.enabled: - return - - for c in codes[1:]: - print(c) + ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) # launch the kernel. n = 1024 m = 10 @@ -48,6 +43,7 @@ def check_device(target): tvm.init_opencl() check_device("cuda") + check_device("opencl") if __name__ == "__main__": diff --git a/tests/python/unittest/test_codegen_device.py b/tests/python/unittest/test_codegen_device.py index b22cbfad4a59..171cc082677d 100644 --- a/tests/python/unittest/test_codegen_device.py +++ b/tests/python/unittest/test_codegen_device.py @@ -23,59 +23,33 @@ def test_add_pipeline(): Cb = tvm.Buffer(C.shape, C.dtype, name='C') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, C:Cb}) stmt = tvm.ir_pass.Simplify(stmt) - fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 3) + fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 0) fsplits = tvm.ir_pass.SplitHostDevice(fapi) - def check_cuda(): - output_ssa = False - for f in fsplits[1:]: - print(tvm.codegen.CompileToC(f, output_ssa, "cuda")) + def check_target(device, host="stackvm"): + if not tvm.codegen.target_enabled(host): + return + if not tvm.codegen.target_enabled(device): + return + ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) + mhost = tvm.codegen.build(fsplits[0], host) + mdev = tvm.codegen.build(fsplits[1:], device) + mhost.import_module(mdev) + code = mdev.get_source() + f = mhost.entry_func + + # launch the kernel. + n = 1027 + a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx) + f(a, b, c) + np.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy()) + + check_target("cuda", host="stackvm") + check_target("cuda", host="llvm") - # build and invoke the kernel. - fcuda = tvm.codegen.BuildNVRTC(fsplits, "stackvm") - num_device = 1 - for i in range(num_device): - ctx = tvm.gpu(i) - if not ctx.enabled: - continue - # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx) - fcuda(a, b, c) - np.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + b.asnumpy()) - - def check_opencl(): - output_ssa = False - for f in fsplits[1:]: - print(tvm.codegen.CompileToC(f, output_ssa, "opencl")) - - # build and invoke the kernel. - fcl = tvm.codegen.BuildOpenCL(fsplits, "stackvm") - # Disable OpenCL runtime test for now, - # since the local worksize on CPU might be too large. - num_device = 0 - for i in range(num_device): - ctx = tvm.cl(i) - if not ctx.enabled: - continue - # launch the kernel. - n = 1027 - a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx) - fcl(a, b, c) - np.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + b.asnumpy()) - - tvm.init_opencl() - if tvm.cl(0).enabled: - check_opencl() - - if tvm.gpu(0).enabled: - check_cuda() if __name__ == "__main__": test_add_pipeline() diff --git a/tests/python/unittest/test_codegen_stack_llvm.py b/tests/python/unittest/test_codegen_stack_llvm.py index addf2e699f29..019f5fbf4e8e 100644 --- a/tests/python/unittest/test_codegen_stack_llvm.py +++ b/tests/python/unittest/test_codegen_stack_llvm.py @@ -1,21 +1,20 @@ import tvm import numpy as np -def tvm_call_global(*args): +def tvm_call_packed(*args): args = tvm.convert(args) - return tvm.make.Call("int32", "tvm_call_global", args, 4, None, 0) + return tvm.make.Call("int32", "tvm_call_packed", args, 4, None, 0) def run_jit(fapi, check): - for target in ["stackvm"]: - if target == "llvm": - f = tvm.codegen.BuildLLVM(fapi) - else: - f = tvm.codegen.BuildStackVM(fapi) + for target in ["llvm", "stackvm"]: + if not tvm.codegen.target_enabled(target): + continue + f = tvm.codegen.build(fapi, target) + s = f.get_source() check(f) - def test_stack_vm_basic(): a = tvm.nd.array(np.zeros(10, dtype='float32')) @tvm.register_func @@ -25,8 +24,8 @@ def tvm_call_back_get_shape(shape0): n = tvm.Var('n') Ab = tvm.Buffer((n, ), tvm.float32) - stmt = tvm.make.Evaluate(tvm_call_global("tvm_call_back_get_shape", Ab.shape[0])) - fapi = tvm.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 1) + stmt = tvm.make.Evaluate(tvm_call_packed("tvm_call_back_get_shape", Ab.shape[0])) + fapi = tvm.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 0) run_jit(fapi, lambda f: f(a)) @@ -47,9 +46,8 @@ def test_stack_vm_loop(): tvm.make.Store(Ab.data, tvm.make.Load(dtype, Ab.data, i) + 1, i + 1), - tvm.make.Evaluate(tvm_call_global("tvm_stack_vm_print", i)))) - fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 1) - f = tvm.codegen.BuildStackVM(fapi) + tvm.make.Evaluate(tvm_call_packed("tvm_stack_vm_print", i)))) + fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0) a = tvm.nd.array(np.zeros(10, dtype=dtype)) def check(f): f(a) @@ -71,7 +69,7 @@ def test_stack_vm_cond(): tvm.make.Load(dtype, Ab.data, i) + 1, i + 1), tvm.make.Store(Ab.data, tvm.make.Load(dtype, Ab.data, i) + 2, i + 1))) - fapi = tvm.ir_pass.MakeAPI(stmt, "test", [Ab], 1) + fapi = tvm.ir_pass.MakeAPI(stmt, "test", [Ab], 0) def check(f): a = tvm.nd.array(np.zeros(10, dtype=dtype)) f(a) @@ -94,11 +92,13 @@ def test_llvm_add_pipeline(): Cb = tvm.Buffer(C.shape, C.dtype, name='C') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, C:Cb}) stmt = tvm.ir_pass.Simplify(stmt) - fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 3) + fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 0) def check_llvm(): + if not tvm.codegen.target_enabled("llvm"): + return # build and invoke the kernel. - f = tvm.codegen.BuildLLVM(fapi) + f = tvm.codegen.build(fapi, "llvm") ctx = tvm.cpu(0) # launch the kernel. n = 1027 @@ -108,10 +108,10 @@ def check_llvm(): f(a, b, c) np.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) - #check_llvm() + check_llvm() if __name__ == "__main__": - test_stack_vm_cond() test_stack_vm_basic() + test_stack_vm_cond() test_stack_vm_loop() test_llvm_add_pipeline() diff --git a/tests/python/unittest/test_module_load.py b/tests/python/unittest/test_module_load.py new file mode 100644 index 000000000000..89e015c28da6 --- /dev/null +++ b/tests/python/unittest/test_module_load.py @@ -0,0 +1,54 @@ +import tvm +from tvm.addon import cc_compiler as cc +import os +import tempfile +import numpy as np + +def test_dso_module_load(): + if not tvm.codegen.target_enabled("llvm"): + return + dtype = 'int64' + temp_dir = tempfile.mkdtemp() + + def save_object(names): + n = tvm.Var('n') + Ab = tvm.Buffer((n, ), dtype) + i = tvm.Var('i') + # for i in 0 to n-1: + stmt = tvm.make.For( + i, 0, n - 1, 0, 0, + tvm.make.Store(Ab.data, + tvm.make.Load(dtype, Ab.data, i) + 1, + i + 1)) + fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0) + m = tvm.codegen.build(fapi, "llvm") + for name in names: + m.save(name) + + path_obj = "%s/test.o" % temp_dir + path_ll = "%s/test.ll" % temp_dir + path_bc = "%s/test.bc" % temp_dir + path_dso = "%s/test.so" % temp_dir + save_object([path_obj, path_ll, path_bc]) + cc.create_shared(path_dso, [path_obj]) + + f1 = tvm.module.load(path_dso) + f2 = tvm.module.load(path_dso) + + a = tvm.nd.array(np.zeros(10, dtype=dtype)) + f1(a) + np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0])) + a = tvm.nd.array(np.zeros(10, dtype=dtype)) + f2(a) + np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0])) + files = [path_obj, path_ll, path_bc, path_dso] + for f in files: + os.remove(f) + os.rmdir(temp_dir) + + +def test_cuda_module_load(): + pass + +if __name__ == "__main__": + test_dso_module_load() diff --git a/tests/python/unittest/test_pass_makeapi.py b/tests/python/unittest/test_pass_makeapi.py index a862eca1ca59..5b31d0cd2f49 100644 --- a/tests/python/unittest/test_pass_makeapi.py +++ b/tests/python/unittest/test_pass_makeapi.py @@ -17,8 +17,9 @@ def test_makeapi(): Cb = tvm.Buffer(C.shape, C.dtype, name='C') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, C:Cb}) - num_packed_args = 2 - f = tvm.ir_pass.MakeAPI(stmt, "myadd", [n, Ab, Bb, Cb], num_packed_args) + num_unpacked_args = 2 + f = tvm.ir_pass.MakeAPI( + stmt, "myadd", [n, Ab, Bb, Cb], num_unpacked_args) assert(f.handle_data_type[Ab.data].dtype == Ab.dtype) assert(len(f.args) == 5) output_ssa = False diff --git a/tests/python/unittest/test_pass_storage_sync.py b/tests/python/unittest/test_pass_storage_sync.py index 53b0b66af228..aa4f85f1018b 100644 --- a/tests/python/unittest/test_pass_storage_sync.py +++ b/tests/python/unittest/test_pass_storage_sync.py @@ -21,7 +21,7 @@ def test_storage_sync(): Ab = tvm.Buffer(A.shape, A.dtype, name='A') A2b = tvm.Buffer(A2.shape, A2.dtype, name='A2') stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}) - f = tvm.ir_pass.MakeAPI(stmt, "test", [Ab, A2b], 2) + f = tvm.ir_pass.MakeAPI(stmt, "test", [Ab, A2b], 0) flist = tvm.ir_pass.SplitHostDevice(f) f = flist[1] f = tvm.ir_pass.StorageSync(f, "shared")