From e74663c7ed432c7bffada9b466148cacfa808bbe Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 2 Dec 2020 18:08:45 -0800 Subject: [PATCH] Revert "Make context handling in GPU runtimes more consistent and robust. (#5474)" This reverts commit f47c5c99deac86c6d1f16cfcb1743a0e9e79317d. --- Makefile | 26 --- src/runtime/cuda.cpp | 163 ++++++++++---- src/runtime/d3d12compute.cpp | 78 ++++--- src/runtime/destructors.cpp | 1 - src/runtime/gpu_context_common.h | 197 ----------------- src/runtime/metal.cpp | 68 ++++-- src/runtime/opencl.cpp | 208 ++++++++++-------- test/common/gpu_context.h | 134 ----------- test/common/gpu_object_lifetime_tracker.h | 7 +- test/correctness/CMakeLists.txt | 1 - test/correctness/gpu_many_kernels.cpp | 92 -------- test/generator/CMakeLists.txt | 13 -- test/generator/acquire_release_aottest.cpp | 159 ++++++++++--- .../gpu_multi_context_threaded_aottest.cpp | 193 ---------------- .../gpu_multi_context_threaded_generator.cpp | 48 ---- 15 files changed, 465 insertions(+), 923 deletions(-) delete mode 100644 src/runtime/gpu_context_common.h delete mode 100644 test/common/gpu_context.h delete mode 100644 test/correctness/gpu_many_kernels.cpp delete mode 100644 test/generator/gpu_multi_context_threaded_aottest.cpp delete mode 100644 test/generator/gpu_multi_context_threaded_generator.cpp diff --git a/Makefile b/Makefile index 6ce06a94b568..ba9fcd3f9b4d 100644 --- a/Makefile +++ b/Makefile @@ -1527,12 +1527,6 @@ $(FILTERS_DIR)/nested_externs_%.a: $(BIN_DIR)/nested_externs.generator @mkdir -p $(@D) $(CURDIR)/$< -g nested_externs_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime -# Similarly, gpu_multi needs two different kernels to test compilation caching. -# Also requies user-context. -$(FILTERS_DIR)/gpu_multi_context_threaded_%.a: $(BIN_DIR)/gpu_multi_context_threaded.generator - @mkdir -p $(@D) - $(CURDIR)/$< -g gpu_multi_context_threaded_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context - GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools GEN_AOT_LD_FLAGS=$(COMMON_LD_FLAGS) @@ -1628,31 +1622,11 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget HL_MULTITARGET_TEST_USE_NOBOUNDSQUERY_FEATURE=1 $(CURDIR)/$< @-echo -# gpu_multi_context_threaded has additional deps to link in -$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \ - $(FILTERS_DIR)/gpu_multi_context_threaded_add.a \ - $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a \ - $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a - @mkdir -p $(@D) - $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@ - -$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \ - $(FILTERS_DIR)/gpu_multi_context_threaded_add.halide_generated.cpp \ - $(FILTERS_DIR)/gpu_multi_context_threaded_mul.halide_generated.cpp \ - $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a - @mkdir -p $(@D) - $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@ - # nested externs doesn't actually contain a generator named # "nested_externs", and has no internal tests in any case. test_generator_nested_externs: @echo "Skipping" -# gpu_multi actually contain a generator named -# "gpu_multi", and has no internal tests in any case. -test_generator_gpu_multi: - @echo "Skipping" - $(BUILD_DIR)/RunGenMain.o: $(ROOT_DIR)/tools/RunGenMain.cpp $(RUNTIME_EXPORTED_INCLUDES) $(ROOT_DIR)/tools/RunGen.h @mkdir -p $(@D) $(CXX) -c $< $(filter-out -g, $(TEST_CXX_FLAGS)) $(OPTIMIZE) -Os $(IMAGE_IO_CXX_FLAGS) -I$(INCLUDE_DIR) -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools -o $@ diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 1dc1cf27086e..7c423e179d85 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1,7 +1,6 @@ #include "HalideRuntimeCuda.h" #include "device_buffer_utils.h" #include "device_interface.h" -#include "gpu_context_common.h" #include "mini_cuda.h" #include "printer.h" #include "scoped_mutex_lock.h" @@ -240,7 +239,43 @@ class Context { } }; -WEAK Halide::Internal::GPUCompilationCache compilation_cache; +// Halide allocates a device API controlled pointer slot as part of +// each compiled module. The slot is used to store information to +// avoid having to reload/recompile kernel code on each call into a +// Halide filter. The cuda runtime uses this pointer to maintain a +// linked list of contexts into which the module has been loaded. +// +// A global list of all registered filters is also kept so all modules +// loaded on a given context can be unloaded and removed from the list +// when halide_device_release is called on a specific context. +// +// The registered_filters struct is not freed as it is pointed to by the +// Halide generated code. The module_state structs are freed. + +struct module_state { + CUcontext context; + CUmodule module; + module_state *next; +}; + +struct registered_filters { + module_state *modules; + registered_filters *next; +}; +WEAK registered_filters *filters_list = nullptr; +// This spinlock protects the above filters_list. +WEAK halide_mutex filters_list_lock; + +WEAK module_state *find_module_for_context(const registered_filters *filters, CUcontext ctx) { + module_state *modules = filters->modules; + while (modules != nullptr) { + if (modules->context == ctx) { + return modules; + } + modules = modules->next; + } + return nullptr; +} WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) { // Initialize CUDA @@ -470,33 +505,6 @@ WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size #endif } -WEAK CUmodule compile_kernel(void *user_context, const char *ptx_src, int size) { - debug(user_context) << "CUDA: compile_kernel cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> "; - - CUjit_option options[] = {CU_JIT_MAX_REGISTERS}; - unsigned int max_regs_per_thread = 64; - - // A hack to enable control over max register count for - // testing. This should be surfaced in the schedule somehow - // instead. - char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS"); - if (regs) { - max_regs_per_thread = atoi(regs); - } - void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread}; - CUmodule loaded_module; - CUresult err = cuModuleLoadDataEx(&loaded_module, ptx_src, 1, options, optionValues); - - if (err != CUDA_SUCCESS) { - error(user_context) << "CUDA: cuModuleLoadData failed: " - << get_error_name(err); - return nullptr; - } else { - debug(user_context) << (void *)(loaded_module) << "\n"; - } - return loaded_module; -} - } // namespace Cuda } // namespace Internal } // namespace Runtime @@ -518,12 +526,54 @@ WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, co uint64_t t_before = halide_current_time_ns(user_context); #endif - CUmodule loaded_module; - if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, loaded_module, - compile_kernel, user_context, ptx_src, size)) { - return halide_error_code_generic_error; - } - halide_assert(user_context, loaded_module != nullptr); + halide_assert(user_context, &filters_list_lock != nullptr); + { + ScopedMutexLock spinlock(&filters_list_lock); + + // Create the state object if necessary. This only happens once, regardless + // of how many times halide_initialize_kernels/halide_release is called. + // halide_release traverses this list and releases the module objects, but + // it does not modify the list nodes created/inserted here. + registered_filters **filters = (registered_filters **)state_ptr; + if (!(*filters)) { + *filters = (registered_filters *)malloc(sizeof(registered_filters)); + (*filters)->modules = nullptr; + (*filters)->next = filters_list; + filters_list = *filters; + } + + // Create the module itself if necessary. + module_state *loaded_module = find_module_for_context(*filters, ctx.context); + if (loaded_module == nullptr) { + loaded_module = (module_state *)malloc(sizeof(module_state)); + debug(user_context) << " cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> "; + + CUjit_option options[] = {CU_JIT_MAX_REGISTERS}; + unsigned int max_regs_per_thread = 64; + + // A hack to enable control over max register count for + // testing. This should be surfaced in the schedule somehow + // instead. + char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS"); + if (regs) { + max_regs_per_thread = atoi(regs); + } + void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread}; + CUresult err = cuModuleLoadDataEx(&loaded_module->module, ptx_src, 1, options, optionValues); + + if (err != CUDA_SUCCESS) { + free(loaded_module); + error(user_context) << "CUDA: cuModuleLoadData failed: " + << get_error_name(err); + return err; + } else { + debug(user_context) << (void *)(loaded_module->module) << "\n"; + } + loaded_module->context = ctx.context; + loaded_module->next = (*filters)->modules; + (*filters)->modules = loaded_module; + } + } // spinlock #ifdef DEBUG_RUNTIME uint64_t t_after = halide_current_time_ns(user_context); @@ -654,7 +704,7 @@ WEAK int halide_cuda_device_release(void *user_context) { << "CUDA: halide_cuda_device_release (user_context: " << user_context << ")\n"; // If we haven't even loaded libcuda, don't load it just to quit. - if (!cuInit) { + if (!lib_cuda) { return 0; } @@ -678,7 +728,34 @@ WEAK int halide_cuda_device_release(void *user_context) { // Dump the contents of the free list, ignoring errors. halide_cuda_release_unused_device_allocations(user_context); - compilation_cache.delete_context(user_context, ctx, cuModuleUnload); + { + ScopedMutexLock spinlock(&filters_list_lock); + + // Unload the modules attached to this context. Note that the list + // nodes themselves are not freed, only the module objects are + // released. Subsequent calls to halide_init_kernels might re-create + // the program object using the same list node to store the module + // object. + registered_filters *filters = filters_list; + while (filters) { + module_state **prev_ptr = &filters->modules; + module_state *loaded_module = filters->modules; + while (loaded_module != nullptr) { + if (loaded_module->context == ctx) { + debug(user_context) << " cuModuleUnload " << loaded_module->module << "\n"; + err = cuModuleUnload(loaded_module->module); + halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); + *prev_ptr = loaded_module->next; + free(loaded_module); + loaded_module = *prev_ptr; + } else { + loaded_module = loaded_module->next; + prev_ptr = &loaded_module->next; + } + } + filters = filters->next; + } + } // spinlock CUcontext old_ctx; cuCtxPopCurrent(&old_ctx); @@ -842,15 +919,12 @@ WEAK int cuda_do_multidimensional_copy(void *user_context, const device_copy &c, << (void *)src << " -> " << (void *)dst << ", " << c.chunk_size << " bytes\n"; if (!from_host && to_host) { debug(user_context) << "cuMemcpyDtoH(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n"; - copy_name = "cuMemcpyDtoH"; err = cuMemcpyDtoH((void *)dst, (CUdeviceptr)src, c.chunk_size); } else if (from_host && !to_host) { debug(user_context) << "cuMemcpyHtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n"; - copy_name = "cuMemcpyHtoD"; err = cuMemcpyHtoD((CUdeviceptr)dst, (void *)src, c.chunk_size); } else if (!from_host && !to_host) { debug(user_context) << "cuMemcpyDtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n"; - copy_name = "cuMemcpyDtoD"; err = cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, c.chunk_size); } else if (dst != src) { debug(user_context) << "memcpy(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n"; @@ -1059,9 +1133,9 @@ WEAK int halide_cuda_run(void *user_context, #endif halide_assert(user_context, state_ptr); - CUmodule mod = nullptr; - bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod); - halide_assert(user_context, found_module && mod != nullptr); + module_state *loaded_module = find_module_for_context((registered_filters *)state_ptr, ctx.context); + halide_assert(user_context, loaded_module != nullptr); + CUmodule mod = loaded_module->module; debug(user_context) << "Got module " << mod << "\n"; halide_assert(user_context, mod); CUfunction f; @@ -1190,7 +1264,7 @@ WEAK const halide_device_interface_t *halide_cuda_device_interface() { } WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *minor) { - if (!lib_cuda && !cuInit) { + if (!lib_cuda) { // If cuda can't be found, we want to return 0, 0 and it's not // considered an error. So we should be very careful about // looking for libcuda without tripping any errors in the rest @@ -1239,7 +1313,6 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min namespace { WEAK __attribute__((destructor)) void halide_cuda_cleanup() { - compilation_cache.release_all(nullptr, cuModuleUnload); halide_cuda_device_release(nullptr); } } // namespace diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 3174c33f52a4..49ef68a0118a 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -45,7 +45,6 @@ #include "HalideRuntimeD3D12Compute.h" #include "device_buffer_utils.h" #include "device_interface.h" -#include "gpu_context_common.h" #include "printer.h" #include "scoped_spin_lock.h" @@ -2438,24 +2437,16 @@ static void *buffer_contents(d3d12_buffer *buffer) { volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0; -WEAK Halide::Internal::GPUCompilationCache compilation_cache; - -WEAK d3d12_library *compile_kernel(void *user_context, const char *source, int source_size, int *error_ret) { - D3D12ContextHolder d3d12_context(user_context, true); - if (d3d12_context.error != 0) { - *error_ret = d3d12_context.error; - return nullptr; - } - - d3d12_library *library = new_library_with_source(d3d12_context.device, source, source_size); - if (library == nullptr) { - TRACEFATAL("D3D12Compute: new_library_with_source failed."); - *error_ret = halide_error_code_out_of_memory; - return nullptr; - } - - return library; -} +// Structure to hold the state of a module attached to the context. +// Also used as a linked-list to keep track of all the different +// modules that are attached to a context in order to release them all +// when then context is released. +struct module_state { + d3d12_library *library; + module_state *next; +}; +D3D12TYPENAME(module_state) +WEAK module_state *state_list = nullptr; } // namespace D3D12Compute } // namespace Internal @@ -2763,14 +2754,29 @@ WEAK int halide_d3d12compute_device_free(void *user_context, halide_buffer_t *bu WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) { TRACELOG; + // Create the state object if necessary. This only happens once, regardless + // of how many times halide_initialize_kernels/halide_release is called. + // halide_release traverses this list and releases the module objects, but + // it does not modify the list nodes created/inserted here. + module_state *&state = *(module_state **)state_ptr; + if (!state) { + state = malloct(); + state->library = nullptr; + state->next = state_list; + state_list = state; + } + D3D12ContextHolder d3d12_context(user_context, true); + if (d3d12_context.error != 0) { + return d3d12_context.error; + } - int error = halide_error_code_generic_error; - d3d12_library *library; - if (!compilation_cache.kernel_state_setup(user_context, state_ptr, d3d12_context.device, - library, compile_kernel, user_context, - source, source_size, &error)) { - return error; + if (state->library == nullptr) { + state->library = new_library_with_source(d3d12_context.device, source, source_size); + if (state->library == nullptr) { + TRACEFATAL("D3D12Compute: new_library_with_source failed."); + return halide_error_code_out_of_memory; + } } return 0; @@ -2833,7 +2839,19 @@ WEAK int halide_d3d12compute_device_release(void *user_context) { release_object(buffer); } - compilation_cache.delete_context(user_context, device, release_object); + // Unload the modules attached to this device. Note that the list + // nodes themselves are not freed, only the program objects are + // released. Subsequent calls to halide_init_kernels might re-create + // the program object using the same list node to store the program + // object. + module_state *state = state_list; + while (state) { + if (state->library) { + release_object(state->library); + state->library = nullptr; + } + state = state->next; + } // Release the device itself, if we created it. if (acquired_device == device) { @@ -3005,9 +3023,8 @@ WEAK int halide_d3d12compute_run(void *user_context, StartCapturingGPUActivity(); #endif - d3d12_library *library = nullptr; - bool found_module = compilation_cache.lookup(device, state_ptr, library); - halide_assert(user_context, found_module && library != nullptr); + halide_assert(user_context, state_ptr); + module_state *state = (module_state *)state_ptr; d3d12_frame *frame = acquire_frame(device); d3d12_compute_command_list *cmdList = frame->cmd_list; @@ -3019,7 +3036,7 @@ WEAK int halide_d3d12compute_run(void *user_context, d3d12_compute_pipeline_state *pipeline_state = nullptr; { TRACE_SCOPE("kernel shader selection"); - function = new_function_with_name(device, library, entry_name, strlen(entry_name), + function = new_function_with_name(device, state->library, entry_name, strlen(entry_name), shared_mem_bytes, threadsX, threadsY, threadsZ); halide_assert(user_context, function); pipeline_state = function->pipeline_state; @@ -3514,7 +3531,6 @@ WEAK const struct halide_device_interface_t *halide_d3d12compute_device_interfac namespace { WEAK __attribute__((destructor)) void halide_d3d12compute_cleanup() { TRACELOG; - compilation_cache.release_all(nullptr, release_object); halide_d3d12compute_device_release(nullptr); } } // namespace diff --git a/src/runtime/destructors.cpp b/src/runtime/destructors.cpp index b7187c6c4d86..1fa7228a86e2 100644 --- a/src/runtime/destructors.cpp +++ b/src/runtime/destructors.cpp @@ -1,5 +1,4 @@ #include "HalideRuntime.h" -#include "printer.h" extern "C" { diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h deleted file mode 100644 index 9ef2662026a8..000000000000 --- a/src/runtime/gpu_context_common.h +++ /dev/null @@ -1,197 +0,0 @@ -#include "printer.h" -#include "scoped_mutex_lock.h" - -namespace Halide { -namespace Internal { - -template -class GPUCompilationCache { - struct CachedCompilation { - ContextT context{}; - ModuleStateT module_state{}; - uint32_t kernel_id{}; - }; - - halide_mutex mutex; - - static constexpr float kLoadFactor{.5f}; - static constexpr int kInitialTableBits{7}; - int log2_compilations_size{0}; // number of bits in index into compilations table. - CachedCompilation *compilations{nullptr}; - int count{0}; - - static constexpr uint32_t kInvalidId{0}; - static constexpr uint32_t kDeletedId{1}; - - uint32_t unique_id{2}; // zero is an invalid id - -public: - static ALWAYS_INLINE uintptr_t kernel_hash(ContextT context, uint32_t id, uint32_t bits) { - uintptr_t addr = (uintptr_t)context + id; - // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39... - // in hexadecimal. - if (sizeof(uintptr_t) >= 8) { - return (addr * (uintptr_t)0x9E3779B97F4A7C15) >> (64 - bits); - } else { - return (addr * (uintptr_t)0x9E3779B9) >> (32 - bits); - } - } - - HALIDE_MUST_USE_RESULT bool insert(ContextT context, uint32_t id, ModuleStateT module_state) { - if (log2_compilations_size == 0) { - if (!resize_table(kInitialTableBits)) { - return false; - } - } - if ((count + 1) > (1 << log2_compilations_size) * kLoadFactor) { - if (!resize_table(log2_compilations_size + 1)) { - return false; - } - } - count += 1; - uintptr_t index = kernel_hash(context, id, log2_compilations_size); - for (int i = 0; i < (1 << log2_compilations_size); i++) { - uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1); - if (compilations[effective_index].kernel_id <= kDeletedId) { - compilations[effective_index].context = context; - compilations[effective_index].module_state = module_state; - compilations[effective_index].kernel_id = id; - return true; - } - } - // This is a logic error that should never occur. It means the table is - // full, but it should have been resized. - halide_assert(nullptr, false); - return false; - } - - HALIDE_MUST_USE_RESULT bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) { - if (log2_compilations_size == 0) { - return false; - } - uintptr_t index = kernel_hash(context, id, log2_compilations_size); - for (int i = 0; i < (1 << log2_compilations_size); i++) { - uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1); - - if (compilations[effective_index].kernel_id == kInvalidId) { - return false; - } - if (compilations[effective_index].context == context && - compilations[effective_index].kernel_id == id) { - module_state = &compilations[effective_index].module_state; - return true; - } - } - return false; - } - - HALIDE_MUST_USE_RESULT bool lookup(ContextT context, void *state_ptr, ModuleStateT &module_state) { - ScopedMutexLock lock_guard(&mutex); - uint32_t id = (uint32_t)(uintptr_t)state_ptr; - ModuleStateT *mod_ptr; - if (find_internal(context, id, mod_ptr)) { - module_state = *mod_ptr; - return true; - } - return false; - } - - HALIDE_MUST_USE_RESULT bool resize_table(int size_bits) { - if (size_bits != log2_compilations_size) { - int new_size = (1 << size_bits); - int old_size = (1 << log2_compilations_size); - CachedCompilation *new_table = (CachedCompilation *)malloc(new_size * sizeof(CachedCompilation)); - if (new_table == nullptr) { - // signal error. - return false; - } - memset(new_table, 0, new_size * sizeof(CachedCompilation)); - CachedCompilation *old_table = compilations; - compilations = new_table; - log2_compilations_size = size_bits; - - if (count > 0) { // Mainly to catch empty initial table case - for (int32_t i = 0; i < old_size; i++) { - if (old_table[i].kernel_id != kInvalidId && - old_table[i].kernel_id != kDeletedId) { - bool result = insert(old_table[i].context, old_table[i].kernel_id, - old_table[i].module_state); - halide_assert(nullptr, result); // Resizing the table while resizing the table is a logic error. - } - } - } - free(old_table); - } - return true; - } - - template - void release_context(void *user_context, bool all, ContextT context, FreeModuleT &f) { - if (count == 0) { - return; - } - - for (int i = 0; i < (1 << log2_compilations_size); i++) { - if (compilations[i].kernel_id > kInvalidId && - (all || (compilations[i].context == context))) { - f(compilations[i].module_state); - compilations[i].module_state = nullptr; - compilations[i].kernel_id = kDeletedId; - count--; - } - } - } - - template - void delete_context(void *user_context, ContextT context, FreeModuleT &f) { - ScopedMutexLock lock_guard(&mutex); - - release_context(user_context, false, context, f); - } - - template - void release_all(void *user_context, FreeModuleT &f) { - ScopedMutexLock lock_guard(&mutex); - - release_context(user_context, true, nullptr, f); - free(compilations); - compilations = nullptr; - log2_compilations_size = 0; - } - - template - HALIDE_MUST_USE_RESULT bool kernel_state_setup(void *user_context, void **state_ptr, - ContextT context, ModuleStateT &result, - CompileModuleT f, - Args... args) { - ScopedMutexLock lock_guard(&mutex); - - uint32_t *id_ptr = (uint32_t *)state_ptr; - if (*id_ptr == 0) { - *id_ptr = unique_id++; - } - - ModuleStateT *mod; - if (find_internal(context, *id_ptr, mod)) { - result = *mod; - return true; - } - - // TODO(zvookin): figure out the calling signature here... - ModuleStateT compiled_module = f(args...); - debug(user_context) << "Caching compiled kernel: " << compiled_module << " id " << *id_ptr << " context " << context << "\n"; - if (compiled_module == nullptr) { - return false; - } - - if (!insert(context, *id_ptr, compiled_module)) { - return false; - } - result = compiled_module; - - return true; - } -}; - -} // namespace Internal -} // namespace Halide diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp index 25fe29f3feda..1a2ba0ce52b4 100644 --- a/src/runtime/metal.cpp +++ b/src/runtime/metal.cpp @@ -1,7 +1,6 @@ #include "HalideRuntimeMetal.h" #include "device_buffer_utils.h" #include "device_interface.h" -#include "gpu_context_common.h" #include "printer.h" #include "scoped_spin_lock.h" @@ -283,7 +282,15 @@ struct device_handle { uint64_t offset; }; -WEAK Halide::Internal::GPUCompilationCache compilation_cache; +// Structure to hold the state of a module attached to the context. +// Also used as a linked-list to keep track of all the different +// modules that are attached to a context in order to release them all +// when then context is released. +struct module_state { + mtl_library *library; + module_state *next; +}; +WEAK module_state *state_list = nullptr; // API Capabilities. If more capabilities need to be checked, // this can be refactored to something more robust/general. @@ -537,6 +544,18 @@ WEAK int halide_metal_device_free(void *user_context, halide_buffer_t *buf) { } WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) { + // Create the state object if necessary. This only happens once, regardless + // of how many times halide_initialize_kernels/halide_release is called. + // halide_release traverses this list and releases the module objects, but + // it does not modify the list nodes created/inserted here. + module_state **state = (module_state **)state_ptr; + if (!(*state)) { + *state = (module_state *)malloc(sizeof(module_state)); + (*state)->library = nullptr; + (*state)->next = state_list; + state_list = *state; + } + MetalContextHolder metal_context(user_context, true); if (metal_context.error != 0) { return metal_context.error; @@ -546,13 +565,23 @@ WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, c uint64_t t_before = halide_current_time_ns(user_context); #endif - mtl_library *library; - if (!compilation_cache.kernel_state_setup(user_context, state_ptr, metal_context.device, library, - new_library_with_source, metal_context.device, - source, source_size)) { - return halide_error_code_generic_error; + if ((*state)->library == nullptr) { +#ifdef DEBUG_RUNTIME + uint64_t t_before_compile = halide_current_time_ns(user_context); +#endif + + debug(user_context) << "Metal - Allocating: new_library_with_source " << (*state)->library << "\n"; + (*state)->library = new_library_with_source(metal_context.device, source, source_size); + if ((*state)->library == nullptr) { + error(user_context) << "Metal: new_library_with_source failed.\n"; + return -1; + } + +#ifdef DEBUG_RUNTIME + uint64_t t_after_compile = halide_current_time_ns(user_context); + debug(user_context) << "Time for halide_metal_initialize_kernels compilation: " << (t_after_compile - t_before_compile) / 1.0e6 << " ms\n"; +#endif } - halide_assert(user_context, library != nullptr); #ifdef DEBUG_RUNTIME uint64_t t_after = halide_current_time_ns(user_context); @@ -615,7 +644,20 @@ WEAK int halide_metal_device_release(void *user_context) { if (device) { halide_metal_device_sync_internal(queue, nullptr); - compilation_cache.delete_context(user_context, device, release_ns_object); + // Unload the modules attached to this device. Note that the list + // nodes themselves are not freed, only the program objects are + // released. Subsequent calls to halide_init_kernels might re-create + // the program object using the same list node to store the program + // object. + module_state *state = state_list; + while (state) { + if (state->library) { + debug(user_context) << "Metal - Releasing: new_library_with_source " << state->library << "\n"; + release_ns_object(state->library); + state->library = nullptr; + } + state = state->next; + } // Release the device itself, if we created it. if (acquired_device == device) { @@ -740,11 +782,10 @@ WEAK int halide_metal_run(void *user_context, return -1; } - mtl_library *library; - bool found_library = compilation_cache.lookup(metal_context.device, state_ptr, library); - halide_assert(user_context, found_library && library != nullptr); + halide_assert(user_context, state_ptr); + module_state *state = (module_state *)state_ptr; - mtl_function *function = new_function_with_name(library, entry_name, strlen(entry_name)); + mtl_function *function = new_function_with_name(state->library, entry_name, strlen(entry_name)); if (function == nullptr) { error(user_context) << "Metal: Could not get function " << entry_name << "from Metal library.\n"; return -1; @@ -1106,7 +1147,6 @@ WEAK const struct halide_device_interface_t *halide_metal_device_interface() { namespace { WEAK __attribute__((destructor)) void halide_metal_cleanup() { - compilation_cache.release_all(nullptr, release_ns_object); halide_metal_device_release(nullptr); } } // namespace diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 7c7815b620d7..7c10033bde31 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -1,7 +1,6 @@ #include "HalideRuntimeOpenCL.h" #include "device_buffer_utils.h" #include "device_interface.h" -#include "gpu_context_common.h" #include "printer.h" #include "scoped_spin_lock.h" @@ -287,7 +286,15 @@ struct device_handle { cl_mem mem; }; -WEAK Halide::Internal::GPUCompilationCache compilation_cache; +// Structure to hold the state of a module attached to the context. +// Also used as a linked-list to keep track of all the different +// modules that are attached to a context in order to release them all +// when then context is released. +struct module_state { + cl_program program; + module_state *next; +}; +WEAK module_state *state_list = nullptr; WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) { if (buf->device == 0) { @@ -549,83 +556,6 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q return err; } -WEAK cl_program compile_kernel(void *user_context, cl_context ctx, const char *src, int size) { - cl_int err = 0; - cl_device_id dev; - - err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr); - if (err != CL_SUCCESS) { - error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: " - << get_opencl_error_name(err); - return nullptr; - } - - cl_device_id devices[] = {dev}; - - // Get the max constant buffer size supported by this OpenCL implementation. - cl_ulong max_constant_buffer_size = 0; - err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr); - if (err != CL_SUCCESS) { - error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: " - << get_opencl_error_name(err); - return nullptr; - } - // Get the max number of constant arguments supported by this OpenCL implementation. - cl_uint max_constant_args = 0; - err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr); - if (err != CL_SUCCESS) { - error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: " - << get_opencl_error_name(err); - return nullptr; - } - - // Build the compile argument options. - stringstream options(user_context); - options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size - << " -D MAX_CONSTANT_ARGS=" << max_constant_args; - - const char *extra_options = halide_opencl_get_build_options(user_context); - options << " " << extra_options; - - const char *sources[] = {src}; - debug(user_context) << " clCreateProgramWithSource -> "; - cl_program program = clCreateProgramWithSource(ctx, 1, &sources[0], nullptr, &err); - if (err != CL_SUCCESS) { - debug(user_context) << get_opencl_error_name(err) << "\n"; - error(user_context) << "CL: clCreateProgramWithSource failed: " - << get_opencl_error_name(err); - return nullptr; - } else { - debug(user_context) << (void *)program << "\n"; - } - - debug(user_context) << " clBuildProgram " << (void *)program - << " " << options.str() << "\n"; - err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr); - if (err != CL_SUCCESS) { - - { - // Allocate an appropriately sized buffer for the build log. - Printer p(user_context); - - p << "CL: clBuildProgram failed: " - << get_opencl_error_name(err) - << "\nBuild Log:\n"; - - // Get build log - if (clGetProgramBuildInfo(program, dev, - CL_PROGRAM_BUILD_LOG, - p.capacity() - p.size() - 1, p.dst, - nullptr) != CL_SUCCESS) { - p << "clGetProgramBuildInfo failed (Printer buffer too small?)"; - } - } - - return nullptr; - } - return program; -} - } // namespace OpenCL } // namespace Internal } // namespace Runtime @@ -745,13 +675,97 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, uint64_t t_before = halide_current_time_ns(user_context); #endif - debug(user_context) << "halide_cuda_initialize_kernels got compilation_cache mutex.\n"; - cl_program program; - if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, program, - compile_kernel, user_context, ctx.context, src, size)) { - return halide_error_code_generic_error; + // Create the state object if necessary. This only happens once, regardless + // of how many times halide_init_kernels/halide_release is called. + // halide_release traverses this list and releases the program objects, but + // it does not modify the list nodes created/inserted here. + module_state **state = (module_state **)state_ptr; + if (!(*state)) { + *state = (module_state *)malloc(sizeof(module_state)); + (*state)->program = nullptr; + (*state)->next = state_list; + state_list = *state; + } + + // Create the program if necessary. TODO: The program object needs to not + // only already exist, but be created for the same context/device as the + // calling context/device. + if (!(*state && (*state)->program) && size > 1) { + cl_int err = 0; + cl_device_id dev; + + err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr); + if (err != CL_SUCCESS) { + error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: " + << get_opencl_error_name(err); + return err; + } + + cl_device_id devices[] = {dev}; + + // Get the max constant buffer size supported by this OpenCL implementation. + cl_ulong max_constant_buffer_size = 0; + err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr); + if (err != CL_SUCCESS) { + error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: " + << get_opencl_error_name(err); + return err; + } + // Get the max number of constant arguments supported by this OpenCL implementation. + cl_uint max_constant_args = 0; + err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr); + if (err != CL_SUCCESS) { + error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: " + << get_opencl_error_name(err); + return err; + } + + // Build the compile argument options. + stringstream options(user_context); + options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size + << " -D MAX_CONSTANT_ARGS=" << max_constant_args; + + const char *extra_options = halide_opencl_get_build_options(user_context); + options << " " << extra_options; + + const char *sources[] = {src}; + debug(user_context) << " clCreateProgramWithSource -> "; + cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], nullptr, &err); + if (err != CL_SUCCESS) { + debug(user_context) << get_opencl_error_name(err) << "\n"; + error(user_context) << "CL: clCreateProgramWithSource failed: " + << get_opencl_error_name(err); + return err; + } else { + debug(user_context) << (void *)program << "\n"; + } + + (*state)->program = program; + debug(user_context) << " clBuildProgram " << (void *)program + << " " << options.str() << "\n"; + err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr); + if (err != CL_SUCCESS) { + + { + // Allocate an appropriately sized buffer for the build log. + Printer p(user_context); + + p << "CL: clBuildProgram failed: " + << get_opencl_error_name(err) + << "\nBuild Log:\n"; + + // Get build log + if (clGetProgramBuildInfo(program, dev, + CL_PROGRAM_BUILD_LOG, + p.capacity() - p.size() - 1, p.dst, + nullptr) != CL_SUCCESS) { + p << "clGetProgramBuildInfo failed (Printer buffer too small?)"; + } + } + + return err; + } } - halide_assert(user_context, program != nullptr); #ifdef DEBUG_RUNTIME uint64_t t_after = halide_current_time_ns(user_context); @@ -806,7 +820,21 @@ WEAK int halide_opencl_device_release(void *user_context) { err = clFinish(q); halide_assert(user_context, err == CL_SUCCESS); - compilation_cache.delete_context(user_context, ctx, clReleaseProgram); + // Unload the modules attached to this context. Note that the list + // nodes themselves are not freed, only the program objects are + // released. Subsequent calls to halide_init_kernels might re-create + // the program object using the same list node to store the program + // object. + module_state *state = state_list; + while (state) { + if (state->program) { + debug(user_context) << " clReleaseProgram " << state->program << "\n"; + err = clReleaseProgram(state->program); + halide_assert(user_context, err == CL_SUCCESS); + state->program = nullptr; + } + state = state->next; + } // Release the context itself, if we created it. if (ctx == context) { @@ -1049,10 +1077,9 @@ WEAK int halide_opencl_run(void *user_context, // Create kernel object for entry_name from the program for this module. halide_assert(user_context, state_ptr); - cl_program program; - bool found_program = compilation_cache.lookup(ctx.context, state_ptr, program); + cl_program program = ((module_state *)state_ptr)->program; - halide_assert(user_context, found_program && program != nullptr); + halide_assert(user_context, program); debug(user_context) << " clCreateKernel " << entry_name << " -> "; cl_kernel f = clCreateKernel(program, entry_name, &err); if (err != CL_SUCCESS) { @@ -1339,7 +1366,6 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() { namespace { WEAK __attribute__((destructor)) void halide_opencl_cleanup() { - compilation_cache.release_all(nullptr, clReleaseProgram); halide_opencl_device_release(nullptr); } } // namespace @@ -1914,4 +1940,4 @@ extern "C" { WEAK const struct halide_device_interface_t *halide_opencl_image_device_interface() { return &opencl_image_device_interface; } -} +} \ No newline at end of file diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h deleted file mode 100644 index 4816ad205866..000000000000 --- a/test/common/gpu_context.h +++ /dev/null @@ -1,134 +0,0 @@ -#if defined(TEST_OPENCL) -// Implement OpenCL custom context. - -#define CL_TARGET_OPENCL_VERSION 120 -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#ifdef __APPLE__ -#include -#else -#include -#endif - -// Just use a global context and queue, created and destroyed by main. -cl_context cl_ctx = nullptr; -cl_command_queue cl_q = nullptr; - -// Create the global context. This is just a helper function not called by Halide. -bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) { - cl_int err = 0; - - const cl_uint maxPlatforms = 4; - cl_platform_id platforms[maxPlatforms]; - cl_uint platformCount = 0; - - err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount); - if (err != CL_SUCCESS) { - printf("clGetPlatformIDs failed (%d)\n", err); - return false; - } - - cl_platform_id platform = nullptr; - - if (platformCount > 0) { - platform = platforms[0]; - } - if (platform == nullptr) { - printf("Failed to get platform\n"); - return false; - } - - cl_device_type device_type = CL_DEVICE_TYPE_ALL; - - // Make sure we have a device - const cl_uint maxDevices = 4; - cl_device_id devices[maxDevices]; - cl_uint deviceCount = 0; - err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount); - if (err != CL_SUCCESS) { - printf("clGetDeviceIDs failed (%d)\n", err); - return false; - } - if (deviceCount == 0) { - printf("Failed to get device\n"); - return false; - } - - cl_device_id dev = devices[deviceCount - 1]; - - // Create context and command queue. - cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, - 0}; - cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err); - if (err != CL_SUCCESS) { - printf("clCreateContext failed (%d)\n", err); - return false; - } - - cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err); - if (err != CL_SUCCESS) { - printf("clCreateCommandQueue failed (%d)\n", err); - return false; - } - return true; -} - -void destroy_opencl_context(cl_context cl_ctx, cl_command_queue cl_q) { - clReleaseCommandQueue(cl_q); - clReleaseContext(cl_ctx); -} - -#elif defined(TEST_CUDA) -// Implement CUDA custom context. -#include - -bool create_cuda_context(CUcontext &cuda_ctx) { - // Initialize CUDA - CUresult err = cuInit(0); - if (err != CUDA_SUCCESS) { - printf("cuInit failed (%d)\n", err); - return false; - } - - // Make sure we have a device - int deviceCount = 0; - err = cuDeviceGetCount(&deviceCount); - if (err != CUDA_SUCCESS) { - printf("cuGetDeviceCount failed (%d)\n", err); - return false; - } - if (deviceCount <= 0) { - printf("No CUDA devices available\n"); - return false; - } - - CUdevice dev; - // Get device - CUresult status; - // Try to get a device >0 first, since 0 should be our display device - // For now, don't try devices > 2 to maintain compatibility with previous behavior. - if (deviceCount > 2) deviceCount = 2; - for (int id = deviceCount - 1; id >= 0; id--) { - status = cuDeviceGet(&dev, id); - if (status == CUDA_SUCCESS) break; - } - - if (status != CUDA_SUCCESS) { - printf("Failed to get CUDA device\n"); - return status; - } - - // Create context - err = cuCtxCreate(&cuda_ctx, 0, dev); - if (err != CUDA_SUCCESS) { - printf("cuCtxCreate failed (%d)\n", err); - return false; - } - - return true; -} - -void destroy_cuda_context(CUcontext cuda_ctx) { - cuCtxDestroy(cuda_ctx); -} - -#endif diff --git a/test/common/gpu_object_lifetime_tracker.h b/test/common/gpu_object_lifetime_tracker.h index 1734a59f2f00..b17dd9118413 100644 --- a/test/common/gpu_object_lifetime_tracker.h +++ b/test/common/gpu_object_lifetime_tracker.h @@ -22,24 +22,25 @@ class GpuObjectLifetimeTracker { } }; - std::array object_types = {{ - {"Caching compiled kernel:", "Releasing cached compilation:"}, - + std::array object_types = {{ // OpenCL objects {"clCreateContext", "clReleaseContext", true}, {"clCreateCommandQueue", "clReleaseCommandQueue", true}, // This handles both "clCreateProgramWithSource" and // "clCreateProgramWithBinary". + {"clCreateProgram", "clReleaseProgram"}, {"clCreateBuffer", "clReleaseMemObject"}, {"clCreateKernel", "clReleaseKernel"}, // CUDA objects {"cuCtxCreate", "cuCtxDestroy", true}, + {"cuModuleLoad", "cuModuleUnload"}, {"cuMemAlloc", "cuMemFree"}, // Metal objects {"Allocating: MTLCreateSystemDefaultDevice", "Releasing: MTLCreateSystemDefaultDevice", true}, {"Allocating: new_command_queue", "Releasing: new_command_queue"}, + {"Allocating: new_library_with_source", "Releasing: new_library_with_source"}, // Hexagon objects {"halide_remote_load_library", "halide_remote_release_library"}, diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 68692f12f1ba..e1ba9d8e4eb3 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -134,7 +134,6 @@ tests(GROUPS correctness gpu_give_input_buffers_device_allocations.cpp gpu_jit_explicit_copy_to_device.cpp gpu_large_alloc.cpp - gpu_many_kernels.cpp gpu_mixed_dimensionality.cpp gpu_mixed_shared_mem_types.cpp gpu_multi_kernel.cpp diff --git a/test/correctness/gpu_many_kernels.cpp b/test/correctness/gpu_many_kernels.cpp deleted file mode 100644 index d9572b38c18c..000000000000 --- a/test/correctness/gpu_many_kernels.cpp +++ /dev/null @@ -1,92 +0,0 @@ -#include "Halide.h" -#include - -#include "halide_benchmark.h" - -// This test makes sure GPU runtimes can handle many different small -// kernels and can handle releasing a device context and making a new -// one and still have many kernels work. This is needed due to kernel -// compilation caching mechanisms in the GPU runtimes. - -using namespace Halide; - -constexpr size_t kNumKernels = 70; - -int main(int argc, char **argv) { - Var x, y, xi, yi; - Func adders[kNumKernels]; - ImageParam input(Int(32), 2); - - Target target = get_jit_target_from_environment(); - int i = 1; - for (Func &f : adders) { - f(x, y) = input(x, y) + i; - if (target.has_gpu_feature()) { - f.compute_root().gpu_tile(x, y, xi, yi, 16, 16); - } else { - f.compute_root().vectorize(x, target.natural_vector_size()); - } - i += 1; - } - - auto start = Halide::Tools::benchmark_now(); - - Buffer buf_a_store(32, 32); - Buffer buf_b_store(32, 32); - Buffer *buf_in = &buf_a_store; - Buffer *buf_out = &buf_b_store; - buf_in->fill(0); - for (Func &f : adders) { - input.set(*buf_in); - f.realize(*buf_out); - std::swap(buf_in, buf_out); - } - buf_in->copy_to_host(); - - auto end = Halide::Tools::benchmark_now(); - double initial_runtime = Halide::Tools::benchmark_duration_seconds(start, end); - - buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); }); - - start = Halide::Tools::benchmark_now(); - - buf_in->fill(0); - for (Func &f : adders) { - input.set(*buf_in); - f.realize(*buf_out); - std::swap(buf_in, buf_out); - } - buf_in->copy_to_host(); - - end = Halide::Tools::benchmark_now(); - double precompiled_runtime = Halide::Tools::benchmark_duration_seconds(start, end); - - buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); }); - - buf_a_store.device_free(); - buf_b_store.device_free(); - const halide_device_interface_t *device = get_device_interface_for_device_api(DeviceAPI::Default_GPU, target); - if (device != nullptr) { - device->device_release(nullptr, device); - } - - start = Halide::Tools::benchmark_now(); - - buf_in->fill(0); - for (Func &f : adders) { - input.set(*buf_in); - f.realize(*buf_out); - std::swap(buf_in, buf_out); - } - buf_in->copy_to_host(); - - end = Halide::Tools::benchmark_now(); - double second_runtime = Halide::Tools::benchmark_duration_seconds(start, end); - - buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); }); - - printf("Initial runtime %f, precompiled runtime %f, second runtime %f.\n", initial_runtime, precompiled_runtime, second_runtime); - - printf("Success!\n"); - return 0; -} diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt index 130af49b7f93..2db1d3bc4f20 100644 --- a/test/generator/CMakeLists.txt +++ b/test/generator/CMakeLists.txt @@ -257,19 +257,6 @@ halide_define_aot_test(external_code GEN_DEPS external_code_generator_deps) # float16_t_generator.cpp halide_define_aot_test(float16_t) -# gpu_multi_context_threaded_aottest.cpp -# gpu_multi_context_threaded_generator.cpp -halide_define_aot_test(gpu_multi_context_threaded - OMIT_DEFAULT_GENERATOR - EXTRA_LIBS - gpu_multi_context_threaded_add - gpu_multi_context_threaded_mul) - -add_halide_library(gpu_multi_context_threaded_add FROM gpu_multi_context_threaded.generator - FEATURES user_context) -add_halide_library(gpu_multi_context_threaded_mul FROM gpu_multi_context_threaded.generator - FEATURES user_context) - # gpu_object_lifetime_aottest.cpp # gpu_object_lifetime_generator.cpp halide_define_aot_test(gpu_object_lifetime FEATURES debug) diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp index 9e4fba70afe6..f0fd8ef9cd04 100644 --- a/test/generator/acquire_release_aottest.cpp +++ b/test/generator/acquire_release_aottest.cpp @@ -14,25 +14,89 @@ int main(int argc, char **argv) { #include #include "acquire_release.h" -#include "gpu_context.h" using namespace Halide::Runtime; const int W = 256, H = 256; #if defined(TEST_OPENCL) +// Implement OpenCL custom context. + +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#ifdef __APPLE__ +#include +#else +#include +#endif // Just use a global context and queue, created and destroyed by main. cl_context cl_ctx = nullptr; cl_command_queue cl_q = nullptr; // Create the global context. This is just a helper function not called by Halide. -bool init_context() { - return create_opencl_context(cl_ctx, cl_q); +int init_context() { + cl_int err = 0; + + const cl_uint maxPlatforms = 4; + cl_platform_id platforms[maxPlatforms]; + cl_uint platformCount = 0; + + err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount); + if (err != CL_SUCCESS) { + printf("clGetPlatformIDs failed (%d)\n", err); + return err; + } + + cl_platform_id platform = nullptr; + + if (platformCount > 0) { + platform = platforms[0]; + } + if (platform == nullptr) { + printf("Failed to get platform\n"); + return CL_INVALID_PLATFORM; + } + + cl_device_type device_type = CL_DEVICE_TYPE_ALL; + + // Make sure we have a device + const cl_uint maxDevices = 4; + cl_device_id devices[maxDevices]; + cl_uint deviceCount = 0; + err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount); + if (err != CL_SUCCESS) { + printf("clGetDeviceIDs failed (%d)\n", err); + return err; + } + if (deviceCount == 0) { + printf("Failed to get device\n"); + return CL_DEVICE_NOT_FOUND; + } + + cl_device_id dev = devices[deviceCount - 1]; + + // Create context and command queue. + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, + 0}; + cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err); + if (err != CL_SUCCESS) { + printf("clCreateContext failed (%d)\n", err); + return err; + } + + cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err); + if (err != CL_SUCCESS) { + printf("clCreateCommandQueue failed (%d)\n", err); + return err; + } + printf("Created CL context %p\n", cl_ctx); + return 0; } void destroy_context() { - destroy_opencl_context(cl_ctx, cl_q); + printf("Destroying CL context %p\n", cl_ctx); + clReleaseCommandQueue(cl_q); + clReleaseContext(cl_ctx); cl_q = nullptr; cl_ctx = nullptr; } @@ -52,14 +116,61 @@ extern "C" int halide_release_cl_context(void *user_context) { return 0; } #elif defined(TEST_CUDA) +// Implement CUDA custom context. +#include + CUcontext cuda_ctx = nullptr; -bool init_context() { - return create_cuda_context(cuda_ctx); +int init_context() { + // Initialize CUDA + CUresult err = cuInit(0); + if (err != CUDA_SUCCESS) { + printf("cuInit failed (%d)\n", err); + return err; + } + + // Make sure we have a device + int deviceCount = 0; + err = cuDeviceGetCount(&deviceCount); + if (err != CUDA_SUCCESS) { + printf("cuGetDeviceCount failed (%d)\n", err); + return err; + } + if (deviceCount <= 0) { + printf("No CUDA devices available\n"); + return CUDA_ERROR_NO_DEVICE; + } + + CUdevice dev; + // Get device + CUresult status; + // Try to get a device >0 first, since 0 should be our display device + // For now, don't try devices > 2 to maintain compatibility with previous behavior. + if (deviceCount > 2) deviceCount = 2; + for (int id = deviceCount - 1; id >= 0; id--) { + status = cuDeviceGet(&dev, id); + if (status == CUDA_SUCCESS) break; + } + + if (status != CUDA_SUCCESS) { + printf("Failed to get CUDA device\n"); + return status; + } + + // Create context + err = cuCtxCreate(&cuda_ctx, 0, dev); + if (err != CUDA_SUCCESS) { + printf("cuCtxCreate failed (%d)\n", err); + return err; + } + printf("Created CUDA context %p\n", cuda_ctx); + + return 0; } void destroy_context() { - destroy_cuda_context(cuda_ctx); + printf("Destroying CUDA context %p\n", cuda_ctx); + cuCtxDestroy(cuda_ctx); cuda_ctx = nullptr; } @@ -78,18 +189,19 @@ extern "C" int halide_cuda_release_context(void *user_context) { } #else // Just use the default implementation of acquire/release. -bool init_context() { +int init_context() { printf("Using default implementation of acquire/release\n"); - return true; + return 0; } void destroy_context() { } #endif -bool run_test() { +int main(int argc, char **argv) { // Initialize the runtime specific GPU context. - if (!init_context()) { - return false; + int ret = init_context(); + if (ret != 0) { + return ret; } // Everything else is a normal Halide program. The GPU runtime will call @@ -115,40 +227,19 @@ bool run_test() { if (input(x, y) * 2.0f + 1.0f != output(x, y)) { printf("Error at (%d, %d): %f != %f\n", x, y, input(x, y) * 2.0f + 1.0f, output(x, y)); - return false; + return -1; } } } - const halide_device_interface_t *interface = output.raw_buffer()->device_interface; - // We need to free our GPU buffers before destroying the context. input.device_free(); output.device_free(); - if (interface != nullptr) { - halide_device_release(nullptr, interface); - } else { - printf("Device interface is nullptr.\n"); - return false; - } - // Free the context we created. destroy_context(); printf("Success!\n"); - return true; -} - -int main(int argc, char **argv) { - if (!run_test()) { - return -1; - } - - if (!run_test()) { - return -1; - } - return 0; } diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp deleted file mode 100644 index 05852909ba57..000000000000 --- a/test/generator/gpu_multi_context_threaded_aottest.cpp +++ /dev/null @@ -1,193 +0,0 @@ -#include - -// This test demonstrates how to use more than one GPU context with -// Halide generated GPU support, specifically in a multithreaded -// program. It of course also tests that this works correctly with the -// Halide GPU runtimes. - -#ifdef _WIN32 -int main(int argc, char **argv) { - printf("[SKIP] Test requires weak linkage, which is not available on Windows.\n"); - return 0; -} -#else - -#include "HalideBuffer.h" -#include "HalideRuntime.h" -#include -#include -#include -#include - -#include "gpu_context.h" - -#include "gpu_multi_context_threaded_add.h" -#include "gpu_multi_context_threaded_mul.h" - -using namespace Halide::Runtime; - -const int W = 32, H = 32; - -#if defined(TEST_OPENCL) - -struct gpu_context { - cl_context cl_ctx; - cl_command_queue cl_q; -}; - -// Create the global context. This is just a helper function not called by Halide. -bool init_context(gpu_context &context) { - return create_opencl_context(context.cl_ctx, context.cl_q); -} - -void destroy_context(gpu_context &context) { - destroy_opencl_context(context.cl_ctx, context.cl_q); - cl_q = nullptr; - cl_ctx = nullptr; -} - -// These functions replace the acquire/release implementation in src/runtime/opencl.cpp. -// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization -// in our implementation of these functions. -extern "C" int halide_acquire_cl_context(void *user_context, cl_context *ctx, cl_command_queue *q, bool create) { - if (user_context == nullptr) { - assert(!create); - *ctx = nullptr; - *q = nullptr; - } else { - const gpu_context *context = (const gpu_context *)user_context; - *ctx = context->cl_ctx; - *q = context->cl_q; - } - return 0; -} - -extern "C" int halide_release_cl_context(void *user_context) { - return 0; -} - -#define HAS_MULTIPLE_CONTEXTS true -#elif defined(TEST_CUDA) - -typedef CUcontext gpu_context; - -bool init_context(CUcontext &cuda_ctx) { - return create_cuda_context(cuda_ctx); -} - -void destroy_context(CUcontext &cuda_ctx) { - destroy_cuda_context(cuda_ctx); - cuda_ctx = nullptr; -} - -// These functions replace the acquire/release implementation in src/runtime/cuda.cpp. -// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization -// in our implementation of these functions. -extern "C" int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, bool create) { - if (user_context == nullptr) { - assert(!create); - *ctx = nullptr; - } else { - *ctx = *(CUcontext *)user_context; - } - return 0; -} - -extern "C" int halide_cuda_release_context(void *user_context) { - return 0; -} - -#define HAS_MULTIPLE_CONTEXTS true -#else -typedef int gpu_context; - -// Just use the default implementation of acquire/release. -bool init_context(int &context) { - printf("Using default implementation of acquire/release\n"); - context = 0; - return true; -} -void destroy_context(int & /* context */) { - -#define HAS_MULTIPLE_CONTEXTS false -#endif - -void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) { - gpu_context context2; - - Buffer buf1_in(W, H); - Buffer buf1_result(W, H); - buf1_in.fill(0); - - const halide_device_interface_t *device_interface; - - int val = 0; - for (int i = 0; i < 10; i++) { - init_context(context2); - - Buffer buf2_in(W, H); - Buffer buf2_result(W, H); - buf2_in.fill(0); - - gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result); - gpu_multi_context_threaded_mul(&context1, buf1_result, buf1_in); - gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result); - - gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result); - gpu_multi_context_threaded_mul(&context2, buf2_result, buf2_in); - gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result); - - buf1_result.copy_to_host(&context1); - buf2_result.copy_to_host(&context2); - - val += 2; - val *= 2; - assert(buf1_result.all_equal(val + 2)); - assert(buf2_result.all_equal(6)); - - device_interface = buf1_result.raw_buffer()->device_interface; - - // About to destroy context, so ensure allocations are freed first. - buf2_in.device_free(&context2); - buf2_result.device_free(&context2); - - if (device_interface != nullptr) { - halide_device_release(&context2, device_interface); - } - destroy_context(context2); - } - - // About to destroy context, so ensure allocations are freed first. - buf1_in.device_free(&context1); - buf1_result.device_free(&context1); - - if (destroy_when_done && device_interface != nullptr) { - halide_device_release(&context1, device_interface); - destroy_context(context1); - } -} - -int main(int argc, char **argv) { - gpu_context contexta; - init_context(contexta); - - gpu_context contextb; - init_context(contextb); - - std::thread thread1(run_kernels_on_thread, contexta, false); - std::thread thread2(run_kernels_on_thread, contextb, false); - - thread1.join(); - thread2.join(); - - // Make sure using the same context on different threads works. - std::thread thread3(run_kernels_on_thread, contexta, HAS_MULTIPLE_CONTEXTS); - std::thread thread4(run_kernels_on_thread, contextb, HAS_MULTIPLE_CONTEXTS); - - thread3.join(); - thread4.join(); - - printf("Success!\n"); - return 0; -} -#endif // !WIN32 diff --git a/test/generator/gpu_multi_context_threaded_generator.cpp b/test/generator/gpu_multi_context_threaded_generator.cpp deleted file mode 100644 index 42f278b6379b..000000000000 --- a/test/generator/gpu_multi_context_threaded_generator.cpp +++ /dev/null @@ -1,48 +0,0 @@ -#include "Halide.h" - -namespace { - -class GpuAdd : public Halide::Generator { -public: - Input> input{"input", 2}; - - Output> output{"output", 2}; - - void generate() { - Var x("x"), y("y"); - - // Create a simple pipeline that scales pixel values by 2. - output(x, y) = input(x, y) + 2; - - Target target = get_target(); - if (target.has_gpu_feature()) { - Var xo, yo, xi, yi; - output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16); - } - } -}; - -class GpuMul : public Halide::Generator { -public: - Input> input{"input", 2}; - - Output> output{"output", 2}; - - void generate() { - Var x("x"), y("y"); - - // Create a simple pipeline that scales pixel values by 2. - output(x, y) = input(x, y) * 2; - - Target target = get_target(); - if (target.has_gpu_feature()) { - Var xo, yo, xi, yi; - output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16); - } - } -}; - -} // namespace - -HALIDE_REGISTER_GENERATOR(GpuAdd, gpu_multi_context_threaded_add) -HALIDE_REGISTER_GENERATOR(GpuMul, gpu_multi_context_threaded_mul)