From d6f60538c67cddf830890694c97c1385f7c91032 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 7 Dec 2020 13:01:04 -0800 Subject: [PATCH 01/24] Revert "Revert "Make context handling in GPU runtimes more consistent and robust. (#5474)" (#5515)" This reverts commit 2ddd0b0fe70c709813079c89ec6bdc52d27c9093. --- Makefile | 26 +++ src/runtime/cuda.cpp | 163 ++++---------- src/runtime/d3d12compute.cpp | 78 +++---- src/runtime/destructors.cpp | 1 + src/runtime/gpu_context_common.h | 197 +++++++++++++++++ src/runtime/metal.cpp | 68 ++---- src/runtime/opencl.cpp | 208 ++++++++---------- test/common/gpu_context.h | 134 +++++++++++ test/common/gpu_object_lifetime_tracker.h | 7 +- test/correctness/CMakeLists.txt | 1 + test/correctness/gpu_many_kernels.cpp | 92 ++++++++ test/generator/CMakeLists.txt | 13 ++ test/generator/acquire_release_aottest.cpp | 159 +++---------- .../gpu_multi_context_threaded_aottest.cpp | 193 ++++++++++++++++ .../gpu_multi_context_threaded_generator.cpp | 48 ++++ 15 files changed, 923 insertions(+), 465 deletions(-) create mode 100644 src/runtime/gpu_context_common.h create mode 100644 test/common/gpu_context.h create mode 100644 test/correctness/gpu_many_kernels.cpp create mode 100644 test/generator/gpu_multi_context_threaded_aottest.cpp create mode 100644 test/generator/gpu_multi_context_threaded_generator.cpp diff --git a/Makefile b/Makefile index ba9fcd3f9b4d..6ce06a94b568 100644 --- a/Makefile +++ b/Makefile @@ -1527,6 +1527,12 @@ $(FILTERS_DIR)/nested_externs_%.a: $(BIN_DIR)/nested_externs.generator @mkdir -p $(@D) $(CURDIR)/$< -g nested_externs_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime +# Similarly, gpu_multi needs two different kernels to test compilation caching. +# Also requies user-context. +$(FILTERS_DIR)/gpu_multi_context_threaded_%.a: $(BIN_DIR)/gpu_multi_context_threaded.generator + @mkdir -p $(@D) + $(CURDIR)/$< -g gpu_multi_context_threaded_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context + GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools GEN_AOT_LD_FLAGS=$(COMMON_LD_FLAGS) @@ -1622,11 +1628,31 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget HL_MULTITARGET_TEST_USE_NOBOUNDSQUERY_FEATURE=1 $(CURDIR)/$< @-echo +# gpu_multi_context_threaded has additional deps to link in +$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \ + $(FILTERS_DIR)/gpu_multi_context_threaded_add.a \ + $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a \ + $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a + @mkdir -p $(@D) + $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@ + +$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \ + $(FILTERS_DIR)/gpu_multi_context_threaded_add.halide_generated.cpp \ + $(FILTERS_DIR)/gpu_multi_context_threaded_mul.halide_generated.cpp \ + $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a + @mkdir -p $(@D) + $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@ + # nested externs doesn't actually contain a generator named # "nested_externs", and has no internal tests in any case. test_generator_nested_externs: @echo "Skipping" +# gpu_multi actually contain a generator named +# "gpu_multi", and has no internal tests in any case. +test_generator_gpu_multi: + @echo "Skipping" + $(BUILD_DIR)/RunGenMain.o: $(ROOT_DIR)/tools/RunGenMain.cpp $(RUNTIME_EXPORTED_INCLUDES) $(ROOT_DIR)/tools/RunGen.h @mkdir -p $(@D) $(CXX) -c $< $(filter-out -g, $(TEST_CXX_FLAGS)) $(OPTIMIZE) -Os $(IMAGE_IO_CXX_FLAGS) -I$(INCLUDE_DIR) -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools -o $@ diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 7c423e179d85..1dc1cf27086e 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1,6 +1,7 @@ #include "HalideRuntimeCuda.h" #include "device_buffer_utils.h" #include "device_interface.h" +#include "gpu_context_common.h" #include "mini_cuda.h" #include "printer.h" #include "scoped_mutex_lock.h" @@ -239,43 +240,7 @@ class Context { } }; -// Halide allocates a device API controlled pointer slot as part of -// each compiled module. The slot is used to store information to -// avoid having to reload/recompile kernel code on each call into a -// Halide filter. The cuda runtime uses this pointer to maintain a -// linked list of contexts into which the module has been loaded. -// -// A global list of all registered filters is also kept so all modules -// loaded on a given context can be unloaded and removed from the list -// when halide_device_release is called on a specific context. -// -// The registered_filters struct is not freed as it is pointed to by the -// Halide generated code. The module_state structs are freed. - -struct module_state { - CUcontext context; - CUmodule module; - module_state *next; -}; - -struct registered_filters { - module_state *modules; - registered_filters *next; -}; -WEAK registered_filters *filters_list = nullptr; -// This spinlock protects the above filters_list. -WEAK halide_mutex filters_list_lock; - -WEAK module_state *find_module_for_context(const registered_filters *filters, CUcontext ctx) { - module_state *modules = filters->modules; - while (modules != nullptr) { - if (modules->context == ctx) { - return modules; - } - modules = modules->next; - } - return nullptr; -} +WEAK Halide::Internal::GPUCompilationCache compilation_cache; WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) { // Initialize CUDA @@ -505,6 +470,33 @@ WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size #endif } +WEAK CUmodule compile_kernel(void *user_context, const char *ptx_src, int size) { + debug(user_context) << "CUDA: compile_kernel cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> "; + + CUjit_option options[] = {CU_JIT_MAX_REGISTERS}; + unsigned int max_regs_per_thread = 64; + + // A hack to enable control over max register count for + // testing. This should be surfaced in the schedule somehow + // instead. + char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS"); + if (regs) { + max_regs_per_thread = atoi(regs); + } + void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread}; + CUmodule loaded_module; + CUresult err = cuModuleLoadDataEx(&loaded_module, ptx_src, 1, options, optionValues); + + if (err != CUDA_SUCCESS) { + error(user_context) << "CUDA: cuModuleLoadData failed: " + << get_error_name(err); + return nullptr; + } else { + debug(user_context) << (void *)(loaded_module) << "\n"; + } + return loaded_module; +} + } // namespace Cuda } // namespace Internal } // namespace Runtime @@ -526,54 +518,12 @@ WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, co uint64_t t_before = halide_current_time_ns(user_context); #endif - halide_assert(user_context, &filters_list_lock != nullptr); - { - ScopedMutexLock spinlock(&filters_list_lock); - - // Create the state object if necessary. This only happens once, regardless - // of how many times halide_initialize_kernels/halide_release is called. - // halide_release traverses this list and releases the module objects, but - // it does not modify the list nodes created/inserted here. - registered_filters **filters = (registered_filters **)state_ptr; - if (!(*filters)) { - *filters = (registered_filters *)malloc(sizeof(registered_filters)); - (*filters)->modules = nullptr; - (*filters)->next = filters_list; - filters_list = *filters; - } - - // Create the module itself if necessary. - module_state *loaded_module = find_module_for_context(*filters, ctx.context); - if (loaded_module == nullptr) { - loaded_module = (module_state *)malloc(sizeof(module_state)); - debug(user_context) << " cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> "; - - CUjit_option options[] = {CU_JIT_MAX_REGISTERS}; - unsigned int max_regs_per_thread = 64; - - // A hack to enable control over max register count for - // testing. This should be surfaced in the schedule somehow - // instead. - char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS"); - if (regs) { - max_regs_per_thread = atoi(regs); - } - void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread}; - CUresult err = cuModuleLoadDataEx(&loaded_module->module, ptx_src, 1, options, optionValues); - - if (err != CUDA_SUCCESS) { - free(loaded_module); - error(user_context) << "CUDA: cuModuleLoadData failed: " - << get_error_name(err); - return err; - } else { - debug(user_context) << (void *)(loaded_module->module) << "\n"; - } - loaded_module->context = ctx.context; - loaded_module->next = (*filters)->modules; - (*filters)->modules = loaded_module; - } - } // spinlock + CUmodule loaded_module; + if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, loaded_module, + compile_kernel, user_context, ptx_src, size)) { + return halide_error_code_generic_error; + } + halide_assert(user_context, loaded_module != nullptr); #ifdef DEBUG_RUNTIME uint64_t t_after = halide_current_time_ns(user_context); @@ -704,7 +654,7 @@ WEAK int halide_cuda_device_release(void *user_context) { << "CUDA: halide_cuda_device_release (user_context: " << user_context << ")\n"; // If we haven't even loaded libcuda, don't load it just to quit. - if (!lib_cuda) { + if (!cuInit) { return 0; } @@ -728,34 +678,7 @@ WEAK int halide_cuda_device_release(void *user_context) { // Dump the contents of the free list, ignoring errors. halide_cuda_release_unused_device_allocations(user_context); - { - ScopedMutexLock spinlock(&filters_list_lock); - - // Unload the modules attached to this context. Note that the list - // nodes themselves are not freed, only the module objects are - // released. Subsequent calls to halide_init_kernels might re-create - // the program object using the same list node to store the module - // object. - registered_filters *filters = filters_list; - while (filters) { - module_state **prev_ptr = &filters->modules; - module_state *loaded_module = filters->modules; - while (loaded_module != nullptr) { - if (loaded_module->context == ctx) { - debug(user_context) << " cuModuleUnload " << loaded_module->module << "\n"; - err = cuModuleUnload(loaded_module->module); - halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED); - *prev_ptr = loaded_module->next; - free(loaded_module); - loaded_module = *prev_ptr; - } else { - loaded_module = loaded_module->next; - prev_ptr = &loaded_module->next; - } - } - filters = filters->next; - } - } // spinlock + compilation_cache.delete_context(user_context, ctx, cuModuleUnload); CUcontext old_ctx; cuCtxPopCurrent(&old_ctx); @@ -919,12 +842,15 @@ WEAK int cuda_do_multidimensional_copy(void *user_context, const device_copy &c, << (void *)src << " -> " << (void *)dst << ", " << c.chunk_size << " bytes\n"; if (!from_host && to_host) { debug(user_context) << "cuMemcpyDtoH(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n"; + copy_name = "cuMemcpyDtoH"; err = cuMemcpyDtoH((void *)dst, (CUdeviceptr)src, c.chunk_size); } else if (from_host && !to_host) { debug(user_context) << "cuMemcpyHtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n"; + copy_name = "cuMemcpyHtoD"; err = cuMemcpyHtoD((CUdeviceptr)dst, (void *)src, c.chunk_size); } else if (!from_host && !to_host) { debug(user_context) << "cuMemcpyDtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n"; + copy_name = "cuMemcpyDtoD"; err = cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, c.chunk_size); } else if (dst != src) { debug(user_context) << "memcpy(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n"; @@ -1133,9 +1059,9 @@ WEAK int halide_cuda_run(void *user_context, #endif halide_assert(user_context, state_ptr); - module_state *loaded_module = find_module_for_context((registered_filters *)state_ptr, ctx.context); - halide_assert(user_context, loaded_module != nullptr); - CUmodule mod = loaded_module->module; + CUmodule mod = nullptr; + bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod); + halide_assert(user_context, found_module && mod != nullptr); debug(user_context) << "Got module " << mod << "\n"; halide_assert(user_context, mod); CUfunction f; @@ -1264,7 +1190,7 @@ WEAK const halide_device_interface_t *halide_cuda_device_interface() { } WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *minor) { - if (!lib_cuda) { + if (!lib_cuda && !cuInit) { // If cuda can't be found, we want to return 0, 0 and it's not // considered an error. So we should be very careful about // looking for libcuda without tripping any errors in the rest @@ -1313,6 +1239,7 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min namespace { WEAK __attribute__((destructor)) void halide_cuda_cleanup() { + compilation_cache.release_all(nullptr, cuModuleUnload); halide_cuda_device_release(nullptr); } } // namespace diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 49ef68a0118a..3174c33f52a4 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -45,6 +45,7 @@ #include "HalideRuntimeD3D12Compute.h" #include "device_buffer_utils.h" #include "device_interface.h" +#include "gpu_context_common.h" #include "printer.h" #include "scoped_spin_lock.h" @@ -2437,16 +2438,24 @@ static void *buffer_contents(d3d12_buffer *buffer) { volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0; -// Structure to hold the state of a module attached to the context. -// Also used as a linked-list to keep track of all the different -// modules that are attached to a context in order to release them all -// when then context is released. -struct module_state { - d3d12_library *library; - module_state *next; -}; -D3D12TYPENAME(module_state) -WEAK module_state *state_list = nullptr; +WEAK Halide::Internal::GPUCompilationCache compilation_cache; + +WEAK d3d12_library *compile_kernel(void *user_context, const char *source, int source_size, int *error_ret) { + D3D12ContextHolder d3d12_context(user_context, true); + if (d3d12_context.error != 0) { + *error_ret = d3d12_context.error; + return nullptr; + } + + d3d12_library *library = new_library_with_source(d3d12_context.device, source, source_size); + if (library == nullptr) { + TRACEFATAL("D3D12Compute: new_library_with_source failed."); + *error_ret = halide_error_code_out_of_memory; + return nullptr; + } + + return library; +} } // namespace D3D12Compute } // namespace Internal @@ -2754,29 +2763,14 @@ WEAK int halide_d3d12compute_device_free(void *user_context, halide_buffer_t *bu WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) { TRACELOG; - // Create the state object if necessary. This only happens once, regardless - // of how many times halide_initialize_kernels/halide_release is called. - // halide_release traverses this list and releases the module objects, but - // it does not modify the list nodes created/inserted here. - module_state *&state = *(module_state **)state_ptr; - if (!state) { - state = malloct(); - state->library = nullptr; - state->next = state_list; - state_list = state; - } - D3D12ContextHolder d3d12_context(user_context, true); - if (d3d12_context.error != 0) { - return d3d12_context.error; - } - if (state->library == nullptr) { - state->library = new_library_with_source(d3d12_context.device, source, source_size); - if (state->library == nullptr) { - TRACEFATAL("D3D12Compute: new_library_with_source failed."); - return halide_error_code_out_of_memory; - } + int error = halide_error_code_generic_error; + d3d12_library *library; + if (!compilation_cache.kernel_state_setup(user_context, state_ptr, d3d12_context.device, + library, compile_kernel, user_context, + source, source_size, &error)) { + return error; } return 0; @@ -2839,19 +2833,7 @@ WEAK int halide_d3d12compute_device_release(void *user_context) { release_object(buffer); } - // Unload the modules attached to this device. Note that the list - // nodes themselves are not freed, only the program objects are - // released. Subsequent calls to halide_init_kernels might re-create - // the program object using the same list node to store the program - // object. - module_state *state = state_list; - while (state) { - if (state->library) { - release_object(state->library); - state->library = nullptr; - } - state = state->next; - } + compilation_cache.delete_context(user_context, device, release_object); // Release the device itself, if we created it. if (acquired_device == device) { @@ -3023,8 +3005,9 @@ WEAK int halide_d3d12compute_run(void *user_context, StartCapturingGPUActivity(); #endif - halide_assert(user_context, state_ptr); - module_state *state = (module_state *)state_ptr; + d3d12_library *library = nullptr; + bool found_module = compilation_cache.lookup(device, state_ptr, library); + halide_assert(user_context, found_module && library != nullptr); d3d12_frame *frame = acquire_frame(device); d3d12_compute_command_list *cmdList = frame->cmd_list; @@ -3036,7 +3019,7 @@ WEAK int halide_d3d12compute_run(void *user_context, d3d12_compute_pipeline_state *pipeline_state = nullptr; { TRACE_SCOPE("kernel shader selection"); - function = new_function_with_name(device, state->library, entry_name, strlen(entry_name), + function = new_function_with_name(device, library, entry_name, strlen(entry_name), shared_mem_bytes, threadsX, threadsY, threadsZ); halide_assert(user_context, function); pipeline_state = function->pipeline_state; @@ -3531,6 +3514,7 @@ WEAK const struct halide_device_interface_t *halide_d3d12compute_device_interfac namespace { WEAK __attribute__((destructor)) void halide_d3d12compute_cleanup() { TRACELOG; + compilation_cache.release_all(nullptr, release_object); halide_d3d12compute_device_release(nullptr); } } // namespace diff --git a/src/runtime/destructors.cpp b/src/runtime/destructors.cpp index 1fa7228a86e2..b7187c6c4d86 100644 --- a/src/runtime/destructors.cpp +++ b/src/runtime/destructors.cpp @@ -1,4 +1,5 @@ #include "HalideRuntime.h" +#include "printer.h" extern "C" { diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h new file mode 100644 index 000000000000..9ef2662026a8 --- /dev/null +++ b/src/runtime/gpu_context_common.h @@ -0,0 +1,197 @@ +#include "printer.h" +#include "scoped_mutex_lock.h" + +namespace Halide { +namespace Internal { + +template +class GPUCompilationCache { + struct CachedCompilation { + ContextT context{}; + ModuleStateT module_state{}; + uint32_t kernel_id{}; + }; + + halide_mutex mutex; + + static constexpr float kLoadFactor{.5f}; + static constexpr int kInitialTableBits{7}; + int log2_compilations_size{0}; // number of bits in index into compilations table. + CachedCompilation *compilations{nullptr}; + int count{0}; + + static constexpr uint32_t kInvalidId{0}; + static constexpr uint32_t kDeletedId{1}; + + uint32_t unique_id{2}; // zero is an invalid id + +public: + static ALWAYS_INLINE uintptr_t kernel_hash(ContextT context, uint32_t id, uint32_t bits) { + uintptr_t addr = (uintptr_t)context + id; + // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39... + // in hexadecimal. + if (sizeof(uintptr_t) >= 8) { + return (addr * (uintptr_t)0x9E3779B97F4A7C15) >> (64 - bits); + } else { + return (addr * (uintptr_t)0x9E3779B9) >> (32 - bits); + } + } + + HALIDE_MUST_USE_RESULT bool insert(ContextT context, uint32_t id, ModuleStateT module_state) { + if (log2_compilations_size == 0) { + if (!resize_table(kInitialTableBits)) { + return false; + } + } + if ((count + 1) > (1 << log2_compilations_size) * kLoadFactor) { + if (!resize_table(log2_compilations_size + 1)) { + return false; + } + } + count += 1; + uintptr_t index = kernel_hash(context, id, log2_compilations_size); + for (int i = 0; i < (1 << log2_compilations_size); i++) { + uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1); + if (compilations[effective_index].kernel_id <= kDeletedId) { + compilations[effective_index].context = context; + compilations[effective_index].module_state = module_state; + compilations[effective_index].kernel_id = id; + return true; + } + } + // This is a logic error that should never occur. It means the table is + // full, but it should have been resized. + halide_assert(nullptr, false); + return false; + } + + HALIDE_MUST_USE_RESULT bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) { + if (log2_compilations_size == 0) { + return false; + } + uintptr_t index = kernel_hash(context, id, log2_compilations_size); + for (int i = 0; i < (1 << log2_compilations_size); i++) { + uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1); + + if (compilations[effective_index].kernel_id == kInvalidId) { + return false; + } + if (compilations[effective_index].context == context && + compilations[effective_index].kernel_id == id) { + module_state = &compilations[effective_index].module_state; + return true; + } + } + return false; + } + + HALIDE_MUST_USE_RESULT bool lookup(ContextT context, void *state_ptr, ModuleStateT &module_state) { + ScopedMutexLock lock_guard(&mutex); + uint32_t id = (uint32_t)(uintptr_t)state_ptr; + ModuleStateT *mod_ptr; + if (find_internal(context, id, mod_ptr)) { + module_state = *mod_ptr; + return true; + } + return false; + } + + HALIDE_MUST_USE_RESULT bool resize_table(int size_bits) { + if (size_bits != log2_compilations_size) { + int new_size = (1 << size_bits); + int old_size = (1 << log2_compilations_size); + CachedCompilation *new_table = (CachedCompilation *)malloc(new_size * sizeof(CachedCompilation)); + if (new_table == nullptr) { + // signal error. + return false; + } + memset(new_table, 0, new_size * sizeof(CachedCompilation)); + CachedCompilation *old_table = compilations; + compilations = new_table; + log2_compilations_size = size_bits; + + if (count > 0) { // Mainly to catch empty initial table case + for (int32_t i = 0; i < old_size; i++) { + if (old_table[i].kernel_id != kInvalidId && + old_table[i].kernel_id != kDeletedId) { + bool result = insert(old_table[i].context, old_table[i].kernel_id, + old_table[i].module_state); + halide_assert(nullptr, result); // Resizing the table while resizing the table is a logic error. + } + } + } + free(old_table); + } + return true; + } + + template + void release_context(void *user_context, bool all, ContextT context, FreeModuleT &f) { + if (count == 0) { + return; + } + + for (int i = 0; i < (1 << log2_compilations_size); i++) { + if (compilations[i].kernel_id > kInvalidId && + (all || (compilations[i].context == context))) { + f(compilations[i].module_state); + compilations[i].module_state = nullptr; + compilations[i].kernel_id = kDeletedId; + count--; + } + } + } + + template + void delete_context(void *user_context, ContextT context, FreeModuleT &f) { + ScopedMutexLock lock_guard(&mutex); + + release_context(user_context, false, context, f); + } + + template + void release_all(void *user_context, FreeModuleT &f) { + ScopedMutexLock lock_guard(&mutex); + + release_context(user_context, true, nullptr, f); + free(compilations); + compilations = nullptr; + log2_compilations_size = 0; + } + + template + HALIDE_MUST_USE_RESULT bool kernel_state_setup(void *user_context, void **state_ptr, + ContextT context, ModuleStateT &result, + CompileModuleT f, + Args... args) { + ScopedMutexLock lock_guard(&mutex); + + uint32_t *id_ptr = (uint32_t *)state_ptr; + if (*id_ptr == 0) { + *id_ptr = unique_id++; + } + + ModuleStateT *mod; + if (find_internal(context, *id_ptr, mod)) { + result = *mod; + return true; + } + + // TODO(zvookin): figure out the calling signature here... + ModuleStateT compiled_module = f(args...); + debug(user_context) << "Caching compiled kernel: " << compiled_module << " id " << *id_ptr << " context " << context << "\n"; + if (compiled_module == nullptr) { + return false; + } + + if (!insert(context, *id_ptr, compiled_module)) { + return false; + } + result = compiled_module; + + return true; + } +}; + +} // namespace Internal +} // namespace Halide diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp index 1a2ba0ce52b4..25fe29f3feda 100644 --- a/src/runtime/metal.cpp +++ b/src/runtime/metal.cpp @@ -1,6 +1,7 @@ #include "HalideRuntimeMetal.h" #include "device_buffer_utils.h" #include "device_interface.h" +#include "gpu_context_common.h" #include "printer.h" #include "scoped_spin_lock.h" @@ -282,15 +283,7 @@ struct device_handle { uint64_t offset; }; -// Structure to hold the state of a module attached to the context. -// Also used as a linked-list to keep track of all the different -// modules that are attached to a context in order to release them all -// when then context is released. -struct module_state { - mtl_library *library; - module_state *next; -}; -WEAK module_state *state_list = nullptr; +WEAK Halide::Internal::GPUCompilationCache compilation_cache; // API Capabilities. If more capabilities need to be checked, // this can be refactored to something more robust/general. @@ -544,18 +537,6 @@ WEAK int halide_metal_device_free(void *user_context, halide_buffer_t *buf) { } WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) { - // Create the state object if necessary. This only happens once, regardless - // of how many times halide_initialize_kernels/halide_release is called. - // halide_release traverses this list and releases the module objects, but - // it does not modify the list nodes created/inserted here. - module_state **state = (module_state **)state_ptr; - if (!(*state)) { - *state = (module_state *)malloc(sizeof(module_state)); - (*state)->library = nullptr; - (*state)->next = state_list; - state_list = *state; - } - MetalContextHolder metal_context(user_context, true); if (metal_context.error != 0) { return metal_context.error; @@ -565,23 +546,13 @@ WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, c uint64_t t_before = halide_current_time_ns(user_context); #endif - if ((*state)->library == nullptr) { -#ifdef DEBUG_RUNTIME - uint64_t t_before_compile = halide_current_time_ns(user_context); -#endif - - debug(user_context) << "Metal - Allocating: new_library_with_source " << (*state)->library << "\n"; - (*state)->library = new_library_with_source(metal_context.device, source, source_size); - if ((*state)->library == nullptr) { - error(user_context) << "Metal: new_library_with_source failed.\n"; - return -1; - } - -#ifdef DEBUG_RUNTIME - uint64_t t_after_compile = halide_current_time_ns(user_context); - debug(user_context) << "Time for halide_metal_initialize_kernels compilation: " << (t_after_compile - t_before_compile) / 1.0e6 << " ms\n"; -#endif + mtl_library *library; + if (!compilation_cache.kernel_state_setup(user_context, state_ptr, metal_context.device, library, + new_library_with_source, metal_context.device, + source, source_size)) { + return halide_error_code_generic_error; } + halide_assert(user_context, library != nullptr); #ifdef DEBUG_RUNTIME uint64_t t_after = halide_current_time_ns(user_context); @@ -644,20 +615,7 @@ WEAK int halide_metal_device_release(void *user_context) { if (device) { halide_metal_device_sync_internal(queue, nullptr); - // Unload the modules attached to this device. Note that the list - // nodes themselves are not freed, only the program objects are - // released. Subsequent calls to halide_init_kernels might re-create - // the program object using the same list node to store the program - // object. - module_state *state = state_list; - while (state) { - if (state->library) { - debug(user_context) << "Metal - Releasing: new_library_with_source " << state->library << "\n"; - release_ns_object(state->library); - state->library = nullptr; - } - state = state->next; - } + compilation_cache.delete_context(user_context, device, release_ns_object); // Release the device itself, if we created it. if (acquired_device == device) { @@ -782,10 +740,11 @@ WEAK int halide_metal_run(void *user_context, return -1; } - halide_assert(user_context, state_ptr); - module_state *state = (module_state *)state_ptr; + mtl_library *library; + bool found_library = compilation_cache.lookup(metal_context.device, state_ptr, library); + halide_assert(user_context, found_library && library != nullptr); - mtl_function *function = new_function_with_name(state->library, entry_name, strlen(entry_name)); + mtl_function *function = new_function_with_name(library, entry_name, strlen(entry_name)); if (function == nullptr) { error(user_context) << "Metal: Could not get function " << entry_name << "from Metal library.\n"; return -1; @@ -1147,6 +1106,7 @@ WEAK const struct halide_device_interface_t *halide_metal_device_interface() { namespace { WEAK __attribute__((destructor)) void halide_metal_cleanup() { + compilation_cache.release_all(nullptr, release_ns_object); halide_metal_device_release(nullptr); } } // namespace diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 7c10033bde31..7c7815b620d7 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -1,6 +1,7 @@ #include "HalideRuntimeOpenCL.h" #include "device_buffer_utils.h" #include "device_interface.h" +#include "gpu_context_common.h" #include "printer.h" #include "scoped_spin_lock.h" @@ -286,15 +287,7 @@ struct device_handle { cl_mem mem; }; -// Structure to hold the state of a module attached to the context. -// Also used as a linked-list to keep track of all the different -// modules that are attached to a context in order to release them all -// when then context is released. -struct module_state { - cl_program program; - module_state *next; -}; -WEAK module_state *state_list = nullptr; +WEAK Halide::Internal::GPUCompilationCache compilation_cache; WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) { if (buf->device == 0) { @@ -556,6 +549,83 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q return err; } +WEAK cl_program compile_kernel(void *user_context, cl_context ctx, const char *src, int size) { + cl_int err = 0; + cl_device_id dev; + + err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr); + if (err != CL_SUCCESS) { + error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: " + << get_opencl_error_name(err); + return nullptr; + } + + cl_device_id devices[] = {dev}; + + // Get the max constant buffer size supported by this OpenCL implementation. + cl_ulong max_constant_buffer_size = 0; + err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr); + if (err != CL_SUCCESS) { + error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: " + << get_opencl_error_name(err); + return nullptr; + } + // Get the max number of constant arguments supported by this OpenCL implementation. + cl_uint max_constant_args = 0; + err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr); + if (err != CL_SUCCESS) { + error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: " + << get_opencl_error_name(err); + return nullptr; + } + + // Build the compile argument options. + stringstream options(user_context); + options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size + << " -D MAX_CONSTANT_ARGS=" << max_constant_args; + + const char *extra_options = halide_opencl_get_build_options(user_context); + options << " " << extra_options; + + const char *sources[] = {src}; + debug(user_context) << " clCreateProgramWithSource -> "; + cl_program program = clCreateProgramWithSource(ctx, 1, &sources[0], nullptr, &err); + if (err != CL_SUCCESS) { + debug(user_context) << get_opencl_error_name(err) << "\n"; + error(user_context) << "CL: clCreateProgramWithSource failed: " + << get_opencl_error_name(err); + return nullptr; + } else { + debug(user_context) << (void *)program << "\n"; + } + + debug(user_context) << " clBuildProgram " << (void *)program + << " " << options.str() << "\n"; + err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr); + if (err != CL_SUCCESS) { + + { + // Allocate an appropriately sized buffer for the build log. + Printer p(user_context); + + p << "CL: clBuildProgram failed: " + << get_opencl_error_name(err) + << "\nBuild Log:\n"; + + // Get build log + if (clGetProgramBuildInfo(program, dev, + CL_PROGRAM_BUILD_LOG, + p.capacity() - p.size() - 1, p.dst, + nullptr) != CL_SUCCESS) { + p << "clGetProgramBuildInfo failed (Printer buffer too small?)"; + } + } + + return nullptr; + } + return program; +} + } // namespace OpenCL } // namespace Internal } // namespace Runtime @@ -675,97 +745,13 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, uint64_t t_before = halide_current_time_ns(user_context); #endif - // Create the state object if necessary. This only happens once, regardless - // of how many times halide_init_kernels/halide_release is called. - // halide_release traverses this list and releases the program objects, but - // it does not modify the list nodes created/inserted here. - module_state **state = (module_state **)state_ptr; - if (!(*state)) { - *state = (module_state *)malloc(sizeof(module_state)); - (*state)->program = nullptr; - (*state)->next = state_list; - state_list = *state; - } - - // Create the program if necessary. TODO: The program object needs to not - // only already exist, but be created for the same context/device as the - // calling context/device. - if (!(*state && (*state)->program) && size > 1) { - cl_int err = 0; - cl_device_id dev; - - err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr); - if (err != CL_SUCCESS) { - error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: " - << get_opencl_error_name(err); - return err; - } - - cl_device_id devices[] = {dev}; - - // Get the max constant buffer size supported by this OpenCL implementation. - cl_ulong max_constant_buffer_size = 0; - err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr); - if (err != CL_SUCCESS) { - error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: " - << get_opencl_error_name(err); - return err; - } - // Get the max number of constant arguments supported by this OpenCL implementation. - cl_uint max_constant_args = 0; - err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr); - if (err != CL_SUCCESS) { - error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: " - << get_opencl_error_name(err); - return err; - } - - // Build the compile argument options. - stringstream options(user_context); - options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size - << " -D MAX_CONSTANT_ARGS=" << max_constant_args; - - const char *extra_options = halide_opencl_get_build_options(user_context); - options << " " << extra_options; - - const char *sources[] = {src}; - debug(user_context) << " clCreateProgramWithSource -> "; - cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], nullptr, &err); - if (err != CL_SUCCESS) { - debug(user_context) << get_opencl_error_name(err) << "\n"; - error(user_context) << "CL: clCreateProgramWithSource failed: " - << get_opencl_error_name(err); - return err; - } else { - debug(user_context) << (void *)program << "\n"; - } - - (*state)->program = program; - debug(user_context) << " clBuildProgram " << (void *)program - << " " << options.str() << "\n"; - err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr); - if (err != CL_SUCCESS) { - - { - // Allocate an appropriately sized buffer for the build log. - Printer p(user_context); - - p << "CL: clBuildProgram failed: " - << get_opencl_error_name(err) - << "\nBuild Log:\n"; - - // Get build log - if (clGetProgramBuildInfo(program, dev, - CL_PROGRAM_BUILD_LOG, - p.capacity() - p.size() - 1, p.dst, - nullptr) != CL_SUCCESS) { - p << "clGetProgramBuildInfo failed (Printer buffer too small?)"; - } - } - - return err; - } + debug(user_context) << "halide_cuda_initialize_kernels got compilation_cache mutex.\n"; + cl_program program; + if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, program, + compile_kernel, user_context, ctx.context, src, size)) { + return halide_error_code_generic_error; } + halide_assert(user_context, program != nullptr); #ifdef DEBUG_RUNTIME uint64_t t_after = halide_current_time_ns(user_context); @@ -820,21 +806,7 @@ WEAK int halide_opencl_device_release(void *user_context) { err = clFinish(q); halide_assert(user_context, err == CL_SUCCESS); - // Unload the modules attached to this context. Note that the list - // nodes themselves are not freed, only the program objects are - // released. Subsequent calls to halide_init_kernels might re-create - // the program object using the same list node to store the program - // object. - module_state *state = state_list; - while (state) { - if (state->program) { - debug(user_context) << " clReleaseProgram " << state->program << "\n"; - err = clReleaseProgram(state->program); - halide_assert(user_context, err == CL_SUCCESS); - state->program = nullptr; - } - state = state->next; - } + compilation_cache.delete_context(user_context, ctx, clReleaseProgram); // Release the context itself, if we created it. if (ctx == context) { @@ -1077,9 +1049,10 @@ WEAK int halide_opencl_run(void *user_context, // Create kernel object for entry_name from the program for this module. halide_assert(user_context, state_ptr); - cl_program program = ((module_state *)state_ptr)->program; + cl_program program; + bool found_program = compilation_cache.lookup(ctx.context, state_ptr, program); - halide_assert(user_context, program); + halide_assert(user_context, found_program && program != nullptr); debug(user_context) << " clCreateKernel " << entry_name << " -> "; cl_kernel f = clCreateKernel(program, entry_name, &err); if (err != CL_SUCCESS) { @@ -1366,6 +1339,7 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() { namespace { WEAK __attribute__((destructor)) void halide_opencl_cleanup() { + compilation_cache.release_all(nullptr, clReleaseProgram); halide_opencl_device_release(nullptr); } } // namespace @@ -1940,4 +1914,4 @@ extern "C" { WEAK const struct halide_device_interface_t *halide_opencl_image_device_interface() { return &opencl_image_device_interface; } -} \ No newline at end of file +} diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h new file mode 100644 index 000000000000..4816ad205866 --- /dev/null +++ b/test/common/gpu_context.h @@ -0,0 +1,134 @@ +#if defined(TEST_OPENCL) +// Implement OpenCL custom context. + +#define CL_TARGET_OPENCL_VERSION 120 +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#ifdef __APPLE__ +#include +#else +#include +#endif + +// Just use a global context and queue, created and destroyed by main. +cl_context cl_ctx = nullptr; +cl_command_queue cl_q = nullptr; + +// Create the global context. This is just a helper function not called by Halide. +bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) { + cl_int err = 0; + + const cl_uint maxPlatforms = 4; + cl_platform_id platforms[maxPlatforms]; + cl_uint platformCount = 0; + + err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount); + if (err != CL_SUCCESS) { + printf("clGetPlatformIDs failed (%d)\n", err); + return false; + } + + cl_platform_id platform = nullptr; + + if (platformCount > 0) { + platform = platforms[0]; + } + if (platform == nullptr) { + printf("Failed to get platform\n"); + return false; + } + + cl_device_type device_type = CL_DEVICE_TYPE_ALL; + + // Make sure we have a device + const cl_uint maxDevices = 4; + cl_device_id devices[maxDevices]; + cl_uint deviceCount = 0; + err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount); + if (err != CL_SUCCESS) { + printf("clGetDeviceIDs failed (%d)\n", err); + return false; + } + if (deviceCount == 0) { + printf("Failed to get device\n"); + return false; + } + + cl_device_id dev = devices[deviceCount - 1]; + + // Create context and command queue. + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, + 0}; + cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err); + if (err != CL_SUCCESS) { + printf("clCreateContext failed (%d)\n", err); + return false; + } + + cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err); + if (err != CL_SUCCESS) { + printf("clCreateCommandQueue failed (%d)\n", err); + return false; + } + return true; +} + +void destroy_opencl_context(cl_context cl_ctx, cl_command_queue cl_q) { + clReleaseCommandQueue(cl_q); + clReleaseContext(cl_ctx); +} + +#elif defined(TEST_CUDA) +// Implement CUDA custom context. +#include + +bool create_cuda_context(CUcontext &cuda_ctx) { + // Initialize CUDA + CUresult err = cuInit(0); + if (err != CUDA_SUCCESS) { + printf("cuInit failed (%d)\n", err); + return false; + } + + // Make sure we have a device + int deviceCount = 0; + err = cuDeviceGetCount(&deviceCount); + if (err != CUDA_SUCCESS) { + printf("cuGetDeviceCount failed (%d)\n", err); + return false; + } + if (deviceCount <= 0) { + printf("No CUDA devices available\n"); + return false; + } + + CUdevice dev; + // Get device + CUresult status; + // Try to get a device >0 first, since 0 should be our display device + // For now, don't try devices > 2 to maintain compatibility with previous behavior. + if (deviceCount > 2) deviceCount = 2; + for (int id = deviceCount - 1; id >= 0; id--) { + status = cuDeviceGet(&dev, id); + if (status == CUDA_SUCCESS) break; + } + + if (status != CUDA_SUCCESS) { + printf("Failed to get CUDA device\n"); + return status; + } + + // Create context + err = cuCtxCreate(&cuda_ctx, 0, dev); + if (err != CUDA_SUCCESS) { + printf("cuCtxCreate failed (%d)\n", err); + return false; + } + + return true; +} + +void destroy_cuda_context(CUcontext cuda_ctx) { + cuCtxDestroy(cuda_ctx); +} + +#endif diff --git a/test/common/gpu_object_lifetime_tracker.h b/test/common/gpu_object_lifetime_tracker.h index b17dd9118413..1734a59f2f00 100644 --- a/test/common/gpu_object_lifetime_tracker.h +++ b/test/common/gpu_object_lifetime_tracker.h @@ -22,25 +22,24 @@ class GpuObjectLifetimeTracker { } }; - std::array object_types = {{ + std::array object_types = {{ + {"Caching compiled kernel:", "Releasing cached compilation:"}, + // OpenCL objects {"clCreateContext", "clReleaseContext", true}, {"clCreateCommandQueue", "clReleaseCommandQueue", true}, // This handles both "clCreateProgramWithSource" and // "clCreateProgramWithBinary". - {"clCreateProgram", "clReleaseProgram"}, {"clCreateBuffer", "clReleaseMemObject"}, {"clCreateKernel", "clReleaseKernel"}, // CUDA objects {"cuCtxCreate", "cuCtxDestroy", true}, - {"cuModuleLoad", "cuModuleUnload"}, {"cuMemAlloc", "cuMemFree"}, // Metal objects {"Allocating: MTLCreateSystemDefaultDevice", "Releasing: MTLCreateSystemDefaultDevice", true}, {"Allocating: new_command_queue", "Releasing: new_command_queue"}, - {"Allocating: new_library_with_source", "Releasing: new_library_with_source"}, // Hexagon objects {"halide_remote_load_library", "halide_remote_release_library"}, diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index e1ba9d8e4eb3..68692f12f1ba 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -134,6 +134,7 @@ tests(GROUPS correctness gpu_give_input_buffers_device_allocations.cpp gpu_jit_explicit_copy_to_device.cpp gpu_large_alloc.cpp + gpu_many_kernels.cpp gpu_mixed_dimensionality.cpp gpu_mixed_shared_mem_types.cpp gpu_multi_kernel.cpp diff --git a/test/correctness/gpu_many_kernels.cpp b/test/correctness/gpu_many_kernels.cpp new file mode 100644 index 000000000000..d9572b38c18c --- /dev/null +++ b/test/correctness/gpu_many_kernels.cpp @@ -0,0 +1,92 @@ +#include "Halide.h" +#include + +#include "halide_benchmark.h" + +// This test makes sure GPU runtimes can handle many different small +// kernels and can handle releasing a device context and making a new +// one and still have many kernels work. This is needed due to kernel +// compilation caching mechanisms in the GPU runtimes. + +using namespace Halide; + +constexpr size_t kNumKernels = 70; + +int main(int argc, char **argv) { + Var x, y, xi, yi; + Func adders[kNumKernels]; + ImageParam input(Int(32), 2); + + Target target = get_jit_target_from_environment(); + int i = 1; + for (Func &f : adders) { + f(x, y) = input(x, y) + i; + if (target.has_gpu_feature()) { + f.compute_root().gpu_tile(x, y, xi, yi, 16, 16); + } else { + f.compute_root().vectorize(x, target.natural_vector_size()); + } + i += 1; + } + + auto start = Halide::Tools::benchmark_now(); + + Buffer buf_a_store(32, 32); + Buffer buf_b_store(32, 32); + Buffer *buf_in = &buf_a_store; + Buffer *buf_out = &buf_b_store; + buf_in->fill(0); + for (Func &f : adders) { + input.set(*buf_in); + f.realize(*buf_out); + std::swap(buf_in, buf_out); + } + buf_in->copy_to_host(); + + auto end = Halide::Tools::benchmark_now(); + double initial_runtime = Halide::Tools::benchmark_duration_seconds(start, end); + + buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); }); + + start = Halide::Tools::benchmark_now(); + + buf_in->fill(0); + for (Func &f : adders) { + input.set(*buf_in); + f.realize(*buf_out); + std::swap(buf_in, buf_out); + } + buf_in->copy_to_host(); + + end = Halide::Tools::benchmark_now(); + double precompiled_runtime = Halide::Tools::benchmark_duration_seconds(start, end); + + buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); }); + + buf_a_store.device_free(); + buf_b_store.device_free(); + const halide_device_interface_t *device = get_device_interface_for_device_api(DeviceAPI::Default_GPU, target); + if (device != nullptr) { + device->device_release(nullptr, device); + } + + start = Halide::Tools::benchmark_now(); + + buf_in->fill(0); + for (Func &f : adders) { + input.set(*buf_in); + f.realize(*buf_out); + std::swap(buf_in, buf_out); + } + buf_in->copy_to_host(); + + end = Halide::Tools::benchmark_now(); + double second_runtime = Halide::Tools::benchmark_duration_seconds(start, end); + + buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); }); + + printf("Initial runtime %f, precompiled runtime %f, second runtime %f.\n", initial_runtime, precompiled_runtime, second_runtime); + + printf("Success!\n"); + return 0; +} diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt index 2db1d3bc4f20..130af49b7f93 100644 --- a/test/generator/CMakeLists.txt +++ b/test/generator/CMakeLists.txt @@ -257,6 +257,19 @@ halide_define_aot_test(external_code GEN_DEPS external_code_generator_deps) # float16_t_generator.cpp halide_define_aot_test(float16_t) +# gpu_multi_context_threaded_aottest.cpp +# gpu_multi_context_threaded_generator.cpp +halide_define_aot_test(gpu_multi_context_threaded + OMIT_DEFAULT_GENERATOR + EXTRA_LIBS + gpu_multi_context_threaded_add + gpu_multi_context_threaded_mul) + +add_halide_library(gpu_multi_context_threaded_add FROM gpu_multi_context_threaded.generator + FEATURES user_context) +add_halide_library(gpu_multi_context_threaded_mul FROM gpu_multi_context_threaded.generator + FEATURES user_context) + # gpu_object_lifetime_aottest.cpp # gpu_object_lifetime_generator.cpp halide_define_aot_test(gpu_object_lifetime FEATURES debug) diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp index f0fd8ef9cd04..9e4fba70afe6 100644 --- a/test/generator/acquire_release_aottest.cpp +++ b/test/generator/acquire_release_aottest.cpp @@ -14,89 +14,25 @@ int main(int argc, char **argv) { #include #include "acquire_release.h" +#include "gpu_context.h" using namespace Halide::Runtime; const int W = 256, H = 256; #if defined(TEST_OPENCL) -// Implement OpenCL custom context. - -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#ifdef __APPLE__ -#include -#else -#include -#endif // Just use a global context and queue, created and destroyed by main. cl_context cl_ctx = nullptr; cl_command_queue cl_q = nullptr; // Create the global context. This is just a helper function not called by Halide. -int init_context() { - cl_int err = 0; - - const cl_uint maxPlatforms = 4; - cl_platform_id platforms[maxPlatforms]; - cl_uint platformCount = 0; - - err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount); - if (err != CL_SUCCESS) { - printf("clGetPlatformIDs failed (%d)\n", err); - return err; - } - - cl_platform_id platform = nullptr; - - if (platformCount > 0) { - platform = platforms[0]; - } - if (platform == nullptr) { - printf("Failed to get platform\n"); - return CL_INVALID_PLATFORM; - } - - cl_device_type device_type = CL_DEVICE_TYPE_ALL; - - // Make sure we have a device - const cl_uint maxDevices = 4; - cl_device_id devices[maxDevices]; - cl_uint deviceCount = 0; - err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount); - if (err != CL_SUCCESS) { - printf("clGetDeviceIDs failed (%d)\n", err); - return err; - } - if (deviceCount == 0) { - printf("Failed to get device\n"); - return CL_DEVICE_NOT_FOUND; - } - - cl_device_id dev = devices[deviceCount - 1]; - - // Create context and command queue. - cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, - 0}; - cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err); - if (err != CL_SUCCESS) { - printf("clCreateContext failed (%d)\n", err); - return err; - } - - cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err); - if (err != CL_SUCCESS) { - printf("clCreateCommandQueue failed (%d)\n", err); - return err; - } - printf("Created CL context %p\n", cl_ctx); - return 0; +bool init_context() { + return create_opencl_context(cl_ctx, cl_q); } void destroy_context() { - printf("Destroying CL context %p\n", cl_ctx); - clReleaseCommandQueue(cl_q); - clReleaseContext(cl_ctx); + destroy_opencl_context(cl_ctx, cl_q); cl_q = nullptr; cl_ctx = nullptr; } @@ -116,61 +52,14 @@ extern "C" int halide_release_cl_context(void *user_context) { return 0; } #elif defined(TEST_CUDA) -// Implement CUDA custom context. -#include - CUcontext cuda_ctx = nullptr; -int init_context() { - // Initialize CUDA - CUresult err = cuInit(0); - if (err != CUDA_SUCCESS) { - printf("cuInit failed (%d)\n", err); - return err; - } - - // Make sure we have a device - int deviceCount = 0; - err = cuDeviceGetCount(&deviceCount); - if (err != CUDA_SUCCESS) { - printf("cuGetDeviceCount failed (%d)\n", err); - return err; - } - if (deviceCount <= 0) { - printf("No CUDA devices available\n"); - return CUDA_ERROR_NO_DEVICE; - } - - CUdevice dev; - // Get device - CUresult status; - // Try to get a device >0 first, since 0 should be our display device - // For now, don't try devices > 2 to maintain compatibility with previous behavior. - if (deviceCount > 2) deviceCount = 2; - for (int id = deviceCount - 1; id >= 0; id--) { - status = cuDeviceGet(&dev, id); - if (status == CUDA_SUCCESS) break; - } - - if (status != CUDA_SUCCESS) { - printf("Failed to get CUDA device\n"); - return status; - } - - // Create context - err = cuCtxCreate(&cuda_ctx, 0, dev); - if (err != CUDA_SUCCESS) { - printf("cuCtxCreate failed (%d)\n", err); - return err; - } - printf("Created CUDA context %p\n", cuda_ctx); - - return 0; +bool init_context() { + return create_cuda_context(cuda_ctx); } void destroy_context() { - printf("Destroying CUDA context %p\n", cuda_ctx); - cuCtxDestroy(cuda_ctx); + destroy_cuda_context(cuda_ctx); cuda_ctx = nullptr; } @@ -189,19 +78,18 @@ extern "C" int halide_cuda_release_context(void *user_context) { } #else // Just use the default implementation of acquire/release. -int init_context() { +bool init_context() { printf("Using default implementation of acquire/release\n"); - return 0; + return true; } void destroy_context() { } #endif -int main(int argc, char **argv) { +bool run_test() { // Initialize the runtime specific GPU context. - int ret = init_context(); - if (ret != 0) { - return ret; + if (!init_context()) { + return false; } // Everything else is a normal Halide program. The GPU runtime will call @@ -227,19 +115,40 @@ int main(int argc, char **argv) { if (input(x, y) * 2.0f + 1.0f != output(x, y)) { printf("Error at (%d, %d): %f != %f\n", x, y, input(x, y) * 2.0f + 1.0f, output(x, y)); - return -1; + return false; } } } + const halide_device_interface_t *interface = output.raw_buffer()->device_interface; + // We need to free our GPU buffers before destroying the context. input.device_free(); output.device_free(); + if (interface != nullptr) { + halide_device_release(nullptr, interface); + } else { + printf("Device interface is nullptr.\n"); + return false; + } + // Free the context we created. destroy_context(); printf("Success!\n"); + return true; +} + +int main(int argc, char **argv) { + if (!run_test()) { + return -1; + } + + if (!run_test()) { + return -1; + } + return 0; } diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp new file mode 100644 index 000000000000..05852909ba57 --- /dev/null +++ b/test/generator/gpu_multi_context_threaded_aottest.cpp @@ -0,0 +1,193 @@ +#include + +// This test demonstrates how to use more than one GPU context with +// Halide generated GPU support, specifically in a multithreaded +// program. It of course also tests that this works correctly with the +// Halide GPU runtimes. + +#ifdef _WIN32 +int main(int argc, char **argv) { + printf("[SKIP] Test requires weak linkage, which is not available on Windows.\n"); + return 0; +} +#else + +#include "HalideBuffer.h" +#include "HalideRuntime.h" +#include +#include +#include +#include + +#include "gpu_context.h" + +#include "gpu_multi_context_threaded_add.h" +#include "gpu_multi_context_threaded_mul.h" + +using namespace Halide::Runtime; + +const int W = 32, H = 32; + +#if defined(TEST_OPENCL) + +struct gpu_context { + cl_context cl_ctx; + cl_command_queue cl_q; +}; + +// Create the global context. This is just a helper function not called by Halide. +bool init_context(gpu_context &context) { + return create_opencl_context(context.cl_ctx, context.cl_q); +} + +void destroy_context(gpu_context &context) { + destroy_opencl_context(context.cl_ctx, context.cl_q); + cl_q = nullptr; + cl_ctx = nullptr; +} + +// These functions replace the acquire/release implementation in src/runtime/opencl.cpp. +// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization +// in our implementation of these functions. +extern "C" int halide_acquire_cl_context(void *user_context, cl_context *ctx, cl_command_queue *q, bool create) { + if (user_context == nullptr) { + assert(!create); + *ctx = nullptr; + *q = nullptr; + } else { + const gpu_context *context = (const gpu_context *)user_context; + *ctx = context->cl_ctx; + *q = context->cl_q; + } + return 0; +} + +extern "C" int halide_release_cl_context(void *user_context) { + return 0; +} + +#define HAS_MULTIPLE_CONTEXTS true +#elif defined(TEST_CUDA) + +typedef CUcontext gpu_context; + +bool init_context(CUcontext &cuda_ctx) { + return create_cuda_context(cuda_ctx); +} + +void destroy_context(CUcontext &cuda_ctx) { + destroy_cuda_context(cuda_ctx); + cuda_ctx = nullptr; +} + +// These functions replace the acquire/release implementation in src/runtime/cuda.cpp. +// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization +// in our implementation of these functions. +extern "C" int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, bool create) { + if (user_context == nullptr) { + assert(!create); + *ctx = nullptr; + } else { + *ctx = *(CUcontext *)user_context; + } + return 0; +} + +extern "C" int halide_cuda_release_context(void *user_context) { + return 0; +} + +#define HAS_MULTIPLE_CONTEXTS true +#else +typedef int gpu_context; + +// Just use the default implementation of acquire/release. +bool init_context(int &context) { + printf("Using default implementation of acquire/release\n"); + context = 0; + return true; +} +void destroy_context(int & /* context */) { + +#define HAS_MULTIPLE_CONTEXTS false +#endif + +void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) { + gpu_context context2; + + Buffer buf1_in(W, H); + Buffer buf1_result(W, H); + buf1_in.fill(0); + + const halide_device_interface_t *device_interface; + + int val = 0; + for (int i = 0; i < 10; i++) { + init_context(context2); + + Buffer buf2_in(W, H); + Buffer buf2_result(W, H); + buf2_in.fill(0); + + gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result); + gpu_multi_context_threaded_mul(&context1, buf1_result, buf1_in); + gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result); + + gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result); + gpu_multi_context_threaded_mul(&context2, buf2_result, buf2_in); + gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result); + + buf1_result.copy_to_host(&context1); + buf2_result.copy_to_host(&context2); + + val += 2; + val *= 2; + assert(buf1_result.all_equal(val + 2)); + assert(buf2_result.all_equal(6)); + + device_interface = buf1_result.raw_buffer()->device_interface; + + // About to destroy context, so ensure allocations are freed first. + buf2_in.device_free(&context2); + buf2_result.device_free(&context2); + + if (device_interface != nullptr) { + halide_device_release(&context2, device_interface); + } + destroy_context(context2); + } + + // About to destroy context, so ensure allocations are freed first. + buf1_in.device_free(&context1); + buf1_result.device_free(&context1); + + if (destroy_when_done && device_interface != nullptr) { + halide_device_release(&context1, device_interface); + destroy_context(context1); + } +} + +int main(int argc, char **argv) { + gpu_context contexta; + init_context(contexta); + + gpu_context contextb; + init_context(contextb); + + std::thread thread1(run_kernels_on_thread, contexta, false); + std::thread thread2(run_kernels_on_thread, contextb, false); + + thread1.join(); + thread2.join(); + + // Make sure using the same context on different threads works. + std::thread thread3(run_kernels_on_thread, contexta, HAS_MULTIPLE_CONTEXTS); + std::thread thread4(run_kernels_on_thread, contextb, HAS_MULTIPLE_CONTEXTS); + + thread3.join(); + thread4.join(); + + printf("Success!\n"); + return 0; +} +#endif // !WIN32 diff --git a/test/generator/gpu_multi_context_threaded_generator.cpp b/test/generator/gpu_multi_context_threaded_generator.cpp new file mode 100644 index 000000000000..42f278b6379b --- /dev/null +++ b/test/generator/gpu_multi_context_threaded_generator.cpp @@ -0,0 +1,48 @@ +#include "Halide.h" + +namespace { + +class GpuAdd : public Halide::Generator { +public: + Input> input{"input", 2}; + + Output> output{"output", 2}; + + void generate() { + Var x("x"), y("y"); + + // Create a simple pipeline that scales pixel values by 2. + output(x, y) = input(x, y) + 2; + + Target target = get_target(); + if (target.has_gpu_feature()) { + Var xo, yo, xi, yi; + output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16); + } + } +}; + +class GpuMul : public Halide::Generator { +public: + Input> input{"input", 2}; + + Output> output{"output", 2}; + + void generate() { + Var x("x"), y("y"); + + // Create a simple pipeline that scales pixel values by 2. + output(x, y) = input(x, y) * 2; + + Target target = get_target(); + if (target.has_gpu_feature()) { + Var xo, yo, xi, yi; + output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16); + } + } +}; + +} // namespace + +HALIDE_REGISTER_GENERATOR(GpuAdd, gpu_multi_context_threaded_add) +HALIDE_REGISTER_GENERATOR(GpuMul, gpu_multi_context_threaded_mul) From f8df8eb8ba3a16a935b552ff19dca411be912309 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 7 Dec 2020 13:01:29 -0800 Subject: [PATCH 02/24] Revert "Revert "Fix broken destroy_context() in gpu_multi_context_threaded_aottest.cpp (#5512)" (#5514)" This reverts commit 2c8e3ea87f2ade9bafd4b498dd95df498fb17827. --- test/generator/gpu_multi_context_threaded_aottest.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp index 05852909ba57..fdaab364b7c9 100644 --- a/test/generator/gpu_multi_context_threaded_aottest.cpp +++ b/test/generator/gpu_multi_context_threaded_aottest.cpp @@ -107,7 +107,9 @@ bool init_context(int &context) { context = 0; return true; } -void destroy_context(int & /* context */) { +void destroy_context(int &context) { + context = 0; +} #define HAS_MULTIPLE_CONTEXTS false #endif From 805e14ba98f4898f3ea85c5f6bac6c7037e20919 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 7 Dec 2020 20:04:49 -0800 Subject: [PATCH 03/24] Solve the COMDAT in runtime failing on Mac OS X problem once and for all by removing Comdat IR annotations in runtime on Mac OS and iOS. --- src/LLVM_Runtime_Linker.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp index 9637f0c2ee91..042a3543ba95 100644 --- a/src/LLVM_Runtime_Linker.cpp +++ b/src/LLVM_Runtime_Linker.cpp @@ -560,6 +560,20 @@ void link_modules(std::vector> &modules, Target t, const std::set retain = {"__stack_chk_guard", "__stack_chk_fail"}; + // COMDAT is not supported in MachO object files, hence it does not + // work on Mac OS or iOS. These sometimes show up in the runtime + // since we compile for an abstract target that is based on ELF. + // This code removes the Comdat and makes the symbol a regular one, + // which only works if there is a single instance, which is generally + // the case for the runtime. Presumably if this isn't true, linking the + // module will fail. + if (t.os == Target::IOS || t.os == Target::OSX) { + for (auto &global_obj : modules[0]->global_objects()) { + global_obj.setComdat(nullptr); + } + modules[0]->getComdatSymbolTable().clear(); + } + // Enumerate the global variables. for (auto &gv : modules[0]->globals()) { // No variables are part of the public interface (even the ones labelled halide_) From fbea278fde95f139887b050308bbcd0faeb6b2f3 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 7 Dec 2020 20:10:46 -0800 Subject: [PATCH 04/24] Improve comment. --- src/LLVM_Runtime_Linker.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp index 042a3543ba95..8cd46446a105 100644 --- a/src/LLVM_Runtime_Linker.cpp +++ b/src/LLVM_Runtime_Linker.cpp @@ -560,13 +560,14 @@ void link_modules(std::vector> &modules, Target t, const std::set retain = {"__stack_chk_guard", "__stack_chk_fail"}; - // COMDAT is not supported in MachO object files, hence it does not - // work on Mac OS or iOS. These sometimes show up in the runtime - // since we compile for an abstract target that is based on ELF. - // This code removes the Comdat and makes the symbol a regular one, - // which only works if there is a single instance, which is generally - // the case for the runtime. Presumably if this isn't true, linking the - // module will fail. + // COMDAT is not supported in MachO object files, hence it does + // not work on Mac OS or iOS. These sometimes show up in the + // runtime since we compile for an abstract target that is based + // on ELF. This code removes all Comdat items and leaves the + // symbols they were attached to as regular definitions, which + // only works if there is a single instance, which is generally + // the case for the runtime. Presumably if this isn't true, + // linking the module will fail. if (t.os == Target::IOS || t.os == Target::OSX) { for (auto &global_obj : modules[0]->global_objects()) { global_obj.setComdat(nullptr); From b851d206f177470e4ff16084bb9b1a047f701e04 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 7 Dec 2020 20:21:06 -0800 Subject: [PATCH 05/24] Fix tabs in indentation. --- src/LLVM_Runtime_Linker.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp index 8cd46446a105..220680a63e01 100644 --- a/src/LLVM_Runtime_Linker.cpp +++ b/src/LLVM_Runtime_Linker.cpp @@ -570,9 +570,9 @@ void link_modules(std::vector> &modules, Target t, // linking the module will fail. if (t.os == Target::IOS || t.os == Target::OSX) { for (auto &global_obj : modules[0]->global_objects()) { - global_obj.setComdat(nullptr); - } - modules[0]->getComdatSymbolTable().clear(); + global_obj.setComdat(nullptr); + } + modules[0]->getComdatSymbolTable().clear(); } // Enumerate the global variables. From 4120026512b7ed4a6174c19b9a590052d694eb95 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Thu, 10 Dec 2020 02:31:26 -0800 Subject: [PATCH 06/24] Make GPU context handling more consistent and use a common compilation cache for kernels. Introduces a finalization routine for kernel compilation to indicate when kernals are not strictly required to be defined. Thus allowing them to be unloaded or discarded, but not when they are needed. --- Makefile | 2 +- src/CodeGen_GPU_Host.cpp | 7 +++ src/runtime/HalideRuntimeCuda.h | 1 + src/runtime/HalideRuntimeD3D12Compute.h | 1 + src/runtime/HalideRuntimeHexagonHost.h | 1 + src/runtime/HalideRuntimeMetal.h | 1 + src/runtime/HalideRuntimeOpenCL.h | 1 + src/runtime/HalideRuntimeOpenGL.h | 1 + src/runtime/HalideRuntimeOpenGLCompute.h | 2 + src/runtime/cuda.cpp | 9 ++- src/runtime/d3d12compute.cpp | 13 ++++- src/runtime/gpu_context_common.h | 57 +++++++++++++------ src/runtime/hexagon_host.cpp | 3 + src/runtime/metal.cpp | 18 ++++-- src/runtime/opencl.cpp | 20 +++++-- src/runtime/opengl.cpp | 3 + src/runtime/openglcompute.cpp | 3 + src/runtime/runtime_api.cpp | 7 +++ test/common/gpu_context.h | 22 +++++++ .../gpu_multi_context_threaded_aottest.cpp | 42 +++++++++++++- 20 files changed, 178 insertions(+), 36 deletions(-) diff --git a/Makefile b/Makefile index d9bc23c9b970..b26984127804 100644 --- a/Makefile +++ b/Makefile @@ -333,7 +333,7 @@ TEST_CXX_FLAGS += -DTEST_OPENCL endif ifneq ($(TEST_METAL), ) -TEST_CXX_FLAGS += -DTEST_METAL +TEST_CXX_FLAGS += -DTEST_METAL -ObjC++ endif ifneq ($(TEST_CUDA), ) diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp index 4cb09da4712d..2815f42b62ee 100644 --- a/src/CodeGen_GPU_Host.cpp +++ b/src/CodeGen_GPU_Host.cpp @@ -212,6 +212,13 @@ void CodeGen_GPU_Host::compile_func(const LoweredFunc &f, Value *result = builder->CreateCall(init, init_kernels_args); Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0)); CodeGen_CPU::create_assertion(did_succeed, Expr(), result); + + // Generate a finalizer call as well to relase any refcounts or other resource usage + // specific to this filter call. + std::string finalize_kernels_name = "halide_" + api_unique_name + "_finalize_kernels"; + llvm::Function *finalize = module->getFunction(finalize_kernels_name); + Value *module_state_value = builder->CreateLoad(module_state); + register_destructor(finalize, module_state_value, CodeGen_CPU::Always); } // the init kernels block should branch to the post-entry block diff --git a/src/runtime/HalideRuntimeCuda.h b/src/runtime/HalideRuntimeCuda.h index 923860b3c315..7f8481642ba2 100644 --- a/src/runtime/HalideRuntimeCuda.h +++ b/src/runtime/HalideRuntimeCuda.h @@ -38,6 +38,7 @@ extern int halide_cuda_run(void *user_context, float *vertex_buffer, int num_coords_dim0, int num_coords_dim1); +extern void halide_cuda_finalize_kernels(void *user_context, void *state_ptr); // @} /** Set the underlying cuda device poiner for a buffer. The device diff --git a/src/runtime/HalideRuntimeD3D12Compute.h b/src/runtime/HalideRuntimeD3D12Compute.h index de78ed8bdbab..997120c3391a 100644 --- a/src/runtime/HalideRuntimeD3D12Compute.h +++ b/src/runtime/HalideRuntimeD3D12Compute.h @@ -37,6 +37,7 @@ extern int halide_d3d12compute_run(void *user_context, float *vertex_buffer, int num_coords_dim0, int num_coords_dim1); +extern void halide_d3d12compute_finalize_kernels(void *user_context, void *state_ptr); // @} /** Set the underlying ID3D12Resource for a halide_buffer_t. The memory backing diff --git a/src/runtime/HalideRuntimeHexagonHost.h b/src/runtime/HalideRuntimeHexagonHost.h index 7ca304e9f94b..c5756504ac4b 100644 --- a/src/runtime/HalideRuntimeHexagonHost.h +++ b/src/runtime/HalideRuntimeHexagonHost.h @@ -143,6 +143,7 @@ extern int halide_hexagon_run(void *user_context, uint64_t arg_sizes[], void *args[], int arg_flags[]); +extern void halide_hexagon_finalize_kernels(void *user_context, void *state_ptr); extern int halide_hexagon_device_release(void *user_context); // @} diff --git a/src/runtime/HalideRuntimeMetal.h b/src/runtime/HalideRuntimeMetal.h index 2b045390dc3f..6e795416e442 100644 --- a/src/runtime/HalideRuntimeMetal.h +++ b/src/runtime/HalideRuntimeMetal.h @@ -25,6 +25,7 @@ extern const struct halide_device_interface_t *halide_metal_device_interface(); // @{ extern int halide_metal_initialize_kernels(void *user_context, void **state_ptr, const char *src, int size); +void halide_metal_finalize_kernels(void *user_context, void *state_ptr); extern int halide_metal_run(void *user_context, void *state_ptr, diff --git a/src/runtime/HalideRuntimeOpenCL.h b/src/runtime/HalideRuntimeOpenCL.h index 9bc35def26c8..54b8b4157489 100644 --- a/src/runtime/HalideRuntimeOpenCL.h +++ b/src/runtime/HalideRuntimeOpenCL.h @@ -39,6 +39,7 @@ extern int halide_opencl_run(void *user_context, float *vertex_buffer, int num_coords_dim0, int num_coords_dim1); +extern void halide_opencl_finalize_kernels(void *user_context, void *state_ptr); // @} /** Set the platform name for OpenCL to use (e.g. "Intel" or diff --git a/src/runtime/HalideRuntimeOpenGL.h b/src/runtime/HalideRuntimeOpenGL.h index 2f253c9900a2..813fe5435b73 100644 --- a/src/runtime/HalideRuntimeOpenGL.h +++ b/src/runtime/HalideRuntimeOpenGL.h @@ -39,6 +39,7 @@ extern int halide_opengl_run(void *user_context, float *vertex_buffer, int num_coords_dim0, int num_coords_dim1); +extern void halide_opengl_finalize_kernels(void *user_context, void *state_ptr); // @} /** Set the underlying OpenGL texture for a buffer. The texture must diff --git a/src/runtime/HalideRuntimeOpenGLCompute.h b/src/runtime/HalideRuntimeOpenGLCompute.h index 8f7cd683845c..2fd76c2fd697 100644 --- a/src/runtime/HalideRuntimeOpenGLCompute.h +++ b/src/runtime/HalideRuntimeOpenGLCompute.h @@ -49,6 +49,8 @@ extern int halide_openglcompute_run(void *user_context, float *vertex_buffer, int num_coords_dim0, int num_coords_dim1); + +extern void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr); // @} /** This function retrieves pointers to OpenGL API functions. diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 1dc1cf27086e..872fd65b74fa 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1058,12 +1058,11 @@ WEAK int halide_cuda_run(void *user_context, uint64_t t_before = halide_current_time_ns(user_context); #endif - halide_assert(user_context, state_ptr); - CUmodule mod = nullptr; - bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod); - halide_assert(user_context, found_module && mod != nullptr); + CUmodule mod{}; + bool found = compilation_cache.lookup(ctx.context, state_ptr, mod); + halide_assert(user_context, found && mod != nullptr); + debug(user_context) << "Got module " << mod << "\n"; - halide_assert(user_context, mod); CUfunction f; err = cuModuleGetFunction(&f, mod, entry_name); debug(user_context) << "Got function " << f << "\n"; diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 3174c33f52a4..f298dfe26727 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -2776,6 +2776,13 @@ WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state return 0; } +WEAK void halide_d3d12compute_finalize_kernels(void *user_context, void *state_ptr) { + D3D12ContextHolder d3d12_context(user_context, true); + if (d3d12_context.error == 0) { + compilation_cache.release_hold(user_context, d3d12_context.device, state_ptr); + } +} + namespace { void compute_barrier(d3d12_copy_command_list *cmdList, d3d12_buffer *buffer) { @@ -3005,9 +3012,9 @@ WEAK int halide_d3d12compute_run(void *user_context, StartCapturingGPUActivity(); #endif - d3d12_library *library = nullptr; - bool found_module = compilation_cache.lookup(device, state_ptr, library); - halide_assert(user_context, found_module && library != nullptr); + d3d12_library *library{}; + bool found = compilation_cache.lookup(device, state_ptr, library); + halide_assert(user_context, found && library != nullptr); d3d12_frame *frame = acquire_frame(device); d3d12_compute_command_list *cmdList = frame->cmd_list; diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h index 9ef2662026a8..f45d2883755b 100644 --- a/src/runtime/gpu_context_common.h +++ b/src/runtime/gpu_context_common.h @@ -10,6 +10,14 @@ class GPUCompilationCache { ContextT context{}; ModuleStateT module_state{}; uint32_t kernel_id{}; + uint32_t use_count{0}; + + CachedCompilation(ContextT context, ModuleStateT module_state, + uint32_t kernel_id, uint32_t use_count) + : context(context), module_state(module_state), + kernel_id(kernel_id), use_count(use_count) { + } + }; halide_mutex mutex; @@ -37,7 +45,7 @@ class GPUCompilationCache { } } - HALIDE_MUST_USE_RESULT bool insert(ContextT context, uint32_t id, ModuleStateT module_state) { + HALIDE_MUST_USE_RESULT bool insert(const CachedCompilation &entry) { if (log2_compilations_size == 0) { if (!resize_table(kInitialTableBits)) { return false; @@ -49,13 +57,11 @@ class GPUCompilationCache { } } count += 1; - uintptr_t index = kernel_hash(context, id, log2_compilations_size); + uintptr_t index = kernel_hash(entry.context, entry.kernel_id, log2_compilations_size); for (int i = 0; i < (1 << log2_compilations_size); i++) { uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1); if (compilations[effective_index].kernel_id <= kDeletedId) { - compilations[effective_index].context = context; - compilations[effective_index].module_state = module_state; - compilations[effective_index].kernel_id = id; + compilations[effective_index] = entry; return true; } } @@ -65,7 +71,8 @@ class GPUCompilationCache { return false; } - HALIDE_MUST_USE_RESULT bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) { + HALIDE_MUST_USE_RESULT bool find_internal(ContextT context, uint32_t id, + ModuleStateT *&module_state, int increment) { if (log2_compilations_size == 0) { return false; } @@ -79,6 +86,9 @@ class GPUCompilationCache { if (compilations[effective_index].context == context && compilations[effective_index].kernel_id == id) { module_state = &compilations[effective_index].module_state; + if (increment != 0) { + compilations[effective_index].use_count += increment; + } return true; } } @@ -89,7 +99,7 @@ class GPUCompilationCache { ScopedMutexLock lock_guard(&mutex); uint32_t id = (uint32_t)(uintptr_t)state_ptr; ModuleStateT *mod_ptr; - if (find_internal(context, id, mod_ptr)) { + if (find_internal(context, id, mod_ptr, 0)) { module_state = *mod_ptr; return true; } @@ -114,8 +124,7 @@ class GPUCompilationCache { for (int32_t i = 0; i < old_size; i++) { if (old_table[i].kernel_id != kInvalidId && old_table[i].kernel_id != kDeletedId) { - bool result = insert(old_table[i].context, old_table[i].kernel_id, - old_table[i].module_state); + bool result = insert(old_table[i]); halide_assert(nullptr, result); // Resizing the table while resizing the table is a logic error. } } @@ -133,7 +142,11 @@ class GPUCompilationCache { for (int i = 0; i < (1 << log2_compilations_size); i++) { if (compilations[i].kernel_id > kInvalidId && - (all || (compilations[i].context == context))) { + (all || (compilations[i].context == context)) && + compilations[i].use_count == 0) { + debug(user_context) << "Releasing cached compilation: " << compilations[i].module_state << + " id " << compilations[i].kernel_id << + " context " << compilations[i].context << "\n"; f(compilations[i].module_state); compilations[i].module_state = nullptr; compilations[i].kernel_id = kDeletedId; @@ -154,9 +167,12 @@ class GPUCompilationCache { ScopedMutexLock lock_guard(&mutex); release_context(user_context, true, nullptr, f); - free(compilations); - compilations = nullptr; - log2_compilations_size = 0; + // Some items may have been in use, so can't free. + if (count == 0) { + free(compilations); + compilations = nullptr; + log2_compilations_size = 0; + } } template @@ -172,25 +188,34 @@ class GPUCompilationCache { } ModuleStateT *mod; - if (find_internal(context, *id_ptr, mod)) { + if (find_internal(context, *id_ptr, mod, 1)) { result = *mod; return true; } // TODO(zvookin): figure out the calling signature here... ModuleStateT compiled_module = f(args...); - debug(user_context) << "Caching compiled kernel: " << compiled_module << " id " << *id_ptr << " context " << context << "\n"; + debug(user_context) << "Caching compiled kernel: " << compiled_module << + " id " << *id_ptr << " context " << context << "\n"; if (compiled_module == nullptr) { return false; } - if (!insert(context, *id_ptr, compiled_module)) { + + if (!insert({context, compiled_module, *id_ptr, 1})) { return false; } result = compiled_module; return true; } + + void release_hold(void *user_context, ContextT context, void *state_ptr) { + ModuleStateT *mod; + uint32_t id = (uint32_t)(uintptr_t)state_ptr; + bool result = find_internal(context, id, mod, -1); + halide_assert(user_context, result); // Value must be in cache to be released + } }; } // namespace Internal diff --git a/src/runtime/hexagon_host.cpp b/src/runtime/hexagon_host.cpp index cbef8cf7508f..b163c6f3e537 100644 --- a/src/runtime/hexagon_host.cpp +++ b/src/runtime/hexagon_host.cpp @@ -323,6 +323,9 @@ WEAK int halide_hexagon_initialize_kernels(void *user_context, void **state_ptr, return result != 0 ? -1 : 0; } +WEAK void halide_hexagon_finalize_kernels(void *user_context, void *state_ptr) { +} + namespace { // Prepare an array of remote_buffer arguments, mapping buffers if diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp index 25fe29f3feda..0cd4492bf6db 100644 --- a/src/runtime/metal.cpp +++ b/src/runtime/metal.cpp @@ -562,6 +562,13 @@ WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, c return 0; } +WEAK void halide_metal_finalize_kernels(void *user_context, void *state_ptr) { + MetalContextHolder metal_context(user_context, true); + if (metal_context.error == 0) { + compilation_cache.release_hold(user_context, metal_context.device, state_ptr); + } +} + namespace { WEAK void halide_metal_device_sync_internal(mtl_command_queue *queue, struct halide_buffer_t *buffer) { @@ -612,10 +619,11 @@ WEAK int halide_metal_device_release(void *user_context) { return error; } - if (device) { + if (acquired_device) { halide_metal_device_sync_internal(queue, nullptr); - compilation_cache.delete_context(user_context, device, release_ns_object); + debug(user_context) << "Calling delete context on device " << acquired_device << "\n"; + compilation_cache.delete_context(user_context, acquired_device, release_ns_object); // Release the device itself, if we created it. if (acquired_device == device) { @@ -740,9 +748,9 @@ WEAK int halide_metal_run(void *user_context, return -1; } - mtl_library *library; - bool found_library = compilation_cache.lookup(metal_context.device, state_ptr, library); - halide_assert(user_context, found_library && library != nullptr); + mtl_library *library{}; + bool found = compilation_cache.lookup(metal_context.device, state_ptr, library); + halide_assert(user_context, found && library != nullptr); mtl_function *function = new_function_with_name(library, entry_name, strlen(entry_name)); if (function == nullptr) { diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 7c7815b620d7..10bd05ccc315 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -731,7 +731,7 @@ WEAK int halide_opencl_compute_capability(void *user_context, int *major, int *m WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, const char *src, int size) { debug(user_context) - << "CL: halide_opencl_init_kernels (user_context: " << user_context + << "CL: halide_opencl_initialize_kernels (user_context: " << user_context << ", state_ptr: " << state_ptr << ", program: " << (void *)src << ", size: " << size << "\n"; @@ -760,6 +760,16 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, return 0; } +WEAK void halide_opencl_finalize_kernels(void *user_context, void *state_ptr) { + debug(user_context) + << "CL: halide_opencl_finalize_kernels (user_context: " << user_context + << ", state_ptr: " << state_ptr << "\n";; + ClContext ctx(user_context); + if (ctx.error_code == CL_SUCCESS) { + compilation_cache.release_hold(user_context, ctx.context, state_ptr); + } +} + // Used to generate correct timings when tracing WEAK int halide_opencl_device_sync(void *user_context, halide_buffer_t *) { debug(user_context) << "CL: halide_opencl_device_sync (user_context: " << user_context << ")\n"; @@ -1049,10 +1059,11 @@ WEAK int halide_opencl_run(void *user_context, // Create kernel object for entry_name from the program for this module. halide_assert(user_context, state_ptr); - cl_program program; - bool found_program = compilation_cache.lookup(ctx.context, state_ptr, program); + + cl_program program{}; + bool found = compilation_cache.lookup(ctx.context, state_ptr, program); + halide_assert(user_context, found && program != nullptr); - halide_assert(user_context, found_program && program != nullptr); debug(user_context) << " clCreateKernel " << entry_name << " -> "; cl_kernel f = clCreateKernel(program, entry_name, &err); if (err != CL_SUCCESS) { @@ -1339,6 +1350,7 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() { namespace { WEAK __attribute__((destructor)) void halide_opencl_cleanup() { + debug(nullptr) << "halide_opencl_cleanup\n"; compilation_cache.release_all(nullptr, clReleaseProgram); halide_opencl_device_release(nullptr); } diff --git a/src/runtime/opengl.cpp b/src/runtime/opengl.cpp index f18d79f867a7..a3e8b15d417d 100644 --- a/src/runtime/opengl.cpp +++ b/src/runtime/opengl.cpp @@ -1959,6 +1959,9 @@ WEAK int halide_opengl_initialize_kernels(void *user_context, void **state_ptr, return 0; } +WEAK void halide_opengl_finalize_kernels(void *user_context, void *state_ptr) { +} + WEAK int halide_opengl_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) { return halide_default_device_and_host_malloc(user_context, buf, &opengl_device_interface); } diff --git a/src/runtime/openglcompute.cpp b/src/runtime/openglcompute.cpp index 5476d854d35c..5928d18a0e51 100644 --- a/src/runtime/openglcompute.cpp +++ b/src/runtime/openglcompute.cpp @@ -904,6 +904,9 @@ WEAK int halide_openglcompute_initialize_kernels(void *user_context, void **stat return 0; } +WEAK void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr) { +} + WEAK int halide_openglcompute_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) { return halide_default_device_and_host_malloc(user_context, buf, &openglcompute_device_interface); } diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp index 6071932000da..ef2609022748 100644 --- a/src/runtime/runtime_api.cpp +++ b/src/runtime/runtime_api.cpp @@ -29,6 +29,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_cuda_device_interface, (void *)&halide_cuda_get_device_ptr, (void *)&halide_cuda_initialize_kernels, + (void *)&halide_cuda_finalize_kernels, (void *)&halide_cuda_run, (void *)&halide_cuda_wrap_device_ptr, (void *)&halide_current_time_ns, @@ -91,6 +92,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_hexagon_get_device_handle, (void *)&halide_hexagon_get_device_size, (void *)&halide_hexagon_initialize_kernels, + (void *)&halide_hexagon_finalize_kernels, (void *)&halide_hexagon_power_hvx_off, (void *)&halide_hexagon_power_hvx_off_as_destructor, (void *)&halide_hexagon_power_hvx_on, @@ -115,6 +117,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_metal_get_buffer, (void *)&halide_metal_get_crop_offset, (void *)&halide_metal_initialize_kernels, + (void *)&halide_metal_finalize_kernels, (void *)&halide_metal_release_context, (void *)&halide_metal_run, (void *)&halide_metal_wrap_buffer, @@ -139,6 +142,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_opencl_image_device_interface, (void *)&halide_opencl_image_wrap_cl_mem, (void *)&halide_opencl_initialize_kernels, + (void *)&halide_opencl_finalize_kernels, (void *)&halide_opencl_run, (void *)&halide_opencl_set_build_options, (void *)&halide_opencl_set_device_type, @@ -151,11 +155,13 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_opengl_get_proc_address, (void *)&halide_opengl_get_texture, (void *)&halide_opengl_initialize_kernels, + (void *)&halide_opengl_finalize_kernels, (void *)&halide_opengl_run, (void *)&halide_opengl_wrap_render_target, (void *)&halide_opengl_wrap_texture, (void *)&halide_openglcompute_device_interface, (void *)&halide_openglcompute_initialize_kernels, + (void *)&halide_openglcompute_finalize_kernels, (void *)&halide_openglcompute_run, (void *)&halide_pointer_to_string, (void *)&halide_print, @@ -202,6 +208,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_d3d12compute_acquire_context, (void *)&halide_d3d12compute_device_interface, (void *)&halide_d3d12compute_initialize_kernels, + (void *)&halide_d3d12compute_finalize_kernels, (void *)&halide_d3d12compute_release_context, (void *)&halide_d3d12compute_run, }; diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h index 4816ad205866..9051bebb30f9 100644 --- a/test/common/gpu_context.h +++ b/test/common/gpu_context.h @@ -131,4 +131,26 @@ void destroy_cuda_context(CUcontext cuda_ctx) { cuCtxDestroy(cuda_ctx); } +#elif defined(TEST_METAL) +#include +#include + +void create_metal_context(id &device, id &queue) { + device = MTLCreateSystemDefaultDevice(); + if (device == nullptr) { + NSArray> *devices = MTLCopyAllDevices(); + if (devices != nullptr) { + device = devices[0]; + } + } + assert(device != nullptr); + queue = [device newCommandQueue]; + assert(queue != nullptr); +} + +void destroy_metal_context(id device, id queue) { + [queue release]; + [device release]; +} + #endif diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp index fdaab364b7c9..05144a4c857d 100644 --- a/test/generator/gpu_multi_context_threaded_aottest.cpp +++ b/test/generator/gpu_multi_context_threaded_aottest.cpp @@ -71,11 +71,11 @@ extern "C" int halide_release_cl_context(void *user_context) { typedef CUcontext gpu_context; -bool init_context(CUcontext &cuda_ctx) { +bool init_context(gpu_context &cuda_ctx) { return create_cuda_context(cuda_ctx); } -void destroy_context(CUcontext &cuda_ctx) { +void destroy_context(gpu_context &cuda_ctx) { destroy_cuda_context(cuda_ctx); cuda_ctx = nullptr; } @@ -97,6 +97,44 @@ extern "C" int halide_cuda_release_context(void *user_context) { return 0; } +#define HAS_MULTIPLE_CONTEXTS true +#elif defined(TEST_METAL) + +struct gpu_context { + id device; + id queue; +}; + +bool init_context(gpu_context &context) { + create_metal_context(context.device, context.queue); + return 0; +} + +void destroy_context(gpu_context &context) { + destroy_metal_context(context.device, context.queue); + context.device = nullptr; + context.queue = nullptr; +} + +int halide_metal_acquire_context(void *user_context, id *device_ret, + id *queue_ret, bool create) { + if (user_context == nullptr) { + assert(!create); + *device_ret = nullptr; + *queue_ret = nullptr; + } else { + gpu_context *context = (gpu_context *)user_context; + *device_ret = context->device; + *queue_ret = context->queue; + } + return 0; + +} + +int halide_metal_release_context(void *user_context) { + return 0; +} + #define HAS_MULTIPLE_CONTEXTS true #else typedef int gpu_context; From 03062ddb6a49764fdca8ddc644ee0a1c8e19a1df Mon Sep 17 00:00:00 2001 From: Z Stern Date: Thu, 10 Dec 2020 16:55:40 -0800 Subject: [PATCH 07/24] Add CUDA finalizer method. Quick fix for syntax error in C codegen. Tab fixes. Makefile fixes. --- Makefile | 8 ++++++++ src/CodeGen_C.cpp | 2 +- src/CodeGen_GPU_Host.cpp | 8 ++++---- src/runtime/cuda.cpp | 7 +++++++ src/runtime/gpu_context_common.h | 12 ++++++------ src/runtime/metal.cpp | 2 +- 6 files changed, 27 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index b26984127804..aeee9d600d70 100644 --- a/Makefile +++ b/Makefile @@ -1153,6 +1153,8 @@ GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_async_parallel,$(GENERAT GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_stubtest,$(GENERATOR_AOTCPP_TESTS)) GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_stubuser,$(GENERATOR_AOTCPP_TESTS)) +GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_gpu_multi_context_threaded,$(GENERATOR_AOTCPP_TESTS)) + test_aotcpp_generator: $(GENERATOR_AOTCPP_TESTS) # This is just a test to ensure than RunGen builds and links for a critical mass of Generators; @@ -1168,6 +1170,7 @@ GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/multitarget.rungen,$ GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/nested_externs.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/tiled_blur.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/extern_output.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) +GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/gpu_multi_context_threaded.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS)) GENERATOR_BUILD_RUNGEN_TESTS := $(GENERATOR_BUILD_RUNGEN_TESTS) \ $(FILTERS_DIR)/multi_rungen \ $(FILTERS_DIR)/multi_rungen2 \ @@ -1652,6 +1655,11 @@ test_generator_nested_externs: test_generator_gpu_multi: @echo "Skipping" +# gpu_multi_context_threaded actually contain a generator named +# "gpu_multi", and has no internal tests in any case. +test_generator_gpu_multi_context_threaded: + @echo "Skipping" + $(BUILD_DIR)/RunGenMain.o: $(ROOT_DIR)/tools/RunGenMain.cpp $(RUNTIME_EXPORTED_INCLUDES) $(ROOT_DIR)/tools/RunGen.h @mkdir -p $(@D) $(CXX) -c $< $(filter-out -g, $(TEST_CXX_FLAGS)) $(OPTIMIZE) -Os $(IMAGE_IO_CXX_FLAGS) -I$(INCLUDE_DIR) -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools -o $@ diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp index 77a294abdb4f..62ae9fda7a4f 100644 --- a/src/CodeGen_C.cpp +++ b/src/CodeGen_C.cpp @@ -1583,7 +1583,7 @@ void CodeGen_C::compile(const LoweredFunc &f) { if (uses_gpu_for_loops) { stream << get_indent() << "halide_error(" - << (have_user_context ? "__user_context_" : "nullptr") + << (have_user_context ? "const_cast(__user_context)" : "nullptr") << ", \"C++ Backend does not support gpu_blocks() or gpu_threads() yet, " << "this function will always fail at runtime\");\n"; stream << get_indent() << "return halide_error_code_device_malloc_failed;\n"; diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp index 2815f42b62ee..b2f5c1e0909e 100644 --- a/src/CodeGen_GPU_Host.cpp +++ b/src/CodeGen_GPU_Host.cpp @@ -213,12 +213,12 @@ void CodeGen_GPU_Host::compile_func(const LoweredFunc &f, Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0)); CodeGen_CPU::create_assertion(did_succeed, Expr(), result); - // Generate a finalizer call as well to relase any refcounts or other resource usage - // specific to this filter call. + // Generate a finalizer call as well to relase any refcounts or other resource usage + // specific to this filter call. std::string finalize_kernels_name = "halide_" + api_unique_name + "_finalize_kernels"; llvm::Function *finalize = module->getFunction(finalize_kernels_name); - Value *module_state_value = builder->CreateLoad(module_state); - register_destructor(finalize, module_state_value, CodeGen_CPU::Always); + Value *module_state_value = builder->CreateLoad(module_state); + register_destructor(finalize, module_state_value, CodeGen_CPU::Always); } // the init kernels block should branch to the post-entry block diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 872fd65b74fa..39f4e6f1fa81 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -533,6 +533,13 @@ WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, co return 0; } +WEAK void halide_cuda_finalize_kernels(void *user_context, void *state_ptr) { + Context ctx(user_context); + if (ctx.error == 0) { + compilation_cache.release_hold(user_context, ctx.context, state_ptr); + } +} + WEAK int halide_cuda_release_unused_device_allocations(void *user_context) { FreeListItem *to_free; { diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h index f45d2883755b..b6d8ddadb317 100644 --- a/src/runtime/gpu_context_common.h +++ b/src/runtime/gpu_context_common.h @@ -167,12 +167,12 @@ class GPUCompilationCache { ScopedMutexLock lock_guard(&mutex); release_context(user_context, true, nullptr, f); - // Some items may have been in use, so can't free. - if (count == 0) { - free(compilations); - compilations = nullptr; - log2_compilations_size = 0; - } + // Some items may have been in use, so can't free. + if (count == 0) { + free(compilations); + compilations = nullptr; + log2_compilations_size = 0; + } } template diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp index 0cd4492bf6db..52b43f4c830b 100644 --- a/src/runtime/metal.cpp +++ b/src/runtime/metal.cpp @@ -622,7 +622,7 @@ WEAK int halide_metal_device_release(void *user_context) { if (acquired_device) { halide_metal_device_sync_internal(queue, nullptr); - debug(user_context) << "Calling delete context on device " << acquired_device << "\n"; + debug(user_context) << "Calling delete context on device " << acquired_device << "\n"; compilation_cache.delete_context(user_context, acquired_device, release_ns_object); // Release the device itself, if we created it. From 42f9ccc744632371c2d957ff71506eed4226064b Mon Sep 17 00:00:00 2001 From: Z Stern Date: Thu, 10 Dec 2020 17:05:27 -0800 Subject: [PATCH 08/24] Conditionalize Objective C support. --- test/common/gpu_context.h | 2 +- test/generator/gpu_multi_context_threaded_aottest.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h index 9051bebb30f9..13a42ea64b88 100644 --- a/test/common/gpu_context.h +++ b/test/common/gpu_context.h @@ -131,7 +131,7 @@ void destroy_cuda_context(CUcontext cuda_ctx) { cuCtxDestroy(cuda_ctx); } -#elif defined(TEST_METAL) +#elif defined(TEST_METAL) && defined(__OBJC__) #include #include diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp index 05144a4c857d..8b629defb9dd 100644 --- a/test/generator/gpu_multi_context_threaded_aottest.cpp +++ b/test/generator/gpu_multi_context_threaded_aottest.cpp @@ -98,7 +98,7 @@ extern "C" int halide_cuda_release_context(void *user_context) { } #define HAS_MULTIPLE_CONTEXTS true -#elif defined(TEST_METAL) +#elif defined(TEST_METAL) && defined(__OBJC__) struct gpu_context { id device; From b634ac07a48238b018b60ca383e85babd61ee997 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Thu, 10 Dec 2020 17:24:09 -0800 Subject: [PATCH 09/24] Fix clang-format complaints. --- src/runtime/gpu_context_common.h | 14 ++++++-------- src/runtime/opencl.cpp | 5 ++--- test/common/gpu_context.h | 8 ++++---- .../gpu_multi_context_threaded_aottest.cpp | 6 +++--- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h index b6d8ddadb317..9b9bbaab4a65 100644 --- a/src/runtime/gpu_context_common.h +++ b/src/runtime/gpu_context_common.h @@ -17,7 +17,6 @@ class GPUCompilationCache { : context(context), module_state(module_state), kernel_id(kernel_id), use_count(use_count) { } - }; halide_mutex mutex; @@ -45,7 +44,7 @@ class GPUCompilationCache { } } - HALIDE_MUST_USE_RESULT bool insert(const CachedCompilation &entry) { + HALIDE_MUST_USE_RESULT bool insert(const CachedCompilation &entry) { if (log2_compilations_size == 0) { if (!resize_table(kInitialTableBits)) { return false; @@ -144,9 +143,9 @@ class GPUCompilationCache { if (compilations[i].kernel_id > kInvalidId && (all || (compilations[i].context == context)) && compilations[i].use_count == 0) { - debug(user_context) << "Releasing cached compilation: " << compilations[i].module_state << - " id " << compilations[i].kernel_id << - " context " << compilations[i].context << "\n"; + debug(user_context) << "Releasing cached compilation: " << compilations[i].module_state + << " id " << compilations[i].kernel_id + << " context " << compilations[i].context << "\n"; f(compilations[i].module_state); compilations[i].module_state = nullptr; compilations[i].kernel_id = kDeletedId; @@ -195,13 +194,12 @@ class GPUCompilationCache { // TODO(zvookin): figure out the calling signature here... ModuleStateT compiled_module = f(args...); - debug(user_context) << "Caching compiled kernel: " << compiled_module << - " id " << *id_ptr << " context " << context << "\n"; + debug(user_context) << "Caching compiled kernel: " << compiled_module + << " id " << *id_ptr << " context " << context << "\n"; if (compiled_module == nullptr) { return false; } - if (!insert({context, compiled_module, *id_ptr, 1})) { return false; } diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index 10bd05ccc315..5ee0d7559ab1 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -763,7 +763,7 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, WEAK void halide_opencl_finalize_kernels(void *user_context, void *state_ptr) { debug(user_context) << "CL: halide_opencl_finalize_kernels (user_context: " << user_context - << ", state_ptr: " << state_ptr << "\n";; + << ", state_ptr: " << state_ptr << "\n"; ClContext ctx(user_context); if (ctx.error_code == CL_SUCCESS) { compilation_cache.release_hold(user_context, ctx.context, state_ptr); @@ -1059,7 +1059,7 @@ WEAK int halide_opencl_run(void *user_context, // Create kernel object for entry_name from the program for this module. halide_assert(user_context, state_ptr); - + cl_program program{}; bool found = compilation_cache.lookup(ctx.context, state_ptr, program); halide_assert(user_context, found && program != nullptr); @@ -1350,7 +1350,6 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() { namespace { WEAK __attribute__((destructor)) void halide_opencl_cleanup() { - debug(nullptr) << "halide_opencl_cleanup\n"; compilation_cache.release_all(nullptr, clReleaseProgram); halide_opencl_device_release(nullptr); } diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h index 13a42ea64b88..b3a7c3ea3e29 100644 --- a/test/common/gpu_context.h +++ b/test/common/gpu_context.h @@ -132,16 +132,16 @@ void destroy_cuda_context(CUcontext cuda_ctx) { } #elif defined(TEST_METAL) && defined(__OBJC__) -#include #include +#include void create_metal_context(id &device, id &queue) { device = MTLCreateSystemDefaultDevice(); if (device == nullptr) { NSArray> *devices = MTLCopyAllDevices(); - if (devices != nullptr) { - device = devices[0]; - } + if (devices != nullptr) { + device = devices[0]; + } } assert(device != nullptr); queue = [device newCommandQueue]; diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp index 8b629defb9dd..388f724eb78f 100644 --- a/test/generator/gpu_multi_context_threaded_aottest.cpp +++ b/test/generator/gpu_multi_context_threaded_aottest.cpp @@ -117,15 +117,15 @@ void destroy_context(gpu_context &context) { } int halide_metal_acquire_context(void *user_context, id *device_ret, - id *queue_ret, bool create) { + id *queue_ret, bool create) { if (user_context == nullptr) { assert(!create); *device_ret = nullptr; *queue_ret = nullptr; } else { gpu_context *context = (gpu_context *)user_context; - *device_ret = context->device; - *queue_ret = context->queue; + *device_ret = context->device; + *queue_ret = context->queue; } return 0; From 285750a71bc3b9b13ad3c16f0ef167c30860f674 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Thu, 10 Dec 2020 17:29:42 -0800 Subject: [PATCH 10/24] Fix clang-format complaints. --- test/common/gpu_context.h | 2 +- test/generator/gpu_multi_context_threaded_aottest.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h index b3a7c3ea3e29..6c697c4fd757 100644 --- a/test/common/gpu_context.h +++ b/test/common/gpu_context.h @@ -146,7 +146,7 @@ void create_metal_context(id &device, id &queue) { assert(device != nullptr); queue = [device newCommandQueue]; assert(queue != nullptr); -} +} void destroy_metal_context(id device, id queue) { [queue release]; diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp index 388f724eb78f..0d58fa6b91ef 100644 --- a/test/generator/gpu_multi_context_threaded_aottest.cpp +++ b/test/generator/gpu_multi_context_threaded_aottest.cpp @@ -128,7 +128,6 @@ int halide_metal_acquire_context(void *user_context, id *device_ret, *queue_ret = context->queue; } return 0; - } int halide_metal_release_context(void *user_context) { From e78ce1400fc8057234e1f1d973aab1ddda0dfb12 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Thu, 10 Dec 2020 20:53:07 -0800 Subject: [PATCH 11/24] Attempt to fix new test failure with cmake. --- test/generator/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt index 130af49b7f93..af7b9b8728ad 100644 --- a/test/generator/CMakeLists.txt +++ b/test/generator/CMakeLists.txt @@ -124,10 +124,12 @@ halide_define_aot_test(acquire_release) if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda") include(AddCudaToTarget) add_cuda_to_target(generator_aot_acquire_release PRIVATE) + add_cuda_to_target(generator_aot_multi_context_threaded PRIVATE) endif () if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl") find_package(OpenCL REQUIRED) target_link_libraries(generator_aot_acquire_release PRIVATE OpenCL::OpenCL) + target_link_libraries(generator_aot_multi_context_threaded PRIVATE OpenCL::OpenCL) endif () # TODO: what are these? From 1046ccc78cc061cd23461ab31bac8caf85a5c22d Mon Sep 17 00:00:00 2001 From: Z Stern Date: Mon, 14 Dec 2020 14:52:54 -0800 Subject: [PATCH 12/24] Add Metal support to acquire_release test and make it so it doesn't fail without a GPU. --- test/generator/acquire_release_aottest.cpp | 42 +++++++++++++++++++--- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp index 9e4fba70afe6..dbb16c29606a 100644 --- a/test/generator/acquire_release_aottest.cpp +++ b/test/generator/acquire_release_aottest.cpp @@ -76,6 +76,41 @@ extern "C" int halide_cuda_release_context(void *user_context) { printf("Releasing CUDA context %p\n", cuda_ctx); return 0; } +#elif defined(TEST_METAL) && defined(__OBJC__) + +struct gpu_context { + id device; + id queue; +}; + +bool init_context(gpu_context &context) { + create_metal_context(context.device, context.queue); + return 0; +} + +void destroy_context(gpu_context &context) { + destroy_metal_context(context.device, context.queue); + context.device = nullptr; + context.queue = nullptr; +} + +int halide_metal_acquire_context(void *user_context, id *device_ret, + id *queue_ret, bool create) { + if (user_context == nullptr) { + assert(!create); + *device_ret = nullptr; + *queue_ret = nullptr; + } else { + gpu_context *context = (gpu_context *)user_context; + *device_ret = context->device; + *queue_ret = context->queue; + } + return 0; +} + +int halide_metal_release_context(void *user_context) { + return 0; +} #else // Just use the default implementation of acquire/release. bool init_context() { @@ -128,14 +163,13 @@ bool run_test() { if (interface != nullptr) { halide_device_release(nullptr, interface); + + // Free the context we created. + destroy_context(); } else { printf("Device interface is nullptr.\n"); - return false; } - // Free the context we created. - destroy_context(); - printf("Success!\n"); return true; } From 3352bf587c6b49b3626b29749ceacfe5dd92aa0a Mon Sep 17 00:00:00 2001 From: Z Stern Date: Wed, 16 Dec 2020 23:20:42 -0800 Subject: [PATCH 13/24] Fix CMake cuda target issue. Add comment to Makefile per review feedback. --- Makefile | 5 +++++ test/generator/CMakeLists.txt | 19 +++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 6e41c930af86..e2e1cb129563 100644 --- a/Makefile +++ b/Makefile @@ -333,6 +333,11 @@ TEST_CXX_FLAGS += -DTEST_OPENCL endif ifneq ($(TEST_METAL), ) +# Using Metal APIs requires writing Objective-C++ (or Swift). Add ObjC++ +# to allow tests to create and destroy Metal contexts, etc. This requires +# tests to be valid Objective-C++, e.g. avoiding using the identifier "id" +# in certain ways. In practice this is not enough of a problem to justify +# the work to limit which files are compiled this way. TEST_CXX_FLAGS += -DTEST_METAL -ObjC++ endif diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt index af7b9b8728ad..95942efd854a 100644 --- a/test/generator/CMakeLists.txt +++ b/test/generator/CMakeLists.txt @@ -114,22 +114,26 @@ if (NOT ${USING_WASM}) target_link_libraries(cxx_mangling_define_extern_externs PRIVATE cxx_mangling) endif () +if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda") + include(AddCudaToTarget) +endif () +if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl") + find_package(OpenCL REQUIRED) +endif () + ## # Create targets for the AOT tests ## # acquire_release_aottest.cpp # acquire_release_generator.cpp +# acquire_release_generator.cpp halide_define_aot_test(acquire_release) if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda") - include(AddCudaToTarget) add_cuda_to_target(generator_aot_acquire_release PRIVATE) - add_cuda_to_target(generator_aot_multi_context_threaded PRIVATE) endif () if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl") - find_package(OpenCL REQUIRED) target_link_libraries(generator_aot_acquire_release PRIVATE OpenCL::OpenCL) - target_link_libraries(generator_aot_multi_context_threaded PRIVATE OpenCL::OpenCL) endif () # TODO: what are these? @@ -272,6 +276,13 @@ add_halide_library(gpu_multi_context_threaded_add FROM gpu_multi_context_threade add_halide_library(gpu_multi_context_threaded_mul FROM gpu_multi_context_threaded.generator FEATURES user_context) +if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda") + add_cuda_to_target(generator_aot_gpu_multi_context_threaded PRIVATE) +endif () +if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl") + target_link_libraries(generator_aot_gpu_multi_context_threaded PRIVATE OpenCL::OpenCL) +endif () + # gpu_object_lifetime_aottest.cpp # gpu_object_lifetime_generator.cpp halide_define_aot_test(gpu_object_lifetime FEATURES debug) From 05c8c7cd5b48b591bc54f283cf9d74f0c8a2be55 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Wed, 16 Dec 2020 23:38:06 -0800 Subject: [PATCH 14/24] Add a couple more locals initializations for safety. --- src/runtime/d3d12compute.cpp | 2 +- src/runtime/metal.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index f298dfe26727..bae6260fa8dd 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -2766,7 +2766,7 @@ WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state D3D12ContextHolder d3d12_context(user_context, true); int error = halide_error_code_generic_error; - d3d12_library *library; + d3d12_library *library{}; if (!compilation_cache.kernel_state_setup(user_context, state_ptr, d3d12_context.device, library, compile_kernel, user_context, source, source_size, &error)) { diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp index 52b43f4c830b..ead7591fcccc 100644 --- a/src/runtime/metal.cpp +++ b/src/runtime/metal.cpp @@ -546,7 +546,7 @@ WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, c uint64_t t_before = halide_current_time_ns(user_context); #endif - mtl_library *library; + mtl_library *library{}; if (!compilation_cache.kernel_state_setup(user_context, state_ptr, metal_context.device, library, new_library_with_source, metal_context.device, source, source_size)) { From 1dd25f7608687e320ed4bee8acdf61c4586d00a1 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Thu, 17 Dec 2020 12:41:07 -0800 Subject: [PATCH 15/24] Remove extraneous test that somehow got moved into header file. --- test/common/gpu_context.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h index 6c697c4fd757..d0ca579d6bb9 100644 --- a/test/common/gpu_context.h +++ b/test/common/gpu_context.h @@ -9,10 +9,6 @@ #include #endif -// Just use a global context and queue, created and destroyed by main. -cl_context cl_ctx = nullptr; -cl_command_queue cl_q = nullptr; - // Create the global context. This is just a helper function not called by Halide. bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) { cl_int err = 0; From 10a1388ccaf6787e35ced172b953acdf781a8e01 Mon Sep 17 00:00:00 2001 From: Z Stern Date: Fri, 18 Dec 2020 02:18:48 -0800 Subject: [PATCH 16/24] Fix OpenCL code per erroneous use of globals. --- test/generator/gpu_multi_context_threaded_aottest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp index 0d58fa6b91ef..212c00abad89 100644 --- a/test/generator/gpu_multi_context_threaded_aottest.cpp +++ b/test/generator/gpu_multi_context_threaded_aottest.cpp @@ -42,8 +42,8 @@ bool init_context(gpu_context &context) { void destroy_context(gpu_context &context) { destroy_opencl_context(context.cl_ctx, context.cl_q); - cl_q = nullptr; - cl_ctx = nullptr; + context.cl_q = nullptr; + context.cl_ctx = nullptr; } // These functions replace the acquire/release implementation in src/runtime/opencl.cpp. From 1962b2573e80e4dce766af6e1457418069a59d3b Mon Sep 17 00:00:00 2001 From: Z Stern Date: Wed, 6 Jan 2021 02:04:28 -0800 Subject: [PATCH 17/24] Fix errors for MEtal case in acquire_release_aottest.cpp. Make Metal context creation test API consistent with CUDA and OpenCL by having it return a success/fail indication instead of asserting internally. --- test/common/gpu_context.h | 13 ++++++++--- test/generator/acquire_release_aottest.cpp | 27 ++++++++-------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h index d0ca579d6bb9..c09f601917ee 100644 --- a/test/common/gpu_context.h +++ b/test/common/gpu_context.h @@ -131,7 +131,7 @@ void destroy_cuda_context(CUcontext cuda_ctx) { #include #include -void create_metal_context(id &device, id &queue) { +bool create_metal_context(id &device, id &queue) { device = MTLCreateSystemDefaultDevice(); if (device == nullptr) { NSArray> *devices = MTLCopyAllDevices(); @@ -139,9 +139,16 @@ void create_metal_context(id &device, id &queue) { device = devices[0]; } } - assert(device != nullptr); + if (device == nullptr) { + printf("Failed to find Metal device.\n"); + return false; + } queue = [device newCommandQueue]; - assert(queue != nullptr); + if (queue == nullptr) { + printf("Failed to create Metal command queue.\n"); + return false; + } + return true; } void destroy_metal_context(id device, id queue) { diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp index dbb16c29606a..3bc4b7efc20c 100644 --- a/test/generator/acquire_release_aottest.cpp +++ b/test/generator/acquire_release_aottest.cpp @@ -81,30 +81,23 @@ extern "C" int halide_cuda_release_context(void *user_context) { struct gpu_context { id device; id queue; -}; +} metal_context; -bool init_context(gpu_context &context) { - create_metal_context(context.device, context.queue); - return 0; +bool init_context() { + return create_metal_context(metal_context.device, metal_context.queue); } -void destroy_context(gpu_context &context) { - destroy_metal_context(context.device, context.queue); - context.device = nullptr; - context.queue = nullptr; +void destroy_context() { + destroy_metal_context(metal_context.device, metal_context.queue); + metal_context.device = nullptr; + metal_context.queue = nullptr; } int halide_metal_acquire_context(void *user_context, id *device_ret, id *queue_ret, bool create) { - if (user_context == nullptr) { - assert(!create); - *device_ret = nullptr; - *queue_ret = nullptr; - } else { - gpu_context *context = (gpu_context *)user_context; - *device_ret = context->device; - *queue_ret = context->queue; - } + *device_ret = metal_context.device; + *queue_ret = metal_context.queue; + return 0; } From 054a5355a1dee0afcf60acbd4b6d2fa47bb2f06a Mon Sep 17 00:00:00 2001 From: Z Stern Date: Wed, 6 Jan 2021 02:21:37 -0800 Subject: [PATCH 18/24] Fix formatting. --- test/common/gpu_context.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h index c09f601917ee..23423bcb0cd5 100644 --- a/test/common/gpu_context.h +++ b/test/common/gpu_context.h @@ -141,12 +141,12 @@ bool create_metal_context(id &device, id &queue) { } if (device == nullptr) { printf("Failed to find Metal device.\n"); - return false; + return false; } queue = [device newCommandQueue]; if (queue == nullptr) { printf("Failed to create Metal command queue.\n"); - return false; + return false; } return true; } From cf862ceeeeb96c71743a03d0f99290568b3765de Mon Sep 17 00:00:00 2001 From: Z Stern Date: Wed, 13 Jan 2021 19:17:35 -0800 Subject: [PATCH 19/24] Fix D3D runtime to work like Metal does and not reentrantly acquire the device context when compiling a kernel. --- src/runtime/d3d12compute.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 72b05118c761..4a91c8047209 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -2440,14 +2440,8 @@ volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0; WEAK Halide::Internal::GPUCompilationCache compilation_cache; -WEAK d3d12_library *compile_kernel(void *user_context, const char *source, int source_size, int *error_ret) { - D3D12ContextHolder d3d12_context(user_context, true); - if (d3d12_context.error != 0) { - *error_ret = d3d12_context.error; - return nullptr; - } - - d3d12_library *library = new_library_with_source(d3d12_context.device, source, source_size); +WEAK d3d12_library *compile_kernel(d3d12_device *device, const char *source, int source_size, int *error_ret) { + d3d12_library *library = new_library_with_source(device, source, source_size); if (library == nullptr) { TRACEFATAL("D3D12Compute: new_library_with_source failed."); *error_ret = halide_error_code_out_of_memory; @@ -2768,7 +2762,7 @@ WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state int error = halide_error_code_generic_error; d3d12_library *library{}; if (!compilation_cache.kernel_state_setup(user_context, state_ptr, d3d12_context.device, - library, compile_kernel, user_context, + library, compile_kernel, d3d12_context.device, source, source_size, &error)) { return error; } From 871acc3d27ac1c9bff61f3fa4eb5280a36678448 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 25 Jan 2021 11:38:08 -0800 Subject: [PATCH 20/24] trigger buildbots From 593a59ab95847f31eb695565a4460b45c9c75998 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 25 Jan 2021 12:45:01 -0800 Subject: [PATCH 21/24] trigger buildbots From 6b877df494abf96b52a1c3b25b462de8e9b25dee Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 25 Jan 2021 14:22:54 -0800 Subject: [PATCH 22/24] trigger buildbots From c61bf3971e84240f4211d7ef054179ce8e74504a Mon Sep 17 00:00:00 2001 From: Marcos Slomp Date: Tue, 26 Jan 2021 14:34:02 -0800 Subject: [PATCH 23/24] bugfix --- src/runtime/d3d12compute.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 4a91c8047209..64085a83f7b8 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -2493,7 +2493,7 @@ static int d3d12_create_context(void *user_context) { } if (status == halide_error_code_success) { - halide_assert(cmd_allocator_main, (cmd_allocator_main == nullptr)); + halide_assert(user_context, (cmd_allocator_main == nullptr)); cmd_allocator_main = new_command_allocator(device); if (cmd_allocator_main == nullptr) { status = halide_error_code_generic_error; @@ -2503,8 +2503,8 @@ static int d3d12_create_context(void *user_context) { if (status == halide_error_code_success) { // NOTE(marcos): a small amount of hard-coded staging buffer storage is // sufficient to get started as suballocations will grow them as needed - halide_assert(cmd_allocator_main, (upload == 0)); - halide_assert(cmd_allocator_main, (readback == 0)); + halide_assert(user_context, (upload == 0)); + halide_assert(user_context, (readback == 0)); size_t heap_size = 4 * 1024 * 1024; upload = new_upload_buffer(device, heap_size); readback = new_readback_buffer(device, heap_size); @@ -2853,6 +2853,7 @@ WEAK int halide_d3d12compute_device_release(void *user_context) { hFenceEvent = nullptr; release_object(cmd_allocator_main); + cmd_allocator_main = nullptr; release_object(device); device = nullptr; From 6b79b654c95b92df9ad7f1abe7ac91634e30c646 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 27 Jan 2021 15:02:09 -0800 Subject: [PATCH 24/24] trigger buildbots