From d6f60538c67cddf830890694c97c1385f7c91032 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 7 Dec 2020 13:01:04 -0800
Subject: [PATCH 01/24] Revert "Revert "Make context handling in GPU runtimes
 more consistent and robust. (#5474)" (#5515)"

This reverts commit 2ddd0b0fe70c709813079c89ec6bdc52d27c9093.
---
 Makefile                                      |  26 +++
 src/runtime/cuda.cpp                          | 163 ++++----------
 src/runtime/d3d12compute.cpp                  |  78 +++----
 src/runtime/destructors.cpp                   |   1 +
 src/runtime/gpu_context_common.h              | 197 +++++++++++++++++
 src/runtime/metal.cpp                         |  68 ++----
 src/runtime/opencl.cpp                        | 208 ++++++++----------
 test/common/gpu_context.h                     | 134 +++++++++++
 test/common/gpu_object_lifetime_tracker.h     |   7 +-
 test/correctness/CMakeLists.txt               |   1 +
 test/correctness/gpu_many_kernels.cpp         |  92 ++++++++
 test/generator/CMakeLists.txt                 |  13 ++
 test/generator/acquire_release_aottest.cpp    | 159 +++----------
 .../gpu_multi_context_threaded_aottest.cpp    | 193 ++++++++++++++++
 .../gpu_multi_context_threaded_generator.cpp  |  48 ++++
 15 files changed, 923 insertions(+), 465 deletions(-)
 create mode 100644 src/runtime/gpu_context_common.h
 create mode 100644 test/common/gpu_context.h
 create mode 100644 test/correctness/gpu_many_kernels.cpp
 create mode 100644 test/generator/gpu_multi_context_threaded_aottest.cpp
 create mode 100644 test/generator/gpu_multi_context_threaded_generator.cpp

diff --git a/Makefile b/Makefile
index ba9fcd3f9b4d..6ce06a94b568 100644
--- a/Makefile
+++ b/Makefile
@@ -1527,6 +1527,12 @@ $(FILTERS_DIR)/nested_externs_%.a: $(BIN_DIR)/nested_externs.generator
 	@mkdir -p $(@D)
 	$(CURDIR)/$< -g nested_externs_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime
 
+# Similarly, gpu_multi needs two different kernels to test compilation caching.
+# Also requies user-context.
+$(FILTERS_DIR)/gpu_multi_context_threaded_%.a: $(BIN_DIR)/gpu_multi_context_threaded.generator
+	@mkdir -p $(@D)
+	$(CURDIR)/$< -g gpu_multi_context_threaded_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context
+
 GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas
 GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools
 GEN_AOT_LD_FLAGS=$(COMMON_LD_FLAGS)
@@ -1622,11 +1628,31 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget
 	HL_MULTITARGET_TEST_USE_NOBOUNDSQUERY_FEATURE=1 $(CURDIR)/$<
 	@-echo
 
+# gpu_multi_context_threaded has additional deps to link in
+$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
+	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_add.a \
+	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a \
+	                                                       $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+	@mkdir -p $(@D)
+	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
+
+$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
+	                                                          $(FILTERS_DIR)/gpu_multi_context_threaded_add.halide_generated.cpp \
+	                                                          $(FILTERS_DIR)/gpu_multi_context_threaded_mul.halide_generated.cpp \
+	                                                          $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+	@mkdir -p $(@D)
+	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
+
 # nested externs doesn't actually contain a generator named
 # "nested_externs", and has no internal tests in any case.
 test_generator_nested_externs:
 	@echo "Skipping"
 
+# gpu_multi actually contain a generator named
+# "gpu_multi", and has no internal tests in any case.
+test_generator_gpu_multi:
+	@echo "Skipping"
+
 $(BUILD_DIR)/RunGenMain.o: $(ROOT_DIR)/tools/RunGenMain.cpp $(RUNTIME_EXPORTED_INCLUDES) $(ROOT_DIR)/tools/RunGen.h
 	@mkdir -p $(@D)
 	$(CXX) -c $< $(filter-out -g, $(TEST_CXX_FLAGS)) $(OPTIMIZE) -Os $(IMAGE_IO_CXX_FLAGS) -I$(INCLUDE_DIR) -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools -o $@
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 7c423e179d85..1dc1cf27086e 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1,6 +1,7 @@
 #include "HalideRuntimeCuda.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
+#include "gpu_context_common.h"
 #include "mini_cuda.h"
 #include "printer.h"
 #include "scoped_mutex_lock.h"
@@ -239,43 +240,7 @@ class Context {
     }
 };
 
-// Halide allocates a device API controlled pointer slot as part of
-// each compiled module. The slot is used to store information to
-// avoid having to reload/recompile kernel code on each call into a
-// Halide filter. The cuda runtime uses this pointer to maintain a
-// linked list of contexts into which the module has been loaded.
-//
-// A global list of all registered filters is also kept so all modules
-// loaded on a given context can be unloaded and removed from the list
-// when halide_device_release is called on a specific context.
-//
-// The registered_filters struct is not freed as it is pointed to by the
-// Halide generated code. The module_state structs are freed.
-
-struct module_state {
-    CUcontext context;
-    CUmodule module;
-    module_state *next;
-};
-
-struct registered_filters {
-    module_state *modules;
-    registered_filters *next;
-};
-WEAK registered_filters *filters_list = nullptr;
-// This spinlock protects the above filters_list.
-WEAK halide_mutex filters_list_lock;
-
-WEAK module_state *find_module_for_context(const registered_filters *filters, CUcontext ctx) {
-    module_state *modules = filters->modules;
-    while (modules != nullptr) {
-        if (modules->context == ctx) {
-            return modules;
-        }
-        modules = modules->next;
-    }
-    return nullptr;
-}
+WEAK Halide::Internal::GPUCompilationCache<CUcontext, CUmodule> compilation_cache;
 
 WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
     // Initialize CUDA
@@ -505,6 +470,33 @@ WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size
 #endif
 }
 
+WEAK CUmodule compile_kernel(void *user_context, const char *ptx_src, int size) {
+    debug(user_context) << "CUDA: compile_kernel cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
+
+    CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
+    unsigned int max_regs_per_thread = 64;
+
+    // A hack to enable control over max register count for
+    // testing. This should be surfaced in the schedule somehow
+    // instead.
+    char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
+    if (regs) {
+        max_regs_per_thread = atoi(regs);
+    }
+    void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
+    CUmodule loaded_module;
+    CUresult err = cuModuleLoadDataEx(&loaded_module, ptx_src, 1, options, optionValues);
+
+    if (err != CUDA_SUCCESS) {
+        error(user_context) << "CUDA: cuModuleLoadData failed: "
+                            << get_error_name(err);
+        return nullptr;
+    } else {
+        debug(user_context) << (void *)(loaded_module) << "\n";
+    }
+    return loaded_module;
+}
+
 }  // namespace Cuda
 }  // namespace Internal
 }  // namespace Runtime
@@ -526,54 +518,12 @@ WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, co
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    halide_assert(user_context, &filters_list_lock != nullptr);
-    {
-        ScopedMutexLock spinlock(&filters_list_lock);
-
-        // Create the state object if necessary. This only happens once, regardless
-        // of how many times halide_initialize_kernels/halide_release is called.
-        // halide_release traverses this list and releases the module objects, but
-        // it does not modify the list nodes created/inserted here.
-        registered_filters **filters = (registered_filters **)state_ptr;
-        if (!(*filters)) {
-            *filters = (registered_filters *)malloc(sizeof(registered_filters));
-            (*filters)->modules = nullptr;
-            (*filters)->next = filters_list;
-            filters_list = *filters;
-        }
-
-        // Create the module itself if necessary.
-        module_state *loaded_module = find_module_for_context(*filters, ctx.context);
-        if (loaded_module == nullptr) {
-            loaded_module = (module_state *)malloc(sizeof(module_state));
-            debug(user_context) << "    cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
-
-            CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
-            unsigned int max_regs_per_thread = 64;
-
-            // A hack to enable control over max register count for
-            // testing. This should be surfaced in the schedule somehow
-            // instead.
-            char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
-            if (regs) {
-                max_regs_per_thread = atoi(regs);
-            }
-            void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
-            CUresult err = cuModuleLoadDataEx(&loaded_module->module, ptx_src, 1, options, optionValues);
-
-            if (err != CUDA_SUCCESS) {
-                free(loaded_module);
-                error(user_context) << "CUDA: cuModuleLoadData failed: "
-                                    << get_error_name(err);
-                return err;
-            } else {
-                debug(user_context) << (void *)(loaded_module->module) << "\n";
-            }
-            loaded_module->context = ctx.context;
-            loaded_module->next = (*filters)->modules;
-            (*filters)->modules = loaded_module;
-        }
-    }  // spinlock
+    CUmodule loaded_module;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, loaded_module,
+                                              compile_kernel, user_context, ptx_src, size)) {
+        return halide_error_code_generic_error;
+    }
+    halide_assert(user_context, loaded_module != nullptr);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -704,7 +654,7 @@ WEAK int halide_cuda_device_release(void *user_context) {
         << "CUDA: halide_cuda_device_release (user_context: " << user_context << ")\n";
 
     // If we haven't even loaded libcuda, don't load it just to quit.
-    if (!lib_cuda) {
+    if (!cuInit) {
         return 0;
     }
 
@@ -728,34 +678,7 @@ WEAK int halide_cuda_device_release(void *user_context) {
         // Dump the contents of the free list, ignoring errors.
         halide_cuda_release_unused_device_allocations(user_context);
 
-        {
-            ScopedMutexLock spinlock(&filters_list_lock);
-
-            // Unload the modules attached to this context. Note that the list
-            // nodes themselves are not freed, only the module objects are
-            // released. Subsequent calls to halide_init_kernels might re-create
-            // the program object using the same list node to store the module
-            // object.
-            registered_filters *filters = filters_list;
-            while (filters) {
-                module_state **prev_ptr = &filters->modules;
-                module_state *loaded_module = filters->modules;
-                while (loaded_module != nullptr) {
-                    if (loaded_module->context == ctx) {
-                        debug(user_context) << "    cuModuleUnload " << loaded_module->module << "\n";
-                        err = cuModuleUnload(loaded_module->module);
-                        halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
-                        *prev_ptr = loaded_module->next;
-                        free(loaded_module);
-                        loaded_module = *prev_ptr;
-                    } else {
-                        loaded_module = loaded_module->next;
-                        prev_ptr = &loaded_module->next;
-                    }
-                }
-                filters = filters->next;
-            }
-        }  // spinlock
+        compilation_cache.delete_context(user_context, ctx, cuModuleUnload);
 
         CUcontext old_ctx;
         cuCtxPopCurrent(&old_ctx);
@@ -919,12 +842,15 @@ WEAK int cuda_do_multidimensional_copy(void *user_context, const device_copy &c,
                             << (void *)src << " -> " << (void *)dst << ", " << c.chunk_size << " bytes\n";
         if (!from_host && to_host) {
             debug(user_context) << "cuMemcpyDtoH(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
+            copy_name = "cuMemcpyDtoH";
             err = cuMemcpyDtoH((void *)dst, (CUdeviceptr)src, c.chunk_size);
         } else if (from_host && !to_host) {
             debug(user_context) << "cuMemcpyHtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
+            copy_name = "cuMemcpyHtoD";
             err = cuMemcpyHtoD((CUdeviceptr)dst, (void *)src, c.chunk_size);
         } else if (!from_host && !to_host) {
             debug(user_context) << "cuMemcpyDtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
+            copy_name = "cuMemcpyDtoD";
             err = cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, c.chunk_size);
         } else if (dst != src) {
             debug(user_context) << "memcpy(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
@@ -1133,9 +1059,9 @@ WEAK int halide_cuda_run(void *user_context,
 #endif
 
     halide_assert(user_context, state_ptr);
-    module_state *loaded_module = find_module_for_context((registered_filters *)state_ptr, ctx.context);
-    halide_assert(user_context, loaded_module != nullptr);
-    CUmodule mod = loaded_module->module;
+    CUmodule mod = nullptr;
+    bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod);
+    halide_assert(user_context, found_module && mod != nullptr);
     debug(user_context) << "Got module " << mod << "\n";
     halide_assert(user_context, mod);
     CUfunction f;
@@ -1264,7 +1190,7 @@ WEAK const halide_device_interface_t *halide_cuda_device_interface() {
 }
 
 WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *minor) {
-    if (!lib_cuda) {
+    if (!lib_cuda && !cuInit) {
         // If cuda can't be found, we want to return 0, 0 and it's not
         // considered an error. So we should be very careful about
         // looking for libcuda without tripping any errors in the rest
@@ -1313,6 +1239,7 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min
 
 namespace {
 WEAK __attribute__((destructor)) void halide_cuda_cleanup() {
+    compilation_cache.release_all(nullptr, cuModuleUnload);
     halide_cuda_device_release(nullptr);
 }
 }  // namespace
diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 49ef68a0118a..3174c33f52a4 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -45,6 +45,7 @@
 #include "HalideRuntimeD3D12Compute.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
+#include "gpu_context_common.h"
 #include "printer.h"
 #include "scoped_spin_lock.h"
 
@@ -2437,16 +2438,24 @@ static void *buffer_contents(d3d12_buffer *buffer) {
 
 volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0;
 
-// Structure to hold the state of a module attached to the context.
-// Also used as a linked-list to keep track of all the different
-// modules that are attached to a context in order to release them all
-// when then context is released.
-struct module_state {
-    d3d12_library *library;
-    module_state *next;
-};
-D3D12TYPENAME(module_state)
-WEAK module_state *state_list = nullptr;
+WEAK Halide::Internal::GPUCompilationCache<d3d12_device *, d3d12_library *> compilation_cache;
+
+WEAK d3d12_library *compile_kernel(void *user_context, const char *source, int source_size, int *error_ret) {
+    D3D12ContextHolder d3d12_context(user_context, true);
+    if (d3d12_context.error != 0) {
+        *error_ret = d3d12_context.error;
+        return nullptr;
+    }
+
+    d3d12_library *library = new_library_with_source(d3d12_context.device, source, source_size);
+    if (library == nullptr) {
+        TRACEFATAL("D3D12Compute: new_library_with_source failed.");
+        *error_ret = halide_error_code_out_of_memory;
+        return nullptr;
+    }
+
+    return library;
+}
 
 }  // namespace D3D12Compute
 }  // namespace Internal
@@ -2754,29 +2763,14 @@ WEAK int halide_d3d12compute_device_free(void *user_context, halide_buffer_t *bu
 WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) {
     TRACELOG;
 
-    // Create the state object if necessary. This only happens once, regardless
-    // of how many times halide_initialize_kernels/halide_release is called.
-    // halide_release traverses this list and releases the module objects, but
-    // it does not modify the list nodes created/inserted here.
-    module_state *&state = *(module_state **)state_ptr;
-    if (!state) {
-        state = malloct<module_state>();
-        state->library = nullptr;
-        state->next = state_list;
-        state_list = state;
-    }
-
     D3D12ContextHolder d3d12_context(user_context, true);
-    if (d3d12_context.error != 0) {
-        return d3d12_context.error;
-    }
 
-    if (state->library == nullptr) {
-        state->library = new_library_with_source(d3d12_context.device, source, source_size);
-        if (state->library == nullptr) {
-            TRACEFATAL("D3D12Compute: new_library_with_source failed.");
-            return halide_error_code_out_of_memory;
-        }
+    int error = halide_error_code_generic_error;
+    d3d12_library *library;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, d3d12_context.device,
+                                              library, compile_kernel, user_context,
+                                              source, source_size, &error)) {
+        return error;
     }
 
     return 0;
@@ -2839,19 +2833,7 @@ WEAK int halide_d3d12compute_device_release(void *user_context) {
             release_object(buffer);
         }
 
-        // Unload the modules attached to this device. Note that the list
-        // nodes themselves are not freed, only the program objects are
-        // released. Subsequent calls to halide_init_kernels might re-create
-        // the program object using the same list node to store the program
-        // object.
-        module_state *state = state_list;
-        while (state) {
-            if (state->library) {
-                release_object(state->library);
-                state->library = nullptr;
-            }
-            state = state->next;
-        }
+        compilation_cache.delete_context(user_context, device, release_object<d3d12_library>);
 
         // Release the device itself, if we created it.
         if (acquired_device == device) {
@@ -3023,8 +3005,9 @@ WEAK int halide_d3d12compute_run(void *user_context,
     StartCapturingGPUActivity();
 #endif
 
-    halide_assert(user_context, state_ptr);
-    module_state *state = (module_state *)state_ptr;
+    d3d12_library *library = nullptr;
+    bool found_module = compilation_cache.lookup(device, state_ptr, library);
+    halide_assert(user_context, found_module && library != nullptr);
 
     d3d12_frame *frame = acquire_frame(device);
     d3d12_compute_command_list *cmdList = frame->cmd_list;
@@ -3036,7 +3019,7 @@ WEAK int halide_d3d12compute_run(void *user_context,
     d3d12_compute_pipeline_state *pipeline_state = nullptr;
     {
         TRACE_SCOPE("kernel shader selection");
-        function = new_function_with_name(device, state->library, entry_name, strlen(entry_name),
+        function = new_function_with_name(device, library, entry_name, strlen(entry_name),
                                           shared_mem_bytes, threadsX, threadsY, threadsZ);
         halide_assert(user_context, function);
         pipeline_state = function->pipeline_state;
@@ -3531,6 +3514,7 @@ WEAK const struct halide_device_interface_t *halide_d3d12compute_device_interfac
 namespace {
 WEAK __attribute__((destructor)) void halide_d3d12compute_cleanup() {
     TRACELOG;
+    compilation_cache.release_all(nullptr, release_object<d3d12_library>);
     halide_d3d12compute_device_release(nullptr);
 }
 }  // namespace
diff --git a/src/runtime/destructors.cpp b/src/runtime/destructors.cpp
index 1fa7228a86e2..b7187c6c4d86 100644
--- a/src/runtime/destructors.cpp
+++ b/src/runtime/destructors.cpp
@@ -1,4 +1,5 @@
 #include "HalideRuntime.h"
+#include "printer.h"
 
 extern "C" {
 
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
new file mode 100644
index 000000000000..9ef2662026a8
--- /dev/null
+++ b/src/runtime/gpu_context_common.h
@@ -0,0 +1,197 @@
+#include "printer.h"
+#include "scoped_mutex_lock.h"
+
+namespace Halide {
+namespace Internal {
+
+template<typename ContextT, typename ModuleStateT>
+class GPUCompilationCache {
+    struct CachedCompilation {
+        ContextT context{};
+        ModuleStateT module_state{};
+        uint32_t kernel_id{};
+    };
+
+    halide_mutex mutex;
+
+    static constexpr float kLoadFactor{.5f};
+    static constexpr int kInitialTableBits{7};
+    int log2_compilations_size{0};  // number of bits in index into compilations table.
+    CachedCompilation *compilations{nullptr};
+    int count{0};
+
+    static constexpr uint32_t kInvalidId{0};
+    static constexpr uint32_t kDeletedId{1};
+
+    uint32_t unique_id{2};  // zero is an invalid id
+
+public:
+    static ALWAYS_INLINE uintptr_t kernel_hash(ContextT context, uint32_t id, uint32_t bits) {
+        uintptr_t addr = (uintptr_t)context + id;
+        // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39...
+        // in hexadecimal.
+        if (sizeof(uintptr_t) >= 8) {
+            return (addr * (uintptr_t)0x9E3779B97F4A7C15) >> (64 - bits);
+        } else {
+            return (addr * (uintptr_t)0x9E3779B9) >> (32 - bits);
+        }
+    }
+
+    HALIDE_MUST_USE_RESULT bool insert(ContextT context, uint32_t id, ModuleStateT module_state) {
+        if (log2_compilations_size == 0) {
+            if (!resize_table(kInitialTableBits)) {
+                return false;
+            }
+        }
+        if ((count + 1) > (1 << log2_compilations_size) * kLoadFactor) {
+            if (!resize_table(log2_compilations_size + 1)) {
+                return false;
+            }
+        }
+        count += 1;
+        uintptr_t index = kernel_hash(context, id, log2_compilations_size);
+        for (int i = 0; i < (1 << log2_compilations_size); i++) {
+            uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
+            if (compilations[effective_index].kernel_id <= kDeletedId) {
+                compilations[effective_index].context = context;
+                compilations[effective_index].module_state = module_state;
+                compilations[effective_index].kernel_id = id;
+                return true;
+            }
+        }
+        // This is a logic error that should never occur. It means the table is
+        // full, but it should have been resized.
+        halide_assert(nullptr, false);
+        return false;
+    }
+
+    HALIDE_MUST_USE_RESULT bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) {
+        if (log2_compilations_size == 0) {
+            return false;
+        }
+        uintptr_t index = kernel_hash(context, id, log2_compilations_size);
+        for (int i = 0; i < (1 << log2_compilations_size); i++) {
+            uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
+
+            if (compilations[effective_index].kernel_id == kInvalidId) {
+                return false;
+            }
+            if (compilations[effective_index].context == context &&
+                compilations[effective_index].kernel_id == id) {
+                module_state = &compilations[effective_index].module_state;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    HALIDE_MUST_USE_RESULT bool lookup(ContextT context, void *state_ptr, ModuleStateT &module_state) {
+        ScopedMutexLock lock_guard(&mutex);
+        uint32_t id = (uint32_t)(uintptr_t)state_ptr;
+        ModuleStateT *mod_ptr;
+        if (find_internal(context, id, mod_ptr)) {
+            module_state = *mod_ptr;
+            return true;
+        }
+        return false;
+    }
+
+    HALIDE_MUST_USE_RESULT bool resize_table(int size_bits) {
+        if (size_bits != log2_compilations_size) {
+            int new_size = (1 << size_bits);
+            int old_size = (1 << log2_compilations_size);
+            CachedCompilation *new_table = (CachedCompilation *)malloc(new_size * sizeof(CachedCompilation));
+            if (new_table == nullptr) {
+                // signal error.
+                return false;
+            }
+            memset(new_table, 0, new_size * sizeof(CachedCompilation));
+            CachedCompilation *old_table = compilations;
+            compilations = new_table;
+            log2_compilations_size = size_bits;
+
+            if (count > 0) {  // Mainly to catch empty initial table case
+                for (int32_t i = 0; i < old_size; i++) {
+                    if (old_table[i].kernel_id != kInvalidId &&
+                        old_table[i].kernel_id != kDeletedId) {
+                        bool result = insert(old_table[i].context, old_table[i].kernel_id,
+                                             old_table[i].module_state);
+                        halide_assert(nullptr, result);  // Resizing the table while resizing the table is a logic error.
+                    }
+                }
+            }
+            free(old_table);
+        }
+        return true;
+    }
+
+    template<typename FreeModuleT>
+    void release_context(void *user_context, bool all, ContextT context, FreeModuleT &f) {
+        if (count == 0) {
+            return;
+        }
+
+        for (int i = 0; i < (1 << log2_compilations_size); i++) {
+            if (compilations[i].kernel_id > kInvalidId &&
+                (all || (compilations[i].context == context))) {
+                f(compilations[i].module_state);
+                compilations[i].module_state = nullptr;
+                compilations[i].kernel_id = kDeletedId;
+                count--;
+            }
+        }
+    }
+
+    template<typename FreeModuleT>
+    void delete_context(void *user_context, ContextT context, FreeModuleT &f) {
+        ScopedMutexLock lock_guard(&mutex);
+
+        release_context(user_context, false, context, f);
+    }
+
+    template<typename FreeModuleT>
+    void release_all(void *user_context, FreeModuleT &f) {
+        ScopedMutexLock lock_guard(&mutex);
+
+        release_context(user_context, true, nullptr, f);
+        free(compilations);
+        compilations = nullptr;
+        log2_compilations_size = 0;
+    }
+
+    template<typename CompileModuleT, typename... Args>
+    HALIDE_MUST_USE_RESULT bool kernel_state_setup(void *user_context, void **state_ptr,
+                                                   ContextT context, ModuleStateT &result,
+                                                   CompileModuleT f,
+                                                   Args... args) {
+        ScopedMutexLock lock_guard(&mutex);
+
+        uint32_t *id_ptr = (uint32_t *)state_ptr;
+        if (*id_ptr == 0) {
+            *id_ptr = unique_id++;
+        }
+
+        ModuleStateT *mod;
+        if (find_internal(context, *id_ptr, mod)) {
+            result = *mod;
+            return true;
+        }
+
+        // TODO(zvookin): figure out the calling signature here...
+        ModuleStateT compiled_module = f(args...);
+        debug(user_context) << "Caching compiled kernel: " << compiled_module << " id " << *id_ptr << " context " << context << "\n";
+        if (compiled_module == nullptr) {
+            return false;
+        }
+
+        if (!insert(context, *id_ptr, compiled_module)) {
+            return false;
+        }
+        result = compiled_module;
+
+        return true;
+    }
+};
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index 1a2ba0ce52b4..25fe29f3feda 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -1,6 +1,7 @@
 #include "HalideRuntimeMetal.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
+#include "gpu_context_common.h"
 #include "printer.h"
 #include "scoped_spin_lock.h"
 
@@ -282,15 +283,7 @@ struct device_handle {
     uint64_t offset;
 };
 
-// Structure to hold the state of a module attached to the context.
-// Also used as a linked-list to keep track of all the different
-// modules that are attached to a context in order to release them all
-// when then context is released.
-struct module_state {
-    mtl_library *library;
-    module_state *next;
-};
-WEAK module_state *state_list = nullptr;
+WEAK Halide::Internal::GPUCompilationCache<mtl_device *, mtl_library *> compilation_cache;
 
 // API Capabilities.  If more capabilities need to be checked,
 // this can be refactored to something more robust/general.
@@ -544,18 +537,6 @@ WEAK int halide_metal_device_free(void *user_context, halide_buffer_t *buf) {
 }
 
 WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) {
-    // Create the state object if necessary. This only happens once, regardless
-    // of how many times halide_initialize_kernels/halide_release is called.
-    // halide_release traverses this list and releases the module objects, but
-    // it does not modify the list nodes created/inserted here.
-    module_state **state = (module_state **)state_ptr;
-    if (!(*state)) {
-        *state = (module_state *)malloc(sizeof(module_state));
-        (*state)->library = nullptr;
-        (*state)->next = state_list;
-        state_list = *state;
-    }
-
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error != 0) {
         return metal_context.error;
@@ -565,23 +546,13 @@ WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, c
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    if ((*state)->library == nullptr) {
-#ifdef DEBUG_RUNTIME
-        uint64_t t_before_compile = halide_current_time_ns(user_context);
-#endif
-
-        debug(user_context) << "Metal - Allocating: new_library_with_source " << (*state)->library << "\n";
-        (*state)->library = new_library_with_source(metal_context.device, source, source_size);
-        if ((*state)->library == nullptr) {
-            error(user_context) << "Metal: new_library_with_source failed.\n";
-            return -1;
-        }
-
-#ifdef DEBUG_RUNTIME
-        uint64_t t_after_compile = halide_current_time_ns(user_context);
-        debug(user_context) << "Time for halide_metal_initialize_kernels compilation: " << (t_after_compile - t_before_compile) / 1.0e6 << " ms\n";
-#endif
+    mtl_library *library;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, metal_context.device, library,
+                                              new_library_with_source, metal_context.device,
+                                              source, source_size)) {
+        return halide_error_code_generic_error;
     }
+    halide_assert(user_context, library != nullptr);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -644,20 +615,7 @@ WEAK int halide_metal_device_release(void *user_context) {
     if (device) {
         halide_metal_device_sync_internal(queue, nullptr);
 
-        // Unload the modules attached to this device. Note that the list
-        // nodes themselves are not freed, only the program objects are
-        // released. Subsequent calls to halide_init_kernels might re-create
-        // the program object using the same list node to store the program
-        // object.
-        module_state *state = state_list;
-        while (state) {
-            if (state->library) {
-                debug(user_context) << "Metal - Releasing: new_library_with_source " << state->library << "\n";
-                release_ns_object(state->library);
-                state->library = nullptr;
-            }
-            state = state->next;
-        }
+        compilation_cache.delete_context(user_context, device, release_ns_object);
 
         // Release the device itself, if we created it.
         if (acquired_device == device) {
@@ -782,10 +740,11 @@ WEAK int halide_metal_run(void *user_context,
         return -1;
     }
 
-    halide_assert(user_context, state_ptr);
-    module_state *state = (module_state *)state_ptr;
+    mtl_library *library;
+    bool found_library = compilation_cache.lookup(metal_context.device, state_ptr, library);
+    halide_assert(user_context, found_library && library != nullptr);
 
-    mtl_function *function = new_function_with_name(state->library, entry_name, strlen(entry_name));
+    mtl_function *function = new_function_with_name(library, entry_name, strlen(entry_name));
     if (function == nullptr) {
         error(user_context) << "Metal: Could not get function " << entry_name << "from Metal library.\n";
         return -1;
@@ -1147,6 +1106,7 @@ WEAK const struct halide_device_interface_t *halide_metal_device_interface() {
 
 namespace {
 WEAK __attribute__((destructor)) void halide_metal_cleanup() {
+    compilation_cache.release_all(nullptr, release_ns_object);
     halide_metal_device_release(nullptr);
 }
 }  // namespace
diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index 7c10033bde31..7c7815b620d7 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -1,6 +1,7 @@
 #include "HalideRuntimeOpenCL.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
+#include "gpu_context_common.h"
 #include "printer.h"
 #include "scoped_spin_lock.h"
 
@@ -286,15 +287,7 @@ struct device_handle {
     cl_mem mem;
 };
 
-// Structure to hold the state of a module attached to the context.
-// Also used as a linked-list to keep track of all the different
-// modules that are attached to a context in order to release them all
-// when then context is released.
-struct module_state {
-    cl_program program;
-    module_state *next;
-};
-WEAK module_state *state_list = nullptr;
+WEAK Halide::Internal::GPUCompilationCache<cl_context, cl_program> compilation_cache;
 
 WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) {
     if (buf->device == 0) {
@@ -556,6 +549,83 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
     return err;
 }
 
+WEAK cl_program compile_kernel(void *user_context, cl_context ctx, const char *src, int size) {
+    cl_int err = 0;
+    cl_device_id dev;
+
+    err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr);
+    if (err != CL_SUCCESS) {
+        error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: "
+                            << get_opencl_error_name(err);
+        return nullptr;
+    }
+
+    cl_device_id devices[] = {dev};
+
+    // Get the max constant buffer size supported by this OpenCL implementation.
+    cl_ulong max_constant_buffer_size = 0;
+    err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr);
+    if (err != CL_SUCCESS) {
+        error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: "
+                            << get_opencl_error_name(err);
+        return nullptr;
+    }
+    // Get the max number of constant arguments supported by this OpenCL implementation.
+    cl_uint max_constant_args = 0;
+    err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr);
+    if (err != CL_SUCCESS) {
+        error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: "
+                            << get_opencl_error_name(err);
+        return nullptr;
+    }
+
+    // Build the compile argument options.
+    stringstream options(user_context);
+    options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size
+            << " -D MAX_CONSTANT_ARGS=" << max_constant_args;
+
+    const char *extra_options = halide_opencl_get_build_options(user_context);
+    options << " " << extra_options;
+
+    const char *sources[] = {src};
+    debug(user_context) << "    clCreateProgramWithSource -> ";
+    cl_program program = clCreateProgramWithSource(ctx, 1, &sources[0], nullptr, &err);
+    if (err != CL_SUCCESS) {
+        debug(user_context) << get_opencl_error_name(err) << "\n";
+        error(user_context) << "CL: clCreateProgramWithSource failed: "
+                            << get_opencl_error_name(err);
+        return nullptr;
+    } else {
+        debug(user_context) << (void *)program << "\n";
+    }
+
+    debug(user_context) << "    clBuildProgram " << (void *)program
+                        << " " << options.str() << "\n";
+    err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr);
+    if (err != CL_SUCCESS) {
+
+        {
+            // Allocate an appropriately sized buffer for the build log.
+            Printer<ErrorPrinter, 16384> p(user_context);
+
+            p << "CL: clBuildProgram failed: "
+              << get_opencl_error_name(err)
+              << "\nBuild Log:\n";
+
+            // Get build log
+            if (clGetProgramBuildInfo(program, dev,
+                                      CL_PROGRAM_BUILD_LOG,
+                                      p.capacity() - p.size() - 1, p.dst,
+                                      nullptr) != CL_SUCCESS) {
+                p << "clGetProgramBuildInfo failed (Printer buffer too small?)";
+            }
+        }
+
+        return nullptr;
+    }
+    return program;
+}
+
 }  // namespace OpenCL
 }  // namespace Internal
 }  // namespace Runtime
@@ -675,97 +745,13 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr,
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    // Create the state object if necessary. This only happens once, regardless
-    // of how many times halide_init_kernels/halide_release is called.
-    // halide_release traverses this list and releases the program objects, but
-    // it does not modify the list nodes created/inserted here.
-    module_state **state = (module_state **)state_ptr;
-    if (!(*state)) {
-        *state = (module_state *)malloc(sizeof(module_state));
-        (*state)->program = nullptr;
-        (*state)->next = state_list;
-        state_list = *state;
-    }
-
-    // Create the program if necessary. TODO: The program object needs to not
-    // only already exist, but be created for the same context/device as the
-    // calling context/device.
-    if (!(*state && (*state)->program) && size > 1) {
-        cl_int err = 0;
-        cl_device_id dev;
-
-        err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr);
-        if (err != CL_SUCCESS) {
-            error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: "
-                                << get_opencl_error_name(err);
-            return err;
-        }
-
-        cl_device_id devices[] = {dev};
-
-        // Get the max constant buffer size supported by this OpenCL implementation.
-        cl_ulong max_constant_buffer_size = 0;
-        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr);
-        if (err != CL_SUCCESS) {
-            error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: "
-                                << get_opencl_error_name(err);
-            return err;
-        }
-        // Get the max number of constant arguments supported by this OpenCL implementation.
-        cl_uint max_constant_args = 0;
-        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr);
-        if (err != CL_SUCCESS) {
-            error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: "
-                                << get_opencl_error_name(err);
-            return err;
-        }
-
-        // Build the compile argument options.
-        stringstream options(user_context);
-        options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size
-                << " -D MAX_CONSTANT_ARGS=" << max_constant_args;
-
-        const char *extra_options = halide_opencl_get_build_options(user_context);
-        options << " " << extra_options;
-
-        const char *sources[] = {src};
-        debug(user_context) << "    clCreateProgramWithSource -> ";
-        cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], nullptr, &err);
-        if (err != CL_SUCCESS) {
-            debug(user_context) << get_opencl_error_name(err) << "\n";
-            error(user_context) << "CL: clCreateProgramWithSource failed: "
-                                << get_opencl_error_name(err);
-            return err;
-        } else {
-            debug(user_context) << (void *)program << "\n";
-        }
-
-        (*state)->program = program;
-        debug(user_context) << "    clBuildProgram " << (void *)program
-                            << " " << options.str() << "\n";
-        err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr);
-        if (err != CL_SUCCESS) {
-
-            {
-                // Allocate an appropriately sized buffer for the build log.
-                Printer<ErrorPrinter, 16384> p(user_context);
-
-                p << "CL: clBuildProgram failed: "
-                  << get_opencl_error_name(err)
-                  << "\nBuild Log:\n";
-
-                // Get build log
-                if (clGetProgramBuildInfo(program, dev,
-                                          CL_PROGRAM_BUILD_LOG,
-                                          p.capacity() - p.size() - 1, p.dst,
-                                          nullptr) != CL_SUCCESS) {
-                    p << "clGetProgramBuildInfo failed (Printer buffer too small?)";
-                }
-            }
-
-            return err;
-        }
+    debug(user_context) << "halide_cuda_initialize_kernels got compilation_cache mutex.\n";
+    cl_program program;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, program,
+                                              compile_kernel, user_context, ctx.context, src, size)) {
+        return halide_error_code_generic_error;
     }
+    halide_assert(user_context, program != nullptr);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -820,21 +806,7 @@ WEAK int halide_opencl_device_release(void *user_context) {
         err = clFinish(q);
         halide_assert(user_context, err == CL_SUCCESS);
 
-        // Unload the modules attached to this context. Note that the list
-        // nodes themselves are not freed, only the program objects are
-        // released. Subsequent calls to halide_init_kernels might re-create
-        // the program object using the same list node to store the program
-        // object.
-        module_state *state = state_list;
-        while (state) {
-            if (state->program) {
-                debug(user_context) << "    clReleaseProgram " << state->program << "\n";
-                err = clReleaseProgram(state->program);
-                halide_assert(user_context, err == CL_SUCCESS);
-                state->program = nullptr;
-            }
-            state = state->next;
-        }
+        compilation_cache.delete_context(user_context, ctx, clReleaseProgram);
 
         // Release the context itself, if we created it.
         if (ctx == context) {
@@ -1077,9 +1049,10 @@ WEAK int halide_opencl_run(void *user_context,
 
     // Create kernel object for entry_name from the program for this module.
     halide_assert(user_context, state_ptr);
-    cl_program program = ((module_state *)state_ptr)->program;
+    cl_program program;
+    bool found_program = compilation_cache.lookup(ctx.context, state_ptr, program);
 
-    halide_assert(user_context, program);
+    halide_assert(user_context, found_program && program != nullptr);
     debug(user_context) << "    clCreateKernel " << entry_name << " -> ";
     cl_kernel f = clCreateKernel(program, entry_name, &err);
     if (err != CL_SUCCESS) {
@@ -1366,6 +1339,7 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() {
 
 namespace {
 WEAK __attribute__((destructor)) void halide_opencl_cleanup() {
+    compilation_cache.release_all(nullptr, clReleaseProgram);
     halide_opencl_device_release(nullptr);
 }
 }  // namespace
@@ -1940,4 +1914,4 @@ extern "C" {
 WEAK const struct halide_device_interface_t *halide_opencl_image_device_interface() {
     return &opencl_image_device_interface;
 }
-}
\ No newline at end of file
+}
diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
new file mode 100644
index 000000000000..4816ad205866
--- /dev/null
+++ b/test/common/gpu_context.h
@@ -0,0 +1,134 @@
+#if defined(TEST_OPENCL)
+// Implement OpenCL custom context.
+
+#define CL_TARGET_OPENCL_VERSION 120
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+// Just use a global context and queue, created and destroyed by main.
+cl_context cl_ctx = nullptr;
+cl_command_queue cl_q = nullptr;
+
+// Create the global context. This is just a helper function not called by Halide.
+bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) {
+    cl_int err = 0;
+
+    const cl_uint maxPlatforms = 4;
+    cl_platform_id platforms[maxPlatforms];
+    cl_uint platformCount = 0;
+
+    err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount);
+    if (err != CL_SUCCESS) {
+        printf("clGetPlatformIDs failed (%d)\n", err);
+        return false;
+    }
+
+    cl_platform_id platform = nullptr;
+
+    if (platformCount > 0) {
+        platform = platforms[0];
+    }
+    if (platform == nullptr) {
+        printf("Failed to get platform\n");
+        return false;
+    }
+
+    cl_device_type device_type = CL_DEVICE_TYPE_ALL;
+
+    // Make sure we have a device
+    const cl_uint maxDevices = 4;
+    cl_device_id devices[maxDevices];
+    cl_uint deviceCount = 0;
+    err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
+    if (err != CL_SUCCESS) {
+        printf("clGetDeviceIDs failed (%d)\n", err);
+        return false;
+    }
+    if (deviceCount == 0) {
+        printf("Failed to get device\n");
+        return false;
+    }
+
+    cl_device_id dev = devices[deviceCount - 1];
+
+    // Create context and command queue.
+    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
+                                          0};
+    cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err);
+    if (err != CL_SUCCESS) {
+        printf("clCreateContext failed (%d)\n", err);
+        return false;
+    }
+
+    cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf("clCreateCommandQueue failed (%d)\n", err);
+        return false;
+    }
+    return true;
+}
+
+void destroy_opencl_context(cl_context cl_ctx, cl_command_queue cl_q) {
+    clReleaseCommandQueue(cl_q);
+    clReleaseContext(cl_ctx);
+}
+
+#elif defined(TEST_CUDA)
+// Implement CUDA custom context.
+#include <cuda.h>
+
+bool create_cuda_context(CUcontext &cuda_ctx) {
+    // Initialize CUDA
+    CUresult err = cuInit(0);
+    if (err != CUDA_SUCCESS) {
+        printf("cuInit failed (%d)\n", err);
+        return false;
+    }
+
+    // Make sure we have a device
+    int deviceCount = 0;
+    err = cuDeviceGetCount(&deviceCount);
+    if (err != CUDA_SUCCESS) {
+        printf("cuGetDeviceCount failed (%d)\n", err);
+        return false;
+    }
+    if (deviceCount <= 0) {
+        printf("No CUDA devices available\n");
+        return false;
+    }
+
+    CUdevice dev;
+    // Get device
+    CUresult status;
+    // Try to get a device >0 first, since 0 should be our display device
+    // For now, don't try devices > 2 to maintain compatibility with previous behavior.
+    if (deviceCount > 2) deviceCount = 2;
+    for (int id = deviceCount - 1; id >= 0; id--) {
+        status = cuDeviceGet(&dev, id);
+        if (status == CUDA_SUCCESS) break;
+    }
+
+    if (status != CUDA_SUCCESS) {
+        printf("Failed to get CUDA device\n");
+        return status;
+    }
+
+    // Create context
+    err = cuCtxCreate(&cuda_ctx, 0, dev);
+    if (err != CUDA_SUCCESS) {
+        printf("cuCtxCreate failed (%d)\n", err);
+        return false;
+    }
+
+    return true;
+}
+
+void destroy_cuda_context(CUcontext cuda_ctx) {
+    cuCtxDestroy(cuda_ctx);
+}
+
+#endif
diff --git a/test/common/gpu_object_lifetime_tracker.h b/test/common/gpu_object_lifetime_tracker.h
index b17dd9118413..1734a59f2f00 100644
--- a/test/common/gpu_object_lifetime_tracker.h
+++ b/test/common/gpu_object_lifetime_tracker.h
@@ -22,25 +22,24 @@ class GpuObjectLifetimeTracker {
         }
     };
 
-    std::array<ObjectType, 13> object_types = {{
+    std::array<ObjectType, 11> object_types = {{
+        {"Caching compiled kernel:", "Releasing cached compilation:"},
+
         // OpenCL objects
         {"clCreateContext", "clReleaseContext", true},
         {"clCreateCommandQueue", "clReleaseCommandQueue", true},
         // This handles both "clCreateProgramWithSource" and
         // "clCreateProgramWithBinary".
-        {"clCreateProgram", "clReleaseProgram"},
         {"clCreateBuffer", "clReleaseMemObject"},
         {"clCreateKernel", "clReleaseKernel"},
 
         // CUDA objects
         {"cuCtxCreate", "cuCtxDestroy", true},
-        {"cuModuleLoad", "cuModuleUnload"},
         {"cuMemAlloc", "cuMemFree"},
 
         // Metal objects
         {"Allocating: MTLCreateSystemDefaultDevice", "Releasing: MTLCreateSystemDefaultDevice", true},
         {"Allocating: new_command_queue", "Releasing: new_command_queue"},
-        {"Allocating: new_library_with_source", "Releasing: new_library_with_source"},
 
         // Hexagon objects
         {"halide_remote_load_library", "halide_remote_release_library"},
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index e1ba9d8e4eb3..68692f12f1ba 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -134,6 +134,7 @@ tests(GROUPS correctness
       gpu_give_input_buffers_device_allocations.cpp
       gpu_jit_explicit_copy_to_device.cpp
       gpu_large_alloc.cpp
+      gpu_many_kernels.cpp
       gpu_mixed_dimensionality.cpp
       gpu_mixed_shared_mem_types.cpp
       gpu_multi_kernel.cpp
diff --git a/test/correctness/gpu_many_kernels.cpp b/test/correctness/gpu_many_kernels.cpp
new file mode 100644
index 000000000000..d9572b38c18c
--- /dev/null
+++ b/test/correctness/gpu_many_kernels.cpp
@@ -0,0 +1,92 @@
+#include "Halide.h"
+#include <algorithm>
+
+#include "halide_benchmark.h"
+
+// This test makes sure GPU runtimes can handle many different small
+// kernels and can handle releasing a device context and making a new
+// one and still have many kernels work. This is needed due to kernel
+// compilation caching mechanisms in the GPU runtimes.
+
+using namespace Halide;
+
+constexpr size_t kNumKernels = 70;
+
+int main(int argc, char **argv) {
+    Var x, y, xi, yi;
+    Func adders[kNumKernels];
+    ImageParam input(Int(32), 2);
+
+    Target target = get_jit_target_from_environment();
+    int i = 1;
+    for (Func &f : adders) {
+        f(x, y) = input(x, y) + i;
+        if (target.has_gpu_feature()) {
+            f.compute_root().gpu_tile(x, y, xi, yi, 16, 16);
+        } else {
+            f.compute_root().vectorize(x, target.natural_vector_size<int32_t>());
+        }
+        i += 1;
+    }
+
+    auto start = Halide::Tools::benchmark_now();
+
+    Buffer<int32_t> buf_a_store(32, 32);
+    Buffer<int32_t> buf_b_store(32, 32);
+    Buffer<int32_t> *buf_in = &buf_a_store;
+    Buffer<int32_t> *buf_out = &buf_b_store;
+    buf_in->fill(0);
+    for (Func &f : adders) {
+        input.set(*buf_in);
+        f.realize(*buf_out);
+        std::swap(buf_in, buf_out);
+    }
+    buf_in->copy_to_host();
+
+    auto end = Halide::Tools::benchmark_now();
+    double initial_runtime = Halide::Tools::benchmark_duration_seconds(start, end);
+
+    buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); });
+
+    start = Halide::Tools::benchmark_now();
+
+    buf_in->fill(0);
+    for (Func &f : adders) {
+        input.set(*buf_in);
+        f.realize(*buf_out);
+        std::swap(buf_in, buf_out);
+    }
+    buf_in->copy_to_host();
+
+    end = Halide::Tools::benchmark_now();
+    double precompiled_runtime = Halide::Tools::benchmark_duration_seconds(start, end);
+
+    buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); });
+
+    buf_a_store.device_free();
+    buf_b_store.device_free();
+    const halide_device_interface_t *device = get_device_interface_for_device_api(DeviceAPI::Default_GPU, target);
+    if (device != nullptr) {
+        device->device_release(nullptr, device);
+    }
+
+    start = Halide::Tools::benchmark_now();
+
+    buf_in->fill(0);
+    for (Func &f : adders) {
+        input.set(*buf_in);
+        f.realize(*buf_out);
+        std::swap(buf_in, buf_out);
+    }
+    buf_in->copy_to_host();
+
+    end = Halide::Tools::benchmark_now();
+    double second_runtime = Halide::Tools::benchmark_duration_seconds(start, end);
+
+    buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); });
+
+    printf("Initial runtime %f, precompiled runtime %f, second runtime %f.\n", initial_runtime, precompiled_runtime, second_runtime);
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 2db1d3bc4f20..130af49b7f93 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -257,6 +257,19 @@ halide_define_aot_test(external_code GEN_DEPS external_code_generator_deps)
 # float16_t_generator.cpp
 halide_define_aot_test(float16_t)
 
+# gpu_multi_context_threaded_aottest.cpp
+# gpu_multi_context_threaded_generator.cpp
+halide_define_aot_test(gpu_multi_context_threaded
+                       OMIT_DEFAULT_GENERATOR
+                       EXTRA_LIBS
+                       gpu_multi_context_threaded_add
+                       gpu_multi_context_threaded_mul)
+
+add_halide_library(gpu_multi_context_threaded_add FROM gpu_multi_context_threaded.generator 
+                   FEATURES user_context)
+add_halide_library(gpu_multi_context_threaded_mul FROM gpu_multi_context_threaded.generator
+                   FEATURES user_context)
+
 # gpu_object_lifetime_aottest.cpp
 # gpu_object_lifetime_generator.cpp
 halide_define_aot_test(gpu_object_lifetime FEATURES debug)
diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp
index f0fd8ef9cd04..9e4fba70afe6 100644
--- a/test/generator/acquire_release_aottest.cpp
+++ b/test/generator/acquire_release_aottest.cpp
@@ -14,89 +14,25 @@ int main(int argc, char **argv) {
 #include <string.h>
 
 #include "acquire_release.h"
+#include "gpu_context.h"
 
 using namespace Halide::Runtime;
 
 const int W = 256, H = 256;
 
 #if defined(TEST_OPENCL)
-// Implement OpenCL custom context.
-
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
 
 // Just use a global context and queue, created and destroyed by main.
 cl_context cl_ctx = nullptr;
 cl_command_queue cl_q = nullptr;
 
 // Create the global context. This is just a helper function not called by Halide.
-int init_context() {
-    cl_int err = 0;
-
-    const cl_uint maxPlatforms = 4;
-    cl_platform_id platforms[maxPlatforms];
-    cl_uint platformCount = 0;
-
-    err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount);
-    if (err != CL_SUCCESS) {
-        printf("clGetPlatformIDs failed (%d)\n", err);
-        return err;
-    }
-
-    cl_platform_id platform = nullptr;
-
-    if (platformCount > 0) {
-        platform = platforms[0];
-    }
-    if (platform == nullptr) {
-        printf("Failed to get platform\n");
-        return CL_INVALID_PLATFORM;
-    }
-
-    cl_device_type device_type = CL_DEVICE_TYPE_ALL;
-
-    // Make sure we have a device
-    const cl_uint maxDevices = 4;
-    cl_device_id devices[maxDevices];
-    cl_uint deviceCount = 0;
-    err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
-    if (err != CL_SUCCESS) {
-        printf("clGetDeviceIDs failed (%d)\n", err);
-        return err;
-    }
-    if (deviceCount == 0) {
-        printf("Failed to get device\n");
-        return CL_DEVICE_NOT_FOUND;
-    }
-
-    cl_device_id dev = devices[deviceCount - 1];
-
-    // Create context and command queue.
-    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
-                                          0};
-    cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err);
-    if (err != CL_SUCCESS) {
-        printf("clCreateContext failed (%d)\n", err);
-        return err;
-    }
-
-    cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
-    if (err != CL_SUCCESS) {
-        printf("clCreateCommandQueue failed (%d)\n", err);
-        return err;
-    }
-    printf("Created CL context %p\n", cl_ctx);
-    return 0;
+bool init_context() {
+    return create_opencl_context(cl_ctx, cl_q);
 }
 
 void destroy_context() {
-    printf("Destroying CL context %p\n", cl_ctx);
-    clReleaseCommandQueue(cl_q);
-    clReleaseContext(cl_ctx);
+    destroy_opencl_context(cl_ctx, cl_q);
     cl_q = nullptr;
     cl_ctx = nullptr;
 }
@@ -116,61 +52,14 @@ extern "C" int halide_release_cl_context(void *user_context) {
     return 0;
 }
 #elif defined(TEST_CUDA)
-// Implement CUDA custom context.
-#include <cuda.h>
-
 CUcontext cuda_ctx = nullptr;
 
-int init_context() {
-    // Initialize CUDA
-    CUresult err = cuInit(0);
-    if (err != CUDA_SUCCESS) {
-        printf("cuInit failed (%d)\n", err);
-        return err;
-    }
-
-    // Make sure we have a device
-    int deviceCount = 0;
-    err = cuDeviceGetCount(&deviceCount);
-    if (err != CUDA_SUCCESS) {
-        printf("cuGetDeviceCount failed (%d)\n", err);
-        return err;
-    }
-    if (deviceCount <= 0) {
-        printf("No CUDA devices available\n");
-        return CUDA_ERROR_NO_DEVICE;
-    }
-
-    CUdevice dev;
-    // Get device
-    CUresult status;
-    // Try to get a device >0 first, since 0 should be our display device
-    // For now, don't try devices > 2 to maintain compatibility with previous behavior.
-    if (deviceCount > 2) deviceCount = 2;
-    for (int id = deviceCount - 1; id >= 0; id--) {
-        status = cuDeviceGet(&dev, id);
-        if (status == CUDA_SUCCESS) break;
-    }
-
-    if (status != CUDA_SUCCESS) {
-        printf("Failed to get CUDA device\n");
-        return status;
-    }
-
-    // Create context
-    err = cuCtxCreate(&cuda_ctx, 0, dev);
-    if (err != CUDA_SUCCESS) {
-        printf("cuCtxCreate failed (%d)\n", err);
-        return err;
-    }
-    printf("Created CUDA context %p\n", cuda_ctx);
-
-    return 0;
+bool init_context() {
+    return create_cuda_context(cuda_ctx);
 }
 
 void destroy_context() {
-    printf("Destroying CUDA context %p\n", cuda_ctx);
-    cuCtxDestroy(cuda_ctx);
+    destroy_cuda_context(cuda_ctx);
     cuda_ctx = nullptr;
 }
 
@@ -189,19 +78,18 @@ extern "C" int halide_cuda_release_context(void *user_context) {
 }
 #else
 // Just use the default implementation of acquire/release.
-int init_context() {
+bool init_context() {
     printf("Using default implementation of acquire/release\n");
-    return 0;
+    return true;
 }
 void destroy_context() {
 }
 #endif
 
-int main(int argc, char **argv) {
+bool run_test() {
     // Initialize the runtime specific GPU context.
-    int ret = init_context();
-    if (ret != 0) {
-        return ret;
+    if (!init_context()) {
+        return false;
     }
 
     // Everything else is a normal Halide program. The GPU runtime will call
@@ -227,19 +115,40 @@ int main(int argc, char **argv) {
             if (input(x, y) * 2.0f + 1.0f != output(x, y)) {
                 printf("Error at (%d, %d): %f != %f\n", x, y, input(x, y) * 2.0f + 1.0f,
                        output(x, y));
-                return -1;
+                return false;
             }
         }
     }
 
+    const halide_device_interface_t *interface = output.raw_buffer()->device_interface;
+
     // We need to free our GPU buffers before destroying the context.
     input.device_free();
     output.device_free();
 
+    if (interface != nullptr) {
+        halide_device_release(nullptr, interface);
+    } else {
+        printf("Device interface is nullptr.\n");
+        return false;
+    }
+
     // Free the context we created.
     destroy_context();
 
     printf("Success!\n");
+    return true;
+}
+
+int main(int argc, char **argv) {
+    if (!run_test()) {
+        return -1;
+    }
+
+    if (!run_test()) {
+        return -1;
+    }
+
     return 0;
 }
 
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
new file mode 100644
index 000000000000..05852909ba57
--- /dev/null
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -0,0 +1,193 @@
+#include <stdio.h>
+
+// This test demonstrates how to use more than one GPU context with
+// Halide generated GPU support, specifically in a multithreaded
+// program. It of course also tests that this works correctly with the
+// Halide GPU runtimes.
+
+#ifdef _WIN32
+int main(int argc, char **argv) {
+    printf("[SKIP] Test requires weak linkage, which is not available on Windows.\n");
+    return 0;
+}
+#else
+
+#include "HalideBuffer.h"
+#include "HalideRuntime.h"
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+#include <thread>
+
+#include "gpu_context.h"
+
+#include "gpu_multi_context_threaded_add.h"
+#include "gpu_multi_context_threaded_mul.h"
+
+using namespace Halide::Runtime;
+
+const int W = 32, H = 32;
+
+#if defined(TEST_OPENCL)
+
+struct gpu_context {
+    cl_context cl_ctx;
+    cl_command_queue cl_q;
+};
+
+// Create the global context. This is just a helper function not called by Halide.
+bool init_context(gpu_context &context) {
+    return create_opencl_context(context.cl_ctx, context.cl_q);
+}
+
+void destroy_context(gpu_context &context) {
+    destroy_opencl_context(context.cl_ctx, context.cl_q);
+    cl_q = nullptr;
+    cl_ctx = nullptr;
+}
+
+// These functions replace the acquire/release implementation in src/runtime/opencl.cpp.
+// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization
+// in our implementation of these functions.
+extern "C" int halide_acquire_cl_context(void *user_context, cl_context *ctx, cl_command_queue *q, bool create) {
+    if (user_context == nullptr) {
+        assert(!create);
+        *ctx = nullptr;
+        *q = nullptr;
+    } else {
+        const gpu_context *context = (const gpu_context *)user_context;
+        *ctx = context->cl_ctx;
+        *q = context->cl_q;
+    }
+    return 0;
+}
+
+extern "C" int halide_release_cl_context(void *user_context) {
+    return 0;
+}
+
+#define HAS_MULTIPLE_CONTEXTS true
+#elif defined(TEST_CUDA)
+
+typedef CUcontext gpu_context;
+
+bool init_context(CUcontext &cuda_ctx) {
+    return create_cuda_context(cuda_ctx);
+}
+
+void destroy_context(CUcontext &cuda_ctx) {
+    destroy_cuda_context(cuda_ctx);
+    cuda_ctx = nullptr;
+}
+
+// These functions replace the acquire/release implementation in src/runtime/cuda.cpp.
+// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization
+// in our implementation of these functions.
+extern "C" int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, bool create) {
+    if (user_context == nullptr) {
+        assert(!create);
+        *ctx = nullptr;
+    } else {
+        *ctx = *(CUcontext *)user_context;
+    }
+    return 0;
+}
+
+extern "C" int halide_cuda_release_context(void *user_context) {
+    return 0;
+}
+
+#define HAS_MULTIPLE_CONTEXTS true
+#else
+typedef int gpu_context;
+
+// Just use the default implementation of acquire/release.
+bool init_context(int &context) {
+    printf("Using default implementation of acquire/release\n");
+    context = 0;
+    return true;
+}
+void destroy_context(int & /* context */) {
+
+#define HAS_MULTIPLE_CONTEXTS false
+#endif
+
+void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) {
+    gpu_context context2;
+
+    Buffer<int32_t> buf1_in(W, H);
+    Buffer<int32_t> buf1_result(W, H);
+    buf1_in.fill(0);
+
+    const halide_device_interface_t *device_interface;
+
+    int val = 0;
+    for (int i = 0; i < 10; i++) {
+        init_context(context2);
+
+        Buffer<int32_t> buf2_in(W, H);
+        Buffer<int32_t> buf2_result(W, H);
+        buf2_in.fill(0);
+
+        gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result);
+        gpu_multi_context_threaded_mul(&context1, buf1_result, buf1_in);
+        gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result);
+
+        gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result);
+        gpu_multi_context_threaded_mul(&context2, buf2_result, buf2_in);
+        gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result);
+
+        buf1_result.copy_to_host(&context1);
+        buf2_result.copy_to_host(&context2);
+
+        val += 2;
+        val *= 2;
+        assert(buf1_result.all_equal(val + 2));
+        assert(buf2_result.all_equal(6));
+
+        device_interface = buf1_result.raw_buffer()->device_interface;
+
+        // About to destroy context, so ensure allocations are freed first.
+        buf2_in.device_free(&context2);
+        buf2_result.device_free(&context2);
+
+        if (device_interface != nullptr) {
+            halide_device_release(&context2, device_interface);
+        }
+        destroy_context(context2);
+    }
+
+    // About to destroy context, so ensure allocations are freed first.
+    buf1_in.device_free(&context1);
+    buf1_result.device_free(&context1);
+
+    if (destroy_when_done && device_interface != nullptr) {
+        halide_device_release(&context1, device_interface);
+        destroy_context(context1);
+    }
+}
+
+int main(int argc, char **argv) {
+    gpu_context contexta;
+    init_context(contexta);
+
+    gpu_context contextb;
+    init_context(contextb);
+
+    std::thread thread1(run_kernels_on_thread, contexta, false);
+    std::thread thread2(run_kernels_on_thread, contextb, false);
+
+    thread1.join();
+    thread2.join();
+
+    // Make sure using the same context on different threads works.
+    std::thread thread3(run_kernels_on_thread, contexta, HAS_MULTIPLE_CONTEXTS);
+    std::thread thread4(run_kernels_on_thread, contextb, HAS_MULTIPLE_CONTEXTS);
+
+    thread3.join();
+    thread4.join();
+
+    printf("Success!\n");
+    return 0;
+}
+#endif  // !WIN32
diff --git a/test/generator/gpu_multi_context_threaded_generator.cpp b/test/generator/gpu_multi_context_threaded_generator.cpp
new file mode 100644
index 000000000000..42f278b6379b
--- /dev/null
+++ b/test/generator/gpu_multi_context_threaded_generator.cpp
@@ -0,0 +1,48 @@
+#include "Halide.h"
+
+namespace {
+
+class GpuAdd : public Halide::Generator<GpuAdd> {
+public:
+    Input<Buffer<int32_t>> input{"input", 2};
+
+    Output<Buffer<int32_t>> output{"output", 2};
+
+    void generate() {
+        Var x("x"), y("y");
+
+        // Create a simple pipeline that scales pixel values by 2.
+        output(x, y) = input(x, y) + 2;
+
+        Target target = get_target();
+        if (target.has_gpu_feature()) {
+            Var xo, yo, xi, yi;
+            output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16);
+        }
+    }
+};
+
+class GpuMul : public Halide::Generator<GpuMul> {
+public:
+    Input<Buffer<int32_t>> input{"input", 2};
+
+    Output<Buffer<int32_t>> output{"output", 2};
+
+    void generate() {
+        Var x("x"), y("y");
+
+        // Create a simple pipeline that scales pixel values by 2.
+        output(x, y) = input(x, y) * 2;
+
+        Target target = get_target();
+        if (target.has_gpu_feature()) {
+            Var xo, yo, xi, yi;
+            output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16);
+        }
+    }
+};
+
+}  // namespace
+
+HALIDE_REGISTER_GENERATOR(GpuAdd, gpu_multi_context_threaded_add)
+HALIDE_REGISTER_GENERATOR(GpuMul, gpu_multi_context_threaded_mul)

From f8df8eb8ba3a16a935b552ff19dca411be912309 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 7 Dec 2020 13:01:29 -0800
Subject: [PATCH 02/24] Revert "Revert "Fix broken destroy_context() in
 gpu_multi_context_threaded_aottest.cpp (#5512)" (#5514)"

This reverts commit 2c8e3ea87f2ade9bafd4b498dd95df498fb17827.
---
 test/generator/gpu_multi_context_threaded_aottest.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index 05852909ba57..fdaab364b7c9 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -107,7 +107,9 @@ bool init_context(int &context) {
     context = 0;
     return true;
 }
-void destroy_context(int & /* context */) {
+void destroy_context(int &context) {
+    context = 0;
+}
 
 #define HAS_MULTIPLE_CONTEXTS false
 #endif

From 805e14ba98f4898f3ea85c5f6bac6c7037e20919 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 7 Dec 2020 20:04:49 -0800
Subject: [PATCH 03/24] Solve the COMDAT in runtime failing on Mac OS X problem
 once and for all by removing Comdat IR annotations in runtime on Mac OS and
 iOS.

---
 src/LLVM_Runtime_Linker.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 9637f0c2ee91..042a3543ba95 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -560,6 +560,20 @@ void link_modules(std::vector<std::unique_ptr<llvm::Module>> &modules, Target t,
     const std::set<string> retain = {"__stack_chk_guard",
                                      "__stack_chk_fail"};
 
+    // COMDAT is not supported in MachO object files, hence it does not
+    // work on Mac OS or iOS. These sometimes show up in the runtime
+    // since we compile for an abstract target that is based on ELF.
+    // This code removes the Comdat and makes the symbol a regular one,
+    // which only works if there is a single instance, which is generally
+    // the case for the runtime. Presumably if this isn't true, linking the
+    // module will fail.
+    if (t.os == Target::IOS || t.os == Target::OSX) {
+        for (auto &global_obj : modules[0]->global_objects()) {
+	    global_obj.setComdat(nullptr);
+	}
+	modules[0]->getComdatSymbolTable().clear();
+    }
+
     // Enumerate the global variables.
     for (auto &gv : modules[0]->globals()) {
         // No variables are part of the public interface (even the ones labelled halide_)

From fbea278fde95f139887b050308bbcd0faeb6b2f3 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 7 Dec 2020 20:10:46 -0800
Subject: [PATCH 04/24] Improve comment.

---
 src/LLVM_Runtime_Linker.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 042a3543ba95..8cd46446a105 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -560,13 +560,14 @@ void link_modules(std::vector<std::unique_ptr<llvm::Module>> &modules, Target t,
     const std::set<string> retain = {"__stack_chk_guard",
                                      "__stack_chk_fail"};
 
-    // COMDAT is not supported in MachO object files, hence it does not
-    // work on Mac OS or iOS. These sometimes show up in the runtime
-    // since we compile for an abstract target that is based on ELF.
-    // This code removes the Comdat and makes the symbol a regular one,
-    // which only works if there is a single instance, which is generally
-    // the case for the runtime. Presumably if this isn't true, linking the
-    // module will fail.
+    // COMDAT is not supported in MachO object files, hence it does
+    // not work on Mac OS or iOS. These sometimes show up in the
+    // runtime since we compile for an abstract target that is based
+    // on ELF. This code removes all Comdat items and leaves the
+    // symbols they were attached to as regular definitions, which
+    // only works if there is a single instance, which is generally
+    // the case for the runtime. Presumably if this isn't true,
+    // linking the module will fail.
     if (t.os == Target::IOS || t.os == Target::OSX) {
         for (auto &global_obj : modules[0]->global_objects()) {
 	    global_obj.setComdat(nullptr);

From b851d206f177470e4ff16084bb9b1a047f701e04 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 7 Dec 2020 20:21:06 -0800
Subject: [PATCH 05/24] Fix tabs in indentation.

---
 src/LLVM_Runtime_Linker.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 8cd46446a105..220680a63e01 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -570,9 +570,9 @@ void link_modules(std::vector<std::unique_ptr<llvm::Module>> &modules, Target t,
     // linking the module will fail.
     if (t.os == Target::IOS || t.os == Target::OSX) {
         for (auto &global_obj : modules[0]->global_objects()) {
-	    global_obj.setComdat(nullptr);
-	}
-	modules[0]->getComdatSymbolTable().clear();
+            global_obj.setComdat(nullptr);
+        }
+        modules[0]->getComdatSymbolTable().clear();
     }
 
     // Enumerate the global variables.

From 4120026512b7ed4a6174c19b9a590052d694eb95 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Thu, 10 Dec 2020 02:31:26 -0800
Subject: [PATCH 06/24] Make GPU context handling more consistent and use a
 common compilation cache for kernels. Introduces a finalization routine for
 kernel compilation to indicate when kernals are not strictly required to be
 defined. Thus allowing them to be unloaded or discarded, but not when they
 are needed.

---
 Makefile                                      |  2 +-
 src/CodeGen_GPU_Host.cpp                      |  7 +++
 src/runtime/HalideRuntimeCuda.h               |  1 +
 src/runtime/HalideRuntimeD3D12Compute.h       |  1 +
 src/runtime/HalideRuntimeHexagonHost.h        |  1 +
 src/runtime/HalideRuntimeMetal.h              |  1 +
 src/runtime/HalideRuntimeOpenCL.h             |  1 +
 src/runtime/HalideRuntimeOpenGL.h             |  1 +
 src/runtime/HalideRuntimeOpenGLCompute.h      |  2 +
 src/runtime/cuda.cpp                          |  9 ++-
 src/runtime/d3d12compute.cpp                  | 13 ++++-
 src/runtime/gpu_context_common.h              | 57 +++++++++++++------
 src/runtime/hexagon_host.cpp                  |  3 +
 src/runtime/metal.cpp                         | 18 ++++--
 src/runtime/opencl.cpp                        | 20 +++++--
 src/runtime/opengl.cpp                        |  3 +
 src/runtime/openglcompute.cpp                 |  3 +
 src/runtime/runtime_api.cpp                   |  7 +++
 test/common/gpu_context.h                     | 22 +++++++
 .../gpu_multi_context_threaded_aottest.cpp    | 42 +++++++++++++-
 20 files changed, 178 insertions(+), 36 deletions(-)

diff --git a/Makefile b/Makefile
index d9bc23c9b970..b26984127804 100644
--- a/Makefile
+++ b/Makefile
@@ -333,7 +333,7 @@ TEST_CXX_FLAGS += -DTEST_OPENCL
 endif
 
 ifneq ($(TEST_METAL), )
-TEST_CXX_FLAGS += -DTEST_METAL
+TEST_CXX_FLAGS += -DTEST_METAL -ObjC++
 endif
 
 ifneq ($(TEST_CUDA), )
diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp
index 4cb09da4712d..2815f42b62ee 100644
--- a/src/CodeGen_GPU_Host.cpp
+++ b/src/CodeGen_GPU_Host.cpp
@@ -212,6 +212,13 @@ void CodeGen_GPU_Host<CodeGen_CPU>::compile_func(const LoweredFunc &f,
         Value *result = builder->CreateCall(init, init_kernels_args);
         Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0));
         CodeGen_CPU::create_assertion(did_succeed, Expr(), result);
+
+	// Generate a finalizer call as well to relase any refcounts or other resource usage
+	// specific to this filter call.
+        std::string finalize_kernels_name = "halide_" + api_unique_name + "_finalize_kernels";
+        llvm::Function *finalize = module->getFunction(finalize_kernels_name);
+	Value *module_state_value = builder->CreateLoad(module_state);
+	register_destructor(finalize, module_state_value, CodeGen_CPU::Always);
     }
 
     // the init kernels block should branch to the post-entry block
diff --git a/src/runtime/HalideRuntimeCuda.h b/src/runtime/HalideRuntimeCuda.h
index 923860b3c315..7f8481642ba2 100644
--- a/src/runtime/HalideRuntimeCuda.h
+++ b/src/runtime/HalideRuntimeCuda.h
@@ -38,6 +38,7 @@ extern int halide_cuda_run(void *user_context,
                            float *vertex_buffer,
                            int num_coords_dim0,
                            int num_coords_dim1);
+extern void halide_cuda_finalize_kernels(void *user_context, void *state_ptr);
 // @}
 
 /** Set the underlying cuda device poiner for a buffer. The device
diff --git a/src/runtime/HalideRuntimeD3D12Compute.h b/src/runtime/HalideRuntimeD3D12Compute.h
index de78ed8bdbab..997120c3391a 100644
--- a/src/runtime/HalideRuntimeD3D12Compute.h
+++ b/src/runtime/HalideRuntimeD3D12Compute.h
@@ -37,6 +37,7 @@ extern int halide_d3d12compute_run(void *user_context,
                                    float *vertex_buffer,
                                    int num_coords_dim0,
                                    int num_coords_dim1);
+extern void halide_d3d12compute_finalize_kernels(void *user_context, void *state_ptr);
 // @}
 
 /** Set the underlying ID3D12Resource for a halide_buffer_t. The memory backing
diff --git a/src/runtime/HalideRuntimeHexagonHost.h b/src/runtime/HalideRuntimeHexagonHost.h
index 7ca304e9f94b..c5756504ac4b 100644
--- a/src/runtime/HalideRuntimeHexagonHost.h
+++ b/src/runtime/HalideRuntimeHexagonHost.h
@@ -143,6 +143,7 @@ extern int halide_hexagon_run(void *user_context,
                               uint64_t arg_sizes[],
                               void *args[],
                               int arg_flags[]);
+extern void halide_hexagon_finalize_kernels(void *user_context, void *state_ptr);
 extern int halide_hexagon_device_release(void *user_context);
 // @}
 
diff --git a/src/runtime/HalideRuntimeMetal.h b/src/runtime/HalideRuntimeMetal.h
index 2b045390dc3f..6e795416e442 100644
--- a/src/runtime/HalideRuntimeMetal.h
+++ b/src/runtime/HalideRuntimeMetal.h
@@ -25,6 +25,7 @@ extern const struct halide_device_interface_t *halide_metal_device_interface();
 // @{
 extern int halide_metal_initialize_kernels(void *user_context, void **state_ptr,
                                            const char *src, int size);
+void halide_metal_finalize_kernels(void *user_context, void *state_ptr);
 
 extern int halide_metal_run(void *user_context,
                             void *state_ptr,
diff --git a/src/runtime/HalideRuntimeOpenCL.h b/src/runtime/HalideRuntimeOpenCL.h
index 9bc35def26c8..54b8b4157489 100644
--- a/src/runtime/HalideRuntimeOpenCL.h
+++ b/src/runtime/HalideRuntimeOpenCL.h
@@ -39,6 +39,7 @@ extern int halide_opencl_run(void *user_context,
                              float *vertex_buffer,
                              int num_coords_dim0,
                              int num_coords_dim1);
+extern void halide_opencl_finalize_kernels(void *user_context, void *state_ptr);
 // @}
 
 /** Set the platform name for OpenCL to use (e.g. "Intel" or
diff --git a/src/runtime/HalideRuntimeOpenGL.h b/src/runtime/HalideRuntimeOpenGL.h
index 2f253c9900a2..813fe5435b73 100644
--- a/src/runtime/HalideRuntimeOpenGL.h
+++ b/src/runtime/HalideRuntimeOpenGL.h
@@ -39,6 +39,7 @@ extern int halide_opengl_run(void *user_context,
                              float *vertex_buffer,
                              int num_coords_dim0,
                              int num_coords_dim1);
+extern void halide_opengl_finalize_kernels(void *user_context, void *state_ptr);
 // @}
 
 /** Set the underlying OpenGL texture for a buffer. The texture must
diff --git a/src/runtime/HalideRuntimeOpenGLCompute.h b/src/runtime/HalideRuntimeOpenGLCompute.h
index 8f7cd683845c..2fd76c2fd697 100644
--- a/src/runtime/HalideRuntimeOpenGLCompute.h
+++ b/src/runtime/HalideRuntimeOpenGLCompute.h
@@ -49,6 +49,8 @@ extern int halide_openglcompute_run(void *user_context,
                                     float *vertex_buffer,
                                     int num_coords_dim0,
                                     int num_coords_dim1);
+
+extern void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr);
 // @}
 
 /** This function retrieves pointers to OpenGL API functions.
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 1dc1cf27086e..872fd65b74fa 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1058,12 +1058,11 @@ WEAK int halide_cuda_run(void *user_context,
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    halide_assert(user_context, state_ptr);
-    CUmodule mod = nullptr;
-    bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod);
-    halide_assert(user_context, found_module && mod != nullptr);
+    CUmodule mod{};
+    bool found = compilation_cache.lookup(ctx.context, state_ptr, mod);
+    halide_assert(user_context, found && mod != nullptr);
+
     debug(user_context) << "Got module " << mod << "\n";
-    halide_assert(user_context, mod);
     CUfunction f;
     err = cuModuleGetFunction(&f, mod, entry_name);
     debug(user_context) << "Got function " << f << "\n";
diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 3174c33f52a4..f298dfe26727 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -2776,6 +2776,13 @@ WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state
     return 0;
 }
 
+WEAK void halide_d3d12compute_finalize_kernels(void *user_context, void *state_ptr) {
+    D3D12ContextHolder d3d12_context(user_context, true);
+    if (d3d12_context.error == 0) {
+        compilation_cache.release_hold(user_context, d3d12_context.device, state_ptr);
+    }
+}
+
 namespace {
 
 void compute_barrier(d3d12_copy_command_list *cmdList, d3d12_buffer *buffer) {
@@ -3005,9 +3012,9 @@ WEAK int halide_d3d12compute_run(void *user_context,
     StartCapturingGPUActivity();
 #endif
 
-    d3d12_library *library = nullptr;
-    bool found_module = compilation_cache.lookup(device, state_ptr, library);
-    halide_assert(user_context, found_module && library != nullptr);
+    d3d12_library *library{};
+    bool found = compilation_cache.lookup(device, state_ptr, library);
+    halide_assert(user_context, found && library != nullptr);
 
     d3d12_frame *frame = acquire_frame(device);
     d3d12_compute_command_list *cmdList = frame->cmd_list;
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index 9ef2662026a8..f45d2883755b 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -10,6 +10,14 @@ class GPUCompilationCache {
         ContextT context{};
         ModuleStateT module_state{};
         uint32_t kernel_id{};
+        uint32_t use_count{0};
+
+        CachedCompilation(ContextT context, ModuleStateT module_state,
+                          uint32_t kernel_id, uint32_t use_count)
+            : context(context), module_state(module_state),
+              kernel_id(kernel_id), use_count(use_count) {
+        }
+
     };
 
     halide_mutex mutex;
@@ -37,7 +45,7 @@ class GPUCompilationCache {
         }
     }
 
-    HALIDE_MUST_USE_RESULT bool insert(ContextT context, uint32_t id, ModuleStateT module_state) {
+  HALIDE_MUST_USE_RESULT bool insert(const CachedCompilation &entry) {
         if (log2_compilations_size == 0) {
             if (!resize_table(kInitialTableBits)) {
                 return false;
@@ -49,13 +57,11 @@ class GPUCompilationCache {
             }
         }
         count += 1;
-        uintptr_t index = kernel_hash(context, id, log2_compilations_size);
+        uintptr_t index = kernel_hash(entry.context, entry.kernel_id, log2_compilations_size);
         for (int i = 0; i < (1 << log2_compilations_size); i++) {
             uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
             if (compilations[effective_index].kernel_id <= kDeletedId) {
-                compilations[effective_index].context = context;
-                compilations[effective_index].module_state = module_state;
-                compilations[effective_index].kernel_id = id;
+                compilations[effective_index] = entry;
                 return true;
             }
         }
@@ -65,7 +71,8 @@ class GPUCompilationCache {
         return false;
     }
 
-    HALIDE_MUST_USE_RESULT bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) {
+    HALIDE_MUST_USE_RESULT bool find_internal(ContextT context, uint32_t id,
+                                              ModuleStateT *&module_state, int increment) {
         if (log2_compilations_size == 0) {
             return false;
         }
@@ -79,6 +86,9 @@ class GPUCompilationCache {
             if (compilations[effective_index].context == context &&
                 compilations[effective_index].kernel_id == id) {
                 module_state = &compilations[effective_index].module_state;
+                if (increment != 0) {
+                    compilations[effective_index].use_count += increment;
+                }
                 return true;
             }
         }
@@ -89,7 +99,7 @@ class GPUCompilationCache {
         ScopedMutexLock lock_guard(&mutex);
         uint32_t id = (uint32_t)(uintptr_t)state_ptr;
         ModuleStateT *mod_ptr;
-        if (find_internal(context, id, mod_ptr)) {
+        if (find_internal(context, id, mod_ptr, 0)) {
             module_state = *mod_ptr;
             return true;
         }
@@ -114,8 +124,7 @@ class GPUCompilationCache {
                 for (int32_t i = 0; i < old_size; i++) {
                     if (old_table[i].kernel_id != kInvalidId &&
                         old_table[i].kernel_id != kDeletedId) {
-                        bool result = insert(old_table[i].context, old_table[i].kernel_id,
-                                             old_table[i].module_state);
+                        bool result = insert(old_table[i]);
                         halide_assert(nullptr, result);  // Resizing the table while resizing the table is a logic error.
                     }
                 }
@@ -133,7 +142,11 @@ class GPUCompilationCache {
 
         for (int i = 0; i < (1 << log2_compilations_size); i++) {
             if (compilations[i].kernel_id > kInvalidId &&
-                (all || (compilations[i].context == context))) {
+                (all || (compilations[i].context == context)) &&
+                compilations[i].use_count == 0) {
+                debug(user_context) << "Releasing cached compilation: " << compilations[i].module_state <<
+                                       " id " << compilations[i].kernel_id <<
+                                       " context " << compilations[i].context << "\n";
                 f(compilations[i].module_state);
                 compilations[i].module_state = nullptr;
                 compilations[i].kernel_id = kDeletedId;
@@ -154,9 +167,12 @@ class GPUCompilationCache {
         ScopedMutexLock lock_guard(&mutex);
 
         release_context(user_context, true, nullptr, f);
-        free(compilations);
-        compilations = nullptr;
-        log2_compilations_size = 0;
+	// Some items may have been in use, so can't free.
+	if (count == 0) {
+	    free(compilations);
+	    compilations = nullptr;
+	    log2_compilations_size = 0;
+	}
     }
 
     template<typename CompileModuleT, typename... Args>
@@ -172,25 +188,34 @@ class GPUCompilationCache {
         }
 
         ModuleStateT *mod;
-        if (find_internal(context, *id_ptr, mod)) {
+        if (find_internal(context, *id_ptr, mod, 1)) {
             result = *mod;
             return true;
         }
 
         // TODO(zvookin): figure out the calling signature here...
         ModuleStateT compiled_module = f(args...);
-        debug(user_context) << "Caching compiled kernel: " << compiled_module << " id " << *id_ptr << " context " << context << "\n";
+        debug(user_context) << "Caching compiled kernel: " << compiled_module <<
+                               " id " << *id_ptr << " context " << context << "\n";
         if (compiled_module == nullptr) {
             return false;
         }
 
-        if (!insert(context, *id_ptr, compiled_module)) {
+        
+        if (!insert({context, compiled_module, *id_ptr, 1})) {
             return false;
         }
         result = compiled_module;
 
         return true;
     }
+
+    void release_hold(void *user_context, ContextT context, void *state_ptr) {
+        ModuleStateT *mod;
+        uint32_t id = (uint32_t)(uintptr_t)state_ptr;
+        bool result = find_internal(context, id, mod, -1);
+        halide_assert(user_context, result);  // Value must be in cache to be released
+    }
 };
 
 }  // namespace Internal
diff --git a/src/runtime/hexagon_host.cpp b/src/runtime/hexagon_host.cpp
index cbef8cf7508f..b163c6f3e537 100644
--- a/src/runtime/hexagon_host.cpp
+++ b/src/runtime/hexagon_host.cpp
@@ -323,6 +323,9 @@ WEAK int halide_hexagon_initialize_kernels(void *user_context, void **state_ptr,
     return result != 0 ? -1 : 0;
 }
 
+WEAK void halide_hexagon_finalize_kernels(void *user_context, void *state_ptr) {
+}
+
 namespace {
 
 // Prepare an array of remote_buffer arguments, mapping buffers if
diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index 25fe29f3feda..0cd4492bf6db 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -562,6 +562,13 @@ WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, c
     return 0;
 }
 
+WEAK void halide_metal_finalize_kernels(void *user_context, void *state_ptr) {
+    MetalContextHolder metal_context(user_context, true);
+    if (metal_context.error == 0) {
+        compilation_cache.release_hold(user_context, metal_context.device, state_ptr);
+    }
+}
+
 namespace {
 
 WEAK void halide_metal_device_sync_internal(mtl_command_queue *queue, struct halide_buffer_t *buffer) {
@@ -612,10 +619,11 @@ WEAK int halide_metal_device_release(void *user_context) {
         return error;
     }
 
-    if (device) {
+    if (acquired_device) {
         halide_metal_device_sync_internal(queue, nullptr);
 
-        compilation_cache.delete_context(user_context, device, release_ns_object);
+	debug(user_context) << "Calling delete context on device " << acquired_device << "\n";
+        compilation_cache.delete_context(user_context, acquired_device, release_ns_object);
 
         // Release the device itself, if we created it.
         if (acquired_device == device) {
@@ -740,9 +748,9 @@ WEAK int halide_metal_run(void *user_context,
         return -1;
     }
 
-    mtl_library *library;
-    bool found_library = compilation_cache.lookup(metal_context.device, state_ptr, library);
-    halide_assert(user_context, found_library && library != nullptr);
+    mtl_library *library{};
+    bool found = compilation_cache.lookup(metal_context.device, state_ptr, library);
+    halide_assert(user_context, found && library != nullptr);
 
     mtl_function *function = new_function_with_name(library, entry_name, strlen(entry_name));
     if (function == nullptr) {
diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index 7c7815b620d7..10bd05ccc315 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -731,7 +731,7 @@ WEAK int halide_opencl_compute_capability(void *user_context, int *major, int *m
 
 WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr, const char *src, int size) {
     debug(user_context)
-        << "CL: halide_opencl_init_kernels (user_context: " << user_context
+        << "CL: halide_opencl_initialize_kernels (user_context: " << user_context
         << ", state_ptr: " << state_ptr
         << ", program: " << (void *)src
         << ", size: " << size << "\n";
@@ -760,6 +760,16 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr,
     return 0;
 }
 
+WEAK void halide_opencl_finalize_kernels(void *user_context, void *state_ptr) {
+    debug(user_context)
+        << "CL: halide_opencl_finalize_kernels (user_context: " << user_context
+        << ", state_ptr: " << state_ptr << "\n";;
+    ClContext ctx(user_context);
+    if (ctx.error_code == CL_SUCCESS) {
+        compilation_cache.release_hold(user_context, ctx.context, state_ptr);
+    }
+}
+
 // Used to generate correct timings when tracing
 WEAK int halide_opencl_device_sync(void *user_context, halide_buffer_t *) {
     debug(user_context) << "CL: halide_opencl_device_sync (user_context: " << user_context << ")\n";
@@ -1049,10 +1059,11 @@ WEAK int halide_opencl_run(void *user_context,
 
     // Create kernel object for entry_name from the program for this module.
     halide_assert(user_context, state_ptr);
-    cl_program program;
-    bool found_program = compilation_cache.lookup(ctx.context, state_ptr, program);
+    
+    cl_program program{};
+    bool found = compilation_cache.lookup(ctx.context, state_ptr, program);
+    halide_assert(user_context, found && program != nullptr);
 
-    halide_assert(user_context, found_program && program != nullptr);
     debug(user_context) << "    clCreateKernel " << entry_name << " -> ";
     cl_kernel f = clCreateKernel(program, entry_name, &err);
     if (err != CL_SUCCESS) {
@@ -1339,6 +1350,7 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() {
 
 namespace {
 WEAK __attribute__((destructor)) void halide_opencl_cleanup() {
+      debug(nullptr) << "halide_opencl_cleanup\n";
     compilation_cache.release_all(nullptr, clReleaseProgram);
     halide_opencl_device_release(nullptr);
 }
diff --git a/src/runtime/opengl.cpp b/src/runtime/opengl.cpp
index f18d79f867a7..a3e8b15d417d 100644
--- a/src/runtime/opengl.cpp
+++ b/src/runtime/opengl.cpp
@@ -1959,6 +1959,9 @@ WEAK int halide_opengl_initialize_kernels(void *user_context, void **state_ptr,
     return 0;
 }
 
+WEAK void halide_opengl_finalize_kernels(void *user_context, void *state_ptr) {
+}
+
 WEAK int halide_opengl_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
     return halide_default_device_and_host_malloc(user_context, buf, &opengl_device_interface);
 }
diff --git a/src/runtime/openglcompute.cpp b/src/runtime/openglcompute.cpp
index 5476d854d35c..5928d18a0e51 100644
--- a/src/runtime/openglcompute.cpp
+++ b/src/runtime/openglcompute.cpp
@@ -904,6 +904,9 @@ WEAK int halide_openglcompute_initialize_kernels(void *user_context, void **stat
     return 0;
 }
 
+WEAK void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr) {
+}
+
 WEAK int halide_openglcompute_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
     return halide_default_device_and_host_malloc(user_context, buf, &openglcompute_device_interface);
 }
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
index 6071932000da..ef2609022748 100644
--- a/src/runtime/runtime_api.cpp
+++ b/src/runtime/runtime_api.cpp
@@ -29,6 +29,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_cuda_device_interface,
     (void *)&halide_cuda_get_device_ptr,
     (void *)&halide_cuda_initialize_kernels,
+    (void *)&halide_cuda_finalize_kernels,
     (void *)&halide_cuda_run,
     (void *)&halide_cuda_wrap_device_ptr,
     (void *)&halide_current_time_ns,
@@ -91,6 +92,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_hexagon_get_device_handle,
     (void *)&halide_hexagon_get_device_size,
     (void *)&halide_hexagon_initialize_kernels,
+    (void *)&halide_hexagon_finalize_kernels,
     (void *)&halide_hexagon_power_hvx_off,
     (void *)&halide_hexagon_power_hvx_off_as_destructor,
     (void *)&halide_hexagon_power_hvx_on,
@@ -115,6 +117,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_metal_get_buffer,
     (void *)&halide_metal_get_crop_offset,
     (void *)&halide_metal_initialize_kernels,
+    (void *)&halide_metal_finalize_kernels,
     (void *)&halide_metal_release_context,
     (void *)&halide_metal_run,
     (void *)&halide_metal_wrap_buffer,
@@ -139,6 +142,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_opencl_image_device_interface,
     (void *)&halide_opencl_image_wrap_cl_mem,
     (void *)&halide_opencl_initialize_kernels,
+    (void *)&halide_opencl_finalize_kernels,
     (void *)&halide_opencl_run,
     (void *)&halide_opencl_set_build_options,
     (void *)&halide_opencl_set_device_type,
@@ -151,11 +155,13 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_opengl_get_proc_address,
     (void *)&halide_opengl_get_texture,
     (void *)&halide_opengl_initialize_kernels,
+    (void *)&halide_opengl_finalize_kernels,
     (void *)&halide_opengl_run,
     (void *)&halide_opengl_wrap_render_target,
     (void *)&halide_opengl_wrap_texture,
     (void *)&halide_openglcompute_device_interface,
     (void *)&halide_openglcompute_initialize_kernels,
+    (void *)&halide_openglcompute_finalize_kernels,
     (void *)&halide_openglcompute_run,
     (void *)&halide_pointer_to_string,
     (void *)&halide_print,
@@ -202,6 +208,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_d3d12compute_acquire_context,
     (void *)&halide_d3d12compute_device_interface,
     (void *)&halide_d3d12compute_initialize_kernels,
+    (void *)&halide_d3d12compute_finalize_kernels,
     (void *)&halide_d3d12compute_release_context,
     (void *)&halide_d3d12compute_run,
 };
diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
index 4816ad205866..9051bebb30f9 100644
--- a/test/common/gpu_context.h
+++ b/test/common/gpu_context.h
@@ -131,4 +131,26 @@ void destroy_cuda_context(CUcontext cuda_ctx) {
     cuCtxDestroy(cuda_ctx);
 }
 
+#elif defined(TEST_METAL)
+#include <Metal/MTLDevice.h>
+#include <Metal/MTLCommandQueue.h>
+
+void create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
+    device = MTLCreateSystemDefaultDevice();
+    if (device == nullptr) {
+        NSArray<id<MTLDevice>> *devices = MTLCopyAllDevices();
+	if (devices != nullptr) {
+	    device = devices[0];
+	}
+    }
+    assert(device != nullptr);
+    queue = [device newCommandQueue];
+    assert(queue != nullptr);
+}  
+
+void destroy_metal_context(id<MTLDevice> device, id<MTLCommandQueue> queue) {
+    [queue release];
+    [device release];
+}
+
 #endif
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index fdaab364b7c9..05144a4c857d 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -71,11 +71,11 @@ extern "C" int halide_release_cl_context(void *user_context) {
 
 typedef CUcontext gpu_context;
 
-bool init_context(CUcontext &cuda_ctx) {
+bool init_context(gpu_context &cuda_ctx) {
     return create_cuda_context(cuda_ctx);
 }
 
-void destroy_context(CUcontext &cuda_ctx) {
+void destroy_context(gpu_context &cuda_ctx) {
     destroy_cuda_context(cuda_ctx);
     cuda_ctx = nullptr;
 }
@@ -97,6 +97,44 @@ extern "C" int halide_cuda_release_context(void *user_context) {
     return 0;
 }
 
+#define HAS_MULTIPLE_CONTEXTS true
+#elif defined(TEST_METAL)
+
+struct gpu_context {
+    id<MTLDevice> device;
+    id<MTLCommandQueue> queue;
+};
+
+bool init_context(gpu_context &context) {
+    create_metal_context(context.device, context.queue);
+    return 0;
+}
+
+void destroy_context(gpu_context &context) {
+    destroy_metal_context(context.device, context.queue);
+    context.device = nullptr;
+    context.queue = nullptr;
+}
+
+int halide_metal_acquire_context(void *user_context, id<MTLDevice> *device_ret,
+				 id<MTLCommandQueue> *queue_ret, bool create) {
+    if (user_context == nullptr) {
+        assert(!create);
+        *device_ret = nullptr;
+        *queue_ret = nullptr;
+    } else {
+        gpu_context *context = (gpu_context *)user_context;
+	*device_ret = context->device;
+	*queue_ret = context->queue;
+    }
+    return 0;
+
+}
+
+int halide_metal_release_context(void *user_context) {
+    return 0;
+}
+
 #define HAS_MULTIPLE_CONTEXTS true
 #else
 typedef int gpu_context;

From 03062ddb6a49764fdca8ddc644ee0a1c8e19a1df Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Thu, 10 Dec 2020 16:55:40 -0800
Subject: [PATCH 07/24] Add CUDA finalizer method.

Quick fix for syntax error in C codegen.

Tab fixes.

Makefile fixes.
---
 Makefile                         |  8 ++++++++
 src/CodeGen_C.cpp                |  2 +-
 src/CodeGen_GPU_Host.cpp         |  8 ++++----
 src/runtime/cuda.cpp             |  7 +++++++
 src/runtime/gpu_context_common.h | 12 ++++++------
 src/runtime/metal.cpp            |  2 +-
 6 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/Makefile b/Makefile
index b26984127804..aeee9d600d70 100644
--- a/Makefile
+++ b/Makefile
@@ -1153,6 +1153,8 @@ GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_async_parallel,$(GENERAT
 GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_stubtest,$(GENERATOR_AOTCPP_TESTS))
 GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_stubuser,$(GENERATOR_AOTCPP_TESTS))
 
+GENERATOR_AOTCPP_TESTS := $(filter-out generator_aotcpp_gpu_multi_context_threaded,$(GENERATOR_AOTCPP_TESTS))
+
 test_aotcpp_generator: $(GENERATOR_AOTCPP_TESTS)
 
 # This is just a test to ensure than RunGen builds and links for a critical mass of Generators;
@@ -1168,6 +1170,7 @@ GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/multitarget.rungen,$
 GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/nested_externs.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS))
 GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/tiled_blur.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS))
 GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/extern_output.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS))
+GENERATOR_BUILD_RUNGEN_TESTS := $(filter-out $(FILTERS_DIR)/gpu_multi_context_threaded.rungen,$(GENERATOR_BUILD_RUNGEN_TESTS))
 GENERATOR_BUILD_RUNGEN_TESTS := $(GENERATOR_BUILD_RUNGEN_TESTS) \
 	$(FILTERS_DIR)/multi_rungen \
 	$(FILTERS_DIR)/multi_rungen2 \
@@ -1652,6 +1655,11 @@ test_generator_nested_externs:
 test_generator_gpu_multi:
 	@echo "Skipping"
 
+# gpu_multi_context_threaded actually contain a generator named
+# "gpu_multi", and has no internal tests in any case.
+test_generator_gpu_multi_context_threaded:
+	@echo "Skipping"
+
 $(BUILD_DIR)/RunGenMain.o: $(ROOT_DIR)/tools/RunGenMain.cpp $(RUNTIME_EXPORTED_INCLUDES) $(ROOT_DIR)/tools/RunGen.h
 	@mkdir -p $(@D)
 	$(CXX) -c $< $(filter-out -g, $(TEST_CXX_FLAGS)) $(OPTIMIZE) -Os $(IMAGE_IO_CXX_FLAGS) -I$(INCLUDE_DIR) -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools -o $@
diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index 77a294abdb4f..62ae9fda7a4f 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -1583,7 +1583,7 @@ void CodeGen_C::compile(const LoweredFunc &f) {
 
         if (uses_gpu_for_loops) {
             stream << get_indent() << "halide_error("
-                   << (have_user_context ? "__user_context_" : "nullptr")
+                   << (have_user_context ? "const_cast<void *>(__user_context)" : "nullptr")
                    << ", \"C++ Backend does not support gpu_blocks() or gpu_threads() yet, "
                    << "this function will always fail at runtime\");\n";
             stream << get_indent() << "return halide_error_code_device_malloc_failed;\n";
diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp
index 2815f42b62ee..b2f5c1e0909e 100644
--- a/src/CodeGen_GPU_Host.cpp
+++ b/src/CodeGen_GPU_Host.cpp
@@ -213,12 +213,12 @@ void CodeGen_GPU_Host<CodeGen_CPU>::compile_func(const LoweredFunc &f,
         Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0));
         CodeGen_CPU::create_assertion(did_succeed, Expr(), result);
 
-	// Generate a finalizer call as well to relase any refcounts or other resource usage
-	// specific to this filter call.
+        // Generate a finalizer call as well to relase any refcounts or other resource usage
+        // specific to this filter call.
         std::string finalize_kernels_name = "halide_" + api_unique_name + "_finalize_kernels";
         llvm::Function *finalize = module->getFunction(finalize_kernels_name);
-	Value *module_state_value = builder->CreateLoad(module_state);
-	register_destructor(finalize, module_state_value, CodeGen_CPU::Always);
+        Value *module_state_value = builder->CreateLoad(module_state);
+        register_destructor(finalize, module_state_value, CodeGen_CPU::Always);
     }
 
     // the init kernels block should branch to the post-entry block
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 872fd65b74fa..39f4e6f1fa81 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -533,6 +533,13 @@ WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, co
     return 0;
 }
 
+WEAK void halide_cuda_finalize_kernels(void *user_context, void *state_ptr) {
+    Context ctx(user_context);
+    if (ctx.error == 0) {
+        compilation_cache.release_hold(user_context, ctx.context, state_ptr);
+    }
+}
+
 WEAK int halide_cuda_release_unused_device_allocations(void *user_context) {
     FreeListItem *to_free;
     {
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index f45d2883755b..b6d8ddadb317 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -167,12 +167,12 @@ class GPUCompilationCache {
         ScopedMutexLock lock_guard(&mutex);
 
         release_context(user_context, true, nullptr, f);
-	// Some items may have been in use, so can't free.
-	if (count == 0) {
-	    free(compilations);
-	    compilations = nullptr;
-	    log2_compilations_size = 0;
-	}
+        // Some items may have been in use, so can't free.
+        if (count == 0) {
+            free(compilations);
+            compilations = nullptr;
+            log2_compilations_size = 0;
+        }
     }
 
     template<typename CompileModuleT, typename... Args>
diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index 0cd4492bf6db..52b43f4c830b 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -622,7 +622,7 @@ WEAK int halide_metal_device_release(void *user_context) {
     if (acquired_device) {
         halide_metal_device_sync_internal(queue, nullptr);
 
-	debug(user_context) << "Calling delete context on device " << acquired_device << "\n";
+        debug(user_context) << "Calling delete context on device " << acquired_device << "\n";
         compilation_cache.delete_context(user_context, acquired_device, release_ns_object);
 
         // Release the device itself, if we created it.

From 42f9ccc744632371c2d957ff71506eed4226064b Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Thu, 10 Dec 2020 17:05:27 -0800
Subject: [PATCH 08/24] Conditionalize Objective C support.

---
 test/common/gpu_context.h                             | 2 +-
 test/generator/gpu_multi_context_threaded_aottest.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
index 9051bebb30f9..13a42ea64b88 100644
--- a/test/common/gpu_context.h
+++ b/test/common/gpu_context.h
@@ -131,7 +131,7 @@ void destroy_cuda_context(CUcontext cuda_ctx) {
     cuCtxDestroy(cuda_ctx);
 }
 
-#elif defined(TEST_METAL)
+#elif defined(TEST_METAL) && defined(__OBJC__)
 #include <Metal/MTLDevice.h>
 #include <Metal/MTLCommandQueue.h>
 
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index 05144a4c857d..8b629defb9dd 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -98,7 +98,7 @@ extern "C" int halide_cuda_release_context(void *user_context) {
 }
 
 #define HAS_MULTIPLE_CONTEXTS true
-#elif defined(TEST_METAL)
+#elif defined(TEST_METAL) && defined(__OBJC__)
 
 struct gpu_context {
     id<MTLDevice> device;

From b634ac07a48238b018b60ca383e85babd61ee997 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Thu, 10 Dec 2020 17:24:09 -0800
Subject: [PATCH 09/24] Fix clang-format complaints.

---
 src/runtime/gpu_context_common.h                   | 14 ++++++--------
 src/runtime/opencl.cpp                             |  5 ++---
 test/common/gpu_context.h                          |  8 ++++----
 .../gpu_multi_context_threaded_aottest.cpp         |  6 +++---
 4 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index b6d8ddadb317..9b9bbaab4a65 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -17,7 +17,6 @@ class GPUCompilationCache {
             : context(context), module_state(module_state),
               kernel_id(kernel_id), use_count(use_count) {
         }
-
     };
 
     halide_mutex mutex;
@@ -45,7 +44,7 @@ class GPUCompilationCache {
         }
     }
 
-  HALIDE_MUST_USE_RESULT bool insert(const CachedCompilation &entry) {
+    HALIDE_MUST_USE_RESULT bool insert(const CachedCompilation &entry) {
         if (log2_compilations_size == 0) {
             if (!resize_table(kInitialTableBits)) {
                 return false;
@@ -144,9 +143,9 @@ class GPUCompilationCache {
             if (compilations[i].kernel_id > kInvalidId &&
                 (all || (compilations[i].context == context)) &&
                 compilations[i].use_count == 0) {
-                debug(user_context) << "Releasing cached compilation: " << compilations[i].module_state <<
-                                       " id " << compilations[i].kernel_id <<
-                                       " context " << compilations[i].context << "\n";
+                debug(user_context) << "Releasing cached compilation: " << compilations[i].module_state
+                                    << " id " << compilations[i].kernel_id
+                                    << " context " << compilations[i].context << "\n";
                 f(compilations[i].module_state);
                 compilations[i].module_state = nullptr;
                 compilations[i].kernel_id = kDeletedId;
@@ -195,13 +194,12 @@ class GPUCompilationCache {
 
         // TODO(zvookin): figure out the calling signature here...
         ModuleStateT compiled_module = f(args...);
-        debug(user_context) << "Caching compiled kernel: " << compiled_module <<
-                               " id " << *id_ptr << " context " << context << "\n";
+        debug(user_context) << "Caching compiled kernel: " << compiled_module
+                            << " id " << *id_ptr << " context " << context << "\n";
         if (compiled_module == nullptr) {
             return false;
         }
 
-        
         if (!insert({context, compiled_module, *id_ptr, 1})) {
             return false;
         }
diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index 10bd05ccc315..5ee0d7559ab1 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -763,7 +763,7 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr,
 WEAK void halide_opencl_finalize_kernels(void *user_context, void *state_ptr) {
     debug(user_context)
         << "CL: halide_opencl_finalize_kernels (user_context: " << user_context
-        << ", state_ptr: " << state_ptr << "\n";;
+        << ", state_ptr: " << state_ptr << "\n";
     ClContext ctx(user_context);
     if (ctx.error_code == CL_SUCCESS) {
         compilation_cache.release_hold(user_context, ctx.context, state_ptr);
@@ -1059,7 +1059,7 @@ WEAK int halide_opencl_run(void *user_context,
 
     // Create kernel object for entry_name from the program for this module.
     halide_assert(user_context, state_ptr);
-    
+
     cl_program program{};
     bool found = compilation_cache.lookup(ctx.context, state_ptr, program);
     halide_assert(user_context, found && program != nullptr);
@@ -1350,7 +1350,6 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() {
 
 namespace {
 WEAK __attribute__((destructor)) void halide_opencl_cleanup() {
-      debug(nullptr) << "halide_opencl_cleanup\n";
     compilation_cache.release_all(nullptr, clReleaseProgram);
     halide_opencl_device_release(nullptr);
 }
diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
index 13a42ea64b88..b3a7c3ea3e29 100644
--- a/test/common/gpu_context.h
+++ b/test/common/gpu_context.h
@@ -132,16 +132,16 @@ void destroy_cuda_context(CUcontext cuda_ctx) {
 }
 
 #elif defined(TEST_METAL) && defined(__OBJC__)
-#include <Metal/MTLDevice.h>
 #include <Metal/MTLCommandQueue.h>
+#include <Metal/MTLDevice.h>
 
 void create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
     device = MTLCreateSystemDefaultDevice();
     if (device == nullptr) {
         NSArray<id<MTLDevice>> *devices = MTLCopyAllDevices();
-	if (devices != nullptr) {
-	    device = devices[0];
-	}
+        if (devices != nullptr) {
+            device = devices[0];
+        }
     }
     assert(device != nullptr);
     queue = [device newCommandQueue];
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index 8b629defb9dd..388f724eb78f 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -117,15 +117,15 @@ void destroy_context(gpu_context &context) {
 }
 
 int halide_metal_acquire_context(void *user_context, id<MTLDevice> *device_ret,
-				 id<MTLCommandQueue> *queue_ret, bool create) {
+                                 id<MTLCommandQueue> *queue_ret, bool create) {
     if (user_context == nullptr) {
         assert(!create);
         *device_ret = nullptr;
         *queue_ret = nullptr;
     } else {
         gpu_context *context = (gpu_context *)user_context;
-	*device_ret = context->device;
-	*queue_ret = context->queue;
+        *device_ret = context->device;
+        *queue_ret = context->queue;
     }
     return 0;
 

From 285750a71bc3b9b13ad3c16f0ef167c30860f674 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Thu, 10 Dec 2020 17:29:42 -0800
Subject: [PATCH 10/24] Fix clang-format complaints.

---
 test/common/gpu_context.h                             | 2 +-
 test/generator/gpu_multi_context_threaded_aottest.cpp | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
index b3a7c3ea3e29..6c697c4fd757 100644
--- a/test/common/gpu_context.h
+++ b/test/common/gpu_context.h
@@ -146,7 +146,7 @@ void create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
     assert(device != nullptr);
     queue = [device newCommandQueue];
     assert(queue != nullptr);
-}  
+}
 
 void destroy_metal_context(id<MTLDevice> device, id<MTLCommandQueue> queue) {
     [queue release];
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index 388f724eb78f..0d58fa6b91ef 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -128,7 +128,6 @@ int halide_metal_acquire_context(void *user_context, id<MTLDevice> *device_ret,
         *queue_ret = context->queue;
     }
     return 0;
-
 }
 
 int halide_metal_release_context(void *user_context) {

From e78ce1400fc8057234e1f1d973aab1ddda0dfb12 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Thu, 10 Dec 2020 20:53:07 -0800
Subject: [PATCH 11/24] Attempt to fix new test failure with cmake.

---
 test/generator/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 130af49b7f93..af7b9b8728ad 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -124,10 +124,12 @@ halide_define_aot_test(acquire_release)
 if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda")
     include(AddCudaToTarget)
     add_cuda_to_target(generator_aot_acquire_release PRIVATE)
+    add_cuda_to_target(generator_aot_multi_context_threaded PRIVATE)
 endif ()
 if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl")
     find_package(OpenCL REQUIRED)
     target_link_libraries(generator_aot_acquire_release PRIVATE OpenCL::OpenCL)
+    target_link_libraries(generator_aot_multi_context_threaded PRIVATE OpenCL::OpenCL)
 endif ()
 
 # TODO: what are these?

From 1046ccc78cc061cd23461ab31bac8caf85a5c22d Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 14 Dec 2020 14:52:54 -0800
Subject: [PATCH 12/24] Add Metal support to acquire_release test and make it
 so it doesn't fail without a GPU.

---
 test/generator/acquire_release_aottest.cpp | 42 +++++++++++++++++++---
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp
index 9e4fba70afe6..dbb16c29606a 100644
--- a/test/generator/acquire_release_aottest.cpp
+++ b/test/generator/acquire_release_aottest.cpp
@@ -76,6 +76,41 @@ extern "C" int halide_cuda_release_context(void *user_context) {
     printf("Releasing CUDA context %p\n", cuda_ctx);
     return 0;
 }
+#elif defined(TEST_METAL) && defined(__OBJC__)
+
+struct gpu_context {
+    id<MTLDevice> device;
+    id<MTLCommandQueue> queue;
+};
+
+bool init_context(gpu_context &context) {
+    create_metal_context(context.device, context.queue);
+    return 0;
+}
+
+void destroy_context(gpu_context &context) {
+    destroy_metal_context(context.device, context.queue);
+    context.device = nullptr;
+    context.queue = nullptr;
+}
+
+int halide_metal_acquire_context(void *user_context, id<MTLDevice> *device_ret,
+                                 id<MTLCommandQueue> *queue_ret, bool create) {
+    if (user_context == nullptr) {
+        assert(!create);
+        *device_ret = nullptr;
+        *queue_ret = nullptr;
+    } else {
+        gpu_context *context = (gpu_context *)user_context;
+        *device_ret = context->device;
+        *queue_ret = context->queue;
+    }
+    return 0;
+}
+
+int halide_metal_release_context(void *user_context) {
+    return 0;
+}
 #else
 // Just use the default implementation of acquire/release.
 bool init_context() {
@@ -128,14 +163,13 @@ bool run_test() {
 
     if (interface != nullptr) {
         halide_device_release(nullptr, interface);
+
+        // Free the context we created.
+        destroy_context();
     } else {
         printf("Device interface is nullptr.\n");
-        return false;
     }
 
-    // Free the context we created.
-    destroy_context();
-
     printf("Success!\n");
     return true;
 }

From 3352bf587c6b49b3626b29749ceacfe5dd92aa0a Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 16 Dec 2020 23:20:42 -0800
Subject: [PATCH 13/24] Fix CMake cuda target issue. Add comment to Makefile
 per review feedback.

---
 Makefile                      |  5 +++++
 test/generator/CMakeLists.txt | 19 +++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 6e41c930af86..e2e1cb129563 100644
--- a/Makefile
+++ b/Makefile
@@ -333,6 +333,11 @@ TEST_CXX_FLAGS += -DTEST_OPENCL
 endif
 
 ifneq ($(TEST_METAL), )
+# Using Metal APIs requires writing Objective-C++ (or Swift). Add ObjC++
+# to allow tests to create and destroy Metal contexts, etc. This requires
+# tests to be valid Objective-C++, e.g. avoiding using the identifier "id"
+# in certain ways. In practice this is not enough of a problem to justify
+# the work to limit which files are compiled this way.
 TEST_CXX_FLAGS += -DTEST_METAL -ObjC++
 endif
 
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index af7b9b8728ad..95942efd854a 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -114,22 +114,26 @@ if (NOT ${USING_WASM})
     target_link_libraries(cxx_mangling_define_extern_externs PRIVATE cxx_mangling)
 endif ()
 
+if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda")
+    include(AddCudaToTarget)
+endif ()
+if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl")
+    find_package(OpenCL REQUIRED)
+endif ()
+
 ##
 # Create targets for the AOT tests
 ##
 
 # acquire_release_aottest.cpp
 # acquire_release_generator.cpp
+# acquire_release_generator.cpp
 halide_define_aot_test(acquire_release)
 if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda")
-    include(AddCudaToTarget)
     add_cuda_to_target(generator_aot_acquire_release PRIVATE)
-    add_cuda_to_target(generator_aot_multi_context_threaded PRIVATE)
 endif ()
 if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl")
-    find_package(OpenCL REQUIRED)
     target_link_libraries(generator_aot_acquire_release PRIVATE OpenCL::OpenCL)
-    target_link_libraries(generator_aot_multi_context_threaded PRIVATE OpenCL::OpenCL)
 endif ()
 
 # TODO: what are these?
@@ -272,6 +276,13 @@ add_halide_library(gpu_multi_context_threaded_add FROM gpu_multi_context_threade
 add_halide_library(gpu_multi_context_threaded_mul FROM gpu_multi_context_threaded.generator
                    FEATURES user_context)
 
+if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda")
+    add_cuda_to_target(generator_aot_gpu_multi_context_threaded PRIVATE)
+endif ()
+if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl")
+    target_link_libraries(generator_aot_gpu_multi_context_threaded PRIVATE OpenCL::OpenCL)
+endif ()
+
 # gpu_object_lifetime_aottest.cpp
 # gpu_object_lifetime_generator.cpp
 halide_define_aot_test(gpu_object_lifetime FEATURES debug)

From 05c8c7cd5b48b591bc54f283cf9d74f0c8a2be55 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 16 Dec 2020 23:38:06 -0800
Subject: [PATCH 14/24] Add a couple more locals initializations for safety.

---
 src/runtime/d3d12compute.cpp | 2 +-
 src/runtime/metal.cpp        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index f298dfe26727..bae6260fa8dd 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -2766,7 +2766,7 @@ WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state
     D3D12ContextHolder d3d12_context(user_context, true);
 
     int error = halide_error_code_generic_error;
-    d3d12_library *library;
+    d3d12_library *library{};
     if (!compilation_cache.kernel_state_setup(user_context, state_ptr, d3d12_context.device,
                                               library, compile_kernel, user_context,
                                               source, source_size, &error)) {
diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index 52b43f4c830b..ead7591fcccc 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -546,7 +546,7 @@ WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, c
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    mtl_library *library;
+    mtl_library *library{};
     if (!compilation_cache.kernel_state_setup(user_context, state_ptr, metal_context.device, library,
                                               new_library_with_source, metal_context.device,
                                               source, source_size)) {

From 1dd25f7608687e320ed4bee8acdf61c4586d00a1 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Thu, 17 Dec 2020 12:41:07 -0800
Subject: [PATCH 15/24] Remove extraneous test that somehow got moved into
 header file.

---
 test/common/gpu_context.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
index 6c697c4fd757..d0ca579d6bb9 100644
--- a/test/common/gpu_context.h
+++ b/test/common/gpu_context.h
@@ -9,10 +9,6 @@
 #include <CL/cl.h>
 #endif
 
-// Just use a global context and queue, created and destroyed by main.
-cl_context cl_ctx = nullptr;
-cl_command_queue cl_q = nullptr;
-
 // Create the global context. This is just a helper function not called by Halide.
 bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) {
     cl_int err = 0;

From 10a1388ccaf6787e35ced172b953acdf781a8e01 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Fri, 18 Dec 2020 02:18:48 -0800
Subject: [PATCH 16/24] Fix OpenCL code per erroneous use of globals.

---
 test/generator/gpu_multi_context_threaded_aottest.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index 0d58fa6b91ef..212c00abad89 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -42,8 +42,8 @@ bool init_context(gpu_context &context) {
 
 void destroy_context(gpu_context &context) {
     destroy_opencl_context(context.cl_ctx, context.cl_q);
-    cl_q = nullptr;
-    cl_ctx = nullptr;
+    context.cl_q = nullptr;
+    context.cl_ctx = nullptr;
 }
 
 // These functions replace the acquire/release implementation in src/runtime/opencl.cpp.

From 1962b2573e80e4dce766af6e1457418069a59d3b Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 6 Jan 2021 02:04:28 -0800
Subject: [PATCH 17/24] Fix errors for MEtal case in
 acquire_release_aottest.cpp.

Make Metal context creation test API consistent with CUDA and OpenCL
by having it return a success/fail indication instead of asserting
internally.
---
 test/common/gpu_context.h                  | 13 ++++++++---
 test/generator/acquire_release_aottest.cpp | 27 ++++++++--------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
index d0ca579d6bb9..c09f601917ee 100644
--- a/test/common/gpu_context.h
+++ b/test/common/gpu_context.h
@@ -131,7 +131,7 @@ void destroy_cuda_context(CUcontext cuda_ctx) {
 #include <Metal/MTLCommandQueue.h>
 #include <Metal/MTLDevice.h>
 
-void create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
+bool create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
     device = MTLCreateSystemDefaultDevice();
     if (device == nullptr) {
         NSArray<id<MTLDevice>> *devices = MTLCopyAllDevices();
@@ -139,9 +139,16 @@ void create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
             device = devices[0];
         }
     }
-    assert(device != nullptr);
+    if (device == nullptr) {
+        printf("Failed to find Metal device.\n");
+	return false;
+    }
     queue = [device newCommandQueue];
-    assert(queue != nullptr);
+    if (queue == nullptr) {
+        printf("Failed to create Metal command queue.\n");
+	return false;
+    }
+    return true;
 }
 
 void destroy_metal_context(id<MTLDevice> device, id<MTLCommandQueue> queue) {
diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp
index dbb16c29606a..3bc4b7efc20c 100644
--- a/test/generator/acquire_release_aottest.cpp
+++ b/test/generator/acquire_release_aottest.cpp
@@ -81,30 +81,23 @@ extern "C" int halide_cuda_release_context(void *user_context) {
 struct gpu_context {
     id<MTLDevice> device;
     id<MTLCommandQueue> queue;
-};
+} metal_context;
 
-bool init_context(gpu_context &context) {
-    create_metal_context(context.device, context.queue);
-    return 0;
+bool init_context() {
+    return create_metal_context(metal_context.device, metal_context.queue);
 }
 
-void destroy_context(gpu_context &context) {
-    destroy_metal_context(context.device, context.queue);
-    context.device = nullptr;
-    context.queue = nullptr;
+void destroy_context() {
+    destroy_metal_context(metal_context.device, metal_context.queue);
+    metal_context.device = nullptr;
+    metal_context.queue = nullptr;
 }
 
 int halide_metal_acquire_context(void *user_context, id<MTLDevice> *device_ret,
                                  id<MTLCommandQueue> *queue_ret, bool create) {
-    if (user_context == nullptr) {
-        assert(!create);
-        *device_ret = nullptr;
-        *queue_ret = nullptr;
-    } else {
-        gpu_context *context = (gpu_context *)user_context;
-        *device_ret = context->device;
-        *queue_ret = context->queue;
-    }
+    *device_ret = metal_context.device;
+    *queue_ret = metal_context.queue;
+
     return 0;
 }
 

From 054a5355a1dee0afcf60acbd4b6d2fa47bb2f06a Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 6 Jan 2021 02:21:37 -0800
Subject: [PATCH 18/24] Fix formatting.

---
 test/common/gpu_context.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
index c09f601917ee..23423bcb0cd5 100644
--- a/test/common/gpu_context.h
+++ b/test/common/gpu_context.h
@@ -141,12 +141,12 @@ bool create_metal_context(id<MTLDevice> &device, id<MTLCommandQueue> &queue) {
     }
     if (device == nullptr) {
         printf("Failed to find Metal device.\n");
-	return false;
+        return false;
     }
     queue = [device newCommandQueue];
     if (queue == nullptr) {
         printf("Failed to create Metal command queue.\n");
-	return false;
+        return false;
     }
     return true;
 }

From cf862ceeeeb96c71743a03d0f99290568b3765de Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 13 Jan 2021 19:17:35 -0800
Subject: [PATCH 19/24] Fix D3D runtime to work like Metal does and not
 reentrantly acquire the device context when compiling a kernel.

---
 src/runtime/d3d12compute.cpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 72b05118c761..4a91c8047209 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -2440,14 +2440,8 @@ volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0;
 
 WEAK Halide::Internal::GPUCompilationCache<d3d12_device *, d3d12_library *> compilation_cache;
 
-WEAK d3d12_library *compile_kernel(void *user_context, const char *source, int source_size, int *error_ret) {
-    D3D12ContextHolder d3d12_context(user_context, true);
-    if (d3d12_context.error != 0) {
-        *error_ret = d3d12_context.error;
-        return nullptr;
-    }
-
-    d3d12_library *library = new_library_with_source(d3d12_context.device, source, source_size);
+WEAK d3d12_library *compile_kernel(d3d12_device *device, const char *source, int source_size, int *error_ret) {
+    d3d12_library *library = new_library_with_source(device, source, source_size);
     if (library == nullptr) {
         TRACEFATAL("D3D12Compute: new_library_with_source failed.");
         *error_ret = halide_error_code_out_of_memory;
@@ -2768,7 +2762,7 @@ WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state
     int error = halide_error_code_generic_error;
     d3d12_library *library{};
     if (!compilation_cache.kernel_state_setup(user_context, state_ptr, d3d12_context.device,
-                                              library, compile_kernel, user_context,
+                                              library, compile_kernel, d3d12_context.device,
                                               source, source_size, &error)) {
         return error;
     }

From 871acc3d27ac1c9bff61f3fa4eb5280a36678448 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 25 Jan 2021 11:38:08 -0800
Subject: [PATCH 20/24] trigger buildbots


From 593a59ab95847f31eb695565a4460b45c9c75998 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 25 Jan 2021 12:45:01 -0800
Subject: [PATCH 21/24] trigger buildbots


From 6b877df494abf96b52a1c3b25b462de8e9b25dee Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 25 Jan 2021 14:22:54 -0800
Subject: [PATCH 22/24] trigger buildbots


From c61bf3971e84240f4211d7ef054179ce8e74504a Mon Sep 17 00:00:00 2001
From: Marcos Slomp <slomp@adobe.com>
Date: Tue, 26 Jan 2021 14:34:02 -0800
Subject: [PATCH 23/24] bugfix

---
 src/runtime/d3d12compute.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 4a91c8047209..64085a83f7b8 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -2493,7 +2493,7 @@ static int d3d12_create_context(void *user_context) {
     }
 
     if (status == halide_error_code_success) {
-        halide_assert(cmd_allocator_main, (cmd_allocator_main == nullptr));
+        halide_assert(user_context, (cmd_allocator_main == nullptr));
         cmd_allocator_main = new_command_allocator<HALIDE_D3D12_COMMAND_LIST_TYPE>(device);
         if (cmd_allocator_main == nullptr) {
             status = halide_error_code_generic_error;
@@ -2503,8 +2503,8 @@ static int d3d12_create_context(void *user_context) {
     if (status == halide_error_code_success) {
         // NOTE(marcos): a small amount of hard-coded staging buffer storage is
         // sufficient to get started as suballocations will grow them as needed
-        halide_assert(cmd_allocator_main, (upload == 0));
-        halide_assert(cmd_allocator_main, (readback == 0));
+        halide_assert(user_context, (upload == 0));
+        halide_assert(user_context, (readback == 0));
         size_t heap_size = 4 * 1024 * 1024;
         upload = new_upload_buffer(device, heap_size);
         readback = new_readback_buffer(device, heap_size);
@@ -2853,6 +2853,7 @@ WEAK int halide_d3d12compute_device_release(void *user_context) {
             hFenceEvent = nullptr;
 
             release_object(cmd_allocator_main);
+            cmd_allocator_main = nullptr;
 
             release_object(device);
             device = nullptr;

From 6b79b654c95b92df9ad7f1abe7ac91634e30c646 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 27 Jan 2021 15:02:09 -0800
Subject: [PATCH 24/24] trigger buildbots