From e74663c7ed432c7bffada9b466148cacfa808bbe Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 2 Dec 2020 18:08:45 -0800
Subject: [PATCH] Revert "Make context handling in GPU runtimes more consistent
 and robust. (#5474)"

This reverts commit f47c5c99deac86c6d1f16cfcb1743a0e9e79317d.
---
 Makefile                                      |  26 ---
 src/runtime/cuda.cpp                          | 163 ++++++++++----
 src/runtime/d3d12compute.cpp                  |  78 ++++---
 src/runtime/destructors.cpp                   |   1 -
 src/runtime/gpu_context_common.h              | 197 -----------------
 src/runtime/metal.cpp                         |  68 ++++--
 src/runtime/opencl.cpp                        | 208 ++++++++++--------
 test/common/gpu_context.h                     | 134 -----------
 test/common/gpu_object_lifetime_tracker.h     |   7 +-
 test/correctness/CMakeLists.txt               |   1 -
 test/correctness/gpu_many_kernels.cpp         |  92 --------
 test/generator/CMakeLists.txt                 |  13 --
 test/generator/acquire_release_aottest.cpp    | 159 ++++++++++---
 .../gpu_multi_context_threaded_aottest.cpp    | 193 ----------------
 .../gpu_multi_context_threaded_generator.cpp  |  48 ----
 15 files changed, 465 insertions(+), 923 deletions(-)
 delete mode 100644 src/runtime/gpu_context_common.h
 delete mode 100644 test/common/gpu_context.h
 delete mode 100644 test/correctness/gpu_many_kernels.cpp
 delete mode 100644 test/generator/gpu_multi_context_threaded_aottest.cpp
 delete mode 100644 test/generator/gpu_multi_context_threaded_generator.cpp

diff --git a/Makefile b/Makefile
index 6ce06a94b568..ba9fcd3f9b4d 100644
--- a/Makefile
+++ b/Makefile
@@ -1527,12 +1527,6 @@ $(FILTERS_DIR)/nested_externs_%.a: $(BIN_DIR)/nested_externs.generator
 	@mkdir -p $(@D)
 	$(CURDIR)/$< -g nested_externs_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime
 
-# Similarly, gpu_multi needs two different kernels to test compilation caching.
-# Also requies user-context.
-$(FILTERS_DIR)/gpu_multi_context_threaded_%.a: $(BIN_DIR)/gpu_multi_context_threaded.generator
-	@mkdir -p $(@D)
-	$(CURDIR)/$< -g gpu_multi_context_threaded_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context
-
 GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas
 GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools
 GEN_AOT_LD_FLAGS=$(COMMON_LD_FLAGS)
@@ -1628,31 +1622,11 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget
 	HL_MULTITARGET_TEST_USE_NOBOUNDSQUERY_FEATURE=1 $(CURDIR)/$<
 	@-echo
 
-# gpu_multi_context_threaded has additional deps to link in
-$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
-	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_add.a \
-	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a \
-	                                                       $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
-	@mkdir -p $(@D)
-	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
-
-$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
-	                                                          $(FILTERS_DIR)/gpu_multi_context_threaded_add.halide_generated.cpp \
-	                                                          $(FILTERS_DIR)/gpu_multi_context_threaded_mul.halide_generated.cpp \
-	                                                          $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
-	@mkdir -p $(@D)
-	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
-
 # nested externs doesn't actually contain a generator named
 # "nested_externs", and has no internal tests in any case.
 test_generator_nested_externs:
 	@echo "Skipping"
 
-# gpu_multi actually contain a generator named
-# "gpu_multi", and has no internal tests in any case.
-test_generator_gpu_multi:
-	@echo "Skipping"
-
 $(BUILD_DIR)/RunGenMain.o: $(ROOT_DIR)/tools/RunGenMain.cpp $(RUNTIME_EXPORTED_INCLUDES) $(ROOT_DIR)/tools/RunGen.h
 	@mkdir -p $(@D)
 	$(CXX) -c $< $(filter-out -g, $(TEST_CXX_FLAGS)) $(OPTIMIZE) -Os $(IMAGE_IO_CXX_FLAGS) -I$(INCLUDE_DIR) -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools -o $@
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 1dc1cf27086e..7c423e179d85 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1,7 +1,6 @@
 #include "HalideRuntimeCuda.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
-#include "gpu_context_common.h"
 #include "mini_cuda.h"
 #include "printer.h"
 #include "scoped_mutex_lock.h"
@@ -240,7 +239,43 @@ class Context {
     }
 };
 
-WEAK Halide::Internal::GPUCompilationCache<CUcontext, CUmodule> compilation_cache;
+// Halide allocates a device API controlled pointer slot as part of
+// each compiled module. The slot is used to store information to
+// avoid having to reload/recompile kernel code on each call into a
+// Halide filter. The cuda runtime uses this pointer to maintain a
+// linked list of contexts into which the module has been loaded.
+//
+// A global list of all registered filters is also kept so all modules
+// loaded on a given context can be unloaded and removed from the list
+// when halide_device_release is called on a specific context.
+//
+// The registered_filters struct is not freed as it is pointed to by the
+// Halide generated code. The module_state structs are freed.
+
+struct module_state {
+    CUcontext context;
+    CUmodule module;
+    module_state *next;
+};
+
+struct registered_filters {
+    module_state *modules;
+    registered_filters *next;
+};
+WEAK registered_filters *filters_list = nullptr;
+// This spinlock protects the above filters_list.
+WEAK halide_mutex filters_list_lock;
+
+WEAK module_state *find_module_for_context(const registered_filters *filters, CUcontext ctx) {
+    module_state *modules = filters->modules;
+    while (modules != nullptr) {
+        if (modules->context == ctx) {
+            return modules;
+        }
+        modules = modules->next;
+    }
+    return nullptr;
+}
 
 WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
     // Initialize CUDA
@@ -470,33 +505,6 @@ WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size
 #endif
 }
 
-WEAK CUmodule compile_kernel(void *user_context, const char *ptx_src, int size) {
-    debug(user_context) << "CUDA: compile_kernel cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
-
-    CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
-    unsigned int max_regs_per_thread = 64;
-
-    // A hack to enable control over max register count for
-    // testing. This should be surfaced in the schedule somehow
-    // instead.
-    char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
-    if (regs) {
-        max_regs_per_thread = atoi(regs);
-    }
-    void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
-    CUmodule loaded_module;
-    CUresult err = cuModuleLoadDataEx(&loaded_module, ptx_src, 1, options, optionValues);
-
-    if (err != CUDA_SUCCESS) {
-        error(user_context) << "CUDA: cuModuleLoadData failed: "
-                            << get_error_name(err);
-        return nullptr;
-    } else {
-        debug(user_context) << (void *)(loaded_module) << "\n";
-    }
-    return loaded_module;
-}
-
 }  // namespace Cuda
 }  // namespace Internal
 }  // namespace Runtime
@@ -518,12 +526,54 @@ WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, co
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    CUmodule loaded_module;
-    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, loaded_module,
-                                              compile_kernel, user_context, ptx_src, size)) {
-        return halide_error_code_generic_error;
-    }
-    halide_assert(user_context, loaded_module != nullptr);
+    halide_assert(user_context, &filters_list_lock != nullptr);
+    {
+        ScopedMutexLock spinlock(&filters_list_lock);
+
+        // Create the state object if necessary. This only happens once, regardless
+        // of how many times halide_initialize_kernels/halide_release is called.
+        // halide_release traverses this list and releases the module objects, but
+        // it does not modify the list nodes created/inserted here.
+        registered_filters **filters = (registered_filters **)state_ptr;
+        if (!(*filters)) {
+            *filters = (registered_filters *)malloc(sizeof(registered_filters));
+            (*filters)->modules = nullptr;
+            (*filters)->next = filters_list;
+            filters_list = *filters;
+        }
+
+        // Create the module itself if necessary.
+        module_state *loaded_module = find_module_for_context(*filters, ctx.context);
+        if (loaded_module == nullptr) {
+            loaded_module = (module_state *)malloc(sizeof(module_state));
+            debug(user_context) << "    cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
+
+            CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
+            unsigned int max_regs_per_thread = 64;
+
+            // A hack to enable control over max register count for
+            // testing. This should be surfaced in the schedule somehow
+            // instead.
+            char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
+            if (regs) {
+                max_regs_per_thread = atoi(regs);
+            }
+            void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
+            CUresult err = cuModuleLoadDataEx(&loaded_module->module, ptx_src, 1, options, optionValues);
+
+            if (err != CUDA_SUCCESS) {
+                free(loaded_module);
+                error(user_context) << "CUDA: cuModuleLoadData failed: "
+                                    << get_error_name(err);
+                return err;
+            } else {
+                debug(user_context) << (void *)(loaded_module->module) << "\n";
+            }
+            loaded_module->context = ctx.context;
+            loaded_module->next = (*filters)->modules;
+            (*filters)->modules = loaded_module;
+        }
+    }  // spinlock
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -654,7 +704,7 @@ WEAK int halide_cuda_device_release(void *user_context) {
         << "CUDA: halide_cuda_device_release (user_context: " << user_context << ")\n";
 
     // If we haven't even loaded libcuda, don't load it just to quit.
-    if (!cuInit) {
+    if (!lib_cuda) {
         return 0;
     }
 
@@ -678,7 +728,34 @@ WEAK int halide_cuda_device_release(void *user_context) {
         // Dump the contents of the free list, ignoring errors.
         halide_cuda_release_unused_device_allocations(user_context);
 
-        compilation_cache.delete_context(user_context, ctx, cuModuleUnload);
+        {
+            ScopedMutexLock spinlock(&filters_list_lock);
+
+            // Unload the modules attached to this context. Note that the list
+            // nodes themselves are not freed, only the module objects are
+            // released. Subsequent calls to halide_init_kernels might re-create
+            // the program object using the same list node to store the module
+            // object.
+            registered_filters *filters = filters_list;
+            while (filters) {
+                module_state **prev_ptr = &filters->modules;
+                module_state *loaded_module = filters->modules;
+                while (loaded_module != nullptr) {
+                    if (loaded_module->context == ctx) {
+                        debug(user_context) << "    cuModuleUnload " << loaded_module->module << "\n";
+                        err = cuModuleUnload(loaded_module->module);
+                        halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
+                        *prev_ptr = loaded_module->next;
+                        free(loaded_module);
+                        loaded_module = *prev_ptr;
+                    } else {
+                        loaded_module = loaded_module->next;
+                        prev_ptr = &loaded_module->next;
+                    }
+                }
+                filters = filters->next;
+            }
+        }  // spinlock
 
         CUcontext old_ctx;
         cuCtxPopCurrent(&old_ctx);
@@ -842,15 +919,12 @@ WEAK int cuda_do_multidimensional_copy(void *user_context, const device_copy &c,
                             << (void *)src << " -> " << (void *)dst << ", " << c.chunk_size << " bytes\n";
         if (!from_host && to_host) {
             debug(user_context) << "cuMemcpyDtoH(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
-            copy_name = "cuMemcpyDtoH";
             err = cuMemcpyDtoH((void *)dst, (CUdeviceptr)src, c.chunk_size);
         } else if (from_host && !to_host) {
             debug(user_context) << "cuMemcpyHtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
-            copy_name = "cuMemcpyHtoD";
             err = cuMemcpyHtoD((CUdeviceptr)dst, (void *)src, c.chunk_size);
         } else if (!from_host && !to_host) {
             debug(user_context) << "cuMemcpyDtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
-            copy_name = "cuMemcpyDtoD";
             err = cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, c.chunk_size);
         } else if (dst != src) {
             debug(user_context) << "memcpy(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
@@ -1059,9 +1133,9 @@ WEAK int halide_cuda_run(void *user_context,
 #endif
 
     halide_assert(user_context, state_ptr);
-    CUmodule mod = nullptr;
-    bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod);
-    halide_assert(user_context, found_module && mod != nullptr);
+    module_state *loaded_module = find_module_for_context((registered_filters *)state_ptr, ctx.context);
+    halide_assert(user_context, loaded_module != nullptr);
+    CUmodule mod = loaded_module->module;
     debug(user_context) << "Got module " << mod << "\n";
     halide_assert(user_context, mod);
     CUfunction f;
@@ -1190,7 +1264,7 @@ WEAK const halide_device_interface_t *halide_cuda_device_interface() {
 }
 
 WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *minor) {
-    if (!lib_cuda && !cuInit) {
+    if (!lib_cuda) {
         // If cuda can't be found, we want to return 0, 0 and it's not
         // considered an error. So we should be very careful about
         // looking for libcuda without tripping any errors in the rest
@@ -1239,7 +1313,6 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min
 
 namespace {
 WEAK __attribute__((destructor)) void halide_cuda_cleanup() {
-    compilation_cache.release_all(nullptr, cuModuleUnload);
     halide_cuda_device_release(nullptr);
 }
 }  // namespace
diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 3174c33f52a4..49ef68a0118a 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -45,7 +45,6 @@
 #include "HalideRuntimeD3D12Compute.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
-#include "gpu_context_common.h"
 #include "printer.h"
 #include "scoped_spin_lock.h"
 
@@ -2438,24 +2437,16 @@ static void *buffer_contents(d3d12_buffer *buffer) {
 
 volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0;
 
-WEAK Halide::Internal::GPUCompilationCache<d3d12_device *, d3d12_library *> compilation_cache;
-
-WEAK d3d12_library *compile_kernel(void *user_context, const char *source, int source_size, int *error_ret) {
-    D3D12ContextHolder d3d12_context(user_context, true);
-    if (d3d12_context.error != 0) {
-        *error_ret = d3d12_context.error;
-        return nullptr;
-    }
-
-    d3d12_library *library = new_library_with_source(d3d12_context.device, source, source_size);
-    if (library == nullptr) {
-        TRACEFATAL("D3D12Compute: new_library_with_source failed.");
-        *error_ret = halide_error_code_out_of_memory;
-        return nullptr;
-    }
-
-    return library;
-}
+// Structure to hold the state of a module attached to the context.
+// Also used as a linked-list to keep track of all the different
+// modules that are attached to a context in order to release them all
+// when then context is released.
+struct module_state {
+    d3d12_library *library;
+    module_state *next;
+};
+D3D12TYPENAME(module_state)
+WEAK module_state *state_list = nullptr;
 
 }  // namespace D3D12Compute
 }  // namespace Internal
@@ -2763,14 +2754,29 @@ WEAK int halide_d3d12compute_device_free(void *user_context, halide_buffer_t *bu
 WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) {
     TRACELOG;
 
+    // Create the state object if necessary. This only happens once, regardless
+    // of how many times halide_initialize_kernels/halide_release is called.
+    // halide_release traverses this list and releases the module objects, but
+    // it does not modify the list nodes created/inserted here.
+    module_state *&state = *(module_state **)state_ptr;
+    if (!state) {
+        state = malloct<module_state>();
+        state->library = nullptr;
+        state->next = state_list;
+        state_list = state;
+    }
+
     D3D12ContextHolder d3d12_context(user_context, true);
+    if (d3d12_context.error != 0) {
+        return d3d12_context.error;
+    }
 
-    int error = halide_error_code_generic_error;
-    d3d12_library *library;
-    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, d3d12_context.device,
-                                              library, compile_kernel, user_context,
-                                              source, source_size, &error)) {
-        return error;
+    if (state->library == nullptr) {
+        state->library = new_library_with_source(d3d12_context.device, source, source_size);
+        if (state->library == nullptr) {
+            TRACEFATAL("D3D12Compute: new_library_with_source failed.");
+            return halide_error_code_out_of_memory;
+        }
     }
 
     return 0;
@@ -2833,7 +2839,19 @@ WEAK int halide_d3d12compute_device_release(void *user_context) {
             release_object(buffer);
         }
 
-        compilation_cache.delete_context(user_context, device, release_object<d3d12_library>);
+        // Unload the modules attached to this device. Note that the list
+        // nodes themselves are not freed, only the program objects are
+        // released. Subsequent calls to halide_init_kernels might re-create
+        // the program object using the same list node to store the program
+        // object.
+        module_state *state = state_list;
+        while (state) {
+            if (state->library) {
+                release_object(state->library);
+                state->library = nullptr;
+            }
+            state = state->next;
+        }
 
         // Release the device itself, if we created it.
         if (acquired_device == device) {
@@ -3005,9 +3023,8 @@ WEAK int halide_d3d12compute_run(void *user_context,
     StartCapturingGPUActivity();
 #endif
 
-    d3d12_library *library = nullptr;
-    bool found_module = compilation_cache.lookup(device, state_ptr, library);
-    halide_assert(user_context, found_module && library != nullptr);
+    halide_assert(user_context, state_ptr);
+    module_state *state = (module_state *)state_ptr;
 
     d3d12_frame *frame = acquire_frame(device);
     d3d12_compute_command_list *cmdList = frame->cmd_list;
@@ -3019,7 +3036,7 @@ WEAK int halide_d3d12compute_run(void *user_context,
     d3d12_compute_pipeline_state *pipeline_state = nullptr;
     {
         TRACE_SCOPE("kernel shader selection");
-        function = new_function_with_name(device, library, entry_name, strlen(entry_name),
+        function = new_function_with_name(device, state->library, entry_name, strlen(entry_name),
                                           shared_mem_bytes, threadsX, threadsY, threadsZ);
         halide_assert(user_context, function);
         pipeline_state = function->pipeline_state;
@@ -3514,7 +3531,6 @@ WEAK const struct halide_device_interface_t *halide_d3d12compute_device_interfac
 namespace {
 WEAK __attribute__((destructor)) void halide_d3d12compute_cleanup() {
     TRACELOG;
-    compilation_cache.release_all(nullptr, release_object<d3d12_library>);
     halide_d3d12compute_device_release(nullptr);
 }
 }  // namespace
diff --git a/src/runtime/destructors.cpp b/src/runtime/destructors.cpp
index b7187c6c4d86..1fa7228a86e2 100644
--- a/src/runtime/destructors.cpp
+++ b/src/runtime/destructors.cpp
@@ -1,5 +1,4 @@
 #include "HalideRuntime.h"
-#include "printer.h"
 
 extern "C" {
 
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
deleted file mode 100644
index 9ef2662026a8..000000000000
--- a/src/runtime/gpu_context_common.h
+++ /dev/null
@@ -1,197 +0,0 @@
-#include "printer.h"
-#include "scoped_mutex_lock.h"
-
-namespace Halide {
-namespace Internal {
-
-template<typename ContextT, typename ModuleStateT>
-class GPUCompilationCache {
-    struct CachedCompilation {
-        ContextT context{};
-        ModuleStateT module_state{};
-        uint32_t kernel_id{};
-    };
-
-    halide_mutex mutex;
-
-    static constexpr float kLoadFactor{.5f};
-    static constexpr int kInitialTableBits{7};
-    int log2_compilations_size{0};  // number of bits in index into compilations table.
-    CachedCompilation *compilations{nullptr};
-    int count{0};
-
-    static constexpr uint32_t kInvalidId{0};
-    static constexpr uint32_t kDeletedId{1};
-
-    uint32_t unique_id{2};  // zero is an invalid id
-
-public:
-    static ALWAYS_INLINE uintptr_t kernel_hash(ContextT context, uint32_t id, uint32_t bits) {
-        uintptr_t addr = (uintptr_t)context + id;
-        // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39...
-        // in hexadecimal.
-        if (sizeof(uintptr_t) >= 8) {
-            return (addr * (uintptr_t)0x9E3779B97F4A7C15) >> (64 - bits);
-        } else {
-            return (addr * (uintptr_t)0x9E3779B9) >> (32 - bits);
-        }
-    }
-
-    HALIDE_MUST_USE_RESULT bool insert(ContextT context, uint32_t id, ModuleStateT module_state) {
-        if (log2_compilations_size == 0) {
-            if (!resize_table(kInitialTableBits)) {
-                return false;
-            }
-        }
-        if ((count + 1) > (1 << log2_compilations_size) * kLoadFactor) {
-            if (!resize_table(log2_compilations_size + 1)) {
-                return false;
-            }
-        }
-        count += 1;
-        uintptr_t index = kernel_hash(context, id, log2_compilations_size);
-        for (int i = 0; i < (1 << log2_compilations_size); i++) {
-            uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
-            if (compilations[effective_index].kernel_id <= kDeletedId) {
-                compilations[effective_index].context = context;
-                compilations[effective_index].module_state = module_state;
-                compilations[effective_index].kernel_id = id;
-                return true;
-            }
-        }
-        // This is a logic error that should never occur. It means the table is
-        // full, but it should have been resized.
-        halide_assert(nullptr, false);
-        return false;
-    }
-
-    HALIDE_MUST_USE_RESULT bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) {
-        if (log2_compilations_size == 0) {
-            return false;
-        }
-        uintptr_t index = kernel_hash(context, id, log2_compilations_size);
-        for (int i = 0; i < (1 << log2_compilations_size); i++) {
-            uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
-
-            if (compilations[effective_index].kernel_id == kInvalidId) {
-                return false;
-            }
-            if (compilations[effective_index].context == context &&
-                compilations[effective_index].kernel_id == id) {
-                module_state = &compilations[effective_index].module_state;
-                return true;
-            }
-        }
-        return false;
-    }
-
-    HALIDE_MUST_USE_RESULT bool lookup(ContextT context, void *state_ptr, ModuleStateT &module_state) {
-        ScopedMutexLock lock_guard(&mutex);
-        uint32_t id = (uint32_t)(uintptr_t)state_ptr;
-        ModuleStateT *mod_ptr;
-        if (find_internal(context, id, mod_ptr)) {
-            module_state = *mod_ptr;
-            return true;
-        }
-        return false;
-    }
-
-    HALIDE_MUST_USE_RESULT bool resize_table(int size_bits) {
-        if (size_bits != log2_compilations_size) {
-            int new_size = (1 << size_bits);
-            int old_size = (1 << log2_compilations_size);
-            CachedCompilation *new_table = (CachedCompilation *)malloc(new_size * sizeof(CachedCompilation));
-            if (new_table == nullptr) {
-                // signal error.
-                return false;
-            }
-            memset(new_table, 0, new_size * sizeof(CachedCompilation));
-            CachedCompilation *old_table = compilations;
-            compilations = new_table;
-            log2_compilations_size = size_bits;
-
-            if (count > 0) {  // Mainly to catch empty initial table case
-                for (int32_t i = 0; i < old_size; i++) {
-                    if (old_table[i].kernel_id != kInvalidId &&
-                        old_table[i].kernel_id != kDeletedId) {
-                        bool result = insert(old_table[i].context, old_table[i].kernel_id,
-                                             old_table[i].module_state);
-                        halide_assert(nullptr, result);  // Resizing the table while resizing the table is a logic error.
-                    }
-                }
-            }
-            free(old_table);
-        }
-        return true;
-    }
-
-    template<typename FreeModuleT>
-    void release_context(void *user_context, bool all, ContextT context, FreeModuleT &f) {
-        if (count == 0) {
-            return;
-        }
-
-        for (int i = 0; i < (1 << log2_compilations_size); i++) {
-            if (compilations[i].kernel_id > kInvalidId &&
-                (all || (compilations[i].context == context))) {
-                f(compilations[i].module_state);
-                compilations[i].module_state = nullptr;
-                compilations[i].kernel_id = kDeletedId;
-                count--;
-            }
-        }
-    }
-
-    template<typename FreeModuleT>
-    void delete_context(void *user_context, ContextT context, FreeModuleT &f) {
-        ScopedMutexLock lock_guard(&mutex);
-
-        release_context(user_context, false, context, f);
-    }
-
-    template<typename FreeModuleT>
-    void release_all(void *user_context, FreeModuleT &f) {
-        ScopedMutexLock lock_guard(&mutex);
-
-        release_context(user_context, true, nullptr, f);
-        free(compilations);
-        compilations = nullptr;
-        log2_compilations_size = 0;
-    }
-
-    template<typename CompileModuleT, typename... Args>
-    HALIDE_MUST_USE_RESULT bool kernel_state_setup(void *user_context, void **state_ptr,
-                                                   ContextT context, ModuleStateT &result,
-                                                   CompileModuleT f,
-                                                   Args... args) {
-        ScopedMutexLock lock_guard(&mutex);
-
-        uint32_t *id_ptr = (uint32_t *)state_ptr;
-        if (*id_ptr == 0) {
-            *id_ptr = unique_id++;
-        }
-
-        ModuleStateT *mod;
-        if (find_internal(context, *id_ptr, mod)) {
-            result = *mod;
-            return true;
-        }
-
-        // TODO(zvookin): figure out the calling signature here...
-        ModuleStateT compiled_module = f(args...);
-        debug(user_context) << "Caching compiled kernel: " << compiled_module << " id " << *id_ptr << " context " << context << "\n";
-        if (compiled_module == nullptr) {
-            return false;
-        }
-
-        if (!insert(context, *id_ptr, compiled_module)) {
-            return false;
-        }
-        result = compiled_module;
-
-        return true;
-    }
-};
-
-}  // namespace Internal
-}  // namespace Halide
diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index 25fe29f3feda..1a2ba0ce52b4 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -1,7 +1,6 @@
 #include "HalideRuntimeMetal.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
-#include "gpu_context_common.h"
 #include "printer.h"
 #include "scoped_spin_lock.h"
 
@@ -283,7 +282,15 @@ struct device_handle {
     uint64_t offset;
 };
 
-WEAK Halide::Internal::GPUCompilationCache<mtl_device *, mtl_library *> compilation_cache;
+// Structure to hold the state of a module attached to the context.
+// Also used as a linked-list to keep track of all the different
+// modules that are attached to a context in order to release them all
+// when then context is released.
+struct module_state {
+    mtl_library *library;
+    module_state *next;
+};
+WEAK module_state *state_list = nullptr;
 
 // API Capabilities.  If more capabilities need to be checked,
 // this can be refactored to something more robust/general.
@@ -537,6 +544,18 @@ WEAK int halide_metal_device_free(void *user_context, halide_buffer_t *buf) {
 }
 
 WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) {
+    // Create the state object if necessary. This only happens once, regardless
+    // of how many times halide_initialize_kernels/halide_release is called.
+    // halide_release traverses this list and releases the module objects, but
+    // it does not modify the list nodes created/inserted here.
+    module_state **state = (module_state **)state_ptr;
+    if (!(*state)) {
+        *state = (module_state *)malloc(sizeof(module_state));
+        (*state)->library = nullptr;
+        (*state)->next = state_list;
+        state_list = *state;
+    }
+
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error != 0) {
         return metal_context.error;
@@ -546,13 +565,23 @@ WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, c
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    mtl_library *library;
-    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, metal_context.device, library,
-                                              new_library_with_source, metal_context.device,
-                                              source, source_size)) {
-        return halide_error_code_generic_error;
+    if ((*state)->library == nullptr) {
+#ifdef DEBUG_RUNTIME
+        uint64_t t_before_compile = halide_current_time_ns(user_context);
+#endif
+
+        debug(user_context) << "Metal - Allocating: new_library_with_source " << (*state)->library << "\n";
+        (*state)->library = new_library_with_source(metal_context.device, source, source_size);
+        if ((*state)->library == nullptr) {
+            error(user_context) << "Metal: new_library_with_source failed.\n";
+            return -1;
+        }
+
+#ifdef DEBUG_RUNTIME
+        uint64_t t_after_compile = halide_current_time_ns(user_context);
+        debug(user_context) << "Time for halide_metal_initialize_kernels compilation: " << (t_after_compile - t_before_compile) / 1.0e6 << " ms\n";
+#endif
     }
-    halide_assert(user_context, library != nullptr);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -615,7 +644,20 @@ WEAK int halide_metal_device_release(void *user_context) {
     if (device) {
         halide_metal_device_sync_internal(queue, nullptr);
 
-        compilation_cache.delete_context(user_context, device, release_ns_object);
+        // Unload the modules attached to this device. Note that the list
+        // nodes themselves are not freed, only the program objects are
+        // released. Subsequent calls to halide_init_kernels might re-create
+        // the program object using the same list node to store the program
+        // object.
+        module_state *state = state_list;
+        while (state) {
+            if (state->library) {
+                debug(user_context) << "Metal - Releasing: new_library_with_source " << state->library << "\n";
+                release_ns_object(state->library);
+                state->library = nullptr;
+            }
+            state = state->next;
+        }
 
         // Release the device itself, if we created it.
         if (acquired_device == device) {
@@ -740,11 +782,10 @@ WEAK int halide_metal_run(void *user_context,
         return -1;
     }
 
-    mtl_library *library;
-    bool found_library = compilation_cache.lookup(metal_context.device, state_ptr, library);
-    halide_assert(user_context, found_library && library != nullptr);
+    halide_assert(user_context, state_ptr);
+    module_state *state = (module_state *)state_ptr;
 
-    mtl_function *function = new_function_with_name(library, entry_name, strlen(entry_name));
+    mtl_function *function = new_function_with_name(state->library, entry_name, strlen(entry_name));
     if (function == nullptr) {
         error(user_context) << "Metal: Could not get function " << entry_name << "from Metal library.\n";
         return -1;
@@ -1106,7 +1147,6 @@ WEAK const struct halide_device_interface_t *halide_metal_device_interface() {
 
 namespace {
 WEAK __attribute__((destructor)) void halide_metal_cleanup() {
-    compilation_cache.release_all(nullptr, release_ns_object);
     halide_metal_device_release(nullptr);
 }
 }  // namespace
diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index 7c7815b620d7..7c10033bde31 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -1,7 +1,6 @@
 #include "HalideRuntimeOpenCL.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
-#include "gpu_context_common.h"
 #include "printer.h"
 #include "scoped_spin_lock.h"
 
@@ -287,7 +286,15 @@ struct device_handle {
     cl_mem mem;
 };
 
-WEAK Halide::Internal::GPUCompilationCache<cl_context, cl_program> compilation_cache;
+// Structure to hold the state of a module attached to the context.
+// Also used as a linked-list to keep track of all the different
+// modules that are attached to a context in order to release them all
+// when then context is released.
+struct module_state {
+    cl_program program;
+    module_state *next;
+};
+WEAK module_state *state_list = nullptr;
 
 WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) {
     if (buf->device == 0) {
@@ -549,83 +556,6 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
     return err;
 }
 
-WEAK cl_program compile_kernel(void *user_context, cl_context ctx, const char *src, int size) {
-    cl_int err = 0;
-    cl_device_id dev;
-
-    err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr);
-    if (err != CL_SUCCESS) {
-        error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: "
-                            << get_opencl_error_name(err);
-        return nullptr;
-    }
-
-    cl_device_id devices[] = {dev};
-
-    // Get the max constant buffer size supported by this OpenCL implementation.
-    cl_ulong max_constant_buffer_size = 0;
-    err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr);
-    if (err != CL_SUCCESS) {
-        error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: "
-                            << get_opencl_error_name(err);
-        return nullptr;
-    }
-    // Get the max number of constant arguments supported by this OpenCL implementation.
-    cl_uint max_constant_args = 0;
-    err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr);
-    if (err != CL_SUCCESS) {
-        error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: "
-                            << get_opencl_error_name(err);
-        return nullptr;
-    }
-
-    // Build the compile argument options.
-    stringstream options(user_context);
-    options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size
-            << " -D MAX_CONSTANT_ARGS=" << max_constant_args;
-
-    const char *extra_options = halide_opencl_get_build_options(user_context);
-    options << " " << extra_options;
-
-    const char *sources[] = {src};
-    debug(user_context) << "    clCreateProgramWithSource -> ";
-    cl_program program = clCreateProgramWithSource(ctx, 1, &sources[0], nullptr, &err);
-    if (err != CL_SUCCESS) {
-        debug(user_context) << get_opencl_error_name(err) << "\n";
-        error(user_context) << "CL: clCreateProgramWithSource failed: "
-                            << get_opencl_error_name(err);
-        return nullptr;
-    } else {
-        debug(user_context) << (void *)program << "\n";
-    }
-
-    debug(user_context) << "    clBuildProgram " << (void *)program
-                        << " " << options.str() << "\n";
-    err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr);
-    if (err != CL_SUCCESS) {
-
-        {
-            // Allocate an appropriately sized buffer for the build log.
-            Printer<ErrorPrinter, 16384> p(user_context);
-
-            p << "CL: clBuildProgram failed: "
-              << get_opencl_error_name(err)
-              << "\nBuild Log:\n";
-
-            // Get build log
-            if (clGetProgramBuildInfo(program, dev,
-                                      CL_PROGRAM_BUILD_LOG,
-                                      p.capacity() - p.size() - 1, p.dst,
-                                      nullptr) != CL_SUCCESS) {
-                p << "clGetProgramBuildInfo failed (Printer buffer too small?)";
-            }
-        }
-
-        return nullptr;
-    }
-    return program;
-}
-
 }  // namespace OpenCL
 }  // namespace Internal
 }  // namespace Runtime
@@ -745,13 +675,97 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr,
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    debug(user_context) << "halide_cuda_initialize_kernels got compilation_cache mutex.\n";
-    cl_program program;
-    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, program,
-                                              compile_kernel, user_context, ctx.context, src, size)) {
-        return halide_error_code_generic_error;
+    // Create the state object if necessary. This only happens once, regardless
+    // of how many times halide_init_kernels/halide_release is called.
+    // halide_release traverses this list and releases the program objects, but
+    // it does not modify the list nodes created/inserted here.
+    module_state **state = (module_state **)state_ptr;
+    if (!(*state)) {
+        *state = (module_state *)malloc(sizeof(module_state));
+        (*state)->program = nullptr;
+        (*state)->next = state_list;
+        state_list = *state;
+    }
+
+    // Create the program if necessary. TODO: The program object needs to not
+    // only already exist, but be created for the same context/device as the
+    // calling context/device.
+    if (!(*state && (*state)->program) && size > 1) {
+        cl_int err = 0;
+        cl_device_id dev;
+
+        err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr);
+        if (err != CL_SUCCESS) {
+            error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: "
+                                << get_opencl_error_name(err);
+            return err;
+        }
+
+        cl_device_id devices[] = {dev};
+
+        // Get the max constant buffer size supported by this OpenCL implementation.
+        cl_ulong max_constant_buffer_size = 0;
+        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr);
+        if (err != CL_SUCCESS) {
+            error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: "
+                                << get_opencl_error_name(err);
+            return err;
+        }
+        // Get the max number of constant arguments supported by this OpenCL implementation.
+        cl_uint max_constant_args = 0;
+        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr);
+        if (err != CL_SUCCESS) {
+            error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: "
+                                << get_opencl_error_name(err);
+            return err;
+        }
+
+        // Build the compile argument options.
+        stringstream options(user_context);
+        options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size
+                << " -D MAX_CONSTANT_ARGS=" << max_constant_args;
+
+        const char *extra_options = halide_opencl_get_build_options(user_context);
+        options << " " << extra_options;
+
+        const char *sources[] = {src};
+        debug(user_context) << "    clCreateProgramWithSource -> ";
+        cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], nullptr, &err);
+        if (err != CL_SUCCESS) {
+            debug(user_context) << get_opencl_error_name(err) << "\n";
+            error(user_context) << "CL: clCreateProgramWithSource failed: "
+                                << get_opencl_error_name(err);
+            return err;
+        } else {
+            debug(user_context) << (void *)program << "\n";
+        }
+
+        (*state)->program = program;
+        debug(user_context) << "    clBuildProgram " << (void *)program
+                            << " " << options.str() << "\n";
+        err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr);
+        if (err != CL_SUCCESS) {
+
+            {
+                // Allocate an appropriately sized buffer for the build log.
+                Printer<ErrorPrinter, 16384> p(user_context);
+
+                p << "CL: clBuildProgram failed: "
+                  << get_opencl_error_name(err)
+                  << "\nBuild Log:\n";
+
+                // Get build log
+                if (clGetProgramBuildInfo(program, dev,
+                                          CL_PROGRAM_BUILD_LOG,
+                                          p.capacity() - p.size() - 1, p.dst,
+                                          nullptr) != CL_SUCCESS) {
+                    p << "clGetProgramBuildInfo failed (Printer buffer too small?)";
+                }
+            }
+
+            return err;
+        }
     }
-    halide_assert(user_context, program != nullptr);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -806,7 +820,21 @@ WEAK int halide_opencl_device_release(void *user_context) {
         err = clFinish(q);
         halide_assert(user_context, err == CL_SUCCESS);
 
-        compilation_cache.delete_context(user_context, ctx, clReleaseProgram);
+        // Unload the modules attached to this context. Note that the list
+        // nodes themselves are not freed, only the program objects are
+        // released. Subsequent calls to halide_init_kernels might re-create
+        // the program object using the same list node to store the program
+        // object.
+        module_state *state = state_list;
+        while (state) {
+            if (state->program) {
+                debug(user_context) << "    clReleaseProgram " << state->program << "\n";
+                err = clReleaseProgram(state->program);
+                halide_assert(user_context, err == CL_SUCCESS);
+                state->program = nullptr;
+            }
+            state = state->next;
+        }
 
         // Release the context itself, if we created it.
         if (ctx == context) {
@@ -1049,10 +1077,9 @@ WEAK int halide_opencl_run(void *user_context,
 
     // Create kernel object for entry_name from the program for this module.
     halide_assert(user_context, state_ptr);
-    cl_program program;
-    bool found_program = compilation_cache.lookup(ctx.context, state_ptr, program);
+    cl_program program = ((module_state *)state_ptr)->program;
 
-    halide_assert(user_context, found_program && program != nullptr);
+    halide_assert(user_context, program);
     debug(user_context) << "    clCreateKernel " << entry_name << " -> ";
     cl_kernel f = clCreateKernel(program, entry_name, &err);
     if (err != CL_SUCCESS) {
@@ -1339,7 +1366,6 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() {
 
 namespace {
 WEAK __attribute__((destructor)) void halide_opencl_cleanup() {
-    compilation_cache.release_all(nullptr, clReleaseProgram);
     halide_opencl_device_release(nullptr);
 }
 }  // namespace
@@ -1914,4 +1940,4 @@ extern "C" {
 WEAK const struct halide_device_interface_t *halide_opencl_image_device_interface() {
     return &opencl_image_device_interface;
 }
-}
+}
\ No newline at end of file
diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
deleted file mode 100644
index 4816ad205866..000000000000
--- a/test/common/gpu_context.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#if defined(TEST_OPENCL)
-// Implement OpenCL custom context.
-
-#define CL_TARGET_OPENCL_VERSION 120
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-// Just use a global context and queue, created and destroyed by main.
-cl_context cl_ctx = nullptr;
-cl_command_queue cl_q = nullptr;
-
-// Create the global context. This is just a helper function not called by Halide.
-bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) {
-    cl_int err = 0;
-
-    const cl_uint maxPlatforms = 4;
-    cl_platform_id platforms[maxPlatforms];
-    cl_uint platformCount = 0;
-
-    err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount);
-    if (err != CL_SUCCESS) {
-        printf("clGetPlatformIDs failed (%d)\n", err);
-        return false;
-    }
-
-    cl_platform_id platform = nullptr;
-
-    if (platformCount > 0) {
-        platform = platforms[0];
-    }
-    if (platform == nullptr) {
-        printf("Failed to get platform\n");
-        return false;
-    }
-
-    cl_device_type device_type = CL_DEVICE_TYPE_ALL;
-
-    // Make sure we have a device
-    const cl_uint maxDevices = 4;
-    cl_device_id devices[maxDevices];
-    cl_uint deviceCount = 0;
-    err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
-    if (err != CL_SUCCESS) {
-        printf("clGetDeviceIDs failed (%d)\n", err);
-        return false;
-    }
-    if (deviceCount == 0) {
-        printf("Failed to get device\n");
-        return false;
-    }
-
-    cl_device_id dev = devices[deviceCount - 1];
-
-    // Create context and command queue.
-    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
-                                          0};
-    cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err);
-    if (err != CL_SUCCESS) {
-        printf("clCreateContext failed (%d)\n", err);
-        return false;
-    }
-
-    cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
-    if (err != CL_SUCCESS) {
-        printf("clCreateCommandQueue failed (%d)\n", err);
-        return false;
-    }
-    return true;
-}
-
-void destroy_opencl_context(cl_context cl_ctx, cl_command_queue cl_q) {
-    clReleaseCommandQueue(cl_q);
-    clReleaseContext(cl_ctx);
-}
-
-#elif defined(TEST_CUDA)
-// Implement CUDA custom context.
-#include <cuda.h>
-
-bool create_cuda_context(CUcontext &cuda_ctx) {
-    // Initialize CUDA
-    CUresult err = cuInit(0);
-    if (err != CUDA_SUCCESS) {
-        printf("cuInit failed (%d)\n", err);
-        return false;
-    }
-
-    // Make sure we have a device
-    int deviceCount = 0;
-    err = cuDeviceGetCount(&deviceCount);
-    if (err != CUDA_SUCCESS) {
-        printf("cuGetDeviceCount failed (%d)\n", err);
-        return false;
-    }
-    if (deviceCount <= 0) {
-        printf("No CUDA devices available\n");
-        return false;
-    }
-
-    CUdevice dev;
-    // Get device
-    CUresult status;
-    // Try to get a device >0 first, since 0 should be our display device
-    // For now, don't try devices > 2 to maintain compatibility with previous behavior.
-    if (deviceCount > 2) deviceCount = 2;
-    for (int id = deviceCount - 1; id >= 0; id--) {
-        status = cuDeviceGet(&dev, id);
-        if (status == CUDA_SUCCESS) break;
-    }
-
-    if (status != CUDA_SUCCESS) {
-        printf("Failed to get CUDA device\n");
-        return status;
-    }
-
-    // Create context
-    err = cuCtxCreate(&cuda_ctx, 0, dev);
-    if (err != CUDA_SUCCESS) {
-        printf("cuCtxCreate failed (%d)\n", err);
-        return false;
-    }
-
-    return true;
-}
-
-void destroy_cuda_context(CUcontext cuda_ctx) {
-    cuCtxDestroy(cuda_ctx);
-}
-
-#endif
diff --git a/test/common/gpu_object_lifetime_tracker.h b/test/common/gpu_object_lifetime_tracker.h
index 1734a59f2f00..b17dd9118413 100644
--- a/test/common/gpu_object_lifetime_tracker.h
+++ b/test/common/gpu_object_lifetime_tracker.h
@@ -22,24 +22,25 @@ class GpuObjectLifetimeTracker {
         }
     };
 
-    std::array<ObjectType, 11> object_types = {{
-        {"Caching compiled kernel:", "Releasing cached compilation:"},
-
+    std::array<ObjectType, 13> object_types = {{
         // OpenCL objects
         {"clCreateContext", "clReleaseContext", true},
         {"clCreateCommandQueue", "clReleaseCommandQueue", true},
         // This handles both "clCreateProgramWithSource" and
         // "clCreateProgramWithBinary".
+        {"clCreateProgram", "clReleaseProgram"},
         {"clCreateBuffer", "clReleaseMemObject"},
         {"clCreateKernel", "clReleaseKernel"},
 
         // CUDA objects
         {"cuCtxCreate", "cuCtxDestroy", true},
+        {"cuModuleLoad", "cuModuleUnload"},
         {"cuMemAlloc", "cuMemFree"},
 
         // Metal objects
         {"Allocating: MTLCreateSystemDefaultDevice", "Releasing: MTLCreateSystemDefaultDevice", true},
         {"Allocating: new_command_queue", "Releasing: new_command_queue"},
+        {"Allocating: new_library_with_source", "Releasing: new_library_with_source"},
 
         // Hexagon objects
         {"halide_remote_load_library", "halide_remote_release_library"},
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 68692f12f1ba..e1ba9d8e4eb3 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -134,7 +134,6 @@ tests(GROUPS correctness
       gpu_give_input_buffers_device_allocations.cpp
       gpu_jit_explicit_copy_to_device.cpp
       gpu_large_alloc.cpp
-      gpu_many_kernels.cpp
       gpu_mixed_dimensionality.cpp
       gpu_mixed_shared_mem_types.cpp
       gpu_multi_kernel.cpp
diff --git a/test/correctness/gpu_many_kernels.cpp b/test/correctness/gpu_many_kernels.cpp
deleted file mode 100644
index d9572b38c18c..000000000000
--- a/test/correctness/gpu_many_kernels.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "Halide.h"
-#include <algorithm>
-
-#include "halide_benchmark.h"
-
-// This test makes sure GPU runtimes can handle many different small
-// kernels and can handle releasing a device context and making a new
-// one and still have many kernels work. This is needed due to kernel
-// compilation caching mechanisms in the GPU runtimes.
-
-using namespace Halide;
-
-constexpr size_t kNumKernels = 70;
-
-int main(int argc, char **argv) {
-    Var x, y, xi, yi;
-    Func adders[kNumKernels];
-    ImageParam input(Int(32), 2);
-
-    Target target = get_jit_target_from_environment();
-    int i = 1;
-    for (Func &f : adders) {
-        f(x, y) = input(x, y) + i;
-        if (target.has_gpu_feature()) {
-            f.compute_root().gpu_tile(x, y, xi, yi, 16, 16);
-        } else {
-            f.compute_root().vectorize(x, target.natural_vector_size<int32_t>());
-        }
-        i += 1;
-    }
-
-    auto start = Halide::Tools::benchmark_now();
-
-    Buffer<int32_t> buf_a_store(32, 32);
-    Buffer<int32_t> buf_b_store(32, 32);
-    Buffer<int32_t> *buf_in = &buf_a_store;
-    Buffer<int32_t> *buf_out = &buf_b_store;
-    buf_in->fill(0);
-    for (Func &f : adders) {
-        input.set(*buf_in);
-        f.realize(*buf_out);
-        std::swap(buf_in, buf_out);
-    }
-    buf_in->copy_to_host();
-
-    auto end = Halide::Tools::benchmark_now();
-    double initial_runtime = Halide::Tools::benchmark_duration_seconds(start, end);
-
-    buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); });
-
-    start = Halide::Tools::benchmark_now();
-
-    buf_in->fill(0);
-    for (Func &f : adders) {
-        input.set(*buf_in);
-        f.realize(*buf_out);
-        std::swap(buf_in, buf_out);
-    }
-    buf_in->copy_to_host();
-
-    end = Halide::Tools::benchmark_now();
-    double precompiled_runtime = Halide::Tools::benchmark_duration_seconds(start, end);
-
-    buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); });
-
-    buf_a_store.device_free();
-    buf_b_store.device_free();
-    const halide_device_interface_t *device = get_device_interface_for_device_api(DeviceAPI::Default_GPU, target);
-    if (device != nullptr) {
-        device->device_release(nullptr, device);
-    }
-
-    start = Halide::Tools::benchmark_now();
-
-    buf_in->fill(0);
-    for (Func &f : adders) {
-        input.set(*buf_in);
-        f.realize(*buf_out);
-        std::swap(buf_in, buf_out);
-    }
-    buf_in->copy_to_host();
-
-    end = Halide::Tools::benchmark_now();
-    double second_runtime = Halide::Tools::benchmark_duration_seconds(start, end);
-
-    buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); });
-
-    printf("Initial runtime %f, precompiled runtime %f, second runtime %f.\n", initial_runtime, precompiled_runtime, second_runtime);
-
-    printf("Success!\n");
-    return 0;
-}
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 130af49b7f93..2db1d3bc4f20 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -257,19 +257,6 @@ halide_define_aot_test(external_code GEN_DEPS external_code_generator_deps)
 # float16_t_generator.cpp
 halide_define_aot_test(float16_t)
 
-# gpu_multi_context_threaded_aottest.cpp
-# gpu_multi_context_threaded_generator.cpp
-halide_define_aot_test(gpu_multi_context_threaded
-                       OMIT_DEFAULT_GENERATOR
-                       EXTRA_LIBS
-                       gpu_multi_context_threaded_add
-                       gpu_multi_context_threaded_mul)
-
-add_halide_library(gpu_multi_context_threaded_add FROM gpu_multi_context_threaded.generator 
-                   FEATURES user_context)
-add_halide_library(gpu_multi_context_threaded_mul FROM gpu_multi_context_threaded.generator
-                   FEATURES user_context)
-
 # gpu_object_lifetime_aottest.cpp
 # gpu_object_lifetime_generator.cpp
 halide_define_aot_test(gpu_object_lifetime FEATURES debug)
diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp
index 9e4fba70afe6..f0fd8ef9cd04 100644
--- a/test/generator/acquire_release_aottest.cpp
+++ b/test/generator/acquire_release_aottest.cpp
@@ -14,25 +14,89 @@ int main(int argc, char **argv) {
 #include <string.h>
 
 #include "acquire_release.h"
-#include "gpu_context.h"
 
 using namespace Halide::Runtime;
 
 const int W = 256, H = 256;
 
 #if defined(TEST_OPENCL)
+// Implement OpenCL custom context.
+
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
 
 // Just use a global context and queue, created and destroyed by main.
 cl_context cl_ctx = nullptr;
 cl_command_queue cl_q = nullptr;
 
 // Create the global context. This is just a helper function not called by Halide.
-bool init_context() {
-    return create_opencl_context(cl_ctx, cl_q);
+int init_context() {
+    cl_int err = 0;
+
+    const cl_uint maxPlatforms = 4;
+    cl_platform_id platforms[maxPlatforms];
+    cl_uint platformCount = 0;
+
+    err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount);
+    if (err != CL_SUCCESS) {
+        printf("clGetPlatformIDs failed (%d)\n", err);
+        return err;
+    }
+
+    cl_platform_id platform = nullptr;
+
+    if (platformCount > 0) {
+        platform = platforms[0];
+    }
+    if (platform == nullptr) {
+        printf("Failed to get platform\n");
+        return CL_INVALID_PLATFORM;
+    }
+
+    cl_device_type device_type = CL_DEVICE_TYPE_ALL;
+
+    // Make sure we have a device
+    const cl_uint maxDevices = 4;
+    cl_device_id devices[maxDevices];
+    cl_uint deviceCount = 0;
+    err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
+    if (err != CL_SUCCESS) {
+        printf("clGetDeviceIDs failed (%d)\n", err);
+        return err;
+    }
+    if (deviceCount == 0) {
+        printf("Failed to get device\n");
+        return CL_DEVICE_NOT_FOUND;
+    }
+
+    cl_device_id dev = devices[deviceCount - 1];
+
+    // Create context and command queue.
+    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
+                                          0};
+    cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err);
+    if (err != CL_SUCCESS) {
+        printf("clCreateContext failed (%d)\n", err);
+        return err;
+    }
+
+    cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf("clCreateCommandQueue failed (%d)\n", err);
+        return err;
+    }
+    printf("Created CL context %p\n", cl_ctx);
+    return 0;
 }
 
 void destroy_context() {
-    destroy_opencl_context(cl_ctx, cl_q);
+    printf("Destroying CL context %p\n", cl_ctx);
+    clReleaseCommandQueue(cl_q);
+    clReleaseContext(cl_ctx);
     cl_q = nullptr;
     cl_ctx = nullptr;
 }
@@ -52,14 +116,61 @@ extern "C" int halide_release_cl_context(void *user_context) {
     return 0;
 }
 #elif defined(TEST_CUDA)
+// Implement CUDA custom context.
+#include <cuda.h>
+
 CUcontext cuda_ctx = nullptr;
 
-bool init_context() {
-    return create_cuda_context(cuda_ctx);
+int init_context() {
+    // Initialize CUDA
+    CUresult err = cuInit(0);
+    if (err != CUDA_SUCCESS) {
+        printf("cuInit failed (%d)\n", err);
+        return err;
+    }
+
+    // Make sure we have a device
+    int deviceCount = 0;
+    err = cuDeviceGetCount(&deviceCount);
+    if (err != CUDA_SUCCESS) {
+        printf("cuGetDeviceCount failed (%d)\n", err);
+        return err;
+    }
+    if (deviceCount <= 0) {
+        printf("No CUDA devices available\n");
+        return CUDA_ERROR_NO_DEVICE;
+    }
+
+    CUdevice dev;
+    // Get device
+    CUresult status;
+    // Try to get a device >0 first, since 0 should be our display device
+    // For now, don't try devices > 2 to maintain compatibility with previous behavior.
+    if (deviceCount > 2) deviceCount = 2;
+    for (int id = deviceCount - 1; id >= 0; id--) {
+        status = cuDeviceGet(&dev, id);
+        if (status == CUDA_SUCCESS) break;
+    }
+
+    if (status != CUDA_SUCCESS) {
+        printf("Failed to get CUDA device\n");
+        return status;
+    }
+
+    // Create context
+    err = cuCtxCreate(&cuda_ctx, 0, dev);
+    if (err != CUDA_SUCCESS) {
+        printf("cuCtxCreate failed (%d)\n", err);
+        return err;
+    }
+    printf("Created CUDA context %p\n", cuda_ctx);
+
+    return 0;
 }
 
 void destroy_context() {
-    destroy_cuda_context(cuda_ctx);
+    printf("Destroying CUDA context %p\n", cuda_ctx);
+    cuCtxDestroy(cuda_ctx);
     cuda_ctx = nullptr;
 }
 
@@ -78,18 +189,19 @@ extern "C" int halide_cuda_release_context(void *user_context) {
 }
 #else
 // Just use the default implementation of acquire/release.
-bool init_context() {
+int init_context() {
     printf("Using default implementation of acquire/release\n");
-    return true;
+    return 0;
 }
 void destroy_context() {
 }
 #endif
 
-bool run_test() {
+int main(int argc, char **argv) {
     // Initialize the runtime specific GPU context.
-    if (!init_context()) {
-        return false;
+    int ret = init_context();
+    if (ret != 0) {
+        return ret;
     }
 
     // Everything else is a normal Halide program. The GPU runtime will call
@@ -115,40 +227,19 @@ bool run_test() {
             if (input(x, y) * 2.0f + 1.0f != output(x, y)) {
                 printf("Error at (%d, %d): %f != %f\n", x, y, input(x, y) * 2.0f + 1.0f,
                        output(x, y));
-                return false;
+                return -1;
             }
         }
     }
 
-    const halide_device_interface_t *interface = output.raw_buffer()->device_interface;
-
     // We need to free our GPU buffers before destroying the context.
     input.device_free();
     output.device_free();
 
-    if (interface != nullptr) {
-        halide_device_release(nullptr, interface);
-    } else {
-        printf("Device interface is nullptr.\n");
-        return false;
-    }
-
     // Free the context we created.
     destroy_context();
 
     printf("Success!\n");
-    return true;
-}
-
-int main(int argc, char **argv) {
-    if (!run_test()) {
-        return -1;
-    }
-
-    if (!run_test()) {
-        return -1;
-    }
-
     return 0;
 }
 
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
deleted file mode 100644
index 05852909ba57..000000000000
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-#include <stdio.h>
-
-// This test demonstrates how to use more than one GPU context with
-// Halide generated GPU support, specifically in a multithreaded
-// program. It of course also tests that this works correctly with the
-// Halide GPU runtimes.
-
-#ifdef _WIN32
-int main(int argc, char **argv) {
-    printf("[SKIP] Test requires weak linkage, which is not available on Windows.\n");
-    return 0;
-}
-#else
-
-#include "HalideBuffer.h"
-#include "HalideRuntime.h"
-#include <assert.h>
-#include <math.h>
-#include <string.h>
-#include <thread>
-
-#include "gpu_context.h"
-
-#include "gpu_multi_context_threaded_add.h"
-#include "gpu_multi_context_threaded_mul.h"
-
-using namespace Halide::Runtime;
-
-const int W = 32, H = 32;
-
-#if defined(TEST_OPENCL)
-
-struct gpu_context {
-    cl_context cl_ctx;
-    cl_command_queue cl_q;
-};
-
-// Create the global context. This is just a helper function not called by Halide.
-bool init_context(gpu_context &context) {
-    return create_opencl_context(context.cl_ctx, context.cl_q);
-}
-
-void destroy_context(gpu_context &context) {
-    destroy_opencl_context(context.cl_ctx, context.cl_q);
-    cl_q = nullptr;
-    cl_ctx = nullptr;
-}
-
-// These functions replace the acquire/release implementation in src/runtime/opencl.cpp.
-// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization
-// in our implementation of these functions.
-extern "C" int halide_acquire_cl_context(void *user_context, cl_context *ctx, cl_command_queue *q, bool create) {
-    if (user_context == nullptr) {
-        assert(!create);
-        *ctx = nullptr;
-        *q = nullptr;
-    } else {
-        const gpu_context *context = (const gpu_context *)user_context;
-        *ctx = context->cl_ctx;
-        *q = context->cl_q;
-    }
-    return 0;
-}
-
-extern "C" int halide_release_cl_context(void *user_context) {
-    return 0;
-}
-
-#define HAS_MULTIPLE_CONTEXTS true
-#elif defined(TEST_CUDA)
-
-typedef CUcontext gpu_context;
-
-bool init_context(CUcontext &cuda_ctx) {
-    return create_cuda_context(cuda_ctx);
-}
-
-void destroy_context(CUcontext &cuda_ctx) {
-    destroy_cuda_context(cuda_ctx);
-    cuda_ctx = nullptr;
-}
-
-// These functions replace the acquire/release implementation in src/runtime/cuda.cpp.
-// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization
-// in our implementation of these functions.
-extern "C" int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, bool create) {
-    if (user_context == nullptr) {
-        assert(!create);
-        *ctx = nullptr;
-    } else {
-        *ctx = *(CUcontext *)user_context;
-    }
-    return 0;
-}
-
-extern "C" int halide_cuda_release_context(void *user_context) {
-    return 0;
-}
-
-#define HAS_MULTIPLE_CONTEXTS true
-#else
-typedef int gpu_context;
-
-// Just use the default implementation of acquire/release.
-bool init_context(int &context) {
-    printf("Using default implementation of acquire/release\n");
-    context = 0;
-    return true;
-}
-void destroy_context(int & /* context */) {
-
-#define HAS_MULTIPLE_CONTEXTS false
-#endif
-
-void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) {
-    gpu_context context2;
-
-    Buffer<int32_t> buf1_in(W, H);
-    Buffer<int32_t> buf1_result(W, H);
-    buf1_in.fill(0);
-
-    const halide_device_interface_t *device_interface;
-
-    int val = 0;
-    for (int i = 0; i < 10; i++) {
-        init_context(context2);
-
-        Buffer<int32_t> buf2_in(W, H);
-        Buffer<int32_t> buf2_result(W, H);
-        buf2_in.fill(0);
-
-        gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result);
-        gpu_multi_context_threaded_mul(&context1, buf1_result, buf1_in);
-        gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result);
-
-        gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result);
-        gpu_multi_context_threaded_mul(&context2, buf2_result, buf2_in);
-        gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result);
-
-        buf1_result.copy_to_host(&context1);
-        buf2_result.copy_to_host(&context2);
-
-        val += 2;
-        val *= 2;
-        assert(buf1_result.all_equal(val + 2));
-        assert(buf2_result.all_equal(6));
-
-        device_interface = buf1_result.raw_buffer()->device_interface;
-
-        // About to destroy context, so ensure allocations are freed first.
-        buf2_in.device_free(&context2);
-        buf2_result.device_free(&context2);
-
-        if (device_interface != nullptr) {
-            halide_device_release(&context2, device_interface);
-        }
-        destroy_context(context2);
-    }
-
-    // About to destroy context, so ensure allocations are freed first.
-    buf1_in.device_free(&context1);
-    buf1_result.device_free(&context1);
-
-    if (destroy_when_done && device_interface != nullptr) {
-        halide_device_release(&context1, device_interface);
-        destroy_context(context1);
-    }
-}
-
-int main(int argc, char **argv) {
-    gpu_context contexta;
-    init_context(contexta);
-
-    gpu_context contextb;
-    init_context(contextb);
-
-    std::thread thread1(run_kernels_on_thread, contexta, false);
-    std::thread thread2(run_kernels_on_thread, contextb, false);
-
-    thread1.join();
-    thread2.join();
-
-    // Make sure using the same context on different threads works.
-    std::thread thread3(run_kernels_on_thread, contexta, HAS_MULTIPLE_CONTEXTS);
-    std::thread thread4(run_kernels_on_thread, contextb, HAS_MULTIPLE_CONTEXTS);
-
-    thread3.join();
-    thread4.join();
-
-    printf("Success!\n");
-    return 0;
-}
-#endif  // !WIN32
diff --git a/test/generator/gpu_multi_context_threaded_generator.cpp b/test/generator/gpu_multi_context_threaded_generator.cpp
deleted file mode 100644
index 42f278b6379b..000000000000
--- a/test/generator/gpu_multi_context_threaded_generator.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "Halide.h"
-
-namespace {
-
-class GpuAdd : public Halide::Generator<GpuAdd> {
-public:
-    Input<Buffer<int32_t>> input{"input", 2};
-
-    Output<Buffer<int32_t>> output{"output", 2};
-
-    void generate() {
-        Var x("x"), y("y");
-
-        // Create a simple pipeline that scales pixel values by 2.
-        output(x, y) = input(x, y) + 2;
-
-        Target target = get_target();
-        if (target.has_gpu_feature()) {
-            Var xo, yo, xi, yi;
-            output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16);
-        }
-    }
-};
-
-class GpuMul : public Halide::Generator<GpuMul> {
-public:
-    Input<Buffer<int32_t>> input{"input", 2};
-
-    Output<Buffer<int32_t>> output{"output", 2};
-
-    void generate() {
-        Var x("x"), y("y");
-
-        // Create a simple pipeline that scales pixel values by 2.
-        output(x, y) = input(x, y) * 2;
-
-        Target target = get_target();
-        if (target.has_gpu_feature()) {
-            Var xo, yo, xi, yi;
-            output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16);
-        }
-    }
-};
-
-}  // namespace
-
-HALIDE_REGISTER_GENERATOR(GpuAdd, gpu_multi_context_threaded_add)
-HALIDE_REGISTER_GENERATOR(GpuMul, gpu_multi_context_threaded_mul)