From 18461ae9d9c1283db9158e1989942a1bc3843ced Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 18 Nov 2020 16:56:19 -0800
Subject: [PATCH 01/17] Initial commit of common code for caching GPU program
 (shader/kernel/etc.) compilations.

---
 src/runtime/gpu_context_common.h | 179 +++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 src/runtime/gpu_context_common.h
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
new file mode 100644
index 000000000000..eec2d727bfaa
--- /dev/null
+++ b/src/runtime/gpu_context_common.h
@@ -0,0 +1,179 @@
+#include "printer.h"
+#include "scoped_mutex_lock.h"
+
+namespace Halide { namespace Internal {
+  
+template <typename ContextT, typename ModuleStateT>
+struct GPUCompilationCache {
+    struct CachedCompilation {
+        ContextT context;
+        ModuleStateT module_state;
+        uint32_t kernel_id;
+    };
+
+    halide_mutex mutex;
+
+    static constexpr float kLoadFactor = .5f;
+    static constexpr int kInitialTableBits = 7;
+    int log2_compilations_size{0}; // number of bits in index into compilations table.
+    CachedCompilation *compilations{nullptr};
+    int count{0};
+
+    static constexpr uint32_t kInvalidId{0};
+    static constexpr uint32_t kDeletedId{1};
+
+    uint32_t unique_id{2}; // zero is an invalid id
+  
+    static ALWAYS_INLINE uintptr_t kernel_hash(ContextT context, uint32_t id, uint32_t bits) {
+        uintptr_t addr = (uintptr_t)context + id;
+        // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39...
+        // in hexadecimal.
+        if (sizeof(uintptr_t) >= 8) {
+            return (addr * (uintptr_t)0x9E3779B97F4A7C15) >> (64 - bits);
+        } else {
+            return (addr * (uintptr_t)0x9E3779B9) >> (32 - bits);
+        }
+    }
+
+    bool insert(ContextT context, uint32_t id, ModuleStateT module_state) {
+        debug(nullptr) << "In insert with log2_compilations_size == " << log2_compilations_size << " .\n";
+        if (log2_compilations_size == 0) {
+            if (!resize_table(kInitialTableBits)) {
+                return false;
+            }
+        }
+        if (count++ > (1 << log2_compilations_size) * kLoadFactor) {
+            if (!resize_table(log2_compilations_size + 1)) {
+                return false;
+            }
+        }
+        debug(nullptr) << "In insert(2) with log2_compilations_size == " << log2_compilations_size << " .\n";
+        uintptr_t index = kernel_hash(context, id, log2_compilations_size);
+        for (int i = 0; i < (1 << log2_compilations_size); i++) {
+            uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
+            debug(nullptr) << "In insert i is " << i << " effective_index is " << effective_index << ".\n";
+            if (compilations[effective_index].kernel_id <= kDeletedId) {
+                compilations[effective_index].context = context;
+                compilations[effective_index].module_state = module_state;
+                compilations[effective_index].kernel_id = id;
+                return true;
+            }
+        }
+        // This is a logic error that should never occur. It means the table is
+        // full, but it should have been resized.
+        halide_assert(nullptr, false);
+        return false;
+    }
+
+    bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) {
+        debug(nullptr) << "In find with log2_compilations_size == " << log2_compilations_size << " .\n";
+        if (log2_compilations_size == 0) {
+            return false;
+        }
+        uintptr_t index = kernel_hash(context, id, log2_compilations_size);
+        for (int i = 0; i < (1 << log2_compilations_size); i++) {
+            uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
+            if (compilations[effective_index].context == context &&
+                compilations[effective_index].kernel_id == id) {
+                module_state = &compilations[effective_index].module_state;
+                return true;
+            }
+        }
+        debug(nullptr) << "Exiting find find.\n";
+        return false;
+    }
+
+    bool lookup(ContextT context, void *state_ptr, ModuleStateT &module_state) {
+        ScopedMutexLock lock_guard(&mutex);
+        uint32_t id = (uint32_t)(uintptr_t)state_ptr;
+        ModuleStateT *mod_ptr;
+        if (find_internal(context, id, mod_ptr)) {
+            module_state = *mod_ptr;
+            return true;
+        }
+        return false;
+    }
+ 
+    bool resize_table(int size_bits) {
+        if (size_bits != log2_compilations_size) {
+            int new_size = (1 << size_bits);
+            int old_size = (1 << log2_compilations_size);
+            CachedCompilation *new_table = (CachedCompilation *)malloc(new_size * sizeof(CachedCompilation));
+            if (new_table == nullptr) {
+                // signal error.
+                return false;
+            }
+            memset(new_table, 0, (1 << size_bits) * sizeof(CachedCompilation));
+            CachedCompilation *old_table = compilations;
+            compilations = new_table;
+            log2_compilations_size = size_bits;
+ 
+            for (int32_t i = 0; i < old_size; i++) {
+                if (compilations[i].kernel_id != kInvalidId &&
+                    compilations[i].kernel_id != kDeletedId) {
+                    insert(compilations[i].context, compilations[i].kernel_id, compilations[i].module_state);
+                }
+            }
+            free(old_table);
+        }
+        return true;
+    }
+
+    template <typename FreeModuleT>
+    void delete_context(void *user_context, ContextT context, FreeModuleT &f) {
+        ScopedMutexLock lock_guard(&mutex);
+
+        for (int i = 0; i < (1 << log2_compilations_size); i++) {
+            if (compilations[i].kernel_id > kInvalidId &&
+                compilations[i].context == context) {
+                debug(user_context) << "Releasing cached compilation: " << compilations[i].module_state << "\n";
+                f(compilations[i].module_state);
+                compilations[i].module_state = nullptr;
+                compilations[i].kernel_id = kInvalidId;
+                count--;
+            }
+        }
+    }
+
+    template <typename CompileModuleT, typename... Args>
+    bool kernel_state_setup(void * user_context, void **state_ptr,
+                            ContextT context, ModuleStateT &result,
+                            CompileModuleT f,
+                            Args... args) {
+        ScopedMutexLock lock_guard(&mutex);
+
+        uint32_t *id_ptr = (uint32_t *)state_ptr;
+        if (*id_ptr == 0) {
+            *id_ptr = unique_id++;
+        }
+
+        ModuleStateT *mod;
+        if (find_internal(context, *id_ptr, mod)) {
+            result = *mod;
+            return true;
+        }
+
+        // TODO(zvookin): figure out the calling signature here...
+        ModuleStateT compiled_module = f(args...);
+        debug(user_context) << "Caching compiled kernel: " << compiled_module << "\n";
+        if (compiled_module == nullptr) {
+            return false;
+        }
+
+        insert(context, *id_ptr, compiled_module);
+        result = compiled_module;
+        
+        return true;
+    }
+
+};
+
+
+// A call that takes the state pointer and looks for the module
+//     does it create the list?
+
+// A call to take the newly compiled module and add it
+
+// A call to remove everything for a context
+
+} }

From 05cf15e09db97b29cc350487a0acd1003fc494e2 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Thu, 19 Nov 2020 12:28:10 -0800
Subject: [PATCH 02/17] New GPU compilation cache.

---
 src/runtime/cuda.cpp                      | 156 +++++-----------
 src/runtime/d3d12compute.cpp              |  81 ++++-----
 src/runtime/destructors.cpp               |   3 +
 src/runtime/gpu_context_common.h          |  48 ++---
 src/runtime/metal.cpp                     |  68 ++-----
 src/runtime/opencl.cpp                    | 210 ++++++++++------------
 test/common/gpu_object_lifetime_tracker.h |   7 +-
 7 files changed, 213 insertions(+), 360 deletions(-)

diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 7c423e179d85..69deb691d586 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1,4 +1,5 @@
 #include "HalideRuntimeCuda.h"
+#include "gpu_context_common.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
 #include "mini_cuda.h"
@@ -239,43 +240,7 @@ class Context {
     }
 };
 
-// Halide allocates a device API controlled pointer slot as part of
-// each compiled module. The slot is used to store information to
-// avoid having to reload/recompile kernel code on each call into a
-// Halide filter. The cuda runtime uses this pointer to maintain a
-// linked list of contexts into which the module has been loaded.
-//
-// A global list of all registered filters is also kept so all modules
-// loaded on a given context can be unloaded and removed from the list
-// when halide_device_release is called on a specific context.
-//
-// The registered_filters struct is not freed as it is pointed to by the
-// Halide generated code. The module_state structs are freed.
-
-struct module_state {
-    CUcontext context;
-    CUmodule module;
-    module_state *next;
-};
-
-struct registered_filters {
-    module_state *modules;
-    registered_filters *next;
-};
-WEAK registered_filters *filters_list = nullptr;
-// This spinlock protects the above filters_list.
-WEAK halide_mutex filters_list_lock;
-
-WEAK module_state *find_module_for_context(const registered_filters *filters, CUcontext ctx) {
-    module_state *modules = filters->modules;
-    while (modules != nullptr) {
-        if (modules->context == ctx) {
-            return modules;
-        }
-        modules = modules->next;
-    }
-    return nullptr;
-}
+WEAK Halide::Internal::GPUCompilationCache<CUcontext, CUmodule> compilation_cache;
 
 WEAK CUresult create_cuda_context(void *user_context, CUcontext *ctx) {
     // Initialize CUDA
@@ -505,6 +470,33 @@ WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size
 #endif
 }
 
+WEAK CUmodule compile_kernel(void *user_context, const char *ptx_src, int size) {
+   debug(user_context) << "CUDA: compile_kernel cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
+
+    CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
+    unsigned int max_regs_per_thread = 64;
+
+    // A hack to enable control over max register count for
+    // testing. This should be surfaced in the schedule somehow
+    // instead.
+    char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
+    if (regs) {
+        max_regs_per_thread = atoi(regs);
+    }
+    void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
+    CUmodule loaded_module;
+    CUresult err = cuModuleLoadDataEx(&loaded_module, ptx_src, 1, options, optionValues);
+
+    if (err != CUDA_SUCCESS) {
+        error(user_context) << "CUDA: cuModuleLoadData failed: "
+                            << get_error_name(err);
+        return nullptr;
+    } else {
+        debug(user_context) << (void *)(loaded_module) << "\n";
+    }
+    return loaded_module;
+}
+  
 }  // namespace Cuda
 }  // namespace Internal
 }  // namespace Runtime
@@ -526,54 +518,12 @@ WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, co
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    halide_assert(user_context, &filters_list_lock != nullptr);
-    {
-        ScopedMutexLock spinlock(&filters_list_lock);
-
-        // Create the state object if necessary. This only happens once, regardless
-        // of how many times halide_initialize_kernels/halide_release is called.
-        // halide_release traverses this list and releases the module objects, but
-        // it does not modify the list nodes created/inserted here.
-        registered_filters **filters = (registered_filters **)state_ptr;
-        if (!(*filters)) {
-            *filters = (registered_filters *)malloc(sizeof(registered_filters));
-            (*filters)->modules = nullptr;
-            (*filters)->next = filters_list;
-            filters_list = *filters;
-        }
-
-        // Create the module itself if necessary.
-        module_state *loaded_module = find_module_for_context(*filters, ctx.context);
-        if (loaded_module == nullptr) {
-            loaded_module = (module_state *)malloc(sizeof(module_state));
-            debug(user_context) << "    cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
-
-            CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
-            unsigned int max_regs_per_thread = 64;
-
-            // A hack to enable control over max register count for
-            // testing. This should be surfaced in the schedule somehow
-            // instead.
-            char *regs = getenv("HL_CUDA_JIT_MAX_REGISTERS");
-            if (regs) {
-                max_regs_per_thread = atoi(regs);
-            }
-            void *optionValues[] = {(void *)(uintptr_t)max_regs_per_thread};
-            CUresult err = cuModuleLoadDataEx(&loaded_module->module, ptx_src, 1, options, optionValues);
-
-            if (err != CUDA_SUCCESS) {
-                free(loaded_module);
-                error(user_context) << "CUDA: cuModuleLoadData failed: "
-                                    << get_error_name(err);
-                return err;
-            } else {
-                debug(user_context) << (void *)(loaded_module->module) << "\n";
-            }
-            loaded_module->context = ctx.context;
-            loaded_module->next = (*filters)->modules;
-            (*filters)->modules = loaded_module;
-        }
-    }  // spinlock
+    CUmodule loaded_module;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, context, loaded_module,
+                                              compile_kernel, user_context, ptx_src, size)) {
+        return halide_error_code_generic_error;
+    }
+    halide_assert(user_context, loaded_module != nullptr);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -728,34 +678,7 @@ WEAK int halide_cuda_device_release(void *user_context) {
         // Dump the contents of the free list, ignoring errors.
         halide_cuda_release_unused_device_allocations(user_context);
 
-        {
-            ScopedMutexLock spinlock(&filters_list_lock);
-
-            // Unload the modules attached to this context. Note that the list
-            // nodes themselves are not freed, only the module objects are
-            // released. Subsequent calls to halide_init_kernels might re-create
-            // the program object using the same list node to store the module
-            // object.
-            registered_filters *filters = filters_list;
-            while (filters) {
-                module_state **prev_ptr = &filters->modules;
-                module_state *loaded_module = filters->modules;
-                while (loaded_module != nullptr) {
-                    if (loaded_module->context == ctx) {
-                        debug(user_context) << "    cuModuleUnload " << loaded_module->module << "\n";
-                        err = cuModuleUnload(loaded_module->module);
-                        halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
-                        *prev_ptr = loaded_module->next;
-                        free(loaded_module);
-                        loaded_module = *prev_ptr;
-                    } else {
-                        loaded_module = loaded_module->next;
-                        prev_ptr = &loaded_module->next;
-                    }
-                }
-                filters = filters->next;
-            }
-        }  // spinlock
+        compilation_cache.delete_context(user_context, context, cuModuleUnload);
 
         CUcontext old_ctx;
         cuCtxPopCurrent(&old_ctx);
@@ -1133,9 +1056,9 @@ WEAK int halide_cuda_run(void *user_context,
 #endif
 
     halide_assert(user_context, state_ptr);
-    module_state *loaded_module = find_module_for_context((registered_filters *)state_ptr, ctx.context);
-    halide_assert(user_context, loaded_module != nullptr);
-    CUmodule mod = loaded_module->module;
+    CUmodule mod;
+    bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod);
+    halide_assert(user_context, found_module && mod != nullptr);
     debug(user_context) << "Got module " << mod << "\n";
     halide_assert(user_context, mod);
     CUfunction f;
@@ -1313,6 +1236,7 @@ WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *min
 
 namespace {
 WEAK __attribute__((destructor)) void halide_cuda_cleanup() {
+    compilation_cache.release_all(nullptr, cuModuleUnload);
     halide_cuda_device_release(nullptr);
 }
 }  // namespace
diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 49ef68a0118a..dfdc70df87b4 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -45,6 +45,7 @@
 #include "HalideRuntimeD3D12Compute.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
+#include "gpu_context_common.h"
 #include "printer.h"
 #include "scoped_spin_lock.h"
 
@@ -2437,16 +2438,24 @@ static void *buffer_contents(d3d12_buffer *buffer) {
 
 volatile ScopedSpinLock::AtomicFlag WEAK thread_lock = 0;
 
-// Structure to hold the state of a module attached to the context.
-// Also used as a linked-list to keep track of all the different
-// modules that are attached to a context in order to release them all
-// when then context is released.
-struct module_state {
-    d3d12_library *library;
-    module_state *next;
-};
-D3D12TYPENAME(module_state)
-WEAK module_state *state_list = nullptr;
+WEAK Halide::Internal::GPUCompilationCache<d3d12_device *, d3d12_library *> compilation_cache;
+
+WEAK d3d12_library *compile_kernel(void *user_context, const char *source, int source_size, int *error_ret) {
+    D3D12ContextHolder d3d12_context(user_context, true);
+    if (d3d12_context.error != 0) {
+        *error_ret = d3d12_context.error;
+        return nullptr;
+    }
+
+    d3d12_library *library = new_library_with_source(d3d12_context.device, source, source_size);
+    if (library == nullptr) {
+        TRACEFATAL("D3D12Compute: new_library_with_source failed.");
+        *error_ret = halide_error_code_out_of_memory;
+        return nullptr;
+    }
+
+    return library;
+}
 
 }  // namespace D3D12Compute
 }  // namespace Internal
@@ -2754,29 +2763,14 @@ WEAK int halide_d3d12compute_device_free(void *user_context, halide_buffer_t *bu
 WEAK int halide_d3d12compute_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) {
     TRACELOG;
 
-    // Create the state object if necessary. This only happens once, regardless
-    // of how many times halide_initialize_kernels/halide_release is called.
-    // halide_release traverses this list and releases the module objects, but
-    // it does not modify the list nodes created/inserted here.
-    module_state *&state = *(module_state **)state_ptr;
-    if (!state) {
-        state = malloct<module_state>();
-        state->library = nullptr;
-        state->next = state_list;
-        state_list = state;
-    }
-
     D3D12ContextHolder d3d12_context(user_context, true);
-    if (d3d12_context.error != 0) {
-        return d3d12_context.error;
-    }
 
-    if (state->library == nullptr) {
-        state->library = new_library_with_source(d3d12_context.device, source, source_size);
-        if (state->library == nullptr) {
-            TRACEFATAL("D3D12Compute: new_library_with_source failed.");
-            return halide_error_code_out_of_memory;
-        }
+    int error = halide_error_code_generic_error;
+    d3d12_library *library;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, d3d12_context.device,
+                                              library, compile_kernel, user_context,
+                                              source, source_size, &error)) {
+        return error;
     }
 
     return 0;
@@ -2839,20 +2833,8 @@ WEAK int halide_d3d12compute_device_release(void *user_context) {
             release_object(buffer);
         }
 
-        // Unload the modules attached to this device. Note that the list
-        // nodes themselves are not freed, only the program objects are
-        // released. Subsequent calls to halide_init_kernels might re-create
-        // the program object using the same list node to store the program
-        // object.
-        module_state *state = state_list;
-        while (state) {
-            if (state->library) {
-                release_object(state->library);
-                state->library = nullptr;
-            }
-            state = state->next;
-        }
-
+        compilation_cache.delete_context(user_context, device, release_object<d3d12_library>);
+        
         // Release the device itself, if we created it.
         if (acquired_device == device) {
             release_object(&upload);
@@ -3023,8 +3005,9 @@ WEAK int halide_d3d12compute_run(void *user_context,
     StartCapturingGPUActivity();
 #endif
 
-    halide_assert(user_context, state_ptr);
-    module_state *state = (module_state *)state_ptr;
+    d3d12_library *library;
+    bool found_module = compilation_cache.lookup(device, state_ptr, library);
+    halide_assert(user_context, found_module && library != nullptr);
 
     d3d12_frame *frame = acquire_frame(device);
     d3d12_compute_command_list *cmdList = frame->cmd_list;
@@ -3036,7 +3019,7 @@ WEAK int halide_d3d12compute_run(void *user_context,
     d3d12_compute_pipeline_state *pipeline_state = nullptr;
     {
         TRACE_SCOPE("kernel shader selection");
-        function = new_function_with_name(device, state->library, entry_name, strlen(entry_name),
+        function = new_function_with_name(device, library, entry_name, strlen(entry_name),
                                           shared_mem_bytes, threadsX, threadsY, threadsZ);
         halide_assert(user_context, function);
         pipeline_state = function->pipeline_state;
@@ -3531,6 +3514,7 @@ WEAK const struct halide_device_interface_t *halide_d3d12compute_device_interfac
 namespace {
 WEAK __attribute__((destructor)) void halide_d3d12compute_cleanup() {
     TRACELOG;
+    compilation_cache.release_all(nullptr, release_object<d3d12_library>);
     halide_d3d12compute_device_release(nullptr);
 }
 }  // namespace
@@ -3582,3 +3566,4 @@ WEAK halide_device_interface_t d3d12compute_device_interface = {
 }  // namespace Internal
 }  // namespace Runtime
 }  // namespace Halide
+
diff --git a/src/runtime/destructors.cpp b/src/runtime/destructors.cpp
index 1fa7228a86e2..b72f106d784f 100644
--- a/src/runtime/destructors.cpp
+++ b/src/runtime/destructors.cpp
@@ -1,13 +1,16 @@
 #include "HalideRuntime.h"
+#include "printer.h"
 
 extern "C" {
 
 ALWAYS_INLINE __attribute__((used)) void call_destructor(void *user_context, void (*fn)(void *user_context, void *object), void **object, bool should_call) {
+  debug(nullptr) << "call_destructor called.\n";
     void *o = *object;
     *object = nullptr;
     // Call the function
     if (o && should_call) {
         fn(user_context, o);
     }
+  debug(nullptr) << "call_destructor done.\n";
 }
 }
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index eec2d727bfaa..0b740d09a726 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -36,7 +36,6 @@ struct GPUCompilationCache {
     }
 
     bool insert(ContextT context, uint32_t id, ModuleStateT module_state) {
-        debug(nullptr) << "In insert with log2_compilations_size == " << log2_compilations_size << " .\n";
         if (log2_compilations_size == 0) {
             if (!resize_table(kInitialTableBits)) {
                 return false;
@@ -47,11 +46,9 @@ struct GPUCompilationCache {
                 return false;
             }
         }
-        debug(nullptr) << "In insert(2) with log2_compilations_size == " << log2_compilations_size << " .\n";
         uintptr_t index = kernel_hash(context, id, log2_compilations_size);
         for (int i = 0; i < (1 << log2_compilations_size); i++) {
             uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
-            debug(nullptr) << "In insert i is " << i << " effective_index is " << effective_index << ".\n";
             if (compilations[effective_index].kernel_id <= kDeletedId) {
                 compilations[effective_index].context = context;
                 compilations[effective_index].module_state = module_state;
@@ -66,7 +63,6 @@ struct GPUCompilationCache {
     }
 
     bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) {
-        debug(nullptr) << "In find with log2_compilations_size == " << log2_compilations_size << " .\n";
         if (log2_compilations_size == 0) {
             return false;
         }
@@ -79,7 +75,6 @@ struct GPUCompilationCache {
                 return true;
             }
         }
-        debug(nullptr) << "Exiting find find.\n";
         return false;
     }
 
@@ -108,10 +103,12 @@ struct GPUCompilationCache {
             compilations = new_table;
             log2_compilations_size = size_bits;
  
-            for (int32_t i = 0; i < old_size; i++) {
-                if (compilations[i].kernel_id != kInvalidId &&
-                    compilations[i].kernel_id != kDeletedId) {
-                    insert(compilations[i].context, compilations[i].kernel_id, compilations[i].module_state);
+            if (count > 0) { // MAinly to catch empty initial table case
+                for (int32_t i = 0; i < old_size; i++) {
+                    if (old_table[i].kernel_id != kInvalidId &&
+                        old_table[i].kernel_id != kDeletedId) {
+                        insert(old_table[i].context, old_table[i].kernel_id, old_table[i].module_state);
+                    }
                 }
             }
             free(old_table);
@@ -120,12 +117,14 @@ struct GPUCompilationCache {
     }
 
     template <typename FreeModuleT>
-    void delete_context(void *user_context, ContextT context, FreeModuleT &f) {
-        ScopedMutexLock lock_guard(&mutex);
+    void release_context(void *user_context, bool all, ContextT context, FreeModuleT &f) {
+        if (count == 0) {
+            return;
+        }
 
         for (int i = 0; i < (1 << log2_compilations_size); i++) {
             if (compilations[i].kernel_id > kInvalidId &&
-                compilations[i].context == context) {
+                (all || compilations[i].context == context)) {
                 debug(user_context) << "Releasing cached compilation: " << compilations[i].module_state << "\n";
                 f(compilations[i].module_state);
                 compilations[i].module_state = nullptr;
@@ -135,6 +134,23 @@ struct GPUCompilationCache {
         }
     }
 
+    template <typename FreeModuleT>
+    void delete_context(void *user_context, ContextT context, FreeModuleT &f) {
+        ScopedMutexLock lock_guard(&mutex);
+
+        release_context(user_context, false, context, f);
+    }
+
+    template <typename FreeModuleT>
+    void release_all(void *user_context, FreeModuleT &f) {
+        ScopedMutexLock lock_guard(&mutex);
+
+        release_context(user_context, true, nullptr, f);
+        free(compilations);
+        compilations = nullptr;
+        log2_compilations_size = 0;
+    }
+
     template <typename CompileModuleT, typename... Args>
     bool kernel_state_setup(void * user_context, void **state_ptr,
                             ContextT context, ModuleStateT &result,
@@ -168,12 +184,4 @@ struct GPUCompilationCache {
 
 };
 
-
-// A call that takes the state pointer and looks for the module
-//     does it create the list?
-
-// A call to take the newly compiled module and add it
-
-// A call to remove everything for a context
-
 } }
diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index 1a2ba0ce52b4..0d34a3cf021b 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -1,6 +1,7 @@
 #include "HalideRuntimeMetal.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
+#include "gpu_context_common.h"
 #include "printer.h"
 #include "scoped_spin_lock.h"
 
@@ -282,15 +283,7 @@ struct device_handle {
     uint64_t offset;
 };
 
-// Structure to hold the state of a module attached to the context.
-// Also used as a linked-list to keep track of all the different
-// modules that are attached to a context in order to release them all
-// when then context is released.
-struct module_state {
-    mtl_library *library;
-    module_state *next;
-};
-WEAK module_state *state_list = nullptr;
+WEAK Halide::Internal::GPUCompilationCache<mtl_device *, mtl_library *> compilation_cache;
 
 // API Capabilities.  If more capabilities need to be checked,
 // this can be refactored to something more robust/general.
@@ -544,18 +537,6 @@ WEAK int halide_metal_device_free(void *user_context, halide_buffer_t *buf) {
 }
 
 WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) {
-    // Create the state object if necessary. This only happens once, regardless
-    // of how many times halide_initialize_kernels/halide_release is called.
-    // halide_release traverses this list and releases the module objects, but
-    // it does not modify the list nodes created/inserted here.
-    module_state **state = (module_state **)state_ptr;
-    if (!(*state)) {
-        *state = (module_state *)malloc(sizeof(module_state));
-        (*state)->library = nullptr;
-        (*state)->next = state_list;
-        state_list = *state;
-    }
-
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error != 0) {
         return metal_context.error;
@@ -565,23 +546,13 @@ WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, c
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    if ((*state)->library == nullptr) {
-#ifdef DEBUG_RUNTIME
-        uint64_t t_before_compile = halide_current_time_ns(user_context);
-#endif
-
-        debug(user_context) << "Metal - Allocating: new_library_with_source " << (*state)->library << "\n";
-        (*state)->library = new_library_with_source(metal_context.device, source, source_size);
-        if ((*state)->library == nullptr) {
-            error(user_context) << "Metal: new_library_with_source failed.\n";
-            return -1;
-        }
-
-#ifdef DEBUG_RUNTIME
-        uint64_t t_after_compile = halide_current_time_ns(user_context);
-        debug(user_context) << "Time for halide_metal_initialize_kernels compilation: " << (t_after_compile - t_before_compile) / 1.0e6 << " ms\n";
-#endif
+    mtl_library *library;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, metal_context.device, library,
+                                              new_library_with_source, metal_context.device,
+                                              source, source_size)) {
+        return halide_error_code_generic_error;
     }
+    halide_assert(user_context, library != nullptr);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -644,20 +615,7 @@ WEAK int halide_metal_device_release(void *user_context) {
     if (device) {
         halide_metal_device_sync_internal(queue, nullptr);
 
-        // Unload the modules attached to this device. Note that the list
-        // nodes themselves are not freed, only the program objects are
-        // released. Subsequent calls to halide_init_kernels might re-create
-        // the program object using the same list node to store the program
-        // object.
-        module_state *state = state_list;
-        while (state) {
-            if (state->library) {
-                debug(user_context) << "Metal - Releasing: new_library_with_source " << state->library << "\n";
-                release_ns_object(state->library);
-                state->library = nullptr;
-            }
-            state = state->next;
-        }
+        compilation_cache.delete_context(user_context, device, release_ns_object);
 
         // Release the device itself, if we created it.
         if (acquired_device == device) {
@@ -782,10 +740,11 @@ WEAK int halide_metal_run(void *user_context,
         return -1;
     }
 
-    halide_assert(user_context, state_ptr);
-    module_state *state = (module_state *)state_ptr;
+    mtl_library *library;
+    bool found_libdary = compilation_cache.lookup(metal_context.device, state_ptr, library);
+    halide_assert(user_context, found_libdary && library != nullptr);
 
-    mtl_function *function = new_function_with_name(state->library, entry_name, strlen(entry_name));
+    mtl_function *function = new_function_with_name(library, entry_name, strlen(entry_name));
     if (function == nullptr) {
         error(user_context) << "Metal: Could not get function " << entry_name << "from Metal library.\n";
         return -1;
@@ -1147,6 +1106,7 @@ WEAK const struct halide_device_interface_t *halide_metal_device_interface() {
 
 namespace {
 WEAK __attribute__((destructor)) void halide_metal_cleanup() {
+    compilation_cache.release_all(nullptr, release_ns_object);
     halide_metal_device_release(nullptr);
 }
 }  // namespace
diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index 7c10033bde31..722c9f12d5a9 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -1,6 +1,7 @@
 #include "HalideRuntimeOpenCL.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
+#include "gpu_context_common.h"
 #include "printer.h"
 #include "scoped_spin_lock.h"
 
@@ -286,15 +287,7 @@ struct device_handle {
     cl_mem mem;
 };
 
-// Structure to hold the state of a module attached to the context.
-// Also used as a linked-list to keep track of all the different
-// modules that are attached to a context in order to release them all
-// when then context is released.
-struct module_state {
-    cl_program program;
-    module_state *next;
-};
-WEAK module_state *state_list = nullptr;
+WEAK Halide::Internal::GPUCompilationCache<cl_context, cl_program> compilation_cache;
 
 WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size_t size = 0) {
     if (buf->device == 0) {
@@ -556,6 +549,83 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
     return err;
 }
 
+WEAK cl_program compile_kernel(void *user_context, ClContext ctx, const char*src, int size) {
+    cl_int err = 0;
+    cl_device_id dev;
+
+    err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr);
+    if (err != CL_SUCCESS) {
+        error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: "
+                            << get_opencl_error_name(err);
+        return nullptr;
+    }
+
+    cl_device_id devices[] = {dev};
+
+    // Get the max constant buffer size supported by this OpenCL implementation.
+    cl_ulong max_constant_buffer_size = 0;
+    err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr);
+    if (err != CL_SUCCESS) {
+        error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: "
+                            << get_opencl_error_name(err);
+        return nullptr;
+    }
+    // Get the max number of constant arguments supported by this OpenCL implementation.
+    cl_uint max_constant_args = 0;
+    err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr);
+    if (err != CL_SUCCESS) {
+        error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: "
+                            << get_opencl_error_name(err);
+        return nullptr;
+    }
+
+    // Build the compile argument options.
+    stringstream options(user_context);
+    options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size
+            << " -D MAX_CONSTANT_ARGS=" << max_constant_args;
+
+    const char *extra_options = halide_opencl_get_build_options(user_context);
+    options << " " << extra_options;
+
+    const char *sources[] = {src};
+    debug(user_context) << "    clCreateProgramWithSource -> ";
+    cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], nullptr, &err);
+    if (err != CL_SUCCESS) {
+        debug(user_context) << get_opencl_error_name(err) << "\n";
+        error(user_context) << "CL: clCreateProgramWithSource failed: "
+                            << get_opencl_error_name(err);
+        return nullptr;
+    } else {
+        debug(user_context) << (void *)program << "\n";
+    }
+
+    debug(user_context) << "    clBuildProgram " << (void *)program
+                        << " " << options.str() << "\n";
+    err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr);
+    if (err != CL_SUCCESS) {
+
+        {
+            // Allocate an appropriately sized buffer for the build log.
+            Printer<ErrorPrinter, 16384> p(user_context);
+
+            p << "CL: clBuildProgram failed: "
+              << get_opencl_error_name(err)
+              << "\nBuild Log:\n";
+
+            // Get build log
+            if (clGetProgramBuildInfo(program, dev,
+                                      CL_PROGRAM_BUILD_LOG,
+                                      p.capacity() - p.size() - 1, p.dst,
+                                      nullptr) != CL_SUCCESS) {
+                p << "clGetProgramBuildInfo failed (Printer buffer too small?)";
+            }
+        }
+
+        return nullptr;
+    }
+    return program;
+}
+
 }  // namespace OpenCL
 }  // namespace Internal
 }  // namespace Runtime
@@ -675,98 +745,14 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr,
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
-    // Create the state object if necessary. This only happens once, regardless
-    // of how many times halide_init_kernels/halide_release is called.
-    // halide_release traverses this list and releases the program objects, but
-    // it does not modify the list nodes created/inserted here.
-    module_state **state = (module_state **)state_ptr;
-    if (!(*state)) {
-        *state = (module_state *)malloc(sizeof(module_state));
-        (*state)->program = nullptr;
-        (*state)->next = state_list;
-        state_list = *state;
-    }
-
-    // Create the program if necessary. TODO: The program object needs to not
-    // only already exist, but be created for the same context/device as the
-    // calling context/device.
-    if (!(*state && (*state)->program) && size > 1) {
-        cl_int err = 0;
-        cl_device_id dev;
-
-        err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr);
-        if (err != CL_SUCCESS) {
-            error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: "
-                                << get_opencl_error_name(err);
-            return err;
-        }
-
-        cl_device_id devices[] = {dev};
-
-        // Get the max constant buffer size supported by this OpenCL implementation.
-        cl_ulong max_constant_buffer_size = 0;
-        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(max_constant_buffer_size), &max_constant_buffer_size, nullptr);
-        if (err != CL_SUCCESS) {
-            error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) failed: "
-                                << get_opencl_error_name(err);
-            return err;
-        }
-        // Get the max number of constant arguments supported by this OpenCL implementation.
-        cl_uint max_constant_args = 0;
-        err = clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(max_constant_args), &max_constant_args, nullptr);
-        if (err != CL_SUCCESS) {
-            error(user_context) << "CL: clGetDeviceInfo (CL_DEVICE_MAX_CONSTANT_ARGS) failed: "
-                                << get_opencl_error_name(err);
-            return err;
-        }
-
-        // Build the compile argument options.
-        stringstream options(user_context);
-        options << "-D MAX_CONSTANT_BUFFER_SIZE=" << max_constant_buffer_size
-                << " -D MAX_CONSTANT_ARGS=" << max_constant_args;
-
-        const char *extra_options = halide_opencl_get_build_options(user_context);
-        options << " " << extra_options;
-
-        const char *sources[] = {src};
-        debug(user_context) << "    clCreateProgramWithSource -> ";
-        cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], nullptr, &err);
-        if (err != CL_SUCCESS) {
-            debug(user_context) << get_opencl_error_name(err) << "\n";
-            error(user_context) << "CL: clCreateProgramWithSource failed: "
-                                << get_opencl_error_name(err);
-            return err;
-        } else {
-            debug(user_context) << (void *)program << "\n";
-        }
-
-        (*state)->program = program;
-        debug(user_context) << "    clBuildProgram " << (void *)program
-                            << " " << options.str() << "\n";
-        err = clBuildProgram(program, 1, devices, options.str(), nullptr, nullptr);
-        if (err != CL_SUCCESS) {
-
-            {
-                // Allocate an appropriately sized buffer for the build log.
-                Printer<ErrorPrinter, 16384> p(user_context);
-
-                p << "CL: clBuildProgram failed: "
-                  << get_opencl_error_name(err)
-                  << "\nBuild Log:\n";
-
-                // Get build log
-                if (clGetProgramBuildInfo(program, dev,
-                                          CL_PROGRAM_BUILD_LOG,
-                                          p.capacity() - p.size() - 1, p.dst,
-                                          nullptr) != CL_SUCCESS) {
-                    p << "clGetProgramBuildInfo failed (Printer buffer too small?)";
-                }
-            }
-
-            return err;
-        }
+    debug(user_context) << "halide_cuda_initialize_kernels got compilation_cache mutex.\n";
+    cl_program program;
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, program,
+                                              compile_kernel, user_context, ctx, src, size)) {
+        return halide_error_code_generic_error;
     }
-
+    halide_assert(user_context, program != nullptr);
+    
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
@@ -820,21 +806,7 @@ WEAK int halide_opencl_device_release(void *user_context) {
         err = clFinish(q);
         halide_assert(user_context, err == CL_SUCCESS);
 
-        // Unload the modules attached to this context. Note that the list
-        // nodes themselves are not freed, only the program objects are
-        // released. Subsequent calls to halide_init_kernels might re-create
-        // the program object using the same list node to store the program
-        // object.
-        module_state *state = state_list;
-        while (state) {
-            if (state->program) {
-                debug(user_context) << "    clReleaseProgram " << state->program << "\n";
-                err = clReleaseProgram(state->program);
-                halide_assert(user_context, err == CL_SUCCESS);
-                state->program = nullptr;
-            }
-            state = state->next;
-        }
+        compilation_cache.delete_context(user_context, ctx, clReleaseProgram);
 
         // Release the context itself, if we created it.
         if (ctx == context) {
@@ -1077,9 +1049,10 @@ WEAK int halide_opencl_run(void *user_context,
 
     // Create kernel object for entry_name from the program for this module.
     halide_assert(user_context, state_ptr);
-    cl_program program = ((module_state *)state_ptr)->program;
+    cl_program program;
+    bool found_program = compilation_cache.lookup(ctx.context, state_ptr, program);
 
-    halide_assert(user_context, program);
+    halide_assert(user_context, found_program && program != nullptr);
     debug(user_context) << "    clCreateKernel " << entry_name << " -> ";
     cl_kernel f = clCreateKernel(program, entry_name, &err);
     if (err != CL_SUCCESS) {
@@ -1366,6 +1339,7 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() {
 
 namespace {
 WEAK __attribute__((destructor)) void halide_opencl_cleanup() {
+  compilation_cache.release_all(nullptr, clReleaseProgram);
     halide_opencl_device_release(nullptr);
 }
 }  // namespace
@@ -1940,4 +1914,4 @@ extern "C" {
 WEAK const struct halide_device_interface_t *halide_opencl_image_device_interface() {
     return &opencl_image_device_interface;
 }
-}
\ No newline at end of file
+}
diff --git a/test/common/gpu_object_lifetime_tracker.h b/test/common/gpu_object_lifetime_tracker.h
index b17dd9118413..1734a59f2f00 100644
--- a/test/common/gpu_object_lifetime_tracker.h
+++ b/test/common/gpu_object_lifetime_tracker.h
@@ -22,25 +22,24 @@ class GpuObjectLifetimeTracker {
         }
     };
 
-    std::array<ObjectType, 13> object_types = {{
+    std::array<ObjectType, 11> object_types = {{
+        {"Caching compiled kernel:", "Releasing cached compilation:"},
+
         // OpenCL objects
         {"clCreateContext", "clReleaseContext", true},
         {"clCreateCommandQueue", "clReleaseCommandQueue", true},
         // This handles both "clCreateProgramWithSource" and
         // "clCreateProgramWithBinary".
-        {"clCreateProgram", "clReleaseProgram"},
         {"clCreateBuffer", "clReleaseMemObject"},
         {"clCreateKernel", "clReleaseKernel"},
 
         // CUDA objects
         {"cuCtxCreate", "cuCtxDestroy", true},
-        {"cuModuleLoad", "cuModuleUnload"},
         {"cuMemAlloc", "cuMemFree"},
 
         // Metal objects
         {"Allocating: MTLCreateSystemDefaultDevice", "Releasing: MTLCreateSystemDefaultDevice", true},
         {"Allocating: new_command_queue", "Releasing: new_command_queue"},
-        {"Allocating: new_library_with_source", "Releasing: new_library_with_source"},
 
         // Hexagon objects
         {"halide_remote_load_library", "halide_remote_release_library"},

From 6f7008b8dfb4d9431d93a2c6638644fdd3d6f8f0 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Tue, 24 Nov 2020 00:00:01 -0800
Subject: [PATCH 03/17] Make GPU kernel compilation caching more robust. This
 fixes issues in the intial support and adds tests. Currently the gpu_multi
 test only has context creation code fo CUDA and OpenCL. This shoulkd be added
 for other GPU runteims, but some coverage is provided via using the default
 context for these APIs.

Fixes a bug in CUDA runtime where some error message text in
cuda_do_multidimensional_copy was not initialized.

Fixes a bug in CUDA runtime where device release code did not run if
CUDA libraries are directly linked into the executable. (This would
have caused crashes due to the device allocation caching among other
issues.)
---
 Makefile                                      |  20 ++
 src/runtime/cuda.cpp                          |  11 +-
 src/runtime/gpu_context_common.h              |  18 +-
 test/common/gpu_context.h                     | 135 +++++++++++++
 test/correctness/gpu_many_kernels.cpp         |  87 +++++++++
 test/generator/acquire_release_aottest.cpp    | 161 ++++------------
 .../gpu_multi_context_threaded_aottest.cpp    | 177 ++++++++++++++++++
 test/generator/gpu_multi_generator.cpp        |  48 +++++
 8 files changed, 520 insertions(+), 137 deletions(-)
 create mode 100644 test/common/gpu_context.h
 create mode 100644 test/correctness/gpu_many_kernels.cpp
 create mode 100644 test/generator/gpu_multi_context_threaded_aottest.cpp
 create mode 100644 test/generator/gpu_multi_generator.cpp

diff --git a/Makefile b/Makefile
index ba9fcd3f9b4d..70e7e26b96bb 100644
--- a/Makefile
+++ b/Makefile
@@ -1527,6 +1527,12 @@ $(FILTERS_DIR)/nested_externs_%.a: $(BIN_DIR)/nested_externs.generator
 	@mkdir -p $(@D)
 	$(CURDIR)/$< -g nested_externs_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime
 
+# Similarly, gpu_multi needs two different kernels to test compilation caching.
+# Also requies user-context.
+$(FILTERS_DIR)/gpu_multi_%.a: $(BIN_DIR)/gpu_multi.generator
+	@mkdir -p $(@D)
+	$(CURDIR)/$< -g gpu_multi_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context
+
 GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas
 GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools
 GEN_AOT_LD_FLAGS=$(COMMON_LD_FLAGS)
@@ -1622,11 +1628,25 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget
 	HL_MULTITARGET_TEST_USE_NOBOUNDSQUERY_FEATURE=1 $(CURDIR)/$<
 	@-echo
 
+# gpu_multi_context_threaded has additional deps to link in
+$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp $(FILTERS_DIR)/gpu_multi_add.a $(FILTERS_DIR)/gpu_multi_mul.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+	@mkdir -p $(@D)
+	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
+
+$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp $(FILTERS_DIR)/gpu_multi_add.halide_generated.cpp $(FILTERS_DIR)/gpu_multi_mul.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+	@mkdir -p $(@D)
+	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
+
 # nested externs doesn't actually contain a generator named
 # "nested_externs", and has no internal tests in any case.
 test_generator_nested_externs:
 	@echo "Skipping"
 
+# gpu_multi actually contain a generator named
+# "gpu_multi", and has no internal tests in any case.
+test_generator_gpu_multi:
+	@echo "Skipping"
+
 $(BUILD_DIR)/RunGenMain.o: $(ROOT_DIR)/tools/RunGenMain.cpp $(RUNTIME_EXPORTED_INCLUDES) $(ROOT_DIR)/tools/RunGen.h
 	@mkdir -p $(@D)
 	$(CXX) -c $< $(filter-out -g, $(TEST_CXX_FLAGS)) $(OPTIMIZE) -Os $(IMAGE_IO_CXX_FLAGS) -I$(INCLUDE_DIR) -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools -o $@
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 69deb691d586..6c033a3394fc 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -519,7 +519,7 @@ WEAK int halide_cuda_initialize_kernels(void *user_context, void **state_ptr, co
 #endif
 
     CUmodule loaded_module;
-    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, context, loaded_module,
+    if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, loaded_module,
                                               compile_kernel, user_context, ptx_src, size)) {
         return halide_error_code_generic_error;
     }
@@ -654,7 +654,7 @@ WEAK int halide_cuda_device_release(void *user_context) {
         << "CUDA: halide_cuda_device_release (user_context: " << user_context << ")\n";
 
     // If we haven't even loaded libcuda, don't load it just to quit.
-    if (!lib_cuda) {
+    if (!cuInit) {
         return 0;
     }
 
@@ -678,7 +678,7 @@ WEAK int halide_cuda_device_release(void *user_context) {
         // Dump the contents of the free list, ignoring errors.
         halide_cuda_release_unused_device_allocations(user_context);
 
-        compilation_cache.delete_context(user_context, context, cuModuleUnload);
+        compilation_cache.delete_context(user_context, ctx, cuModuleUnload);
 
         CUcontext old_ctx;
         cuCtxPopCurrent(&old_ctx);
@@ -842,12 +842,15 @@ WEAK int cuda_do_multidimensional_copy(void *user_context, const device_copy &c,
                             << (void *)src << " -> " << (void *)dst << ", " << c.chunk_size << " bytes\n";
         if (!from_host && to_host) {
             debug(user_context) << "cuMemcpyDtoH(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
+            copy_name = "cuMemcpyDtoH";
             err = cuMemcpyDtoH((void *)dst, (CUdeviceptr)src, c.chunk_size);
         } else if (from_host && !to_host) {
             debug(user_context) << "cuMemcpyHtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
+            copy_name = "cuMemcpyHtoD";
             err = cuMemcpyHtoD((CUdeviceptr)dst, (void *)src, c.chunk_size);
         } else if (!from_host && !to_host) {
             debug(user_context) << "cuMemcpyDtoD(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
+            copy_name = "cuMemcpyDtoD";
             err = cuMemcpyDtoD((CUdeviceptr)dst, (CUdeviceptr)src, c.chunk_size);
         } else if (dst != src) {
             debug(user_context) << "memcpy(" << (void *)dst << ", " << (void *)src << ", " << c.chunk_size << ")\n";
@@ -1187,7 +1190,7 @@ WEAK const halide_device_interface_t *halide_cuda_device_interface() {
 }
 
 WEAK int halide_cuda_compute_capability(void *user_context, int *major, int *minor) {
-    if (!lib_cuda) {
+    if (!lib_cuda && !cuInit) {
         // If cuda can't be found, we want to return 0, 0 and it's not
         // considered an error. So we should be very careful about
         // looking for libcuda without tripping any errors in the rest
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index 0b740d09a726..52ac4d8ca6cf 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -41,11 +41,12 @@ struct GPUCompilationCache {
                 return false;
             }
         }
-        if (count++ > (1 << log2_compilations_size) * kLoadFactor) {
+        if ((count + 1) > (1 << log2_compilations_size) * kLoadFactor) {
             if (!resize_table(log2_compilations_size + 1)) {
                 return false;
             }
         }
+        count += 1;
         uintptr_t index = kernel_hash(context, id, log2_compilations_size);
         for (int i = 0; i < (1 << log2_compilations_size); i++) {
             uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
@@ -69,6 +70,10 @@ struct GPUCompilationCache {
         uintptr_t index = kernel_hash(context, id, log2_compilations_size);
         for (int i = 0; i < (1 << log2_compilations_size); i++) {
             uintptr_t effective_index = (index + i) & ((1 << log2_compilations_size) - 1);
+
+            if (compilations[effective_index].kernel_id == kInvalidId) {
+                return false;
+            }
             if (compilations[effective_index].context == context &&
                 compilations[effective_index].kernel_id == id) {
                 module_state = &compilations[effective_index].module_state;
@@ -98,12 +103,12 @@ struct GPUCompilationCache {
                 // signal error.
                 return false;
             }
-            memset(new_table, 0, (1 << size_bits) * sizeof(CachedCompilation));
+            memset(new_table, 0, new_size * sizeof(CachedCompilation));
             CachedCompilation *old_table = compilations;
             compilations = new_table;
             log2_compilations_size = size_bits;
  
-            if (count > 0) { // MAinly to catch empty initial table case
+            if (count > 0) { // Mainly to catch empty initial table case
                 for (int32_t i = 0; i < old_size; i++) {
                     if (old_table[i].kernel_id != kInvalidId &&
                         old_table[i].kernel_id != kDeletedId) {
@@ -124,11 +129,10 @@ struct GPUCompilationCache {
 
         for (int i = 0; i < (1 << log2_compilations_size); i++) {
             if (compilations[i].kernel_id > kInvalidId &&
-                (all || compilations[i].context == context)) {
-                debug(user_context) << "Releasing cached compilation: " << compilations[i].module_state << "\n";
+                (all || (compilations[i].context == context))) {
                 f(compilations[i].module_state);
                 compilations[i].module_state = nullptr;
-                compilations[i].kernel_id = kInvalidId;
+                compilations[i].kernel_id = kDeletedId;
                 count--;
             }
         }
@@ -171,7 +175,7 @@ struct GPUCompilationCache {
 
         // TODO(zvookin): figure out the calling signature here...
         ModuleStateT compiled_module = f(args...);
-        debug(user_context) << "Caching compiled kernel: " << compiled_module << "\n";
+        debug(user_context) << "Caching compiled kernel: " << compiled_module << " id " << *id_ptr << " context " << context << "\n";
         if (compiled_module == nullptr) {
             return false;
         }
diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
new file mode 100644
index 000000000000..d62aeb0f1657
--- /dev/null
+++ b/test/common/gpu_context.h
@@ -0,0 +1,135 @@
+#if defined(TEST_OPENCL)
+// Implement OpenCL custom context.
+
+#define CL_TARGET_OPENCL_VERSION 120
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+// Just use a global context and queue, created and destroyed by main.
+cl_context cl_ctx = nullptr;
+cl_command_queue cl_q = nullptr;
+
+// Create the global context. This is just a helper function not called by Halide.
+bool create_opencl_context(cl_context &cl_ctx, cl_command_queue &cl_q) {
+    cl_int err = 0;
+
+    const cl_uint maxPlatforms = 4;
+    cl_platform_id platforms[maxPlatforms];
+    cl_uint platformCount = 0;
+
+    err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount);
+    if (err != CL_SUCCESS) {
+        printf("clGetPlatformIDs failed (%d)\n", err);
+        return false;
+    }
+
+    cl_platform_id platform = nullptr;
+
+    if (platformCount > 0) {
+        platform = platforms[0];
+    }
+    if (platform == nullptr) {
+        printf("Failed to get platform\n");
+        return false;
+    }
+
+    cl_device_type device_type = CL_DEVICE_TYPE_ALL;
+
+    // Make sure we have a device
+    const cl_uint maxDevices = 4;
+    cl_device_id devices[maxDevices];
+    cl_uint deviceCount = 0;
+    err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
+    if (err != CL_SUCCESS) {
+        printf("clGetDeviceIDs failed (%d)\n", err);
+        return false;
+    }
+    if (deviceCount == 0) {
+        printf("Failed to get device\n");
+        return false;
+    }
+
+    cl_device_id dev = devices[deviceCount - 1];
+
+    // Create context and command queue.
+    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
+                                          0};
+    cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err);
+    if (err != CL_SUCCESS) {
+        printf("clCreateContext failed (%d)\n", err);
+        return false;
+    }
+
+    cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
+    if (err != CL_SUCCESS) {
+        printf("clCreateCommandQueue failed (%d)\n", err);
+        return false;
+    }
+    return true;
+}
+
+void destroy_opencl_context(cl_context cl_ctx, cl_command_queue cl_q) {
+    clReleaseCommandQueue(cl_q);
+    clReleaseContext(cl_ctx);
+}
+
+#elif defined(TEST_CUDA)
+// Implement CUDA custom context.
+#include <cuda.h>
+
+bool create_cuda_context(CUcontext &cuda_ctx) {
+    // Initialize CUDA
+    CUresult err = cuInit(0);
+    if (err != CUDA_SUCCESS) {
+        printf("cuInit failed (%d)\n", err);
+        return false;
+    }
+
+    // Make sure we have a device
+    int deviceCount = 0;
+    err = cuDeviceGetCount(&deviceCount);
+    if (err != CUDA_SUCCESS) {
+        printf("cuGetDeviceCount failed (%d)\n", err);
+        return false;
+    }
+    if (deviceCount <= 0) {
+        printf("No CUDA devices available\n");
+        return false;
+    }
+
+    CUdevice dev;
+    // Get device
+    CUresult status;
+    // Try to get a device >0 first, since 0 should be our display device
+    // For now, don't try devices > 2 to maintain compatibility with previous behavior.
+    if (deviceCount > 2) deviceCount = 2;
+    for (int id = deviceCount - 1; id >= 0; id--) {
+        status = cuDeviceGet(&dev, id);
+        if (status == CUDA_SUCCESS) break;
+    }
+
+    if (status != CUDA_SUCCESS) {
+        printf("Failed to get CUDA device\n");
+        return status;
+    }
+
+    // Create context
+    err = cuCtxCreate(&cuda_ctx, 0, dev);
+    if (err != CUDA_SUCCESS) {
+        printf("cuCtxCreate failed (%d)\n", err);
+        return false;
+    }
+
+    return true;
+}
+
+void destroy_cuda_context(CUcontext cuda_ctx) {
+    cuCtxDestroy(cuda_ctx);
+}
+
+#endif
+
diff --git a/test/correctness/gpu_many_kernels.cpp b/test/correctness/gpu_many_kernels.cpp
new file mode 100644
index 000000000000..1944dc50b585
--- /dev/null
+++ b/test/correctness/gpu_many_kernels.cpp
@@ -0,0 +1,87 @@
+#include "Halide.h"
+#include <algorithm>
+
+#include "halide_benchmark.h"
+
+using namespace Halide;
+
+constexpr size_t kNumKernels = 70;
+
+int main(int argc, char **argv) {
+    Var x, y, xi, yi;
+    Func adders[kNumKernels];
+    ImageParam input(Int(32), 2);
+
+    Target target = get_jit_target_from_environment();
+    int i = 1;
+    for (Func &f : adders) {
+        f(x, y) = input(x, y) + i;
+        if (target.has_gpu_feature()) {
+            f.compute_root().gpu_tile(x, y, xi, yi, 16, 16);
+        } else {
+            f.compute_root().vectorize(x, target.natural_vector_size<int32_t>());
+        }
+        i += 1;
+    }
+
+    auto start = Halide::Tools::benchmark_now();
+
+    Buffer<int32_t> buf_a_store(32, 32);
+    Buffer<int32_t> buf_b_store(32, 32);
+    Buffer<int32_t> *buf_in = &buf_a_store;
+    Buffer<int32_t> *buf_out = &buf_b_store;
+    buf_in->fill(0);
+    for (Func &f : adders) {
+        input.set(*buf_in);
+        f.realize(*buf_out);
+        std::swap(buf_in, buf_out);
+    }
+    buf_in->copy_to_host();
+
+    auto end = Halide::Tools::benchmark_now();
+    double initial_runtime = Halide::Tools::benchmark_duration_seconds(start, end);
+
+    buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); });
+
+    start = Halide::Tools::benchmark_now();
+
+    buf_in->fill(0);
+    for (Func &f : adders) {
+        input.set(*buf_in);
+        f.realize(*buf_out);
+        std::swap(buf_in, buf_out);
+    }
+    buf_in->copy_to_host();
+
+    end = Halide::Tools::benchmark_now();
+    double precompiled_runtime = Halide::Tools::benchmark_duration_seconds(start, end);
+
+    buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); });
+
+    buf_a_store.device_free();
+    buf_b_store.device_free();
+    const halide_device_interface_t *device = get_device_interface_for_device_api(DeviceAPI::Default_GPU, target);
+    if (device != nullptr) {
+        device->device_release(nullptr, device);
+    }
+
+    start = Halide::Tools::benchmark_now();
+
+    buf_in->fill(0);
+    for (Func &f : adders) {
+        input.set(*buf_in);
+        f.realize(*buf_out);
+        std::swap(buf_in, buf_out);
+    }
+    buf_in->copy_to_host();
+
+    end = Halide::Tools::benchmark_now();
+    double second_runtime = Halide::Tools::benchmark_duration_seconds(start, end);
+
+    buf_in->for_each_value([](int32_t x) { assert(x == (kNumKernels * (kNumKernels + 1)) / 2); });
+
+    printf("Initial runtime %f, precompiled runtime %f, second runtime %f.\n", initial_runtime, precompiled_runtime, second_runtime);
+
+    printf("Success!\n");
+    return 0;
+}    
diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp
index f0fd8ef9cd04..6144426295dc 100644
--- a/test/generator/acquire_release_aottest.cpp
+++ b/test/generator/acquire_release_aottest.cpp
@@ -14,89 +14,25 @@ int main(int argc, char **argv) {
 #include <string.h>
 
 #include "acquire_release.h"
+#include "gpu_context.h"
 
 using namespace Halide::Runtime;
 
 const int W = 256, H = 256;
 
 #if defined(TEST_OPENCL)
-// Implement OpenCL custom context.
-
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif
 
 // Just use a global context and queue, created and destroyed by main.
 cl_context cl_ctx = nullptr;
 cl_command_queue cl_q = nullptr;
 
 // Create the global context. This is just a helper function not called by Halide.
-int init_context() {
-    cl_int err = 0;
-
-    const cl_uint maxPlatforms = 4;
-    cl_platform_id platforms[maxPlatforms];
-    cl_uint platformCount = 0;
-
-    err = clGetPlatformIDs(maxPlatforms, platforms, &platformCount);
-    if (err != CL_SUCCESS) {
-        printf("clGetPlatformIDs failed (%d)\n", err);
-        return err;
-    }
-
-    cl_platform_id platform = nullptr;
-
-    if (platformCount > 0) {
-        platform = platforms[0];
-    }
-    if (platform == nullptr) {
-        printf("Failed to get platform\n");
-        return CL_INVALID_PLATFORM;
-    }
-
-    cl_device_type device_type = CL_DEVICE_TYPE_ALL;
-
-    // Make sure we have a device
-    const cl_uint maxDevices = 4;
-    cl_device_id devices[maxDevices];
-    cl_uint deviceCount = 0;
-    err = clGetDeviceIDs(platform, device_type, maxDevices, devices, &deviceCount);
-    if (err != CL_SUCCESS) {
-        printf("clGetDeviceIDs failed (%d)\n", err);
-        return err;
-    }
-    if (deviceCount == 0) {
-        printf("Failed to get device\n");
-        return CL_DEVICE_NOT_FOUND;
-    }
-
-    cl_device_id dev = devices[deviceCount - 1];
-
-    // Create context and command queue.
-    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
-                                          0};
-    cl_ctx = clCreateContext(properties, 1, &dev, nullptr, nullptr, &err);
-    if (err != CL_SUCCESS) {
-        printf("clCreateContext failed (%d)\n", err);
-        return err;
-    }
-
-    cl_q = clCreateCommandQueue(cl_ctx, dev, 0, &err);
-    if (err != CL_SUCCESS) {
-        printf("clCreateCommandQueue failed (%d)\n", err);
-        return err;
-    }
-    printf("Created CL context %p\n", cl_ctx);
-    return 0;
+bool init_context() {
+    return create_opencl_context(cl_ctx, cl_q);
 }
 
 void destroy_context() {
-    printf("Destroying CL context %p\n", cl_ctx);
-    clReleaseCommandQueue(cl_q);
-    clReleaseContext(cl_ctx);
+    destroy_opencl_context(cl_ctx, cl_q);
     cl_q = nullptr;
     cl_ctx = nullptr;
 }
@@ -116,61 +52,14 @@ extern "C" int halide_release_cl_context(void *user_context) {
     return 0;
 }
 #elif defined(TEST_CUDA)
-// Implement CUDA custom context.
-#include <cuda.h>
-
 CUcontext cuda_ctx = nullptr;
 
-int init_context() {
-    // Initialize CUDA
-    CUresult err = cuInit(0);
-    if (err != CUDA_SUCCESS) {
-        printf("cuInit failed (%d)\n", err);
-        return err;
-    }
-
-    // Make sure we have a device
-    int deviceCount = 0;
-    err = cuDeviceGetCount(&deviceCount);
-    if (err != CUDA_SUCCESS) {
-        printf("cuGetDeviceCount failed (%d)\n", err);
-        return err;
-    }
-    if (deviceCount <= 0) {
-        printf("No CUDA devices available\n");
-        return CUDA_ERROR_NO_DEVICE;
-    }
-
-    CUdevice dev;
-    // Get device
-    CUresult status;
-    // Try to get a device >0 first, since 0 should be our display device
-    // For now, don't try devices > 2 to maintain compatibility with previous behavior.
-    if (deviceCount > 2) deviceCount = 2;
-    for (int id = deviceCount - 1; id >= 0; id--) {
-        status = cuDeviceGet(&dev, id);
-        if (status == CUDA_SUCCESS) break;
-    }
-
-    if (status != CUDA_SUCCESS) {
-        printf("Failed to get CUDA device\n");
-        return status;
-    }
-
-    // Create context
-    err = cuCtxCreate(&cuda_ctx, 0, dev);
-    if (err != CUDA_SUCCESS) {
-        printf("cuCtxCreate failed (%d)\n", err);
-        return err;
-    }
-    printf("Created CUDA context %p\n", cuda_ctx);
-
-    return 0;
+bool init_context() {
+    return create_cuda_context(cuda_ctx);
 }
 
 void destroy_context() {
-    printf("Destroying CUDA context %p\n", cuda_ctx);
-    cuCtxDestroy(cuda_ctx);
+    destroy_cuda_context(cuda_ctx);
     cuda_ctx = nullptr;
 }
 
@@ -189,19 +78,18 @@ extern "C" int halide_cuda_release_context(void *user_context) {
 }
 #else
 // Just use the default implementation of acquire/release.
-int init_context() {
+bool init_context() {
     printf("Using default implementation of acquire/release\n");
-    return 0;
+    return true;
 }
 void destroy_context() {
 }
 #endif
 
-int main(int argc, char **argv) {
+bool run_test() {
     // Initialize the runtime specific GPU context.
-    int ret = init_context();
-    if (ret != 0) {
-        return ret;
+    if (!init_context()) {
+        return false;
     }
 
     // Everything else is a normal Halide program. The GPU runtime will call
@@ -227,20 +115,41 @@ int main(int argc, char **argv) {
             if (input(x, y) * 2.0f + 1.0f != output(x, y)) {
                 printf("Error at (%d, %d): %f != %f\n", x, y, input(x, y) * 2.0f + 1.0f,
                        output(x, y));
-                return -1;
+                return false;
             }
         }
     }
 
+    const halide_device_interface_t *interface = output.raw_buffer()->device_interface;
+
     // We need to free our GPU buffers before destroying the context.
     input.device_free();
     output.device_free();
 
+    if (interface != nullptr) {
+        halide_device_release(nullptr, interface);
+    } else {
+        printf("Device interface is nullptr.\n");
+        return false;
+    }
+
     // Free the context we created.
     destroy_context();
 
     printf("Success!\n");
-    return 0;
+    return true;
+}
+
+int main(int argc, char **argv) {
+  if (!run_test()) {
+    return -1;
+  }
+  
+  if (!run_test()) {
+    return -1;
+  }
+
+  return 0;
 }
 
 #endif
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
new file mode 100644
index 000000000000..93412c11dcdb
--- /dev/null
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -0,0 +1,177 @@
+#include <stdio.h>
+
+#ifdef _WIN32
+int main(int argc, char **argv) {
+    printf("[SKIP] Test requires weak linkage, which is not available on Windows.\n");
+    return 0;
+}
+#else
+
+#include "HalideBuffer.h"
+#include "HalideRuntime.h"
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+#include <thread>
+
+#include "gpu_multi_add.h"
+#include "gpu_multi_mul.h"
+#include "gpu_context.h"
+
+using namespace Halide::Runtime;
+
+const int W = 32, H = 32;
+
+#if defined(TEST_OPENCL)
+
+struct gpu_context {
+    cl_context cl_ctx;
+    cl_command_queue cl_q;
+};
+
+// Create the global context. This is just a helper function not called by Halide.
+bool init_context(gpu_context &context) {
+    return create_opencl_context(context.cl_ctx, context.cl_q);
+}
+
+void destroy_context(gpu_context &context) {
+    destroy_opencl_context(context.cl_ctx, context.cl_q);
+    cl_q = nullptr;
+    cl_ctx = nullptr;
+}
+
+// These functions replace the acquire/release implementation in src/runtime/opencl.cpp.
+// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization
+// in our implementation of these functions.
+extern "C" int halide_acquire_cl_context(void *user_context, cl_context *ctx, cl_command_queue *q, bool create) {
+    if (user_context == nullptr) {
+        assert(!create);
+        *ctx = nullptr;
+        *q = nullptr;
+    } else {
+        const gpu_context *context = (const gpu_context *)user_context;
+        *ctx = context->cl_ctx;
+        *q = context->cl_q;
+    }
+    return 0;
+}
+
+extern "C" int halide_release_cl_context(void *user_context) {
+    return 0;
+}
+#elif defined(TEST_CUDA)
+
+typedef CUcontext gpu_context;
+
+bool init_context(CUcontext &cuda_ctx) {
+    return create_cuda_context(cuda_ctx);
+}
+
+void destroy_context(CUcontext &cuda_ctx) {
+    destroy_cuda_context(cuda_ctx);
+    cuda_ctx = nullptr;
+}
+
+// These functions replace the acquire/release implementation in src/runtime/cuda.cpp.
+// Since we don't parallelize access to the GPU in the schedule, we don't need synchronization
+// in our implementation of these functions.
+extern "C" int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, bool create) {
+    if (user_context == nullptr) {
+        assert(!create);
+        *ctx = nullptr;
+    } else {
+        *ctx = *(CUcontext *)user_context;
+    }
+    return 0;
+}
+
+extern "C" int halide_cuda_release_context(void *user_context) {
+    return 0;
+}
+#else
+// Just use the default implementation of acquire/release.
+bool init_context() {
+    printf("Using default implementation of acquire/release\n");
+    return true;
+}
+void destroy_context() {
+}
+#endif
+
+void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) {
+    gpu_context context2;
+
+    Buffer<int32_t> buf1_in(W, H);
+    Buffer<int32_t> buf1_result(W, H);
+    buf1_in.fill(0);
+
+    const halide_device_interface_t *device_interface;
+
+    int val = 0;
+    for (int i = 0; i < 10; i++) {
+        init_context(context2);
+
+        Buffer<int32_t> buf2_in(W, H);
+        Buffer<int32_t> buf2_result(W, H);
+        buf2_in.fill(0);
+
+        gpu_multi_add(&context1, buf1_in, buf1_result);
+        gpu_multi_mul(&context1, buf1_result, buf1_in);
+        gpu_multi_add(&context1, buf1_in, buf1_result);
+
+        gpu_multi_add(&context2, buf2_in, buf2_result);
+        gpu_multi_mul(&context2, buf2_result, buf2_in);
+        gpu_multi_add(&context2, buf2_in, buf2_result);
+
+        buf1_result.copy_to_host(&context1);
+        buf2_result.copy_to_host(&context2);
+
+        val += 2;
+        val *= 2;
+        assert(buf1_result.all_equal(val + 2));
+        assert(buf2_result.all_equal(6));
+
+        device_interface = buf1_result.raw_buffer()->device_interface;
+
+        // About to destroy context, so ensure allocations are freed first.
+        buf2_in.device_free(&context2);
+        buf2_result.device_free(&context2);
+
+        halide_device_release(&context2, device_interface);
+        destroy_context(context2);
+    }
+ 
+    // About to destroy context, so ensure allocations are freed first.
+    buf1_in.device_free(&context1);
+    buf1_result.device_free(&context1);
+
+    if (destroy_when_done) {
+        halide_device_release(&context1, device_interface);
+        destroy_context(context1);
+    }
+}
+
+int main(int argc, char **argv) {
+    gpu_context contexta;
+    init_context(contexta);
+
+    gpu_context contextb;
+    init_context(contextb);
+
+    std::thread thread1(run_kernels_on_thread, contexta, false);
+    std::thread thread2(run_kernels_on_thread, contextb, false);
+
+    thread1.join();
+    thread2.join();
+
+    // Make sure using the same context on different threads works.
+    std::thread thread3(run_kernels_on_thread, contexta, true);
+    std::thread thread4(run_kernels_on_thread, contextb, true);
+
+    thread3.join();
+    thread4.join();
+
+    printf("Success!\n");
+    return 0;
+}
+#endif // !WIN32
diff --git a/test/generator/gpu_multi_generator.cpp b/test/generator/gpu_multi_generator.cpp
new file mode 100644
index 000000000000..52ebb3f77320
--- /dev/null
+++ b/test/generator/gpu_multi_generator.cpp
@@ -0,0 +1,48 @@
+#include "Halide.h"
+
+namespace {
+
+class GpuAdd : public Halide::Generator<GpuAdd> {
+public:
+    Input<Buffer<int32_t>> input{"input", 2};
+
+    Output<Buffer<int32_t>> output{"output", 2};
+
+    void generate() {
+        Var x("x"), y("y");
+
+        // Create a simple pipeline that scales pixel values by 2.
+        output(x, y) = input(x, y) + 2;
+
+        Target target = get_target();
+        if (target.has_gpu_feature()) {
+            Var xo, yo, xi, yi;
+            output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16);
+        }
+    }
+};
+
+class GpuMul : public Halide::Generator<GpuMul> {
+public:
+    Input<Buffer<int32_t>> input{"input", 2};
+
+    Output<Buffer<int32_t>> output{"output", 2};
+
+    void generate() {
+        Var x("x"), y("y");
+
+        // Create a simple pipeline that scales pixel values by 2.
+        output(x, y) = input(x, y) * 2;
+
+        Target target = get_target();
+        if (target.has_gpu_feature()) {
+            Var xo, yo, xi, yi;
+            output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16);
+        }
+    }
+};
+
+}  // namespace
+
+HALIDE_REGISTER_GENERATOR(GpuAdd, gpu_multi_add)
+HALIDE_REGISTER_GENERATOR(GpuMul, gpu_multi_mul)

From 48cf2bec3a296cf7e3b9aeb0aefe09fafed62cbd Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Tue, 24 Nov 2020 00:22:07 -0800
Subject: [PATCH 04/17] Add cmake support.

Add initial commits explaining what tests do.
---
 test/correctness/gpu_many_kernels.cpp               |  5 +++++
 test/generator/CMakeLists.txt                       | 13 +++++++++++++
 .../gpu_multi_context_threaded_aottest.cpp          |  5 +++++
 3 files changed, 23 insertions(+)

diff --git a/test/correctness/gpu_many_kernels.cpp b/test/correctness/gpu_many_kernels.cpp
index 1944dc50b585..91960254dd77 100644
--- a/test/correctness/gpu_many_kernels.cpp
+++ b/test/correctness/gpu_many_kernels.cpp
@@ -3,6 +3,11 @@
 
 #include "halide_benchmark.h"
 
+// This test makes sure GPU runtimes can handle many different small
+// kernels and can handle releasing a device context and making a new
+// one and still have many kernels work. This is needed due to kernel
+// compilation caching mechanisms int he GPU runtimes.
+
 using namespace Halide;
 
 constexpr size_t kNumKernels = 70;
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 2db1d3bc4f20..796cd04fe240 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -257,6 +257,19 @@ halide_define_aot_test(external_code GEN_DEPS external_code_generator_deps)
 # float16_t_generator.cpp
 halide_define_aot_test(float16_t)
 
+# gpu_multi_context_threaded_aottest.cpp
+# gpu_multi_generator.cpp
+halide_define_aot_test(gpu_multi_context_threaded
+                       OMIT_DEFAULT_GENERATOR
+                       EXTRA_LIBS
+                       gpu_multi_add
+                       gpu_multi_mul)
+
+add_halide_library(gpu_multi_add FROM gpu_multi.generator 
+                   FEATURES user_context)
+add_halide_library(gpu_multi_mul FROM gpu_multi.generator
+                   FEATURES user_context)
+
 # gpu_object_lifetime_aottest.cpp
 # gpu_object_lifetime_generator.cpp
 halide_define_aot_test(gpu_object_lifetime FEATURES debug)
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index 93412c11dcdb..c8a86b7b0bc8 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -1,5 +1,10 @@
 #include <stdio.h>
 
+// This test demonstrates how to use more than one GPU context with
+// Halide generated GPU support, specifically in a multithreaded
+// program. It of course also tests that this works correctly with the
+// Halide GPU runtimes.
+
 #ifdef _WIN32
 int main(int argc, char **argv) {
     printf("[SKIP] Test requires weak linkage, which is not available on Windows.\n");

From f9ba360cd29d3dae64f9538c1c03aef0459211d6 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Tue, 24 Nov 2020 01:01:12 -0800
Subject: [PATCH 05/17] Address clang format lint issues.

---
 src/runtime/cuda.cpp                          |  6 ++--
 src/runtime/d3d12compute.cpp                  |  2 +-
 src/runtime/destructors.cpp                   |  2 --
 src/runtime/gpu_context_common.h              | 33 ++++++++++---------
 src/runtime/opencl.cpp                        |  6 ++--
 test/correctness/gpu_many_kernels.cpp         |  2 +-
 test/generator/acquire_release_aottest.cpp    | 18 +++++-----
 .../gpu_multi_context_threaded_aottest.cpp    |  7 ++--
 8 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 6c033a3394fc..e3d76c815603 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1,7 +1,7 @@
 #include "HalideRuntimeCuda.h"
-#include "gpu_context_common.h"
 #include "device_buffer_utils.h"
 #include "device_interface.h"
+#include "gpu_context_common.h"
 #include "mini_cuda.h"
 #include "printer.h"
 #include "scoped_mutex_lock.h"
@@ -471,7 +471,7 @@ WEAK bool validate_device_pointer(void *user_context, halide_buffer_t *buf, size
 }
 
 WEAK CUmodule compile_kernel(void *user_context, const char *ptx_src, int size) {
-   debug(user_context) << "CUDA: compile_kernel cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
+    debug(user_context) << "CUDA: compile_kernel cuModuleLoadData " << (void *)ptx_src << ", " << size << " -> ";
 
     CUjit_option options[] = {CU_JIT_MAX_REGISTERS};
     unsigned int max_regs_per_thread = 64;
@@ -496,7 +496,7 @@ WEAK CUmodule compile_kernel(void *user_context, const char *ptx_src, int size)
     }
     return loaded_module;
 }
-  
+
 }  // namespace Cuda
 }  // namespace Internal
 }  // namespace Runtime
diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index dfdc70df87b4..40e8c9dbcbb4 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -2834,7 +2834,7 @@ WEAK int halide_d3d12compute_device_release(void *user_context) {
         }
 
         compilation_cache.delete_context(user_context, device, release_object<d3d12_library>);
-        
+
         // Release the device itself, if we created it.
         if (acquired_device == device) {
             release_object(&upload);
diff --git a/src/runtime/destructors.cpp b/src/runtime/destructors.cpp
index b72f106d784f..b7187c6c4d86 100644
--- a/src/runtime/destructors.cpp
+++ b/src/runtime/destructors.cpp
@@ -4,13 +4,11 @@
 extern "C" {
 
 ALWAYS_INLINE __attribute__((used)) void call_destructor(void *user_context, void (*fn)(void *user_context, void *object), void **object, bool should_call) {
-  debug(nullptr) << "call_destructor called.\n";
     void *o = *object;
     *object = nullptr;
     // Call the function
     if (o && should_call) {
         fn(user_context, o);
     }
-  debug(nullptr) << "call_destructor done.\n";
 }
 }
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index 52ac4d8ca6cf..5d0d88ba8e07 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -1,9 +1,10 @@
 #include "printer.h"
 #include "scoped_mutex_lock.h"
 
-namespace Halide { namespace Internal {
-  
-template <typename ContextT, typename ModuleStateT>
+namespace Halide {
+namespace Internal {
+
+template<typename ContextT, typename ModuleStateT>
 struct GPUCompilationCache {
     struct CachedCompilation {
         ContextT context;
@@ -15,15 +16,15 @@ struct GPUCompilationCache {
 
     static constexpr float kLoadFactor = .5f;
     static constexpr int kInitialTableBits = 7;
-    int log2_compilations_size{0}; // number of bits in index into compilations table.
+    int log2_compilations_size{0};  // number of bits in index into compilations table.
     CachedCompilation *compilations{nullptr};
     int count{0};
 
     static constexpr uint32_t kInvalidId{0};
     static constexpr uint32_t kDeletedId{1};
 
-    uint32_t unique_id{2}; // zero is an invalid id
-  
+    uint32_t unique_id{2};  // zero is an invalid id
+
     static ALWAYS_INLINE uintptr_t kernel_hash(ContextT context, uint32_t id, uint32_t bits) {
         uintptr_t addr = (uintptr_t)context + id;
         // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39...
@@ -93,7 +94,7 @@ struct GPUCompilationCache {
         }
         return false;
     }
- 
+
     bool resize_table(int size_bits) {
         if (size_bits != log2_compilations_size) {
             int new_size = (1 << size_bits);
@@ -107,8 +108,8 @@ struct GPUCompilationCache {
             CachedCompilation *old_table = compilations;
             compilations = new_table;
             log2_compilations_size = size_bits;
- 
-            if (count > 0) { // Mainly to catch empty initial table case
+
+            if (count > 0) {  // Mainly to catch empty initial table case
                 for (int32_t i = 0; i < old_size; i++) {
                     if (old_table[i].kernel_id != kInvalidId &&
                         old_table[i].kernel_id != kDeletedId) {
@@ -121,7 +122,7 @@ struct GPUCompilationCache {
         return true;
     }
 
-    template <typename FreeModuleT>
+    template<typename FreeModuleT>
     void release_context(void *user_context, bool all, ContextT context, FreeModuleT &f) {
         if (count == 0) {
             return;
@@ -138,7 +139,7 @@ struct GPUCompilationCache {
         }
     }
 
-    template <typename FreeModuleT>
+    template<typename FreeModuleT>
     void delete_context(void *user_context, ContextT context, FreeModuleT &f) {
         ScopedMutexLock lock_guard(&mutex);
 
@@ -155,8 +156,8 @@ struct GPUCompilationCache {
         log2_compilations_size = 0;
     }
 
-    template <typename CompileModuleT, typename... Args>
-    bool kernel_state_setup(void * user_context, void **state_ptr,
+    template<typename CompileModuleT, typename... Args>
+    bool kernel_state_setup(void *user_context, void **state_ptr,
                             ContextT context, ModuleStateT &result,
                             CompileModuleT f,
                             Args... args) {
@@ -182,10 +183,10 @@ struct GPUCompilationCache {
 
         insert(context, *id_ptr, compiled_module);
         result = compiled_module;
-        
+
         return true;
     }
-
 };
 
-} }
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index 722c9f12d5a9..5445125526fe 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -549,7 +549,7 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
     return err;
 }
 
-WEAK cl_program compile_kernel(void *user_context, ClContext ctx, const char*src, int size) {
+WEAK cl_program compile_kernel(void *user_context, ClContext ctx, const char *src, int size) {
     cl_int err = 0;
     cl_device_id dev;
 
@@ -752,7 +752,7 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr,
         return halide_error_code_generic_error;
     }
     halide_assert(user_context, program != nullptr);
-    
+
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
     debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6 << " ms\n";
@@ -1339,7 +1339,7 @@ WEAK const struct halide_device_interface_t *halide_opencl_device_interface() {
 
 namespace {
 WEAK __attribute__((destructor)) void halide_opencl_cleanup() {
-  compilation_cache.release_all(nullptr, clReleaseProgram);
+    compilation_cache.release_all(nullptr, clReleaseProgram);
     halide_opencl_device_release(nullptr);
 }
 }  // namespace
diff --git a/test/correctness/gpu_many_kernels.cpp b/test/correctness/gpu_many_kernels.cpp
index 91960254dd77..a5568f32d3a5 100644
--- a/test/correctness/gpu_many_kernels.cpp
+++ b/test/correctness/gpu_many_kernels.cpp
@@ -89,4 +89,4 @@ int main(int argc, char **argv) {
 
     printf("Success!\n");
     return 0;
-}    
+}
diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp
index 6144426295dc..2cbebe51f05a 100644
--- a/test/generator/acquire_release_aottest.cpp
+++ b/test/generator/acquire_release_aottest.cpp
@@ -141,15 +141,15 @@ bool run_test() {
 }
 
 int main(int argc, char **argv) {
-  if (!run_test()) {
-    return -1;
-  }
-  
-  if (!run_test()) {
-    return -1;
-  }
-
-  return 0;
+    if (!run_test()) {
+      return -1;
+    }
+
+    if (!run_test()) {
+        return -1;
+    }
+
+    return 0;
 }
 
 #endif
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index c8a86b7b0bc8..3b438eef57a0 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -19,9 +19,10 @@ int main(int argc, char **argv) {
 #include <string.h>
 #include <thread>
 
+#include "gpu_context.h"
+
 #include "gpu_multi_add.h"
 #include "gpu_multi_mul.h"
-#include "gpu_context.h"
 
 using namespace Halide::Runtime;
 
@@ -145,7 +146,7 @@ void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) {
         halide_device_release(&context2, device_interface);
         destroy_context(context2);
     }
- 
+
     // About to destroy context, so ensure allocations are freed first.
     buf1_in.device_free(&context1);
     buf1_result.device_free(&context1);
@@ -179,4 +180,4 @@ int main(int argc, char **argv) {
     printf("Success!\n");
     return 0;
 }
-#endif // !WIN32
+#endif  // !WIN32

From 1b6335767bc14f339eac8c9794f331e16bb0c460 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Tue, 24 Nov 2020 01:53:46 -0800
Subject: [PATCH 06/17] More formatting fixes.

---
 src/runtime/d3d12compute.cpp               | 1 -
 src/runtime/gpu_context_common.h           | 2 +-
 test/generator/acquire_release_aottest.cpp | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 40e8c9dbcbb4..7b5a60dd77be 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -3566,4 +3566,3 @@ WEAK halide_device_interface_t d3d12compute_device_interface = {
 }  // namespace Internal
 }  // namespace Runtime
 }  // namespace Halide
-
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index 5d0d88ba8e07..1060df4a9409 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -146,7 +146,7 @@ struct GPUCompilationCache {
         release_context(user_context, false, context, f);
     }
 
-    template <typename FreeModuleT>
+    template<typename FreeModuleT>
     void release_all(void *user_context, FreeModuleT &f) {
         ScopedMutexLock lock_guard(&mutex);
 
diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp
index 2cbebe51f05a..9e4fba70afe6 100644
--- a/test/generator/acquire_release_aottest.cpp
+++ b/test/generator/acquire_release_aottest.cpp
@@ -142,7 +142,7 @@ bool run_test() {
 
 int main(int argc, char **argv) {
     if (!run_test()) {
-      return -1;
+        return -1;
     }
 
     if (!run_test()) {

From a406376d9cd16b3a330eb380fd8b3e4b37c1d2c8 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Tue, 24 Nov 2020 09:31:13 -0800
Subject: [PATCH 07/17] Fix spelling error.

---
 src/runtime/metal.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index 0d34a3cf021b..25fe29f3feda 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -741,8 +741,8 @@ WEAK int halide_metal_run(void *user_context,
     }
 
     mtl_library *library;
-    bool found_libdary = compilation_cache.lookup(metal_context.device, state_ptr, library);
-    halide_assert(user_context, found_libdary && library != nullptr);
+    bool found_library = compilation_cache.lookup(metal_context.device, state_ptr, library);
+    halide_assert(user_context, found_library && library != nullptr);
 
     mtl_function *function = new_function_with_name(library, entry_name, strlen(entry_name));
     if (function == nullptr) {

From d5e60af6fa46c65083572b243fe65f8a36175557 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Tue, 24 Nov 2020 09:35:45 -0800
Subject: [PATCH 08/17] Clang format fix.

---
 test/common/gpu_context.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
index d62aeb0f1657..4816ad205866 100644
--- a/test/common/gpu_context.h
+++ b/test/common/gpu_context.h
@@ -132,4 +132,3 @@ void destroy_cuda_context(CUcontext cuda_ctx) {
 }
 
 #endif
-

From 61cabe3d0f2cd6c2d14fc6c0660ab59e09db04f7 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Tue, 24 Nov 2020 10:43:22 -0800
Subject: [PATCH 09/17] Rename gpu_multi_generator.cpp to match the name of hte
 test that uses it to stick closer to naming pattern and work with CMake rules
 code.

---
 Makefile                                         |  8 ++++----
 test/generator/CMakeLists.txt                    |  8 ++++----
 .../gpu_multi_context_threaded_aottest.cpp       | 16 ++++++++--------
 ... => gpu_multi_context_threaded_generator.cpp} |  4 ++--
 4 files changed, 18 insertions(+), 18 deletions(-)
 rename test/generator/{gpu_multi_generator.cpp => gpu_multi_context_threaded_generator.cpp} (88%)

diff --git a/Makefile b/Makefile
index 70e7e26b96bb..7d66abf1b05a 100644
--- a/Makefile
+++ b/Makefile
@@ -1529,9 +1529,9 @@ $(FILTERS_DIR)/nested_externs_%.a: $(BIN_DIR)/nested_externs.generator
 
 # Similarly, gpu_multi needs two different kernels to test compilation caching.
 # Also requies user-context.
-$(FILTERS_DIR)/gpu_multi_%.a: $(BIN_DIR)/gpu_multi.generator
+$(FILTERS_DIR)/gpu_multi_context_threaded_%.a: $(BIN_DIR)/gpu_multi_context_threaded.generator
 	@mkdir -p $(@D)
-	$(CURDIR)/$< -g gpu_multi_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context
+	$(CURDIR)/$< -g gpu_multi_context_threaded_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context
 
 GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas
 GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools
@@ -1629,11 +1629,11 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget
 	@-echo
 
 # gpu_multi_context_threaded has additional deps to link in
-$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp $(FILTERS_DIR)/gpu_multi_add.a $(FILTERS_DIR)/gpu_multi_mul.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp $(FILTERS_DIR)/gpu_multi_context_threaded_add.a $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
 
-$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp $(FILTERS_DIR)/gpu_multi_add.halide_generated.cpp $(FILTERS_DIR)/gpu_multi_mul.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp $(FILTERS_DIR)/gpu_multi_context_threaded_add.halide_generated.cpp $(FILTERS_DIR)/gpu_multi_context_threaded_mul.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
 
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 796cd04fe240..618fa60b4bae 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -262,12 +262,12 @@ halide_define_aot_test(float16_t)
 halide_define_aot_test(gpu_multi_context_threaded
                        OMIT_DEFAULT_GENERATOR
                        EXTRA_LIBS
-                       gpu_multi_add
-                       gpu_multi_mul)
+                       gpu_multi_context_threaded_add
+                       gpu_multi_context_threaded_mul)
 
-add_halide_library(gpu_multi_add FROM gpu_multi.generator 
+add_halide_library(gpu_multi_context_threaded_add FROM gpu_multi_context_threaded.generator 
                    FEATURES user_context)
-add_halide_library(gpu_multi_mul FROM gpu_multi.generator
+add_halide_library(gpu_multi_context_threaded_mul FROM gpu_multi_context_threaded.generator
                    FEATURES user_context)
 
 # gpu_object_lifetime_aottest.cpp
diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index 3b438eef57a0..9ed77ff77e82 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -21,8 +21,8 @@ int main(int argc, char **argv) {
 
 #include "gpu_context.h"
 
-#include "gpu_multi_add.h"
-#include "gpu_multi_mul.h"
+#include "gpu_multi_context_threaded_add.h"
+#include "gpu_multi_context_threaded_mul.h"
 
 using namespace Halide::Runtime;
 
@@ -121,13 +121,13 @@ void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) {
         Buffer<int32_t> buf2_result(W, H);
         buf2_in.fill(0);
 
-        gpu_multi_add(&context1, buf1_in, buf1_result);
-        gpu_multi_mul(&context1, buf1_result, buf1_in);
-        gpu_multi_add(&context1, buf1_in, buf1_result);
+        gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result);
+        gpu_multi_context_threaded_mul(&context1, buf1_result, buf1_in);
+        gpu_multi_context_threaded_add(&context1, buf1_in, buf1_result);
 
-        gpu_multi_add(&context2, buf2_in, buf2_result);
-        gpu_multi_mul(&context2, buf2_result, buf2_in);
-        gpu_multi_add(&context2, buf2_in, buf2_result);
+        gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result);
+        gpu_multi_context_threaded_mul(&context2, buf2_result, buf2_in);
+        gpu_multi_context_threaded_add(&context2, buf2_in, buf2_result);
 
         buf1_result.copy_to_host(&context1);
         buf2_result.copy_to_host(&context2);
diff --git a/test/generator/gpu_multi_generator.cpp b/test/generator/gpu_multi_context_threaded_generator.cpp
similarity index 88%
rename from test/generator/gpu_multi_generator.cpp
rename to test/generator/gpu_multi_context_threaded_generator.cpp
index 52ebb3f77320..42f278b6379b 100644
--- a/test/generator/gpu_multi_generator.cpp
+++ b/test/generator/gpu_multi_context_threaded_generator.cpp
@@ -44,5 +44,5 @@ class GpuMul : public Halide::Generator<GpuMul> {
 
 }  // namespace
 
-HALIDE_REGISTER_GENERATOR(GpuAdd, gpu_multi_add)
-HALIDE_REGISTER_GENERATOR(GpuMul, gpu_multi_mul)
+HALIDE_REGISTER_GENERATOR(GpuAdd, gpu_multi_context_threaded_add)
+HALIDE_REGISTER_GENERATOR(GpuMul, gpu_multi_context_threaded_mul)

From ffdcb356b96b76da80b1b6f602d6ba7f6ada6fea Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 25 Nov 2020 14:37:19 -0800
Subject: [PATCH 10/17] Was using the arong kind of context value. Worked fine,
 but was passing more than it needed to.

---
 src/runtime/opencl.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index 5445125526fe..7c7815b620d7 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -549,11 +549,11 @@ WEAK int create_opencl_context(void *user_context, cl_context *ctx, cl_command_q
     return err;
 }
 
-WEAK cl_program compile_kernel(void *user_context, ClContext ctx, const char *src, int size) {
+WEAK cl_program compile_kernel(void *user_context, cl_context ctx, const char *src, int size) {
     cl_int err = 0;
     cl_device_id dev;
 
-    err = clGetContextInfo(ctx.context, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr);
+    err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, sizeof(dev), &dev, nullptr);
     if (err != CL_SUCCESS) {
         error(user_context) << "CL: clGetContextInfo(CL_CONTEXT_DEVICES) failed: "
                             << get_opencl_error_name(err);
@@ -589,7 +589,7 @@ WEAK cl_program compile_kernel(void *user_context, ClContext ctx, const char *sr
 
     const char *sources[] = {src};
     debug(user_context) << "    clCreateProgramWithSource -> ";
-    cl_program program = clCreateProgramWithSource(ctx.context, 1, &sources[0], nullptr, &err);
+    cl_program program = clCreateProgramWithSource(ctx, 1, &sources[0], nullptr, &err);
     if (err != CL_SUCCESS) {
         debug(user_context) << get_opencl_error_name(err) << "\n";
         error(user_context) << "CL: clCreateProgramWithSource failed: "
@@ -748,7 +748,7 @@ WEAK int halide_opencl_initialize_kernels(void *user_context, void **state_ptr,
     debug(user_context) << "halide_cuda_initialize_kernels got compilation_cache mutex.\n";
     cl_program program;
     if (!compilation_cache.kernel_state_setup(user_context, state_ptr, ctx.context, program,
-                                              compile_kernel, user_context, ctx, src, size)) {
+                                              compile_kernel, user_context, ctx.context, src, size)) {
         return halide_error_code_generic_error;
     }
     halide_assert(user_context, program != nullptr);

From 4948453719d06e36e88fd196c733eff66c8725aa Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 30 Nov 2020 10:02:25 -0800
Subject: [PATCH 11/17] Address review comments, CMake mistake.

---
 Makefile                              | 9 +++++++--
 src/runtime/d3d12compute.cpp          | 2 +-
 test/correctness/CMakeLists.txt       | 3 ++-
 test/correctness/gpu_many_kernels.cpp | 2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 7d66abf1b05a..836c1462f6d1 100644
--- a/Makefile
+++ b/Makefile
@@ -1629,11 +1629,16 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget
 	@-echo
 
 # gpu_multi_context_threaded has additional deps to link in
-$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp $(FILTERS_DIR)/gpu_multi_context_threaded_add.a $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+$(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
+	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_add.a $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a
+	                                                       $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
 
-$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp $(FILTERS_DIR)/gpu_multi_context_threaded_add.halide_generated.cpp $(FILTERS_DIR)/gpu_multi_context_threaded_mul.halide_generated.cpp $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
+$(BIN_DIR)/$(TARGET)/generator_aotcpp_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
+	                                                          $(FILTERS_DIR)/gpu_multi_context_threaded_add.halide_generated.cpp \
+	                                                          $(FILTERS_DIR)/gpu_multi_context_threaded_mul.halide_generated.cpp \
+	                                                          $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
 
diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 7b5a60dd77be..3174c33f52a4 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -3005,7 +3005,7 @@ WEAK int halide_d3d12compute_run(void *user_context,
     StartCapturingGPUActivity();
 #endif
 
-    d3d12_library *library;
+    d3d12_library *library = nullptr;
     bool found_module = compilation_cache.lookup(device, state_ptr, library);
     halide_assert(user_context, found_module && library != nullptr);
 
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index e1ba9d8e4eb3..a5a2ad7ac10f 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -1,5 +1,5 @@
 tests(GROUPS correctness_multi_gpu
-      SOURCES
+a      SOURCES
       gpu_multi_device.cpp
       )
 
@@ -134,6 +134,7 @@ tests(GROUPS correctness
       gpu_give_input_buffers_device_allocations.cpp
       gpu_jit_explicit_copy_to_device.cpp
       gpu_large_alloc.cpp
+      gpu_many_kernels.cpp
       gpu_mixed_dimensionality.cpp
       gpu_mixed_shared_mem_types.cpp
       gpu_multi_kernel.cpp
diff --git a/test/correctness/gpu_many_kernels.cpp b/test/correctness/gpu_many_kernels.cpp
index a5568f32d3a5..d9572b38c18c 100644
--- a/test/correctness/gpu_many_kernels.cpp
+++ b/test/correctness/gpu_many_kernels.cpp
@@ -6,7 +6,7 @@
 // This test makes sure GPU runtimes can handle many different small
 // kernels and can handle releasing a device context and making a new
 // one and still have many kernels work. This is needed due to kernel
-// compilation caching mechanisms int he GPU runtimes.
+// compilation caching mechanisms in the GPU runtimes.
 
 using namespace Halide;
 

From db39f4f5ad4371ba23d244166c228a39f0dfef3b Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Mon, 30 Nov 2020 17:42:48 -0800
Subject: [PATCH 12/17] Typo fix.

---
 test/correctness/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index a5a2ad7ac10f..68692f12f1ba 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -1,5 +1,5 @@
 tests(GROUPS correctness_multi_gpu
-a      SOURCES
+      SOURCES
       gpu_multi_device.cpp
       )
 

From 7cfb51eadabbd6fa81de79ed39412eafdfa89afd Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 2 Dec 2020 00:40:55 -0800
Subject: [PATCH 13/17] Address review feedback. Fix cmake semantic comment.

---
 src/runtime/gpu_context_common.h | 28 ++++++++++++++++------------
 test/generator/CMakeLists.txt    |  2 +-
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index 1060df4a9409..06186e3b838e 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -14,8 +14,8 @@ struct GPUCompilationCache {
 
     halide_mutex mutex;
 
-    static constexpr float kLoadFactor = .5f;
-    static constexpr int kInitialTableBits = 7;
+    static constexpr float kLoadFactor{.5f};
+    static constexpr int kInitialTableBits{7};
     int log2_compilations_size{0};  // number of bits in index into compilations table.
     CachedCompilation *compilations{nullptr};
     int count{0};
@@ -36,7 +36,7 @@ struct GPUCompilationCache {
         }
     }
 
-    bool insert(ContextT context, uint32_t id, ModuleStateT module_state) {
+    HALIDE_MUST_USE_RESULT bool insert(ContextT context, uint32_t id, ModuleStateT module_state) {
         if (log2_compilations_size == 0) {
             if (!resize_table(kInitialTableBits)) {
                 return false;
@@ -64,7 +64,7 @@ struct GPUCompilationCache {
         return false;
     }
 
-    bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) {
+    HALIDE_MUST_USE_RESULT bool find_internal(ContextT context, uint32_t id, ModuleStateT *&module_state) {
         if (log2_compilations_size == 0) {
             return false;
         }
@@ -84,7 +84,7 @@ struct GPUCompilationCache {
         return false;
     }
 
-    bool lookup(ContextT context, void *state_ptr, ModuleStateT &module_state) {
+    HALIDE_MUST_USE_RESULT bool lookup(ContextT context, void *state_ptr, ModuleStateT &module_state) {
         ScopedMutexLock lock_guard(&mutex);
         uint32_t id = (uint32_t)(uintptr_t)state_ptr;
         ModuleStateT *mod_ptr;
@@ -95,7 +95,7 @@ struct GPUCompilationCache {
         return false;
     }
 
-    bool resize_table(int size_bits) {
+    HALIDE_MUST_USE_RESULT bool resize_table(int size_bits) {
         if (size_bits != log2_compilations_size) {
             int new_size = (1 << size_bits);
             int old_size = (1 << log2_compilations_size);
@@ -113,7 +113,9 @@ struct GPUCompilationCache {
                 for (int32_t i = 0; i < old_size; i++) {
                     if (old_table[i].kernel_id != kInvalidId &&
                         old_table[i].kernel_id != kDeletedId) {
-                        insert(old_table[i].context, old_table[i].kernel_id, old_table[i].module_state);
+                        bool result = insert(old_table[i].context, old_table[i].kernel_id,
+                                             old_table[i].module_state);
+                        halide_assert(nullptr, result);  // Resizing the table while resizing the table is a logic error.
                     }
                 }
             }
@@ -157,10 +159,10 @@ struct GPUCompilationCache {
     }
 
     template<typename CompileModuleT, typename... Args>
-    bool kernel_state_setup(void *user_context, void **state_ptr,
-                            ContextT context, ModuleStateT &result,
-                            CompileModuleT f,
-                            Args... args) {
+    HALIDE_MUST_USE_RESULT bool kernel_state_setup(void *user_context, void **state_ptr,
+                                                   ContextT context, ModuleStateT &result,
+                                                   CompileModuleT f,
+                                                   Args... args) {
         ScopedMutexLock lock_guard(&mutex);
 
         uint32_t *id_ptr = (uint32_t *)state_ptr;
@@ -181,7 +183,9 @@ struct GPUCompilationCache {
             return false;
         }
 
-        insert(context, *id_ptr, compiled_module);
+        if (!insert(context, *id_ptr, compiled_module)) {
+            return false;
+        }
         result = compiled_module;
 
         return true;
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 618fa60b4bae..130af49b7f93 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -258,7 +258,7 @@ halide_define_aot_test(external_code GEN_DEPS external_code_generator_deps)
 halide_define_aot_test(float16_t)
 
 # gpu_multi_context_threaded_aottest.cpp
-# gpu_multi_generator.cpp
+# gpu_multi_context_threaded_generator.cpp
 halide_define_aot_test(gpu_multi_context_threaded
                        OMIT_DEFAULT_GENERATOR
                        EXTRA_LIBS

From 0a57c4bee90f2e06d0a00c2ad042820fca7867a4 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 2 Dec 2020 13:14:07 -0800
Subject: [PATCH 14/17] Address review feedback.

---
 Makefile                         |  3 ++-
 apps/fft/funct.h                 |  2 +-
 src/BoundaryConditions.h         | 14 ++++++-------
 src/Buffer.h                     | 34 ++++++++++++++++----------------
 src/Func.h                       | 16 +++++++--------
 src/Generator.h                  | 28 +++++++++++++-------------
 src/IRMutator.h                  |  2 +-
 src/IROperator.h                 | 22 ++++++++++-----------
 src/IRVisitor.h                  | 12 +++++------
 src/ImageParam.h                 |  2 +-
 src/Pipeline.h                   |  4 ++--
 src/RDom.h                       |  4 ++--
 src/Realization.h                |  2 +-
 src/Tuple.h                      |  2 +-
 src/runtime/HalideBuffer.h       | 10 +++++-----
 src/runtime/cuda.cpp             |  2 +-
 src/runtime/gpu_context_common.h |  3 ++-
 tools/RunGen.h                   |  2 +-
 18 files changed, 83 insertions(+), 81 deletions(-)

diff --git a/Makefile b/Makefile
index 836c1462f6d1..6ce06a94b568 100644
--- a/Makefile
+++ b/Makefile
@@ -1630,7 +1630,8 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget
 
 # gpu_multi_context_threaded has additional deps to link in
 $(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
-	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_add.a $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a
+	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_add.a \
+	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a \
 	                                                       $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
diff --git a/apps/fft/funct.h b/apps/fft/funct.h
index 79d558d480ab..a6d5a9036a43 100644
--- a/apps/fft/funct.h
+++ b/apps/fft/funct.h
@@ -59,7 +59,7 @@ class FuncT : public Halide::Func {
     }
 
     template<typename... Args>
-    FuncRefT<T> operator()(Args &&... args) const {
+    FuncRefT<T> operator()(Args &&...args) const {
         return Func::operator()(std::forward<Args>(args)...);
     }
 
diff --git a/src/BoundaryConditions.h b/src/BoundaryConditions.h
index 3c1697643db3..1187c5b15a8e 100644
--- a/src/BoundaryConditions.h
+++ b/src/BoundaryConditions.h
@@ -57,7 +57,7 @@ inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
 
 template<typename... Args>
 inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
-                                                      const Expr &a1, const Expr &a2, Args &&... args) {
+                                                      const Expr &a1, const Expr &a2, Args &&...args) {
     collected_args.push_back(Range(a1, a2));
     collect_region(collected_args, std::forward<Args>(args)...);
 }
@@ -124,7 +124,7 @@ HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value,
-                                                  Bounds &&... bounds) {
+                                                  Bounds &&...bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return constant_exterior(Internal::func_like_to_func(func_like), value, collected_bounds);
@@ -132,7 +132,7 @@ HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tupl
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value,
-                                                  Bounds &&... bounds) {
+                                                  Bounds &&...bounds) {
     return constant_exterior(func_like, Tuple(value), std::forward<Bounds>(bounds)...);
 }
 // @}
@@ -170,7 +170,7 @@ HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like) {
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_ATTRIBUTE_DEPRECATED("Add braces around the bounds like so: {{a, b}, {c, d}}")
-HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like, Bounds &&... bounds) {
+HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like, Bounds &&...bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return repeat_edge(Internal::func_like_to_func(func_like), collected_bounds);
@@ -210,7 +210,7 @@ HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like) {
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_ATTRIBUTE_DEPRECATED("Add braces around the bounds like so: {{a, b}, {c, d}}")
-HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like, Bounds &&... bounds) {
+HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like, Bounds &&...bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return repeat_image(Internal::func_like_to_func(func_like), collected_bounds);
@@ -250,7 +250,7 @@ HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like) {
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_ATTRIBUTE_DEPRECATED("Add braces around the bounds like so: {{a, b}, {c, d}}")
-HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like, Bounds &&... bounds) {
+HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like, Bounds &&...bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return mirror_image(Internal::func_like_to_func(func_like), collected_bounds);
@@ -293,7 +293,7 @@ HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like) {
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_ATTRIBUTE_DEPRECATED("Add braces around the bounds like so: {{a, b}, {c, d}}")
-HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like, Bounds &&... bounds) {
+HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like, Bounds &&...bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return mirror_interior(Internal::func_like_to_func(func_like), collected_bounds);
diff --git a/src/Buffer.h b/src/Buffer.h
index 477d6a893995..da4c19b2c027 100644
--- a/src/Buffer.h
+++ b/src/Buffer.h
@@ -52,7 +52,7 @@ inline std::string get_name_from_end_of_parameter_pack() {
 template<typename First,
          typename Second,
          typename... Args>
-std::string get_name_from_end_of_parameter_pack(First first, Second second, Args &&... rest) {
+std::string get_name_from_end_of_parameter_pack(First first, Second second, Args &&...rest) {
     return get_name_from_end_of_parameter_pack(second, std::forward<Args>(rest)...);
 }
 
@@ -63,13 +63,13 @@ inline void get_shape_from_start_of_parameter_pack_helper(std::vector<int> &) {
 }
 
 template<typename... Args>
-void get_shape_from_start_of_parameter_pack_helper(std::vector<int> &result, int x, Args &&... rest) {
+void get_shape_from_start_of_parameter_pack_helper(std::vector<int> &result, int x, Args &&...rest) {
     result.push_back(x);
     get_shape_from_start_of_parameter_pack_helper(result, std::forward<Args>(rest)...);
 }
 
 template<typename... Args>
-std::vector<int> get_shape_from_start_of_parameter_pack(Args &&... args) {
+std::vector<int> get_shape_from_start_of_parameter_pack(Args &&...args) {
     std::vector<int> result;
     get_shape_from_start_of_parameter_pack_helper(result, std::forward<Args>(args)...);
     return result;
@@ -248,7 +248,7 @@ class Buffer {
              typename = typename std::enable_if<Internal::all_ints_and_optional_name<Args...>::value>::type>
     explicit Buffer(Type t,
                     Internal::add_const_if_T_is_const<T, void> *data,
-                    int first, Args &&... rest)
+                    int first, Args &&...rest)
         : Buffer(Runtime::Buffer<T>(t, data, Internal::get_shape_from_start_of_parameter_pack(first, rest...)),
                  Internal::get_name_from_end_of_parameter_pack(rest...)) {
     }
@@ -265,7 +265,7 @@ class Buffer {
     template<typename... Args,
              typename = typename std::enable_if<Internal::all_ints_and_optional_name<Args...>::value>::type>
     explicit Buffer(T *data,
-                    int first, Args &&... rest)
+                    int first, Args &&...rest)
         : Buffer(Runtime::Buffer<T>(data, Internal::get_shape_from_start_of_parameter_pack(first, rest...)),
                  Internal::get_name_from_end_of_parameter_pack(rest...)) {
     }
@@ -387,18 +387,18 @@ class Buffer {
     // @}
 
     // We forward numerous methods from the underlying Buffer
-#define HALIDE_BUFFER_FORWARD_CONST(method)                                                                                      \
-    template<typename... Args>                                                                                                   \
-    auto method(Args &&... args) const->decltype(std::declval<const Runtime::Buffer<T>>().method(std::forward<Args>(args)...)) { \
-        user_assert(defined()) << "Undefined buffer calling const method " #method "\n";                                         \
-        return get()->method(std::forward<Args>(args)...);                                                                       \
+#define HALIDE_BUFFER_FORWARD_CONST(method)                                                                                     \
+    template<typename... Args>                                                                                                  \
+    auto method(Args &&...args) const->decltype(std::declval<const Runtime::Buffer<T>>().method(std::forward<Args>(args)...)) { \
+        user_assert(defined()) << "Undefined buffer calling const method " #method "\n";                                        \
+        return get()->method(std::forward<Args>(args)...);                                                                      \
     }
 
-#define HALIDE_BUFFER_FORWARD(method)                                                                                \
-    template<typename... Args>                                                                                       \
-    auto method(Args &&... args)->decltype(std::declval<Runtime::Buffer<T>>().method(std::forward<Args>(args)...)) { \
-        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                                   \
-        return get()->method(std::forward<Args>(args)...);                                                           \
+#define HALIDE_BUFFER_FORWARD(method)                                                                               \
+    template<typename... Args>                                                                                      \
+    auto method(Args &&...args)->decltype(std::declval<Runtime::Buffer<T>>().method(std::forward<Args>(args)...)) { \
+        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                                  \
+        return get()->method(std::forward<Args>(args)...);                                                          \
     }
 
 // This is a weird-looking but effective workaround for a deficiency in "perfect forwarding":
@@ -536,12 +536,12 @@ class Buffer {
     }
 
     template<typename... Args>
-    auto operator()(int first, Args &&... args) -> decltype(std::declval<Runtime::Buffer<T>>()(first, std::forward<Args>(args)...)) {
+    auto operator()(int first, Args &&...args) -> decltype(std::declval<Runtime::Buffer<T>>()(first, std::forward<Args>(args)...)) {
         return (*get())(first, std::forward<Args>(args)...);
     }
 
     template<typename... Args>
-    auto operator()(int first, Args &&... args) const -> decltype(std::declval<const Runtime::Buffer<T>>()(first, std::forward<Args>(args)...)) {
+    auto operator()(int first, Args &&...args) const -> decltype(std::declval<const Runtime::Buffer<T>>()(first, std::forward<Args>(args)...)) {
         return (*get())(first, std::forward<Args>(args)...);
     }
 
diff --git a/src/Func.h b/src/Func.h
index f893f02e4a37..85b60b47f018 100644
--- a/src/Func.h
+++ b/src/Func.h
@@ -376,7 +376,7 @@ class Stage {
 
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
-    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&... args) {
+    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
         std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
         return reorder(collected_args);
     }
@@ -1285,7 +1285,7 @@ class Func {
 
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
-    operator()(Args &&... args) const {
+    operator()(Args &&...args) const {
         std::vector<Var> collected_args{std::forward<Args>(args)...};
         return this->operator()(collected_args);
     }
@@ -1302,7 +1302,7 @@ class Func {
 
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
-    operator()(const Expr &x, Args &&... args) const {
+    operator()(const Expr &x, Args &&...args) const {
         std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
         return (*this)(collected_args);
     }
@@ -1601,7 +1601,7 @@ class Func {
 
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
-    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&... args) {
+    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
         std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
         return reorder(collected_args);
     }
@@ -2037,7 +2037,7 @@ class Func {
     Func &reorder_storage(const Var &x, const Var &y);
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
-    reorder_storage(const Var &x, const Var &y, Args &&... args) {
+    reorder_storage(const Var &x, const Var &y, Args &&...args) {
         std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
         return reorder_storage(collected_args);
     }
@@ -2472,7 +2472,7 @@ inline void assign_results(Realization &r, int idx, Last last) {
 }
 
 template<typename First, typename Second, typename... Rest>
-inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&... rest) {
+inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
     assign_results<First>(r, idx, first);
     assign_results<Second, Rest...>(r, idx + 1, second, rest...);
 }
@@ -2496,7 +2496,7 @@ HALIDE_NO_USER_CODE_INLINE T evaluate(const Expr &e) {
 
 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
 template<typename First, typename... Rest>
-HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&... rest) {
+HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
     Internal::check_types<First, Rest...>(t, 0);
 
     Func f;
@@ -2541,7 +2541,7 @@ HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e) {
  *  use GPU if jit target from environment specifies one. */
 // @{
 template<typename First, typename... Rest>
-HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&... rest) {
+HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
     Internal::check_types<First, Rest...>(t, 0);
 
     Func f;
diff --git a/src/Generator.h b/src/Generator.h
index 4e2181f1849b..300230aae2b2 100644
--- a/src/Generator.h
+++ b/src/Generator.h
@@ -1324,7 +1324,7 @@ class StubOutputBufferBase {
     }
 
     template<typename... Args>
-    Realization realize(Args &&... args) {
+    Realization realize(Args &&...args) {
         check_scheduled("realize");
         return f.realize(std::forward<Args>(args)..., get_target());
     }
@@ -1626,15 +1626,15 @@ class GeneratorInputImpl : public GeneratorInputBase {
 // types in question satisfy the property of copies referring to the same underlying
 // structure (returning references is just an optimization). Since this is verbose
 // and used in several places, we'll use a helper macro:
-#define HALIDE_FORWARD_METHOD(Class, Method)                                                                                                         \
-    template<typename... Args>                                                                                                                       \
-    inline auto Method(Args &&... args)->typename std::remove_reference<decltype(std::declval<Class>().Method(std::forward<Args>(args)...))>::type { \
-        return this->template as<Class>().Method(std::forward<Args>(args)...);                                                                       \
+#define HALIDE_FORWARD_METHOD(Class, Method)                                                                                                        \
+    template<typename... Args>                                                                                                                      \
+    inline auto Method(Args &&...args)->typename std::remove_reference<decltype(std::declval<Class>().Method(std::forward<Args>(args)...))>::type { \
+        return this->template as<Class>().Method(std::forward<Args>(args)...);                                                                      \
     }
 
 #define HALIDE_FORWARD_METHOD_CONST(Class, Method)                                                                  \
     template<typename... Args>                                                                                      \
-    inline auto Method(Args &&... args) const->                                                                     \
+    inline auto Method(Args &&...args) const->                                                                      \
         typename std::remove_reference<decltype(std::declval<Class>().Method(std::forward<Args>(args)...))>::type { \
         this->check_gio_access();                                                                                   \
         return this->template as<Class>().Method(std::forward<Args>(args)...);                                      \
@@ -1683,7 +1683,7 @@ class GeneratorInput_Buffer : public GeneratorInputImpl<T, Func> {
     }
 
     template<typename... Args>
-    Expr operator()(Args &&... args) const {
+    Expr operator()(Args &&...args) const {
         this->check_gio_access();
         return Func(*this)(std::forward<Args>(args)...);
     }
@@ -1854,7 +1854,7 @@ class GeneratorInput_Func : public GeneratorInputImpl<T, Func> {
     }
 
     template<typename... Args>
-    Expr operator()(Args &&... args) const {
+    Expr operator()(Args &&...args) const {
         this->check_gio_access();
         return this->funcs().at(0)(std::forward<Args>(args)...);
     }
@@ -2328,7 +2328,7 @@ class GeneratorOutputImpl : public GeneratorOutputBase {
 
 public:
     template<typename... Args, typename T2 = T, typename std::enable_if<!std::is_array<T2>::value>::type * = nullptr>
-    FuncRef operator()(Args &&... args) const {
+    FuncRef operator()(Args &&...args) const {
         this->check_gio_access();
         return get_values<ValueType>().at(0)(std::forward<Args>(args)...);
     }
@@ -2896,7 +2896,7 @@ class GeneratorContext {
     }
 
     template<typename T, typename... Args>
-    inline std::unique_ptr<T> apply(const Args &... args) const {
+    inline std::unique_ptr<T> apply(const Args &...args) const {
         auto t = this->create<T>();
         t->apply(args...);
         return t;
@@ -3102,7 +3102,7 @@ class GeneratorBase : public NamesInterface, public GeneratorContext {
      * will assert-fail at Halide compile time.
      */
     template<typename... Args>
-    void set_inputs(const Args &... args) {
+    void set_inputs(const Args &...args) {
         // set_inputs_vector() checks this too, but checking it here allows build_inputs() to avoid out-of-range checks.
         GeneratorParamInfo &pi = this->param_info();
         user_assert(sizeof...(args) == pi.inputs().size())
@@ -3119,7 +3119,7 @@ class GeneratorBase : public NamesInterface, public GeneratorContext {
     // Only enable if none of the args are Realization; otherwise we can incorrectly
     // select this method instead of the Realization-as-outparam variant
     template<typename... Args, typename std::enable_if<NoRealizations<Args...>::value>::type * = nullptr>
-    Realization realize(Args &&... args) {
+    Realization realize(Args &&...args) {
         this->check_scheduled("realize");
         return get_pipeline().realize(std::forward<Args>(args)..., get_target());
     }
@@ -3197,7 +3197,7 @@ class GeneratorBase : public NamesInterface, public GeneratorContext {
     }
 
     template<typename... Args>
-    HALIDE_NO_USER_CODE_INLINE void add_requirement(Expr condition, Args &&... args) {
+    HALIDE_NO_USER_CODE_INLINE void add_requirement(Expr condition, Args &&...args) {
         get_pipeline().add_requirement(condition, std::forward<Args>(args)...);
     }
 
@@ -3514,7 +3514,7 @@ class Generator : public Internal::GeneratorBase {
     using Internal::GeneratorBase::create;
 
     template<typename... Args>
-    void apply(const Args &... args) {
+    void apply(const Args &...args) {
 #ifndef _MSC_VER
         // VS2015 apparently has some SFINAE issues, so this can inappropriately
         // trigger there. (We'll still fail when generate() is called, just
diff --git a/src/IRMutator.h b/src/IRMutator.h
index 0e587b035721..abadff03b6d2 100644
--- a/src/IRMutator.h
+++ b/src/IRMutator.h
@@ -104,7 +104,7 @@ class IRGraphMutator : public IRMutator {
 
 /** A helper function for mutator-like things to mutate regions */
 template<typename Mutator, typename... Args>
-std::pair<Region, bool> mutate_region(Mutator *mutator, const Region &bounds, Args &&... args) {
+std::pair<Region, bool> mutate_region(Mutator *mutator, const Region &bounds, Args &&...args) {
     Region new_bounds(bounds.size());
     bool bounds_changed = false;
 
diff --git a/src/IROperator.h b/src/IROperator.h
index fc2dffffcc7a..7912d945910b 100644
--- a/src/IROperator.h
+++ b/src/IROperator.h
@@ -312,13 +312,13 @@ inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &arg
 }
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, const char *arg, Args &&... more_args) {
+inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, const char *arg, Args &&...more_args) {
     args.emplace_back(std::string(arg));
     collect_print_args(args, std::forward<Args>(more_args)...);
 }
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, Expr arg, Args &&... more_args) {
+inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, Expr arg, Args &&...more_args) {
     args.push_back(std::move(arg));
     collect_print_args(args, std::forward<Args>(more_args)...);
 }
@@ -624,7 +624,7 @@ inline Expr max(Expr a, float b) {
  * The arguments can be any mix of types but must all be convertible to Expr. */
 template<typename A, typename B, typename C, typename... Rest,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Rest...>::value>::type * = nullptr>
-inline Expr max(A &&a, B &&b, C &&c, Rest &&... rest) {
+inline Expr max(A &&a, B &&b, C &&c, Rest &&...rest) {
     return max(std::forward<A>(a), max(std::forward<B>(b), std::forward<C>(c), std::forward<Rest>(rest)...));
 }
 
@@ -659,7 +659,7 @@ inline Expr min(Expr a, float b) {
  * The arguments can be any mix of types but must all be convertible to Expr. */
 template<typename A, typename B, typename C, typename... Rest,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Rest...>::value>::type * = nullptr>
-inline Expr min(A &&a, B &&b, C &&c, Rest &&... rest) {
+inline Expr min(A &&a, B &&b, C &&c, Rest &&...rest) {
     return min(std::forward<A>(a), min(std::forward<B>(b), std::forward<C>(c), std::forward<Rest>(rest)...));
 }
 
@@ -764,7 +764,7 @@ Expr select(Expr condition, Expr true_value, Expr false_value);
  * final value if all conditions are false. */
 template<typename... Args,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Args...>::value>::type * = nullptr>
-inline Expr select(Expr c0, Expr v0, Expr c1, Expr v1, Args &&... args) {
+inline Expr select(Expr c0, Expr v0, Expr c1, Expr v1, Args &&...args) {
     return select(std::move(c0), std::move(v0), select(std::move(c1), std::move(v1), std::forward<Args>(args)...));
 }
 
@@ -779,12 +779,12 @@ Tuple tuple_select(const Expr &condition, const Tuple &true_value, const Tuple &
  * a Tuple, it must match the size of the true and false Tuples. */
 // @{
 template<typename... Args>
-inline Tuple tuple_select(const Tuple &c0, const Tuple &v0, const Tuple &c1, const Tuple &v1, Args &&... args) {
+inline Tuple tuple_select(const Tuple &c0, const Tuple &v0, const Tuple &c1, const Tuple &v1, Args &&...args) {
     return tuple_select(c0, v0, tuple_select(c1, v1, std::forward<Args>(args)...));
 }
 
 template<typename... Args>
-inline Tuple tuple_select(const Expr &c0, const Tuple &v0, const Expr &c1, const Tuple &v1, Args &&... args) {
+inline Tuple tuple_select(const Expr &c0, const Tuple &v0, const Expr &c1, const Tuple &v1, Args &&...args) {
     return tuple_select(c0, v0, tuple_select(c1, v1, std::forward<Args>(args)...));
 }
 // @}
@@ -1209,7 +1209,7 @@ Expr random_int(Expr seed = Expr());
 Expr print(const std::vector<Expr> &values);
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE Expr print(Expr a, Args &&... args) {
+inline HALIDE_NO_USER_CODE_INLINE Expr print(Expr a, Args &&...args) {
     std::vector<Expr> collected_args = {std::move(a)};
     Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
     return print(collected_args);
@@ -1222,7 +1222,7 @@ inline HALIDE_NO_USER_CODE_INLINE Expr print(Expr a, Args &&... args) {
 Expr print_when(Expr condition, const std::vector<Expr> &values);
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE Expr print_when(Expr condition, Expr a, Args &&... args) {
+inline HALIDE_NO_USER_CODE_INLINE Expr print_when(Expr condition, Expr a, Args &&...args) {
     std::vector<Expr> collected_args = {std::move(a)};
     Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
     return print_when(std::move(condition), collected_args);
@@ -1255,7 +1255,7 @@ inline HALIDE_NO_USER_CODE_INLINE Expr print_when(Expr condition, Expr a, Args &
 Expr require(Expr condition, const std::vector<Expr> &values);
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE Expr require(Expr condition, Expr value, Args &&... args) {
+inline HALIDE_NO_USER_CODE_INLINE Expr require(Expr condition, Expr value, Args &&...args) {
     std::vector<Expr> collected_args = {std::move(value)};
     Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
     return require(std::move(condition), collected_args);
@@ -1315,7 +1315,7 @@ inline Expr undef() {
  * on the digest. */
 // @{
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE Expr memoize_tag(Expr result, Args &&... args) {
+inline HALIDE_NO_USER_CODE_INLINE Expr memoize_tag(Expr result, Args &&...args) {
     std::vector<Expr> collected_args{std::forward<Args>(args)...};
     return Internal::memoize_tag_helper(std::move(result), collected_args);
 }
diff --git a/src/IRVisitor.h b/src/IRVisitor.h
index ce33e80ac318..1db022c6f6a5 100644
--- a/src/IRVisitor.h
+++ b/src/IRVisitor.h
@@ -159,7 +159,7 @@ template<typename T, typename ExprRet, typename StmtRet>
 class VariadicVisitor {
 private:
     template<typename... Args>
-    ExprRet dispatch_expr(const BaseExprNode *node, Args &&... args) {
+    ExprRet dispatch_expr(const BaseExprNode *node, Args &&...args) {
         if (node == nullptr) {
             return ExprRet{};
         };
@@ -249,7 +249,7 @@ class VariadicVisitor {
     }
 
     template<typename... Args>
-    StmtRet dispatch_stmt(const BaseStmtNode *node, Args &&... args) {
+    StmtRet dispatch_stmt(const BaseStmtNode *node, Args &&...args) {
         if (node == nullptr) {
             return StmtRet{};
         };
@@ -324,22 +324,22 @@ class VariadicVisitor {
 
 public:
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE StmtRet dispatch(const Stmt &s, Args &&... args) {
+    HALIDE_ALWAYS_INLINE StmtRet dispatch(const Stmt &s, Args &&...args) {
         return dispatch_stmt(s.get(), std::forward<Args>(args)...);
     }
 
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE StmtRet dispatch(Stmt &&s, Args &&... args) {
+    HALIDE_ALWAYS_INLINE StmtRet dispatch(Stmt &&s, Args &&...args) {
         return dispatch_stmt(s.get(), std::forward<Args>(args)...);
     }
 
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE ExprRet dispatch(const Expr &e, Args &&... args) {
+    HALIDE_ALWAYS_INLINE ExprRet dispatch(const Expr &e, Args &&...args) {
         return dispatch_expr(e.get(), std::forward<Args>(args)...);
     }
 
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE ExprRet dispatch(Expr &&e, Args &&... args) {
+    HALIDE_ALWAYS_INLINE ExprRet dispatch(Expr &&e, Args &&...args) {
         return dispatch_expr(e.get(), std::forward<Args>(args)...);
     }
 };
diff --git a/src/ImageParam.h b/src/ImageParam.h
index d4383bf4ed7f..d294b731df2f 100644
--- a/src/ImageParam.h
+++ b/src/ImageParam.h
@@ -64,7 +64,7 @@ class ImageParam : public OutputImageParam {
      */
     // @{
     template<typename... Args>
-    HALIDE_NO_USER_CODE_INLINE Expr operator()(Args &&... args) const {
+    HALIDE_NO_USER_CODE_INLINE Expr operator()(Args &&...args) const {
         return func(std::forward<Args>(args)...);
     }
     Expr operator()(std::vector<Expr>) const;
diff --git a/src/Pipeline.h b/src/Pipeline.h
index 5c833fcfabcf..bccd78516a65 100644
--- a/src/Pipeline.h
+++ b/src/Pipeline.h
@@ -121,7 +121,7 @@ class Pipeline {
         }
         template<typename T, typename... Args,
                  typename = typename std::enable_if<Internal::all_are_convertible<Buffer<>, Args...>::value>::type>
-        RealizationArg(Buffer<T> &a, Args &&... args) {
+        RealizationArg(Buffer<T> &a, Args &&...args) {
             buffer_list.reset(new std::vector<Buffer<>>({a, args...}));
         }
         RealizationArg(RealizationArg &&from) = default;
@@ -569,7 +569,7 @@ class Pipeline {
     void trace_pipeline();
 
     template<typename... Args>
-    inline HALIDE_NO_USER_CODE_INLINE void add_requirement(const Expr &condition, Args &&... args) {
+    inline HALIDE_NO_USER_CODE_INLINE void add_requirement(const Expr &condition, Args &&...args) {
         std::vector<Expr> collected_args;
         Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
         add_requirement(condition, collected_args);
diff --git a/src/RDom.h b/src/RDom.h
index 32fc1955782d..b425e1c9d32f 100644
--- a/src/RDom.h
+++ b/src/RDom.h
@@ -198,7 +198,7 @@ class RDom {
     void initialize_from_region(const Region &region, std::string name = "");
 
     template<typename... Args>
-    HALIDE_NO_USER_CODE_INLINE void initialize_from_region(Region &region, const Expr &min, const Expr &extent, Args &&... args) {
+    HALIDE_NO_USER_CODE_INLINE void initialize_from_region(Region &region, const Expr &min, const Expr &extent, Args &&...args) {
         region.push_back({min, extent});
         initialize_from_region(region, std::forward<Args>(args)...);
     }
@@ -215,7 +215,7 @@ class RDom {
     }
 
     template<typename... Args>
-    HALIDE_NO_USER_CODE_INLINE RDom(Expr min, Expr extent, Args &&... args) {
+    HALIDE_NO_USER_CODE_INLINE RDom(Expr min, Expr extent, Args &&...args) {
         // This should really just be a delegating constructor, but I couldn't make
         // that work with variadic template unpacking in visual studio 2013
         Region region;
diff --git a/src/Realization.h b/src/Realization.h
index 77355811abe4..bc7f227b254c 100644
--- a/src/Realization.h
+++ b/src/Realization.h
@@ -44,7 +44,7 @@ class Realization {
     template<typename T,
              typename... Args,
              typename = typename std::enable_if<Internal::all_are_convertible<Buffer<void>, Args...>::value>::type>
-    Realization(Buffer<T> &a, Args &&... args) {
+    Realization(Buffer<T> &a, Args &&...args) {
         images = std::vector<Buffer<void>>({a, args...});
     }
 
diff --git a/src/Tuple.h b/src/Tuple.h
index 759935a6fb70..7f775c578d14 100644
--- a/src/Tuple.h
+++ b/src/Tuple.h
@@ -45,7 +45,7 @@ class Tuple {
     /** Construct a Tuple from some Exprs. */
     //@{
     template<typename... Args>
-    Tuple(const Expr &a, const Expr &b, Args &&... args) {
+    Tuple(const Expr &a, const Expr &b, Args &&...args) {
         exprs = std::vector<Expr>{a, b, std::forward<Args>(args)...};
     }
     //@}
diff --git a/src/runtime/HalideBuffer.h b/src/runtime/HalideBuffer.h
index 24d2908fb62a..32e241f41a3d 100644
--- a/src/runtime/HalideBuffer.h
+++ b/src/runtime/HalideBuffer.h
@@ -928,7 +928,7 @@ class Buffer {
      * host_dirty flag. */
     template<typename... Args,
              typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&... rest) {
+    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&...rest) {
         if (!T_is_void) {
             assert(static_halide_type() == t);
         }
@@ -945,7 +945,7 @@ class Buffer {
      * take ownership of the data and does not set the host_dirty flag. */
     template<typename... Args,
              typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    explicit Buffer(T *data, int first, Args &&... rest) {
+    explicit Buffer(T *data, int first, Args &&...rest) {
         int extents[] = {first, (int)rest...};
         buf.type = static_halide_type();
         constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
@@ -2075,7 +2075,7 @@ class Buffer {
     }
 
     template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
-    void for_each_value_impl(Fn &&f, Args &&... other_buffers) const {
+    void for_each_value_impl(Fn &&f, Args &&...other_buffers) const {
         Buffer<>::for_each_value_task_dim<N> *t =
             (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA((dimensions() + 1) * sizeof(for_each_value_task_dim<N>));
         // Move the preparatory code into a non-templated helper to
@@ -2107,7 +2107,7 @@ class Buffer {
      * will result in a compilation error. */
     // @{
     template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
-    HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_value(Fn &&f, Args &&... other_buffers) const {
+    HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_value(Fn &&f, Args &&...other_buffers) const {
         for_each_value_impl(f, std::forward<Args>(other_buffers)...);
         return *this;
     }
@@ -2115,7 +2115,7 @@ class Buffer {
     template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
     HALIDE_ALWAYS_INLINE
         Buffer<T, D> &
-        for_each_value(Fn &&f, Args &&... other_buffers) {
+        for_each_value(Fn &&f, Args &&...other_buffers) {
         for_each_value_impl(f, std::forward<Args>(other_buffers)...);
         return *this;
     }
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index e3d76c815603..1dc1cf27086e 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1059,7 +1059,7 @@ WEAK int halide_cuda_run(void *user_context,
 #endif
 
     halide_assert(user_context, state_ptr);
-    CUmodule mod;
+    CUmodule mod = nullptr;
     bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod);
     halide_assert(user_context, found_module && mod != nullptr);
     debug(user_context) << "Got module " << mod << "\n";
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index 06186e3b838e..5979a61eccdd 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -5,7 +5,7 @@ namespace Halide {
 namespace Internal {
 
 template<typename ContextT, typename ModuleStateT>
-struct GPUCompilationCache {
+class GPUCompilationCache {
     struct CachedCompilation {
         ContextT context;
         ModuleStateT module_state;
@@ -25,6 +25,7 @@ struct GPUCompilationCache {
 
     uint32_t unique_id{2};  // zero is an invalid id
 
+public:
     static ALWAYS_INLINE uintptr_t kernel_hash(ContextT context, uint32_t id, uint32_t bits) {
         uintptr_t addr = (uintptr_t)context + id;
         // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39...
diff --git a/tools/RunGen.h b/tools/RunGen.h
index 5a2a64086538..7ffd9b23eef4 100644
--- a/tools/RunGen.h
+++ b/tools/RunGen.h
@@ -203,7 +203,7 @@ inline constexpr int halide_type_code(halide_type_code_t code, int bits) {
 // variants *will* be instantiated (increasing code size), so this approach
 // should only be used when strictly necessary.
 template<template<typename> class Functor, typename... Args>
-auto dynamic_type_dispatch(const halide_type_t &type, Args &&... args) -> decltype(std::declval<Functor<uint8_t>>()(std::forward<Args>(args)...)) {
+auto dynamic_type_dispatch(const halide_type_t &type, Args &&...args) -> decltype(std::declval<Functor<uint8_t>>()(std::forward<Args>(args)...)) {
 
 #define HANDLE_CASE(CODE, BITS, TYPE)  \
     case halide_type_code(CODE, BITS): \

From 0ccd159f27b0e45bdc1ad5df763ab9a78e121dc3 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 2 Dec 2020 13:26:35 -0800
Subject: [PATCH 15/17] Revert "Address review feedback."

This reverts commit 0a57c4bee90f2e06d0a00c2ad042820fca7867a4.
---
 Makefile                         |  3 +--
 apps/fft/funct.h                 |  2 +-
 src/BoundaryConditions.h         | 14 ++++++-------
 src/Buffer.h                     | 34 ++++++++++++++++----------------
 src/Func.h                       | 16 +++++++--------
 src/Generator.h                  | 28 +++++++++++++-------------
 src/IRMutator.h                  |  2 +-
 src/IROperator.h                 | 22 ++++++++++-----------
 src/IRVisitor.h                  | 12 +++++------
 src/ImageParam.h                 |  2 +-
 src/Pipeline.h                   |  4 ++--
 src/RDom.h                       |  4 ++--
 src/Realization.h                |  2 +-
 src/Tuple.h                      |  2 +-
 src/runtime/HalideBuffer.h       | 10 +++++-----
 src/runtime/cuda.cpp             |  2 +-
 src/runtime/gpu_context_common.h |  3 +--
 tools/RunGen.h                   |  2 +-
 18 files changed, 81 insertions(+), 83 deletions(-)

diff --git a/Makefile b/Makefile
index 6ce06a94b568..836c1462f6d1 100644
--- a/Makefile
+++ b/Makefile
@@ -1630,8 +1630,7 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget
 
 # gpu_multi_context_threaded has additional deps to link in
 $(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
-	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_add.a \
-	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a \
+	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_add.a $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a
 	                                                       $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
diff --git a/apps/fft/funct.h b/apps/fft/funct.h
index a6d5a9036a43..79d558d480ab 100644
--- a/apps/fft/funct.h
+++ b/apps/fft/funct.h
@@ -59,7 +59,7 @@ class FuncT : public Halide::Func {
     }
 
     template<typename... Args>
-    FuncRefT<T> operator()(Args &&...args) const {
+    FuncRefT<T> operator()(Args &&... args) const {
         return Func::operator()(std::forward<Args>(args)...);
     }
 
diff --git a/src/BoundaryConditions.h b/src/BoundaryConditions.h
index 1187c5b15a8e..3c1697643db3 100644
--- a/src/BoundaryConditions.h
+++ b/src/BoundaryConditions.h
@@ -57,7 +57,7 @@ inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
 
 template<typename... Args>
 inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
-                                                      const Expr &a1, const Expr &a2, Args &&...args) {
+                                                      const Expr &a1, const Expr &a2, Args &&... args) {
     collected_args.push_back(Range(a1, a2));
     collect_region(collected_args, std::forward<Args>(args)...);
 }
@@ -124,7 +124,7 @@ HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value,
-                                                  Bounds &&...bounds) {
+                                                  Bounds &&... bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return constant_exterior(Internal::func_like_to_func(func_like), value, collected_bounds);
@@ -132,7 +132,7 @@ HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tupl
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value,
-                                                  Bounds &&...bounds) {
+                                                  Bounds &&... bounds) {
     return constant_exterior(func_like, Tuple(value), std::forward<Bounds>(bounds)...);
 }
 // @}
@@ -170,7 +170,7 @@ HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like) {
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_ATTRIBUTE_DEPRECATED("Add braces around the bounds like so: {{a, b}, {c, d}}")
-HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like, Bounds &&...bounds) {
+HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like, Bounds &&... bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return repeat_edge(Internal::func_like_to_func(func_like), collected_bounds);
@@ -210,7 +210,7 @@ HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like) {
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_ATTRIBUTE_DEPRECATED("Add braces around the bounds like so: {{a, b}, {c, d}}")
-HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like, Bounds &&...bounds) {
+HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like, Bounds &&... bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return repeat_image(Internal::func_like_to_func(func_like), collected_bounds);
@@ -250,7 +250,7 @@ HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like) {
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_ATTRIBUTE_DEPRECATED("Add braces around the bounds like so: {{a, b}, {c, d}}")
-HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like, Bounds &&...bounds) {
+HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like, Bounds &&... bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return mirror_image(Internal::func_like_to_func(func_like), collected_bounds);
@@ -293,7 +293,7 @@ HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like) {
 template<typename T, typename... Bounds,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
 HALIDE_ATTRIBUTE_DEPRECATED("Add braces around the bounds like so: {{a, b}, {c, d}}")
-HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like, Bounds &&...bounds) {
+HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like, Bounds &&... bounds) {
     Region collected_bounds;
     Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
     return mirror_interior(Internal::func_like_to_func(func_like), collected_bounds);
diff --git a/src/Buffer.h b/src/Buffer.h
index da4c19b2c027..477d6a893995 100644
--- a/src/Buffer.h
+++ b/src/Buffer.h
@@ -52,7 +52,7 @@ inline std::string get_name_from_end_of_parameter_pack() {
 template<typename First,
          typename Second,
          typename... Args>
-std::string get_name_from_end_of_parameter_pack(First first, Second second, Args &&...rest) {
+std::string get_name_from_end_of_parameter_pack(First first, Second second, Args &&... rest) {
     return get_name_from_end_of_parameter_pack(second, std::forward<Args>(rest)...);
 }
 
@@ -63,13 +63,13 @@ inline void get_shape_from_start_of_parameter_pack_helper(std::vector<int> &) {
 }
 
 template<typename... Args>
-void get_shape_from_start_of_parameter_pack_helper(std::vector<int> &result, int x, Args &&...rest) {
+void get_shape_from_start_of_parameter_pack_helper(std::vector<int> &result, int x, Args &&... rest) {
     result.push_back(x);
     get_shape_from_start_of_parameter_pack_helper(result, std::forward<Args>(rest)...);
 }
 
 template<typename... Args>
-std::vector<int> get_shape_from_start_of_parameter_pack(Args &&...args) {
+std::vector<int> get_shape_from_start_of_parameter_pack(Args &&... args) {
     std::vector<int> result;
     get_shape_from_start_of_parameter_pack_helper(result, std::forward<Args>(args)...);
     return result;
@@ -248,7 +248,7 @@ class Buffer {
              typename = typename std::enable_if<Internal::all_ints_and_optional_name<Args...>::value>::type>
     explicit Buffer(Type t,
                     Internal::add_const_if_T_is_const<T, void> *data,
-                    int first, Args &&...rest)
+                    int first, Args &&... rest)
         : Buffer(Runtime::Buffer<T>(t, data, Internal::get_shape_from_start_of_parameter_pack(first, rest...)),
                  Internal::get_name_from_end_of_parameter_pack(rest...)) {
     }
@@ -265,7 +265,7 @@ class Buffer {
     template<typename... Args,
              typename = typename std::enable_if<Internal::all_ints_and_optional_name<Args...>::value>::type>
     explicit Buffer(T *data,
-                    int first, Args &&...rest)
+                    int first, Args &&... rest)
         : Buffer(Runtime::Buffer<T>(data, Internal::get_shape_from_start_of_parameter_pack(first, rest...)),
                  Internal::get_name_from_end_of_parameter_pack(rest...)) {
     }
@@ -387,18 +387,18 @@ class Buffer {
     // @}
 
     // We forward numerous methods from the underlying Buffer
-#define HALIDE_BUFFER_FORWARD_CONST(method)                                                                                     \
-    template<typename... Args>                                                                                                  \
-    auto method(Args &&...args) const->decltype(std::declval<const Runtime::Buffer<T>>().method(std::forward<Args>(args)...)) { \
-        user_assert(defined()) << "Undefined buffer calling const method " #method "\n";                                        \
-        return get()->method(std::forward<Args>(args)...);                                                                      \
+#define HALIDE_BUFFER_FORWARD_CONST(method)                                                                                      \
+    template<typename... Args>                                                                                                   \
+    auto method(Args &&... args) const->decltype(std::declval<const Runtime::Buffer<T>>().method(std::forward<Args>(args)...)) { \
+        user_assert(defined()) << "Undefined buffer calling const method " #method "\n";                                         \
+        return get()->method(std::forward<Args>(args)...);                                                                       \
     }
 
-#define HALIDE_BUFFER_FORWARD(method)                                                                               \
-    template<typename... Args>                                                                                      \
-    auto method(Args &&...args)->decltype(std::declval<Runtime::Buffer<T>>().method(std::forward<Args>(args)...)) { \
-        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                                  \
-        return get()->method(std::forward<Args>(args)...);                                                          \
+#define HALIDE_BUFFER_FORWARD(method)                                                                                \
+    template<typename... Args>                                                                                       \
+    auto method(Args &&... args)->decltype(std::declval<Runtime::Buffer<T>>().method(std::forward<Args>(args)...)) { \
+        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                                   \
+        return get()->method(std::forward<Args>(args)...);                                                           \
     }
 
 // This is a weird-looking but effective workaround for a deficiency in "perfect forwarding":
@@ -536,12 +536,12 @@ class Buffer {
     }
 
     template<typename... Args>
-    auto operator()(int first, Args &&...args) -> decltype(std::declval<Runtime::Buffer<T>>()(first, std::forward<Args>(args)...)) {
+    auto operator()(int first, Args &&... args) -> decltype(std::declval<Runtime::Buffer<T>>()(first, std::forward<Args>(args)...)) {
         return (*get())(first, std::forward<Args>(args)...);
     }
 
     template<typename... Args>
-    auto operator()(int first, Args &&...args) const -> decltype(std::declval<const Runtime::Buffer<T>>()(first, std::forward<Args>(args)...)) {
+    auto operator()(int first, Args &&... args) const -> decltype(std::declval<const Runtime::Buffer<T>>()(first, std::forward<Args>(args)...)) {
         return (*get())(first, std::forward<Args>(args)...);
     }
 
diff --git a/src/Func.h b/src/Func.h
index 85b60b47f018..f893f02e4a37 100644
--- a/src/Func.h
+++ b/src/Func.h
@@ -376,7 +376,7 @@ class Stage {
 
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
-    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
+    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&... args) {
         std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
         return reorder(collected_args);
     }
@@ -1285,7 +1285,7 @@ class Func {
 
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
-    operator()(Args &&...args) const {
+    operator()(Args &&... args) const {
         std::vector<Var> collected_args{std::forward<Args>(args)...};
         return this->operator()(collected_args);
     }
@@ -1302,7 +1302,7 @@ class Func {
 
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
-    operator()(const Expr &x, Args &&...args) const {
+    operator()(const Expr &x, Args &&... args) const {
         std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
         return (*this)(collected_args);
     }
@@ -1601,7 +1601,7 @@ class Func {
 
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
-    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
+    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&... args) {
         std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
         return reorder(collected_args);
     }
@@ -2037,7 +2037,7 @@ class Func {
     Func &reorder_storage(const Var &x, const Var &y);
     template<typename... Args>
     HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
-    reorder_storage(const Var &x, const Var &y, Args &&...args) {
+    reorder_storage(const Var &x, const Var &y, Args &&... args) {
         std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
         return reorder_storage(collected_args);
     }
@@ -2472,7 +2472,7 @@ inline void assign_results(Realization &r, int idx, Last last) {
 }
 
 template<typename First, typename Second, typename... Rest>
-inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
+inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&... rest) {
     assign_results<First>(r, idx, first);
     assign_results<Second, Rest...>(r, idx + 1, second, rest...);
 }
@@ -2496,7 +2496,7 @@ HALIDE_NO_USER_CODE_INLINE T evaluate(const Expr &e) {
 
 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
 template<typename First, typename... Rest>
-HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
+HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&... rest) {
     Internal::check_types<First, Rest...>(t, 0);
 
     Func f;
@@ -2541,7 +2541,7 @@ HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e) {
  *  use GPU if jit target from environment specifies one. */
 // @{
 template<typename First, typename... Rest>
-HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
+HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&... rest) {
     Internal::check_types<First, Rest...>(t, 0);
 
     Func f;
diff --git a/src/Generator.h b/src/Generator.h
index 300230aae2b2..4e2181f1849b 100644
--- a/src/Generator.h
+++ b/src/Generator.h
@@ -1324,7 +1324,7 @@ class StubOutputBufferBase {
     }
 
     template<typename... Args>
-    Realization realize(Args &&...args) {
+    Realization realize(Args &&... args) {
         check_scheduled("realize");
         return f.realize(std::forward<Args>(args)..., get_target());
     }
@@ -1626,15 +1626,15 @@ class GeneratorInputImpl : public GeneratorInputBase {
 // types in question satisfy the property of copies referring to the same underlying
 // structure (returning references is just an optimization). Since this is verbose
 // and used in several places, we'll use a helper macro:
-#define HALIDE_FORWARD_METHOD(Class, Method)                                                                                                        \
-    template<typename... Args>                                                                                                                      \
-    inline auto Method(Args &&...args)->typename std::remove_reference<decltype(std::declval<Class>().Method(std::forward<Args>(args)...))>::type { \
-        return this->template as<Class>().Method(std::forward<Args>(args)...);                                                                      \
+#define HALIDE_FORWARD_METHOD(Class, Method)                                                                                                         \
+    template<typename... Args>                                                                                                                       \
+    inline auto Method(Args &&... args)->typename std::remove_reference<decltype(std::declval<Class>().Method(std::forward<Args>(args)...))>::type { \
+        return this->template as<Class>().Method(std::forward<Args>(args)...);                                                                       \
     }
 
 #define HALIDE_FORWARD_METHOD_CONST(Class, Method)                                                                  \
     template<typename... Args>                                                                                      \
-    inline auto Method(Args &&...args) const->                                                                      \
+    inline auto Method(Args &&... args) const->                                                                     \
         typename std::remove_reference<decltype(std::declval<Class>().Method(std::forward<Args>(args)...))>::type { \
         this->check_gio_access();                                                                                   \
         return this->template as<Class>().Method(std::forward<Args>(args)...);                                      \
@@ -1683,7 +1683,7 @@ class GeneratorInput_Buffer : public GeneratorInputImpl<T, Func> {
     }
 
     template<typename... Args>
-    Expr operator()(Args &&...args) const {
+    Expr operator()(Args &&... args) const {
         this->check_gio_access();
         return Func(*this)(std::forward<Args>(args)...);
     }
@@ -1854,7 +1854,7 @@ class GeneratorInput_Func : public GeneratorInputImpl<T, Func> {
     }
 
     template<typename... Args>
-    Expr operator()(Args &&...args) const {
+    Expr operator()(Args &&... args) const {
         this->check_gio_access();
         return this->funcs().at(0)(std::forward<Args>(args)...);
     }
@@ -2328,7 +2328,7 @@ class GeneratorOutputImpl : public GeneratorOutputBase {
 
 public:
     template<typename... Args, typename T2 = T, typename std::enable_if<!std::is_array<T2>::value>::type * = nullptr>
-    FuncRef operator()(Args &&...args) const {
+    FuncRef operator()(Args &&... args) const {
         this->check_gio_access();
         return get_values<ValueType>().at(0)(std::forward<Args>(args)...);
     }
@@ -2896,7 +2896,7 @@ class GeneratorContext {
     }
 
     template<typename T, typename... Args>
-    inline std::unique_ptr<T> apply(const Args &...args) const {
+    inline std::unique_ptr<T> apply(const Args &... args) const {
         auto t = this->create<T>();
         t->apply(args...);
         return t;
@@ -3102,7 +3102,7 @@ class GeneratorBase : public NamesInterface, public GeneratorContext {
      * will assert-fail at Halide compile time.
      */
     template<typename... Args>
-    void set_inputs(const Args &...args) {
+    void set_inputs(const Args &... args) {
         // set_inputs_vector() checks this too, but checking it here allows build_inputs() to avoid out-of-range checks.
         GeneratorParamInfo &pi = this->param_info();
         user_assert(sizeof...(args) == pi.inputs().size())
@@ -3119,7 +3119,7 @@ class GeneratorBase : public NamesInterface, public GeneratorContext {
     // Only enable if none of the args are Realization; otherwise we can incorrectly
     // select this method instead of the Realization-as-outparam variant
     template<typename... Args, typename std::enable_if<NoRealizations<Args...>::value>::type * = nullptr>
-    Realization realize(Args &&...args) {
+    Realization realize(Args &&... args) {
         this->check_scheduled("realize");
         return get_pipeline().realize(std::forward<Args>(args)..., get_target());
     }
@@ -3197,7 +3197,7 @@ class GeneratorBase : public NamesInterface, public GeneratorContext {
     }
 
     template<typename... Args>
-    HALIDE_NO_USER_CODE_INLINE void add_requirement(Expr condition, Args &&...args) {
+    HALIDE_NO_USER_CODE_INLINE void add_requirement(Expr condition, Args &&... args) {
         get_pipeline().add_requirement(condition, std::forward<Args>(args)...);
     }
 
@@ -3514,7 +3514,7 @@ class Generator : public Internal::GeneratorBase {
     using Internal::GeneratorBase::create;
 
     template<typename... Args>
-    void apply(const Args &...args) {
+    void apply(const Args &... args) {
 #ifndef _MSC_VER
         // VS2015 apparently has some SFINAE issues, so this can inappropriately
         // trigger there. (We'll still fail when generate() is called, just
diff --git a/src/IRMutator.h b/src/IRMutator.h
index abadff03b6d2..0e587b035721 100644
--- a/src/IRMutator.h
+++ b/src/IRMutator.h
@@ -104,7 +104,7 @@ class IRGraphMutator : public IRMutator {
 
 /** A helper function for mutator-like things to mutate regions */
 template<typename Mutator, typename... Args>
-std::pair<Region, bool> mutate_region(Mutator *mutator, const Region &bounds, Args &&...args) {
+std::pair<Region, bool> mutate_region(Mutator *mutator, const Region &bounds, Args &&... args) {
     Region new_bounds(bounds.size());
     bool bounds_changed = false;
 
diff --git a/src/IROperator.h b/src/IROperator.h
index 7912d945910b..fc2dffffcc7a 100644
--- a/src/IROperator.h
+++ b/src/IROperator.h
@@ -312,13 +312,13 @@ inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &arg
 }
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, const char *arg, Args &&...more_args) {
+inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, const char *arg, Args &&... more_args) {
     args.emplace_back(std::string(arg));
     collect_print_args(args, std::forward<Args>(more_args)...);
 }
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, Expr arg, Args &&...more_args) {
+inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, Expr arg, Args &&... more_args) {
     args.push_back(std::move(arg));
     collect_print_args(args, std::forward<Args>(more_args)...);
 }
@@ -624,7 +624,7 @@ inline Expr max(Expr a, float b) {
  * The arguments can be any mix of types but must all be convertible to Expr. */
 template<typename A, typename B, typename C, typename... Rest,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Rest...>::value>::type * = nullptr>
-inline Expr max(A &&a, B &&b, C &&c, Rest &&...rest) {
+inline Expr max(A &&a, B &&b, C &&c, Rest &&... rest) {
     return max(std::forward<A>(a), max(std::forward<B>(b), std::forward<C>(c), std::forward<Rest>(rest)...));
 }
 
@@ -659,7 +659,7 @@ inline Expr min(Expr a, float b) {
  * The arguments can be any mix of types but must all be convertible to Expr. */
 template<typename A, typename B, typename C, typename... Rest,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Rest...>::value>::type * = nullptr>
-inline Expr min(A &&a, B &&b, C &&c, Rest &&...rest) {
+inline Expr min(A &&a, B &&b, C &&c, Rest &&... rest) {
     return min(std::forward<A>(a), min(std::forward<B>(b), std::forward<C>(c), std::forward<Rest>(rest)...));
 }
 
@@ -764,7 +764,7 @@ Expr select(Expr condition, Expr true_value, Expr false_value);
  * final value if all conditions are false. */
 template<typename... Args,
          typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Args...>::value>::type * = nullptr>
-inline Expr select(Expr c0, Expr v0, Expr c1, Expr v1, Args &&...args) {
+inline Expr select(Expr c0, Expr v0, Expr c1, Expr v1, Args &&... args) {
     return select(std::move(c0), std::move(v0), select(std::move(c1), std::move(v1), std::forward<Args>(args)...));
 }
 
@@ -779,12 +779,12 @@ Tuple tuple_select(const Expr &condition, const Tuple &true_value, const Tuple &
  * a Tuple, it must match the size of the true and false Tuples. */
 // @{
 template<typename... Args>
-inline Tuple tuple_select(const Tuple &c0, const Tuple &v0, const Tuple &c1, const Tuple &v1, Args &&...args) {
+inline Tuple tuple_select(const Tuple &c0, const Tuple &v0, const Tuple &c1, const Tuple &v1, Args &&... args) {
     return tuple_select(c0, v0, tuple_select(c1, v1, std::forward<Args>(args)...));
 }
 
 template<typename... Args>
-inline Tuple tuple_select(const Expr &c0, const Tuple &v0, const Expr &c1, const Tuple &v1, Args &&...args) {
+inline Tuple tuple_select(const Expr &c0, const Tuple &v0, const Expr &c1, const Tuple &v1, Args &&... args) {
     return tuple_select(c0, v0, tuple_select(c1, v1, std::forward<Args>(args)...));
 }
 // @}
@@ -1209,7 +1209,7 @@ Expr random_int(Expr seed = Expr());
 Expr print(const std::vector<Expr> &values);
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE Expr print(Expr a, Args &&...args) {
+inline HALIDE_NO_USER_CODE_INLINE Expr print(Expr a, Args &&... args) {
     std::vector<Expr> collected_args = {std::move(a)};
     Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
     return print(collected_args);
@@ -1222,7 +1222,7 @@ inline HALIDE_NO_USER_CODE_INLINE Expr print(Expr a, Args &&...args) {
 Expr print_when(Expr condition, const std::vector<Expr> &values);
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE Expr print_when(Expr condition, Expr a, Args &&...args) {
+inline HALIDE_NO_USER_CODE_INLINE Expr print_when(Expr condition, Expr a, Args &&... args) {
     std::vector<Expr> collected_args = {std::move(a)};
     Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
     return print_when(std::move(condition), collected_args);
@@ -1255,7 +1255,7 @@ inline HALIDE_NO_USER_CODE_INLINE Expr print_when(Expr condition, Expr a, Args &
 Expr require(Expr condition, const std::vector<Expr> &values);
 
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE Expr require(Expr condition, Expr value, Args &&...args) {
+inline HALIDE_NO_USER_CODE_INLINE Expr require(Expr condition, Expr value, Args &&... args) {
     std::vector<Expr> collected_args = {std::move(value)};
     Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
     return require(std::move(condition), collected_args);
@@ -1315,7 +1315,7 @@ inline Expr undef() {
  * on the digest. */
 // @{
 template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE Expr memoize_tag(Expr result, Args &&...args) {
+inline HALIDE_NO_USER_CODE_INLINE Expr memoize_tag(Expr result, Args &&... args) {
     std::vector<Expr> collected_args{std::forward<Args>(args)...};
     return Internal::memoize_tag_helper(std::move(result), collected_args);
 }
diff --git a/src/IRVisitor.h b/src/IRVisitor.h
index 1db022c6f6a5..ce33e80ac318 100644
--- a/src/IRVisitor.h
+++ b/src/IRVisitor.h
@@ -159,7 +159,7 @@ template<typename T, typename ExprRet, typename StmtRet>
 class VariadicVisitor {
 private:
     template<typename... Args>
-    ExprRet dispatch_expr(const BaseExprNode *node, Args &&...args) {
+    ExprRet dispatch_expr(const BaseExprNode *node, Args &&... args) {
         if (node == nullptr) {
             return ExprRet{};
         };
@@ -249,7 +249,7 @@ class VariadicVisitor {
     }
 
     template<typename... Args>
-    StmtRet dispatch_stmt(const BaseStmtNode *node, Args &&...args) {
+    StmtRet dispatch_stmt(const BaseStmtNode *node, Args &&... args) {
         if (node == nullptr) {
             return StmtRet{};
         };
@@ -324,22 +324,22 @@ class VariadicVisitor {
 
 public:
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE StmtRet dispatch(const Stmt &s, Args &&...args) {
+    HALIDE_ALWAYS_INLINE StmtRet dispatch(const Stmt &s, Args &&... args) {
         return dispatch_stmt(s.get(), std::forward<Args>(args)...);
     }
 
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE StmtRet dispatch(Stmt &&s, Args &&...args) {
+    HALIDE_ALWAYS_INLINE StmtRet dispatch(Stmt &&s, Args &&... args) {
         return dispatch_stmt(s.get(), std::forward<Args>(args)...);
     }
 
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE ExprRet dispatch(const Expr &e, Args &&...args) {
+    HALIDE_ALWAYS_INLINE ExprRet dispatch(const Expr &e, Args &&... args) {
         return dispatch_expr(e.get(), std::forward<Args>(args)...);
     }
 
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE ExprRet dispatch(Expr &&e, Args &&...args) {
+    HALIDE_ALWAYS_INLINE ExprRet dispatch(Expr &&e, Args &&... args) {
         return dispatch_expr(e.get(), std::forward<Args>(args)...);
     }
 };
diff --git a/src/ImageParam.h b/src/ImageParam.h
index d294b731df2f..d4383bf4ed7f 100644
--- a/src/ImageParam.h
+++ b/src/ImageParam.h
@@ -64,7 +64,7 @@ class ImageParam : public OutputImageParam {
      */
     // @{
     template<typename... Args>
-    HALIDE_NO_USER_CODE_INLINE Expr operator()(Args &&...args) const {
+    HALIDE_NO_USER_CODE_INLINE Expr operator()(Args &&... args) const {
         return func(std::forward<Args>(args)...);
     }
     Expr operator()(std::vector<Expr>) const;
diff --git a/src/Pipeline.h b/src/Pipeline.h
index bccd78516a65..5c833fcfabcf 100644
--- a/src/Pipeline.h
+++ b/src/Pipeline.h
@@ -121,7 +121,7 @@ class Pipeline {
         }
         template<typename T, typename... Args,
                  typename = typename std::enable_if<Internal::all_are_convertible<Buffer<>, Args...>::value>::type>
-        RealizationArg(Buffer<T> &a, Args &&...args) {
+        RealizationArg(Buffer<T> &a, Args &&... args) {
             buffer_list.reset(new std::vector<Buffer<>>({a, args...}));
         }
         RealizationArg(RealizationArg &&from) = default;
@@ -569,7 +569,7 @@ class Pipeline {
     void trace_pipeline();
 
     template<typename... Args>
-    inline HALIDE_NO_USER_CODE_INLINE void add_requirement(const Expr &condition, Args &&...args) {
+    inline HALIDE_NO_USER_CODE_INLINE void add_requirement(const Expr &condition, Args &&... args) {
         std::vector<Expr> collected_args;
         Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
         add_requirement(condition, collected_args);
diff --git a/src/RDom.h b/src/RDom.h
index b425e1c9d32f..32fc1955782d 100644
--- a/src/RDom.h
+++ b/src/RDom.h
@@ -198,7 +198,7 @@ class RDom {
     void initialize_from_region(const Region &region, std::string name = "");
 
     template<typename... Args>
-    HALIDE_NO_USER_CODE_INLINE void initialize_from_region(Region &region, const Expr &min, const Expr &extent, Args &&...args) {
+    HALIDE_NO_USER_CODE_INLINE void initialize_from_region(Region &region, const Expr &min, const Expr &extent, Args &&... args) {
         region.push_back({min, extent});
         initialize_from_region(region, std::forward<Args>(args)...);
     }
@@ -215,7 +215,7 @@ class RDom {
     }
 
     template<typename... Args>
-    HALIDE_NO_USER_CODE_INLINE RDom(Expr min, Expr extent, Args &&...args) {
+    HALIDE_NO_USER_CODE_INLINE RDom(Expr min, Expr extent, Args &&... args) {
         // This should really just be a delegating constructor, but I couldn't make
         // that work with variadic template unpacking in visual studio 2013
         Region region;
diff --git a/src/Realization.h b/src/Realization.h
index bc7f227b254c..77355811abe4 100644
--- a/src/Realization.h
+++ b/src/Realization.h
@@ -44,7 +44,7 @@ class Realization {
     template<typename T,
              typename... Args,
              typename = typename std::enable_if<Internal::all_are_convertible<Buffer<void>, Args...>::value>::type>
-    Realization(Buffer<T> &a, Args &&...args) {
+    Realization(Buffer<T> &a, Args &&... args) {
         images = std::vector<Buffer<void>>({a, args...});
     }
 
diff --git a/src/Tuple.h b/src/Tuple.h
index 7f775c578d14..759935a6fb70 100644
--- a/src/Tuple.h
+++ b/src/Tuple.h
@@ -45,7 +45,7 @@ class Tuple {
     /** Construct a Tuple from some Exprs. */
     //@{
     template<typename... Args>
-    Tuple(const Expr &a, const Expr &b, Args &&...args) {
+    Tuple(const Expr &a, const Expr &b, Args &&... args) {
         exprs = std::vector<Expr>{a, b, std::forward<Args>(args)...};
     }
     //@}
diff --git a/src/runtime/HalideBuffer.h b/src/runtime/HalideBuffer.h
index 32e241f41a3d..24d2908fb62a 100644
--- a/src/runtime/HalideBuffer.h
+++ b/src/runtime/HalideBuffer.h
@@ -928,7 +928,7 @@ class Buffer {
      * host_dirty flag. */
     template<typename... Args,
              typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&...rest) {
+    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&... rest) {
         if (!T_is_void) {
             assert(static_halide_type() == t);
         }
@@ -945,7 +945,7 @@ class Buffer {
      * take ownership of the data and does not set the host_dirty flag. */
     template<typename... Args,
              typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    explicit Buffer(T *data, int first, Args &&...rest) {
+    explicit Buffer(T *data, int first, Args &&... rest) {
         int extents[] = {first, (int)rest...};
         buf.type = static_halide_type();
         constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
@@ -2075,7 +2075,7 @@ class Buffer {
     }
 
     template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
-    void for_each_value_impl(Fn &&f, Args &&...other_buffers) const {
+    void for_each_value_impl(Fn &&f, Args &&... other_buffers) const {
         Buffer<>::for_each_value_task_dim<N> *t =
             (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA((dimensions() + 1) * sizeof(for_each_value_task_dim<N>));
         // Move the preparatory code into a non-templated helper to
@@ -2107,7 +2107,7 @@ class Buffer {
      * will result in a compilation error. */
     // @{
     template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
-    HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_value(Fn &&f, Args &&...other_buffers) const {
+    HALIDE_ALWAYS_INLINE const Buffer<T, D> &for_each_value(Fn &&f, Args &&... other_buffers) const {
         for_each_value_impl(f, std::forward<Args>(other_buffers)...);
         return *this;
     }
@@ -2115,7 +2115,7 @@ class Buffer {
     template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
     HALIDE_ALWAYS_INLINE
         Buffer<T, D> &
-        for_each_value(Fn &&f, Args &&...other_buffers) {
+        for_each_value(Fn &&f, Args &&... other_buffers) {
         for_each_value_impl(f, std::forward<Args>(other_buffers)...);
         return *this;
     }
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 1dc1cf27086e..e3d76c815603 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1059,7 +1059,7 @@ WEAK int halide_cuda_run(void *user_context,
 #endif
 
     halide_assert(user_context, state_ptr);
-    CUmodule mod = nullptr;
+    CUmodule mod;
     bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod);
     halide_assert(user_context, found_module && mod != nullptr);
     debug(user_context) << "Got module " << mod << "\n";
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index 5979a61eccdd..06186e3b838e 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -5,7 +5,7 @@ namespace Halide {
 namespace Internal {
 
 template<typename ContextT, typename ModuleStateT>
-class GPUCompilationCache {
+struct GPUCompilationCache {
     struct CachedCompilation {
         ContextT context;
         ModuleStateT module_state;
@@ -25,7 +25,6 @@ class GPUCompilationCache {
 
     uint32_t unique_id{2};  // zero is an invalid id
 
-public:
     static ALWAYS_INLINE uintptr_t kernel_hash(ContextT context, uint32_t id, uint32_t bits) {
         uintptr_t addr = (uintptr_t)context + id;
         // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39...
diff --git a/tools/RunGen.h b/tools/RunGen.h
index 7ffd9b23eef4..5a2a64086538 100644
--- a/tools/RunGen.h
+++ b/tools/RunGen.h
@@ -203,7 +203,7 @@ inline constexpr int halide_type_code(halide_type_code_t code, int bits) {
 // variants *will* be instantiated (increasing code size), so this approach
 // should only be used when strictly necessary.
 template<template<typename> class Functor, typename... Args>
-auto dynamic_type_dispatch(const halide_type_t &type, Args &&...args) -> decltype(std::declval<Functor<uint8_t>>()(std::forward<Args>(args)...)) {
+auto dynamic_type_dispatch(const halide_type_t &type, Args &&... args) -> decltype(std::declval<Functor<uint8_t>>()(std::forward<Args>(args)...)) {
 
 #define HANDLE_CASE(CODE, BITS, TYPE)  \
     case halide_type_code(CODE, BITS): \

From 9c1a4b829be7c1ba6e0054af454ca38e41da55da Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 2 Dec 2020 13:36:12 -0800
Subject: [PATCH 16/17] Revert clang-format damage. Address review feedback.

---
 Makefile                         | 3 ++-
 src/runtime/cuda.cpp             | 2 +-
 src/runtime/gpu_context_common.h | 9 +++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 836c1462f6d1..6ce06a94b568 100644
--- a/Makefile
+++ b/Makefile
@@ -1630,7 +1630,8 @@ generator_aot_multitarget: $(BIN_DIR)/$(TARGET)/generator_aot_multitarget
 
 # gpu_multi_context_threaded has additional deps to link in
 $(BIN_DIR)/$(TARGET)/generator_aot_gpu_multi_context_threaded: $(ROOT_DIR)/test/generator/gpu_multi_context_threaded_aottest.cpp \
-	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_add.a $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a
+	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_add.a \
+	                                                       $(FILTERS_DIR)/gpu_multi_context_threaded_mul.a \
 	                                                       $(RUNTIME_EXPORTED_INCLUDES) $(BIN_DIR)/$(TARGET)/runtime.a
 	@mkdir -p $(@D)
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) $(CUDA_LD_FLAGS) -o $@
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index e3d76c815603..1dc1cf27086e 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1059,7 +1059,7 @@ WEAK int halide_cuda_run(void *user_context,
 #endif
 
     halide_assert(user_context, state_ptr);
-    CUmodule mod;
+    CUmodule mod = nullptr;
     bool found_module = compilation_cache.lookup(ctx.context, state_ptr, mod);
     halide_assert(user_context, found_module && mod != nullptr);
     debug(user_context) << "Got module " << mod << "\n";
diff --git a/src/runtime/gpu_context_common.h b/src/runtime/gpu_context_common.h
index 06186e3b838e..9ef2662026a8 100644
--- a/src/runtime/gpu_context_common.h
+++ b/src/runtime/gpu_context_common.h
@@ -5,11 +5,11 @@ namespace Halide {
 namespace Internal {
 
 template<typename ContextT, typename ModuleStateT>
-struct GPUCompilationCache {
+class GPUCompilationCache {
     struct CachedCompilation {
-        ContextT context;
-        ModuleStateT module_state;
-        uint32_t kernel_id;
+        ContextT context{};
+        ModuleStateT module_state{};
+        uint32_t kernel_id{};
     };
 
     halide_mutex mutex;
@@ -25,6 +25,7 @@ struct GPUCompilationCache {
 
     uint32_t unique_id{2};  // zero is an invalid id
 
+public:
     static ALWAYS_INLINE uintptr_t kernel_hash(ContextT context, uint32_t id, uint32_t bits) {
         uintptr_t addr = (uintptr_t)context + id;
         // Fibonacci hashing. The golden ratio is 1.9E3779B97F4A7C15F39...

From b2d0bca79359501c10ad00a02bc6343ccf5ed0c9 Mon Sep 17 00:00:00 2001
From: Z Stern <zalman@google.com>
Date: Wed, 2 Dec 2020 14:20:32 -0800
Subject: [PATCH 17/17] Make test work without GPU support.

---
 .../gpu_multi_context_threaded_aottest.cpp    | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/test/generator/gpu_multi_context_threaded_aottest.cpp b/test/generator/gpu_multi_context_threaded_aottest.cpp
index 9ed77ff77e82..05852909ba57 100644
--- a/test/generator/gpu_multi_context_threaded_aottest.cpp
+++ b/test/generator/gpu_multi_context_threaded_aottest.cpp
@@ -65,6 +65,8 @@ extern "C" int halide_acquire_cl_context(void *user_context, cl_context *ctx, cl
 extern "C" int halide_release_cl_context(void *user_context) {
     return 0;
 }
+
+#define HAS_MULTIPLE_CONTEXTS true
 #elif defined(TEST_CUDA)
 
 typedef CUcontext gpu_context;
@@ -94,14 +96,20 @@ extern "C" int halide_cuda_acquire_context(void *user_context, CUcontext *ctx, b
 extern "C" int halide_cuda_release_context(void *user_context) {
     return 0;
 }
+
+#define HAS_MULTIPLE_CONTEXTS true
 #else
+typedef int gpu_context;
+
 // Just use the default implementation of acquire/release.
-bool init_context() {
+bool init_context(int &context) {
     printf("Using default implementation of acquire/release\n");
+    context = 0;
     return true;
 }
-void destroy_context() {
-}
+void destroy_context(int & /* context */) {
+
+#define HAS_MULTIPLE_CONTEXTS false
 #endif
 
 void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) {
@@ -143,7 +151,9 @@ void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) {
         buf2_in.device_free(&context2);
         buf2_result.device_free(&context2);
 
-        halide_device_release(&context2, device_interface);
+        if (device_interface != nullptr) {
+            halide_device_release(&context2, device_interface);
+        }
         destroy_context(context2);
     }
 
@@ -151,7 +161,7 @@ void run_kernels_on_thread(gpu_context context1, bool destroy_when_done) {
     buf1_in.device_free(&context1);
     buf1_result.device_free(&context1);
 
-    if (destroy_when_done) {
+    if (destroy_when_done && device_interface != nullptr) {
         halide_device_release(&context1, device_interface);
         destroy_context(context1);
     }
@@ -171,8 +181,8 @@ int main(int argc, char **argv) {
     thread2.join();
 
     // Make sure using the same context on different threads works.
-    std::thread thread3(run_kernels_on_thread, contexta, true);
-    std::thread thread4(run_kernels_on_thread, contextb, true);
+    std::thread thread3(run_kernels_on_thread, contexta, HAS_MULTIPLE_CONTEXTS);
+    std::thread thread4(run_kernels_on_thread, contextb, HAS_MULTIPLE_CONTEXTS);
 
     thread3.join();
     thread4.join();