From 7d70656a4390453c6331001a01953213eb914cb8 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Thu, 25 Feb 2021 19:02:05 -0700
Subject: [PATCH 01/19] Remove unused vertex buffer parameters.

---
 src/CodeGen_GPU_Host.cpp                 | 12 ------------
 src/runtime/HalideRuntimeCuda.h          |  6 +-----
 src/runtime/HalideRuntimeD3D12Compute.h  |  6 +-----
 src/runtime/HalideRuntimeMetal.h         |  6 +-----
 src/runtime/HalideRuntimeOpenCL.h        |  6 +-----
 src/runtime/HalideRuntimeOpenGLCompute.h |  6 +-----
 src/runtime/cuda.cpp                     |  6 +-----
 src/runtime/d3d12compute.cpp             |  6 +-----
 src/runtime/metal.cpp                    |  6 +-----
 src/runtime/opencl.cpp                   |  6 +-----
 src/runtime/openglcompute.cpp            |  9 ++-------
 11 files changed, 11 insertions(+), 64 deletions(-)
diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp
index dea5a586c4ad..0ea02b28a04c 100644
--- a/src/CodeGen_GPU_Host.cpp
+++ b/src/CodeGen_GPU_Host.cpp
@@ -269,14 +269,6 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
             }
         }
 
-        Value *null_float_ptr = ConstantPointerNull::get(CodeGen_LLVM::f32_t->getPointerTo());
-        Value *zero_int32 = codegen(Expr(cast<int>(0)));
-
-        Value *gpu_num_padded_attributes = zero_int32;
-        Value *gpu_vertex_buffer = null_float_ptr;
-        Value *gpu_num_coords_dim0 = zero_int32;
-        Value *gpu_num_coords_dim1 = zero_int32;
-
         // compute a closure over the state passed into the kernel
         HostClosure c(loop->body, loop->name);
 
@@ -500,10 +492,6 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
                 0,
                 0,
                 "gpu_arg_is_buffer_ref" + api_unique_name),
-            gpu_num_padded_attributes,
-            gpu_vertex_buffer,
-            gpu_num_coords_dim0,
-            gpu_num_coords_dim1,
         };
         std::string run_fn_name = "halide_" + api_unique_name + "_run";
         llvm::Function *dev_run_fn = module->getFunction(run_fn_name);
diff --git a/src/runtime/HalideRuntimeCuda.h b/src/runtime/HalideRuntimeCuda.h
index 7f8481642ba2..3eff7d834543 100644
--- a/src/runtime/HalideRuntimeCuda.h
+++ b/src/runtime/HalideRuntimeCuda.h
@@ -33,11 +33,7 @@ extern int halide_cuda_run(void *user_context,
                            int shared_mem_bytes,
                            size_t arg_sizes[],
                            void *args[],
-                           int8_t arg_is_buffer[],
-                           int num_attributes,
-                           float *vertex_buffer,
-                           int num_coords_dim0,
-                           int num_coords_dim1);
+                           int8_t arg_is_buffer[]);
 extern void halide_cuda_finalize_kernels(void *user_context, void *state_ptr);
 // @}
 
diff --git a/src/runtime/HalideRuntimeD3D12Compute.h b/src/runtime/HalideRuntimeD3D12Compute.h
index ef6d618b1b5d..5814f85a8de2 100644
--- a/src/runtime/HalideRuntimeD3D12Compute.h
+++ b/src/runtime/HalideRuntimeD3D12Compute.h
@@ -30,11 +30,7 @@ extern int halide_d3d12compute_run(void *user_context,
                                    int blocksX, int blocksY, int blocksZ,
                                    int threadsX, int threadsY, int threadsZ,
                                    int shared_mem_bytes,
-                                   halide_type_t arg_types[], void *args[], int8_t arg_is_buffer[],
-                                   int num_attributes,
-                                   float *vertex_buffer,
-                                   int num_coords_dim0,
-                                   int num_coords_dim1);
+                                   halide_type_t arg_types[], void *args[], int8_t arg_is_buffer[]);
 extern void halide_d3d12compute_finalize_kernels(void *user_context, void *state_ptr);
 // @}
 
diff --git a/src/runtime/HalideRuntimeMetal.h b/src/runtime/HalideRuntimeMetal.h
index 6e795416e442..802d6659317f 100644
--- a/src/runtime/HalideRuntimeMetal.h
+++ b/src/runtime/HalideRuntimeMetal.h
@@ -35,11 +35,7 @@ extern int halide_metal_run(void *user_context,
                             int shared_mem_bytes,
                             size_t arg_sizes[],
                             void *args[],
-                            int8_t arg_is_buffer[],
-                            int num_attributes,
-                            float *vertex_buffer,
-                            int num_coords_dim0,
-                            int num_coords_dim1);
+                            int8_t arg_is_buffer[]);
 // @}
 
 /** Set the underlying MTLBuffer for a halide_buffer_t. This memory should be
diff --git a/src/runtime/HalideRuntimeOpenCL.h b/src/runtime/HalideRuntimeOpenCL.h
index 54b8b4157489..510dc6f1ba8e 100644
--- a/src/runtime/HalideRuntimeOpenCL.h
+++ b/src/runtime/HalideRuntimeOpenCL.h
@@ -34,11 +34,7 @@ extern int halide_opencl_run(void *user_context,
                              int shared_mem_bytes,
                              size_t arg_sizes[],
                              void *args[],
-                             int8_t arg_is_buffer[],
-                             int num_attributes,
-                             float *vertex_buffer,
-                             int num_coords_dim0,
-                             int num_coords_dim1);
+                             int8_t arg_is_buffer[]);
 extern void halide_opencl_finalize_kernels(void *user_context, void *state_ptr);
 // @}
 
diff --git a/src/runtime/HalideRuntimeOpenGLCompute.h b/src/runtime/HalideRuntimeOpenGLCompute.h
index 2fd76c2fd697..decca61124f1 100644
--- a/src/runtime/HalideRuntimeOpenGLCompute.h
+++ b/src/runtime/HalideRuntimeOpenGLCompute.h
@@ -44,11 +44,7 @@ extern int halide_openglcompute_run(void *user_context,
                                     int shared_mem_bytes,
                                     struct halide_type_t arg_types[],
                                     void *args[],
-                                    int8_t is_buffer[],
-                                    int num_attributes,
-                                    float *vertex_buffer,
-                                    int num_coords_dim0,
-                                    int num_coords_dim1);
+                                    int8_t is_buffer[]);
 
 extern void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr);
 // @}
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 5260e2847b9d..0846bd9b5c40 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -1045,11 +1045,7 @@ WEAK int halide_cuda_run(void *user_context,
                          int shared_mem_bytes,
                          size_t arg_sizes[],
                          void *args[],
-                         int8_t arg_is_buffer[],
-                         int num_attributes,
-                         float *vertex_buffer,
-                         int num_coords_dim0,
-                         int num_coords_dim1) {
+                         int8_t arg_is_buffer[]) {
 
     debug(user_context) << "CUDA: halide_cuda_run ("
                         << "user_context: " << user_context << ", "
diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index d190c164ad8e..eab19a6aacd5 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -2986,11 +2986,7 @@ WEAK int halide_d3d12compute_run(void *user_context,
                                  int blocksX, int blocksY, int blocksZ,
                                  int threadsX, int threadsY, int threadsZ,
                                  int shared_mem_bytes,
-                                 halide_type_t arg_types[], void *args[], int8_t arg_is_buffer[],
-                                 int num_attributes,
-                                 float *vertex_buffer,
-                                 int num_coords_dim0,
-                                 int num_coords_dim1) {
+                                 halide_type_t arg_types[], void *args[], int8_t arg_is_buffer[]) {
     TRACELOG;
 
     D3D12ContextHolder d3d12_context(user_context, true);
diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index 9450ab5eee00..4d8e6d093b8d 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -728,11 +728,7 @@ WEAK int halide_metal_run(void *user_context,
                           int shared_mem_bytes,
                           size_t arg_sizes[],
                           void *args[],
-                          int8_t arg_is_buffer[],
-                          int num_attributes,
-                          float *vertex_buffer,
-                          int num_coords_dim0,
-                          int num_coords_dim1) {
+                          int8_t arg_is_buffer[]) {
 #ifdef DEBUG_RUNTIME
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index c62b957f0e3b..04fce0178cb3 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -1043,11 +1043,7 @@ WEAK int halide_opencl_run(void *user_context,
                            int shared_mem_bytes,
                            size_t arg_sizes[],
                            void *args[],
-                           int8_t arg_is_buffer[],
-                           int num_attributes,
-                           float *vertex_buffer,
-                           int num_coords_dim0,
-                           int num_coords_dim1) {
+                           int8_t arg_is_buffer[]) {
     debug(user_context)
         << "CL: halide_opencl_run (user_context: " << user_context << ", "
         << "entry: " << entry_name << ", "
diff --git a/src/runtime/openglcompute.cpp b/src/runtime/openglcompute.cpp
index 22b3ac15aa7d..99707ab02888 100644
--- a/src/runtime/openglcompute.cpp
+++ b/src/runtime/openglcompute.cpp
@@ -592,9 +592,7 @@ WEAK int halide_openglcompute_run(void *user_context, void *state_ptr,
                                   const char *entry_name, int blocksX, int blocksY,
                                   int blocksZ, int threadsX, int threadsY, int threadsZ,
                                   int shared_mem_bytes, halide_type_t arg_types[], void *args[],
-                                  int8_t arg_is_buffer[], int num_attributes,
-                                  float *vertex_buffer, int num_coords_dim0,
-                                  int num_coords_dim1) {
+                                  int8_t arg_is_buffer[]) {
 #ifdef DEBUG_RUNTIME
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
@@ -604,10 +602,7 @@ WEAK int halide_openglcompute_run(void *user_context, void *state_ptr,
         << "entry: " << entry_name << ", "
         << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
         << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
-        << "shmem: " << shared_mem_bytes << ", "
-        << "num_attributes: " << num_attributes << ", "
-        << "num_coords_dim0: " << num_coords_dim0 << ", "
-        << "num_coords_dim1: " << num_coords_dim1 << "\n";
+        << "shmem: " << shared_mem_bytes << "\n";
 
     if (!global_state.initialized) {
         error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_run).";

From 041b9fe09865bf25fa26f90db72a5d7b631899a6 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Thu, 25 Feb 2021 19:48:06 -0700
Subject: [PATCH 02/19] Offload GPU code in a lowering pass instead of via
 CodeGen_GPU_Host. Fixes #5650, fixes #2797, fixes #2084, now #1971 is more
 relevant.

---
 Makefile                 |   4 +-
 src/CMakeLists.txt       |   4 +-
 src/CodeGen_GPU_Host.cpp | 537 ---------------------------------------
 src/CodeGen_GPU_Host.h   |  75 ------
 src/CodeGen_LLVM.cpp     |  43 +---
 src/Lower.cpp            |  10 +
 src/OffloadGPULoops.cpp  | 344 +++++++++++++++++++++++++
 src/OffloadGPULoops.h    |  25 ++
 8 files changed, 384 insertions(+), 658 deletions(-)
 delete mode 100644 src/CodeGen_GPU_Host.cpp
 delete mode 100644 src/CodeGen_GPU_Host.h
 create mode 100644 src/OffloadGPULoops.cpp
 create mode 100644 src/OffloadGPULoops.h

diff --git a/Makefile b/Makefile
index edc66e4178ca..c685c299940c 100644
--- a/Makefile
+++ b/Makefile
@@ -422,7 +422,6 @@ SOURCE_FILES = \
   CodeGen_C.cpp \
   CodeGen_D3D12Compute_Dev.cpp \
   CodeGen_GPU_Dev.cpp \
-  CodeGen_GPU_Host.cpp \
   CodeGen_Hexagon.cpp \
   CodeGen_Internal.cpp \
   CodeGen_LLVM.cpp \
@@ -498,6 +497,7 @@ SOURCE_FILES = \
   ModulusRemainder.cpp \
   Monotonic.cpp \
   ObjectInstanceRegistry.cpp \
+  OffloadGPULoops.cpp \
   OutputImageParam.cpp \
   ParallelRVar.cpp \
   Parameter.cpp \
@@ -595,7 +595,6 @@ HEADER_FILES = \
   CodeGen_C.h \
   CodeGen_D3D12Compute_Dev.h \
   CodeGen_GPU_Dev.h \
-  CodeGen_GPU_Host.h \
   CodeGen_Internal.h \
   CodeGen_LLVM.h \
   CodeGen_Metal_Dev.h \
@@ -679,6 +678,7 @@ HEADER_FILES = \
   ModulusRemainder.h \
   Monotonic.h \
   ObjectInstanceRegistry.h \
+  OffloadGPULoops.h \
   OutputImageParam.h \
   ParallelRVar.h \
   Param.h \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 289b7bc9447d..b74006aa3528 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -27,7 +27,6 @@ set(HEADER_FILES
     CodeGen_C.h
     CodeGen_D3D12Compute_Dev.h
     CodeGen_GPU_Dev.h
-    CodeGen_GPU_Host.h
     CodeGen_Internal.h
     CodeGen_LLVM.h
     CodeGen_Metal_Dev.h
@@ -110,6 +109,7 @@ set(HEADER_FILES
     ModulusRemainder.h
     Monotonic.h
     ObjectInstanceRegistry.h
+    OffloadGPULoops.h
     OutputImageParam.h
     ParallelRVar.h
     Param.h
@@ -190,7 +190,6 @@ set(SOURCE_FILES
     CodeGen_C.cpp
     CodeGen_D3D12Compute_Dev.cpp
     CodeGen_GPU_Dev.cpp
-    CodeGen_GPU_Host.cpp
     CodeGen_Hexagon.cpp
     CodeGen_Internal.cpp
     CodeGen_LLVM.cpp
@@ -266,6 +265,7 @@ set(SOURCE_FILES
     ModulusRemainder.cpp
     Monotonic.cpp
     ObjectInstanceRegistry.cpp
+    OffloadGPULoops.cpp
     OutputImageParam.cpp
     ParallelRVar.cpp
     Parameter.cpp
diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp
deleted file mode 100644
index 0ea02b28a04c..000000000000
--- a/src/CodeGen_GPU_Host.cpp
+++ /dev/null
@@ -1,537 +0,0 @@
-#include <sstream>
-
-#include "CodeGen_ARM.h"
-#include "CodeGen_D3D12Compute_Dev.h"
-#include "CodeGen_GPU_Host.h"
-#include "CodeGen_Internal.h"
-#include "CodeGen_MIPS.h"
-#include "CodeGen_Metal_Dev.h"
-#include "CodeGen_OpenCL_Dev.h"
-#include "CodeGen_OpenGLCompute_Dev.h"
-#include "CodeGen_PTX_Dev.h"
-#include "CodeGen_PowerPC.h"
-#include "CodeGen_RISCV.h"
-#include "CodeGen_WebAssembly.h"
-#include "CodeGen_X86.h"
-#include "Debug.h"
-#include "DeviceArgument.h"
-#include "ExprUsesVar.h"
-#include "IROperator.h"
-#include "IRPrinter.h"
-#include "LLVM_Headers.h"
-#include "Simplify.h"
-#include "Util.h"
-
-namespace Halide {
-namespace Internal {
-
-using std::map;
-using std::string;
-using std::vector;
-
-using namespace llvm;
-
-namespace {
-
-// Sniff the contents of a kernel to extracts the bounds of all the
-// thread indices (so we know how many threads to launch), and the
-// amount of shared memory to allocate.
-class ExtractBounds : public IRVisitor {
-public:
-    Expr num_threads[4];
-    Expr num_blocks[4];
-    Expr shared_mem_size;
-
-    ExtractBounds()
-        : shared_mem_size(0) {
-        for (int i = 0; i < 4; i++) {
-            num_threads[i] = num_blocks[i] = 1;
-        }
-    }
-
-private:
-    bool found_shared = false;
-
-    using IRVisitor::visit;
-
-    void visit(const For *op) override {
-        if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
-            internal_assert(is_const_zero(op->min));
-        }
-
-        if (ends_with(op->name, ".__thread_id_x")) {
-            num_threads[0] = op->extent;
-        } else if (ends_with(op->name, ".__thread_id_y")) {
-            num_threads[1] = op->extent;
-        } else if (ends_with(op->name, ".__thread_id_z")) {
-            num_threads[2] = op->extent;
-        } else if (ends_with(op->name, ".__thread_id_w")) {
-            num_threads[3] = op->extent;
-        } else if (ends_with(op->name, ".__block_id_x")) {
-            num_blocks[0] = op->extent;
-        } else if (ends_with(op->name, ".__block_id_y")) {
-            num_blocks[1] = op->extent;
-        } else if (ends_with(op->name, ".__block_id_z")) {
-            num_blocks[2] = op->extent;
-        } else if (ends_with(op->name, ".__block_id_w")) {
-            num_blocks[3] = op->extent;
-        }
-
-        op->body.accept(this);
-    }
-
-    void visit(const LetStmt *op) override {
-        if (expr_uses_var(shared_mem_size, op->name)) {
-            shared_mem_size = Let::make(op->name, op->value, shared_mem_size);
-        }
-        op->body.accept(this);
-    }
-
-    void visit(const Allocate *allocate) override {
-        user_assert(!allocate->new_expr.defined()) << "Allocate node inside GPU kernel has custom new expression.\n"
-                                                   << "(Memoization is not supported inside GPU kernels at present.)\n";
-
-        if (allocate->memory_type == MemoryType::GPUShared) {
-            internal_assert(allocate->extents.size() == 1);
-            shared_mem_size += allocate->extents[0] * allocate->type.bytes();
-            found_shared = true;
-        }
-        allocate->body.accept(this);
-    }
-};
-
-Value *get_module_state(llvm::Module *module, const std::string &function_name,
-                        const std::string &api_unique_name, bool create = true) {
-    std::string name = "module_state_" + function_name + "_" + api_unique_name;
-    GlobalVariable *module_state = module->getGlobalVariable(name, true);
-    if (!module_state && create) {
-        // Create a global variable to hold the module state
-        PointerType *void_ptr_type = llvm::Type::getInt8PtrTy(module->getContext());
-        module_state = new GlobalVariable(*module, void_ptr_type,
-                                          false, GlobalVariable::InternalLinkage,
-                                          ConstantPointerNull::get(void_ptr_type),
-                                          name);
-        debug(4) << "Created device module state global variable\n";
-    }
-
-    return module_state;
-}
-
-}  // namespace
-
-template<typename CodeGen_CPU>
-CodeGen_GPU_Host<CodeGen_CPU>::CodeGen_GPU_Host(const Target &target)
-    : CodeGen_CPU(target) {
-    // For the default GPU, the order of preferences is: Metal,
-    // OpenCL, CUDA, OpenGLCompute last.
-    // The code is in reverse order to allow later tests to override
-    // earlier ones.
-    if (target.has_feature(Target::OpenGLCompute)) {
-        debug(1) << "Constructing OpenGL Compute device codegen\n";
-        cgdev[DeviceAPI::OpenGLCompute] = new_CodeGen_OpenGLCompute_Dev(target);
-    }
-    if (target.has_feature(Target::CUDA)) {
-        debug(1) << "Constructing CUDA device codegen\n";
-        cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(target);
-    }
-    if (target.has_feature(Target::OpenCL)) {
-        debug(1) << "Constructing OpenCL device codegen\n";
-        cgdev[DeviceAPI::OpenCL] = new_CodeGen_OpenCL_Dev(target);
-    }
-    if (target.has_feature(Target::Metal)) {
-        debug(1) << "Constructing Metal device codegen\n";
-        cgdev[DeviceAPI::Metal] = new_CodeGen_Metal_Dev(target);
-    }
-    if (target.has_feature(Target::D3D12Compute)) {
-        debug(1) << "Constructing Direct3D 12 Compute device codegen\n";
-        cgdev[DeviceAPI::D3D12Compute] = new_CodeGen_D3D12Compute_Dev(target);
-    }
-
-    if (cgdev.empty()) {
-        internal_error << "Requested unknown GPU target: " << target.to_string() << "\n";
-    }
-}
-
-template<typename CodeGen_CPU>
-void CodeGen_GPU_Host<CodeGen_CPU>::compile_func(const LoweredFunc &f,
-                                                 const std::string &simple_name,
-                                                 const std::string &extern_name) {
-    function_name = simple_name;
-
-    // Create a new module for all of the kernels we find in this function.
-    for (auto &i : cgdev) {
-        i.second->init_module();
-    }
-
-    // Call the base implementation to create the function.
-    CodeGen_CPU::compile_func(f, simple_name, extern_name);
-
-    // We need to insert code after the existing entry block, so that
-    // the destructor stack slots exist before we do the assertions
-    // involved in initializing gpu kernels.
-
-    // Split the entry block just before its end.
-    BasicBlock *entry = &function->getEntryBlock();
-    llvm::Instruction *terminator = entry->getTerminator();
-    internal_assert(terminator);
-    BasicBlock *post_entry = entry->splitBasicBlock(terminator);
-
-    // Create some code that does the GPU initialization.
-    BasicBlock *init_kernels_bb = BasicBlock::Create(*context, "init_kernels",
-                                                     function, post_entry);
-
-    // The entry block should go to the init kernels block instead of
-    // the post entry block.
-    entry->getTerminator()->eraseFromParent();
-    builder->SetInsertPoint(entry);
-    builder->CreateBr(init_kernels_bb);
-
-    // Fill out the init kernels block
-    builder->SetInsertPoint(init_kernels_bb);
-
-    for (auto &i : cgdev) {
-        CodeGen_GPU_Dev *gpu_codegen = i.second.get();
-        std::string api_unique_name = gpu_codegen->api_unique_name();
-
-        // If the module state for this API/function did not get created, there were
-        // no kernels using this API.
-        llvm::Value *module_state = get_module_state(module.get(), function_name, api_unique_name, false);
-        if (!module_state) {
-            continue;
-        }
-
-        debug(2) << "Generating init_kernels for " << api_unique_name << "\n";
-
-        std::vector<char> kernel_src = gpu_codegen->compile_to_src();
-
-        Value *kernel_src_ptr =
-            CodeGen_CPU::create_binary_blob(kernel_src,
-                                            "halide_" + function_name + "_" + api_unique_name + "_kernel_src");
-
-        if (f.args[0].name == "__user_context") {
-            // The user context is first argument of the function.
-            // We retrieve it here so it's available for subsequent calls of
-            // get_user_context().
-            sym_push("__user_context", iterator_to_pointer(function->arg_begin()));
-        }
-
-        Value *user_context = get_user_context();
-        Value *kernel_size = ConstantInt::get(i32_t, kernel_src.size());
-        std::string init_kernels_name = "halide_" + api_unique_name + "_initialize_kernels";
-        llvm::Function *init = module->getFunction(init_kernels_name);
-        internal_assert(init) << "Could not find function " + init_kernels_name + " in initial module\n";
-        vector<Value *> init_kernels_args = {user_context, module_state, kernel_src_ptr, kernel_size};
-        Value *result = builder->CreateCall(init, init_kernels_args);
-        Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0));
-        CodeGen_CPU::create_assertion(did_succeed, Expr(), result);
-
-        // Generate a finalizer call as well to relase any refcounts or other resource usage
-        // specific to this filter call.
-        std::string finalize_kernels_name = "halide_" + api_unique_name + "_finalize_kernels";
-        llvm::Function *finalize = module->getFunction(finalize_kernels_name);
-        Value *module_state_value = builder->CreateLoad(module_state);
-        register_destructor(finalize, module_state_value, CodeGen_CPU::Always);
-    }
-
-    // the init kernels block should branch to the post-entry block
-    builder->CreateBr(post_entry);
-
-    function_name = "";
-}
-
-template<typename CodeGen_CPU>
-void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
-    if (CodeGen_GPU_Dev::is_gpu_var(loop->name)) {
-        // We're in the loop over outermost block dimension
-        debug(2) << "Kernel launch: " << loop->name << "\n";
-
-        internal_assert(loop->device_api != DeviceAPI::Default_GPU)
-            << "A concrete device API should have been selected before codegen.";
-
-        ExtractBounds bounds;
-        loop->accept(&bounds);
-
-        debug(2) << "Kernel bounds: ("
-                 << bounds.num_threads[0] << ", "
-                 << bounds.num_threads[1] << ", "
-                 << bounds.num_threads[2] << ", "
-                 << bounds.num_threads[3] << ") threads, ("
-                 << bounds.num_blocks[0] << ", "
-                 << bounds.num_blocks[1] << ", "
-                 << bounds.num_blocks[2] << ", "
-                 << bounds.num_blocks[3] << ") blocks\n";
-
-        // compile the kernel
-        string kernel_name = unique_name("kernel_" + loop->name);
-        for (size_t i = 0; i < kernel_name.size(); i++) {
-            if (!isalnum(kernel_name[i])) {
-                kernel_name[i] = '_';
-            }
-        }
-
-        // compute a closure over the state passed into the kernel
-        HostClosure c(loop->body, loop->name);
-
-        // Determine the arguments that must be passed into the halide function
-        vector<DeviceArgument> closure_args = c.arguments();
-
-        // Sort the args by the size of the underlying type. This is
-        // helpful for avoiding struct-packing ambiguities in metal,
-        // which passes the scalar args as a struct.
-        std::sort(closure_args.begin(), closure_args.end(),
-                  [](const DeviceArgument &a, const DeviceArgument &b) {
-                      if (a.is_buffer == b.is_buffer) {
-                          return a.type.bits() > b.type.bits();
-                      } else {
-                          // Ensure that buffer arguments come first:
-                          // for many OpenGL/Compute systems, the
-                          // legal indices for buffer args are much
-                          // more restrictive than for scalar args,
-                          // and scalar args can be 'grown' by
-                          // LICM. Putting buffers first makes it much
-                          // more likely we won't fail on some
-                          // hardware.
-                          return a.is_buffer > b.is_buffer;
-                      }
-                  });
-
-        for (size_t i = 0; i < closure_args.size(); i++) {
-            if (closure_args[i].is_buffer && allocations.contains(closure_args[i].name)) {
-                closure_args[i].size = allocations.get(closure_args[i].name).constant_bytes;
-            }
-        }
-
-        CodeGen_GPU_Dev *gpu_codegen = cgdev[loop->device_api].get();
-        user_assert(gpu_codegen != nullptr)
-            << "Loop is scheduled on device " << loop->device_api
-            << " which does not appear in target " << target.to_string() << "\n";
-        gpu_codegen->add_kernel(loop, kernel_name, closure_args);
-
-        // get the actual name of the generated kernel for this loop
-        kernel_name = gpu_codegen->get_current_kernel_name();
-        debug(2) << "Compiled launch to kernel \"" << kernel_name << "\"\n";
-        Value *entry_name_str = builder->CreateGlobalStringPtr(kernel_name, "entry_name");
-
-        llvm::Type *target_size_t_type = (target.bits == 32) ? i32_t : i64_t;
-
-        // build the kernel arguments array
-        llvm::PointerType *arg_t = i8_t->getPointerTo();  // void*
-        int num_args = (int)closure_args.size();
-
-        // nullptr-terminated list
-        llvm::Type *gpu_args_arr_type = ArrayType::get(arg_t, num_args + 1);
-        Value *gpu_args_arr =
-            create_alloca_at_entry(
-                gpu_args_arr_type,
-                1, false,
-                kernel_name + "_args");
-
-        // nullptr-terminated list of size_t's
-        llvm::Type *gpu_arg_sizes_arr_type = ArrayType::get(target_size_t_type, num_args + 1);
-        llvm::ArrayType *gpu_arg_types_arr_type = ArrayType::get(type_t_type, num_args + 1);
-        vector<Constant *> arg_types_array_entries;
-
-        std::string api_unique_name = gpu_codegen->api_unique_name();
-
-        Value *gpu_arg_sizes_arr = nullptr;
-        bool runtime_run_takes_types = gpu_codegen->kernel_run_takes_types();
-
-        if (!runtime_run_takes_types) {
-            gpu_arg_sizes_arr =
-                create_alloca_at_entry(
-                    gpu_arg_sizes_arr_type,
-                    1, false,
-                    kernel_name + "_arg_sizes");
-        }
-
-        llvm::Type *gpu_arg_is_buffer_arr_type = ArrayType::get(i8_t, num_args + 1);
-        Value *gpu_arg_is_buffer_arr =
-            create_alloca_at_entry(
-                gpu_arg_is_buffer_arr_type,
-                1, false,
-                kernel_name + "_arg_is_buffer");
-
-        for (int i = 0; i < num_args; i++) {
-            // get the closure argument
-            string name = closure_args[i].name;
-            Value *val;
-
-            if (closure_args[i].is_buffer) {
-                // If it's a buffer, get the .buffer symbol
-                val = sym_get(name + ".buffer");
-            } else if (ends_with(name, ".varying")) {
-                // Expressions for varying attributes are passed in the
-                // expression mesh. Pass a non-nullptr value in the argument array
-                // to keep it in sync with the argument names encoded in the
-                // shader header
-                val = ConstantInt::get(target_size_t_type, 1);
-            } else {
-                // Otherwise just look up the symbol
-                val = sym_get(name);
-            }
-
-            if (!closure_args[i].is_buffer) {
-                // allocate stack space to mirror the closure element. It
-                // might be in a register and we need a pointer to it for
-                // the gpu args array.
-                Value *ptr = create_alloca_at_entry(val->getType(), 1, false, name + ".stack");
-                // store the closure value into the stack space
-                builder->CreateStore(val, ptr);
-                val = ptr;
-            }
-
-            // store a void * pointer to the argument into the gpu_args_arr
-            Value *bits = builder->CreateBitCast(val, arg_t);
-            builder->CreateStore(bits,
-                                 builder->CreateConstGEP2_32(
-                                     gpu_args_arr_type,
-                                     gpu_args_arr,
-                                     0,
-                                     i));
-
-            if (runtime_run_takes_types) {
-                Constant *arg_type_fields[] = {
-                    ConstantInt::get(i8_t, closure_args[i].type.code()),
-                    ConstantInt::get(i8_t, closure_args[i].type.bits()),
-                    ConstantInt::get(i16_t, 1)};
-                arg_types_array_entries.push_back(ConstantStruct::get(type_t_type, arg_type_fields));
-            } else {
-                // store the size of the argument.
-                int size_bytes = (closure_args[i].is_buffer) ? 8 : closure_args[i].type.bytes();
-                builder->CreateStore(ConstantInt::get(target_size_t_type, size_bytes),
-                                     builder->CreateConstGEP2_32(
-                                         gpu_arg_sizes_arr_type,
-                                         gpu_arg_sizes_arr,
-                                         0,
-                                         i));
-            }
-
-            builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer),
-                                 builder->CreateConstGEP2_32(
-                                     gpu_arg_is_buffer_arr_type,
-                                     gpu_arg_is_buffer_arr,
-                                     0,
-                                     i));
-        }
-        // nullptr-terminate the lists
-        builder->CreateStore(ConstantPointerNull::get(arg_t),
-                             builder->CreateConstGEP2_32(
-                                 gpu_args_arr_type,
-                                 gpu_args_arr,
-                                 0,
-                                 num_args));
-        if (runtime_run_takes_types) {
-            Constant *arg_type_fields[] = {
-                ConstantInt::get(i8_t, 0),
-                ConstantInt::get(i8_t, 0),
-                ConstantInt::get(i16_t, 0)};
-            arg_types_array_entries.push_back(ConstantStruct::get(type_t_type, arg_type_fields));
-        } else {
-            builder->CreateStore(ConstantInt::get(target_size_t_type, 0),
-                                 builder->CreateConstGEP2_32(
-                                     gpu_arg_sizes_arr_type,
-                                     gpu_arg_sizes_arr,
-                                     0,
-                                     num_args));
-        }
-        builder->CreateStore(ConstantInt::get(i8_t, 0),
-                             builder->CreateConstGEP2_32(
-                                 gpu_arg_is_buffer_arr_type,
-                                 gpu_arg_is_buffer_arr,
-                                 0,
-                                 num_args));
-
-        GlobalVariable *arg_types_array_storage = nullptr;
-        if (runtime_run_takes_types) {
-            arg_types_array_storage = new GlobalVariable(
-                *module,
-                gpu_arg_types_arr_type,
-                /*isConstant*/ true,
-                GlobalValue::PrivateLinkage,
-                ConstantArray::get(gpu_arg_types_arr_type, arg_types_array_entries));
-        }
-
-        // TODO: only three dimensions can be passed to
-        // cuLaunchKernel. How should we handle blkid[3]?
-        internal_assert(is_const_one(bounds.num_threads[3]) && is_const_one(bounds.num_blocks[3]))
-            << bounds.num_threads[3] << ", " << bounds.num_blocks[3] << "\n";
-        debug(4) << "CodeGen_GPU_Host get_user_context returned " << get_user_context() << "\n";
-        debug(3) << "bounds.num_blocks[0] = " << bounds.num_blocks[0] << "\n";
-        debug(3) << "bounds.num_blocks[1] = " << bounds.num_blocks[1] << "\n";
-        debug(3) << "bounds.num_blocks[2] = " << bounds.num_blocks[2] << "\n";
-        debug(3) << "bounds.num_threads[0] = " << bounds.num_threads[0] << "\n";
-        debug(3) << "bounds.num_threads[1] = " << bounds.num_threads[1] << "\n";
-        debug(3) << "bounds.num_threads[2] = " << bounds.num_threads[2] << "\n";
-
-        Constant *zero = ConstantInt::get(i32_t, 0);
-        Value *zeros[] = {zero, zero};
-
-        // Order-of-evaluation is guaranteed to be in order in brace-init-lists,
-        // so the multiple calls to codegen here are fine
-        Value *launch_args[] = {
-            get_user_context(),
-            builder->CreateLoad(get_module_state(module.get(), function_name, api_unique_name)),
-            entry_name_str,
-            codegen(bounds.num_blocks[0]),
-            codegen(bounds.num_blocks[1]),
-            codegen(bounds.num_blocks[2]),
-            codegen(bounds.num_threads[0]),
-            codegen(bounds.num_threads[1]),
-            codegen(bounds.num_threads[2]),
-            codegen(bounds.shared_mem_size),
-            runtime_run_takes_types ? ConstantExpr::getInBoundsGetElementPtr(gpu_arg_types_arr_type, arg_types_array_storage, zeros) : builder->CreateConstGEP2_32(gpu_arg_sizes_arr_type, gpu_arg_sizes_arr, 0, 0, "gpu_arg_sizes_ar_ref" + api_unique_name),
-            builder->CreateConstGEP2_32(
-                gpu_args_arr_type,
-                gpu_args_arr,
-                0,
-                0,
-                "gpu_args_arr_ref" + api_unique_name),
-            builder->CreateConstGEP2_32(
-                gpu_arg_is_buffer_arr_type,
-                gpu_arg_is_buffer_arr,
-                0,
-                0,
-                "gpu_arg_is_buffer_ref" + api_unique_name),
-        };
-        std::string run_fn_name = "halide_" + api_unique_name + "_run";
-        llvm::Function *dev_run_fn = module->getFunction(run_fn_name);
-        internal_assert(dev_run_fn) << "Could not find " << run_fn_name << " in module\n";
-        Value *result = builder->CreateCall(dev_run_fn, launch_args);
-        Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0));
-
-        CodeGen_CPU::create_assertion(did_succeed,
-                                      // Should have already called halide_error inside the gpu runtime
-                                      halide_error_code_device_run_failed,
-                                      result);
-    } else {
-        CodeGen_CPU::visit(loop);
-    }
-}
-
-// Force template instantiation.
-#ifdef WITH_X86
-template class CodeGen_GPU_Host<CodeGen_X86>;
-#endif
-
-#if defined(WITH_ARM) || defined(WITH_AARCH64)
-template class CodeGen_GPU_Host<CodeGen_ARM>;
-#endif
-
-#ifdef WITH_MIPS
-template class CodeGen_GPU_Host<CodeGen_MIPS>;
-#endif
-
-#ifdef WITH_POWERPC
-template class CodeGen_GPU_Host<CodeGen_PowerPC>;
-#endif
-
-#ifdef WITH_WEBASSEMBLY
-template class CodeGen_GPU_Host<CodeGen_WebAssembly>;
-#endif
-
-#ifdef WITH_RISCV
-template class CodeGen_GPU_Host<CodeGen_RISCV>;
-#endif
-
-}  // namespace Internal
-}  // namespace Halide
diff --git a/src/CodeGen_GPU_Host.h b/src/CodeGen_GPU_Host.h
deleted file mode 100644
index 6b867fb2a2c3..000000000000
--- a/src/CodeGen_GPU_Host.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef HALIDE_CODEGEN_GPU_HOST_H
-#define HALIDE_CODEGEN_GPU_HOST_H
-
-/** \file
- * Defines the code-generator for producing GPU host code
- */
-
-#include <map>
-#include <string>
-
-#include "CodeGen_GPU_Dev.h"
-#include "IR.h"
-
-namespace Halide {
-
-struct Target;
-
-namespace Internal {
-
-/** A code generator that emits GPU code from a given Halide stmt. */
-template<typename CodeGen_CPU>
-class CodeGen_GPU_Host : public CodeGen_CPU {
-public:
-    /** Create a GPU code generator. GPU target is selected via
-     * CodeGen_GPU_Options. Processor features can be enabled using the
-     * appropriate flags from Target */
-    CodeGen_GPU_Host(const Target &);
-
-protected:
-    void compile_func(const LoweredFunc &func, const std::string &simple_name, const std::string &extern_name) override;
-
-    /** Declare members of the base class that must exist to help the
-     * compiler do name lookup. Annoying but necessary, because the
-     * compiler doesn't know that CodeGen_CPU will in fact inherit
-     * from CodeGen for every instantiation of this template. */
-    using CodeGen_CPU::allocations;
-    using CodeGen_CPU::builder;
-    using CodeGen_CPU::codegen;
-    using CodeGen_CPU::context;
-    using CodeGen_CPU::create_alloca_at_entry;
-    using CodeGen_CPU::function;
-    using CodeGen_CPU::get_user_context;
-    using CodeGen_CPU::halide_buffer_t_type;
-    using CodeGen_CPU::i16_t;
-    using CodeGen_CPU::i32_t;
-    using CodeGen_CPU::i64_t;
-    using CodeGen_CPU::i8_t;
-    using CodeGen_CPU::init_module;
-    using CodeGen_CPU::llvm_type_of;
-    using CodeGen_CPU::module;
-    using CodeGen_CPU::register_destructor;
-    using CodeGen_CPU::sym_exists;
-    using CodeGen_CPU::sym_get;
-    using CodeGen_CPU::sym_pop;
-    using CodeGen_CPU::sym_push;
-    using CodeGen_CPU::target;
-    using CodeGen_CPU::type_t_type;
-    using CodeGen_CPU::visit;
-
-    /** Nodes for which we need to override default behavior for the GPU runtime */
-    // @{
-    void visit(const For *) override;
-    // @}
-
-    std::string function_name;
-
-private:
-    /** Child code generator for device kernels. */
-    std::map<DeviceAPI, std::unique_ptr<CodeGen_GPU_Dev>> cgdev;
-};
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 6ae5e9a74547..97dbd8c225da 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -7,7 +7,6 @@
 #include "CPlusPlusMangle.h"
 #include "CSE.h"
 #include "CodeGen_ARM.h"
-#include "CodeGen_GPU_Host.h"
 #include "CodeGen_Hexagon.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_LLVM.h"
@@ -239,47 +238,7 @@ void CodeGen_LLVM::set_context(llvm::LLVMContext &context) {
 }
 
 std::unique_ptr<CodeGen_LLVM> CodeGen_LLVM::new_for_target(const Target &target, llvm::LLVMContext &context) {
-    // The awkward mapping from targets to code generators
-    if (target.features_any_of({Target::CUDA,
-                                Target::OpenCL,
-                                Target::OpenGLCompute,
-                                Target::Metal,
-                                Target::D3D12Compute})) {
-#ifdef WITH_X86
-        if (target.arch == Target::X86) {
-            return make_codegen<CodeGen_GPU_Host<CodeGen_X86>>(target, context);
-        }
-#endif
-#if defined(WITH_ARM) || defined(WITH_AARCH64)
-        if (target.arch == Target::ARM) {
-            return make_codegen<CodeGen_GPU_Host<CodeGen_ARM>>(target, context);
-        }
-#endif
-#ifdef WITH_MIPS
-        if (target.arch == Target::MIPS) {
-            return make_codegen<CodeGen_GPU_Host<CodeGen_MIPS>>(target, context);
-        }
-#endif
-#ifdef WITH_POWERPC
-        if (target.arch == Target::POWERPC) {
-            return make_codegen<CodeGen_GPU_Host<CodeGen_PowerPC>>(target, context);
-        }
-#endif
-#ifdef WITH_WEBASSEMBLY
-        if (target.arch == Target::WebAssembly) {
-            return make_codegen<CodeGen_GPU_Host<CodeGen_WebAssembly>>(target, context);
-        }
-#endif
-#ifdef WITH_RISCV
-        if (target.arch == Target::RISCV) {
-            return make_codegen<CodeGen_GPU_Host<CodeGen_RISCV>>(target, context);
-        }
-#endif
-        user_error << "Invalid target architecture for GPU backend: "
-                   << target.to_string() << "\n";
-        return nullptr;
-
-    } else if (target.arch == Target::X86) {
+    if (target.arch == Target::X86) {
         return make_codegen<CodeGen_X86>(target, context);
     } else if (target.arch == Target::ARM) {
         return make_codegen<CodeGen_ARM>(target, context);
diff --git a/src/Lower.cpp b/src/Lower.cpp
index d8a3b12a9b34..6855aed4ec61 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -40,6 +40,7 @@
 #include "LoopCarry.h"
 #include "LowerWarpShuffles.h"
 #include "Memoization.h"
+#include "OffloadGPULoops.h"
 #include "PartitionLoops.h"
 #include "Prefetch.h"
 #include "Profiling.h"
@@ -440,6 +441,15 @@ Module lower(const vector<Function> &output_funcs,
         debug(1) << "Skipping Hexagon offload...\n";
     }
 
+    if (t.has_gpu_feature()) {
+        debug(1) << "Offloading GPU loops...\n";
+        s = inject_gpu_offload(s, t);
+        debug(2) << "Lowering after splitting off GPU loops:\n"
+                 << s << "\n\n";
+    } else {
+        debug(1) << "Skipping GPU offload...\n";
+    }
+
     if (!custom_passes.empty()) {
         for (size_t i = 0; i < custom_passes.size(); i++) {
             debug(1) << "Running custom lowering pass " << i << "...\n";
diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
new file mode 100644
index 000000000000..cd80ebb94d92
--- /dev/null
+++ b/src/OffloadGPULoops.cpp
@@ -0,0 +1,344 @@
+#include <memory>
+
+#include "Closure.h"
+#include "CodeGen_D3D12Compute_Dev.h"
+#include "CodeGen_GPU_Dev.h"
+#include "CodeGen_Metal_Dev.h"
+#include "CodeGen_OpenCL_Dev.h"
+#include "CodeGen_OpenGLCompute_Dev.h"
+#include "CodeGen_PTX_Dev.h"
+#include "ExprUsesVar.h"
+#include "InjectHostDevBufferCopies.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+#include "IRPrinter.h"
+#include "OffloadGPULoops.h"
+#include "Util.h"
+
+namespace Halide {
+namespace Internal {
+
+using std::map;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+namespace {
+
+// Sniff the contents of a kernel to extracts the bounds of all the
+// thread indices (so we know how many threads to launch), and the
+// amount of shared memory to allocate.
+class ExtractBounds : public IRVisitor {
+public:
+    Expr num_threads[4];
+    Expr num_blocks[4];
+    Expr shared_mem_size;
+
+    ExtractBounds()
+        : shared_mem_size(0) {
+        for (int i = 0; i < 4; i++) {
+            num_threads[i] = num_blocks[i] = 1;
+        }
+    }
+
+private:
+    bool found_shared = false;
+
+    using IRVisitor::visit;
+
+    void visit(const For *op) override {
+        if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
+            internal_assert(is_const_zero(op->min));
+        }
+
+        if (ends_with(op->name, ".__thread_id_x")) {
+            num_threads[0] = op->extent;
+        } else if (ends_with(op->name, ".__thread_id_y")) {
+            num_threads[1] = op->extent;
+        } else if (ends_with(op->name, ".__thread_id_z")) {
+            num_threads[2] = op->extent;
+        } else if (ends_with(op->name, ".__thread_id_w")) {
+            num_threads[3] = op->extent;
+        } else if (ends_with(op->name, ".__block_id_x")) {
+            num_blocks[0] = op->extent;
+        } else if (ends_with(op->name, ".__block_id_y")) {
+            num_blocks[1] = op->extent;
+        } else if (ends_with(op->name, ".__block_id_z")) {
+            num_blocks[2] = op->extent;
+        } else if (ends_with(op->name, ".__block_id_w")) {
+            num_blocks[3] = op->extent;
+        }
+
+        op->body.accept(this);
+    }
+
+    void visit(const LetStmt *op) override {
+        if (expr_uses_var(shared_mem_size, op->name)) {
+            shared_mem_size = Let::make(op->name, op->value, shared_mem_size);
+        }
+        op->body.accept(this);
+    }
+
+    void visit(const Allocate *allocate) override {
+        user_assert(!allocate->new_expr.defined()) << "Allocate node inside GPU kernel has custom new expression.\n"
+                                                   << "(Memoization is not supported inside GPU kernels at present.)\n";
+
+        if (allocate->memory_type == MemoryType::GPUShared) {
+            internal_assert(allocate->extents.size() == 1);
+            shared_mem_size += allocate->extents[0] * allocate->type.bytes();
+            found_shared = true;
+        }
+        allocate->body.accept(this);
+    }
+};
+
+Expr make_type_arg(const Type &t) {
+    vector<Expr> args = {
+        cast<uint8_t>(t.code()),
+        cast<uint8_t>(t.bits()),
+        cast<uint16_t>(t.lanes()),
+    };
+    return Call::make(type_of<void *>(), Call::make_struct, args, Call::Intrinsic);
+}
+
+class InjectGpuOffload : public IRMutator {
+    /** Child code generator for device kernels. */
+    map<DeviceAPI, unique_ptr<CodeGen_GPU_Dev>> cgdev;
+
+    map<string, Expr> state_bufs;
+
+    const Target &target;
+
+    Expr state_var(const string &name, Type type, bool create) {
+        Expr ptr = state_var_ptr(name, type, create);
+        if (!ptr.defined()) {
+            return Expr();
+        }
+        return Let::make(name, ptr,
+                         Load::make(type_of<void *>(), name, 0,
+                                    Buffer<>(), Parameter(), const_true(), ModulusRemainder()));
+    }
+
+    Expr state_var_ptr(const string &name, Type type, bool create) {
+        Expr &buf = state_bufs[name];
+        if (!buf.defined() && create) {
+            auto storage = Buffer<void *>::make_scalar(name + "_buf");
+            storage() = nullptr;
+            buf = Variable::make(type_of<halide_buffer_t *>(), storage.name() + ".buffer", storage);
+        }
+        if (buf.defined()) {
+            return Call::make(Handle(), Call::buffer_get_host, {buf}, Call::Extern);
+        } else {
+            return Expr();
+        }
+    }
+
+    Expr module_state(const string &api_name, bool create = true) {
+        return state_var(api_name, type_of<void *>(), create);
+    }
+
+    Expr module_state_ptr(const string &api_name, bool create = true) {
+        return state_var_ptr(api_name, type_of<void *>(), create);
+    }
+
+    // Create a Buffer containing the given vector, and return an
+    // expression for a pointer to the first element.
+    Expr make_buffer_ptr(const vector<char> &data, const string &name) {
+        Buffer<uint8_t> code((int)data.size(), name);
+        memcpy(code.data(), data.data(), (int)data.size());
+        Expr buf = Variable::make(type_of<halide_buffer_t *>(), name + ".buffer", code);
+        return Call::make(Handle(), Call::buffer_get_host, {buf}, Call::Extern);
+    }
+
+    using IRMutator::visit;
+
+    Stmt visit(const For *loop) override {
+        if (!CodeGen_GPU_Dev::is_gpu_var(loop->name)) {
+            return IRMutator::visit(loop);
+        }
+
+        // We're in the loop over outermost block dimension
+        debug(2) << "Kernel launch: " << loop->name << "\n";
+
+        internal_assert(loop->device_api != DeviceAPI::Default_GPU)
+            << "A concrete device API should have been selected before codegen.";
+
+        ExtractBounds bounds;
+        loop->accept(&bounds);
+        debug(2) << "Kernel bounds: ("
+                 << bounds.num_threads[0] << ", "
+                 << bounds.num_threads[1] << ", "
+                 << bounds.num_threads[2] << ", "
+                 << bounds.num_threads[3] << ") threads, ("
+                 << bounds.num_blocks[0] << ", "
+                 << bounds.num_blocks[1] << ", "
+                 << bounds.num_blocks[2] << ", "
+                 << bounds.num_blocks[3] << ") blocks\n";
+
+        // compute a closure over the state passed into the kernel
+        HostClosure c(loop->body, loop->name);
+
+        // Determine the arguments that must be passed into the halide function
+        vector<DeviceArgument> closure_args = c.arguments();
+
+        // Sort the args by the size of the underlying type. This is
+        // helpful for avoiding struct-packing ambiguities in metal,
+        // which passes the scalar args as a struct.
+        sort(closure_args.begin(), closure_args.end(),
+             [](const DeviceArgument &a, const DeviceArgument &b) {
+                 if (a.is_buffer == b.is_buffer) {
+                    return a.type.bits() > b.type.bits();
+                 } else {
+                     // Ensure that buffer arguments come first:
+                     // for many OpenGL/Compute systems, the
+                     // legal indices for buffer args are much
+                     // more restrictive than for scalar args,
+                     // and scalar args can be 'grown' by
+                     // LICM. Putting buffers first makes it much
+                     // more likely we won't fail on some
+                     // hardware.
+                     return a.is_buffer > b.is_buffer;
+                 }
+             });
+
+        // compile the kernel
+        string kernel_name = c_print_name(unique_name("kernel_" + loop->name));
+
+        CodeGen_GPU_Dev *gpu_codegen = cgdev[loop->device_api].get();
+        user_assert(gpu_codegen != nullptr)
+            << "Loop is scheduled on device " << loop->device_api
+            << " which does not appear in target " << target.to_string() << "\n";
+        gpu_codegen->add_kernel(loop, kernel_name, closure_args);
+
+        // get the actual name of the generated kernel for this loop
+        kernel_name = gpu_codegen->get_current_kernel_name();
+        debug(2) << "Compiled launch to kernel \"" << kernel_name << "\"\n";
+
+        bool runtime_run_takes_types = gpu_codegen->kernel_run_takes_types();
+        Type target_size_t_type = target.bits == 32 ? Int(32) : Int(64);
+
+        vector<Expr> args, arg_types_or_sizes, arg_is_buffer;
+        for (const DeviceArgument &i : closure_args) {
+            Expr val;
+            if (i.is_buffer) {
+                val = Variable::make(Handle(), i.name + ".buffer");
+            } else {
+                val = Variable::make(i.type, i.name);
+                val = Call::make(type_of<void *>(), Call::make_struct, {val}, Call::Intrinsic);
+            }
+            args.push_back(val);
+
+            if (runtime_run_takes_types) {
+                arg_types_or_sizes.push_back(make_type_arg(i.type.with_lanes(1)));
+            } else {
+                arg_types_or_sizes.push_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes()));
+            }
+
+            arg_is_buffer.push_back(cast<uint8_t>(i.is_buffer));
+        }
+
+        // nullptr-terminate the lists
+        args.push_back(reinterpret(Handle(), cast<uint64_t>(0)));
+        if (runtime_run_takes_types) {
+            internal_assert(halide_type_int == 0);
+            arg_types_or_sizes.push_back(make_type_arg(Type(halide_type_int, 0, 0)));
+        } else {
+            arg_types_or_sizes.push_back(cast(target_size_t_type, 0));
+        }
+        arg_is_buffer.push_back(cast<uint8_t>(0));
+
+        // TODO: only three dimensions can be passed to
+        // cuLaunchKernel. How should we handle blkid[3]?
+        internal_assert(is_const_one(bounds.num_threads[3]) && is_const_one(bounds.num_blocks[3]))
+            << bounds.num_threads[3] << ", " << bounds.num_blocks[3] << "\n";
+        debug(3) << "bounds.num_blocks[0] = " << bounds.num_blocks[0] << "\n";
+        debug(3) << "bounds.num_blocks[1] = " << bounds.num_blocks[1] << "\n";
+        debug(3) << "bounds.num_blocks[2] = " << bounds.num_blocks[2] << "\n";
+        debug(3) << "bounds.num_threads[0] = " << bounds.num_threads[0] << "\n";
+        debug(3) << "bounds.num_threads[1] = " << bounds.num_threads[1] << "\n";
+        debug(3) << "bounds.num_threads[2] = " << bounds.num_threads[2] << "\n";
+
+        string api_unique_name = gpu_codegen->api_unique_name();
+        vector<Expr> run_args = {
+            module_state(api_unique_name),
+            kernel_name,
+            Expr(bounds.num_blocks[0]),
+            Expr(bounds.num_blocks[1]),
+            Expr(bounds.num_blocks[2]),
+            Expr(bounds.num_threads[0]),
+            Expr(bounds.num_threads[1]),
+            Expr(bounds.num_threads[2]),
+            Expr(bounds.shared_mem_size),
+            Call::make(Handle(), Call::make_struct, arg_types_or_sizes, Call::Intrinsic),
+            Call::make(Handle(), Call::make_struct, args, Call::Intrinsic),
+            Call::make(Handle(), Call::make_struct, arg_is_buffer, Call::Intrinsic),
+        };
+        return call_extern_and_assert("halide_" + api_unique_name + "_run", run_args);
+    }
+
+public:
+    InjectGpuOffload(const Target &target) : target(target) {
+        if (target.has_feature(Target::OpenGLCompute)) {
+            cgdev[DeviceAPI::OpenGLCompute] = new_CodeGen_OpenGLCompute_Dev(target);
+        }
+        if (target.has_feature(Target::CUDA)) {
+            cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(target);
+        }
+        if (target.has_feature(Target::OpenCL)) {
+            cgdev[DeviceAPI::OpenCL] = new_CodeGen_OpenCL_Dev(target);
+        }
+        if (target.has_feature(Target::Metal)) {
+            cgdev[DeviceAPI::Metal] = new_CodeGen_Metal_Dev(target);
+        }
+        if (target.has_feature(Target::D3D12Compute)) {
+            cgdev[DeviceAPI::D3D12Compute] = new_CodeGen_D3D12Compute_Dev(target);
+        }
+
+        internal_assert(!cgdev.empty()) << "Requested unknown GPU target: " << target.to_string() << "\n";
+    }
+
+    Stmt inject(Stmt s) {
+        // Create a new module for all of the kernels we find in this function.
+        for (auto &i : cgdev) {
+            i.second->init_module();
+        }
+
+        Stmt result = mutate(s);
+
+        for (auto &i : cgdev) {
+            string api_unique_name = i.second->api_unique_name();
+
+            // If the module state for this API/function did not get created, there were
+            // no kernels using this API.
+            Expr state_ptr = module_state_ptr(api_unique_name, false);
+            if (!state_ptr.defined()) {
+                continue;
+            }
+
+            debug(2) << "Generating init_kernels for " << api_unique_name << "\n";
+            vector<char> kernel_src = i.second->compile_to_src();
+            Expr kernel_src_buf = make_buffer_ptr(kernel_src, api_unique_name + "_kernels");
+
+            string init_kernels_name = "halide_" + api_unique_name + "_initialize_kernels";
+            vector<Expr> init_args = {state_ptr, kernel_src_buf, Expr((int)kernel_src.size())};
+            Stmt init_kernels = call_extern_and_assert(init_kernels_name, init_args);
+
+            string destructor_name = "halide_" + api_unique_name + "_finalize_kernels";
+            vector<Expr> finalize_args = {Expr(destructor_name), module_state(api_unique_name)};
+            Stmt register_destructor = Evaluate::make(
+                Call::make(Handle(), Call::register_destructor, finalize_args, Call::Intrinsic));
+
+            result = Block::make({init_kernels, register_destructor, result});
+        }
+        return result;
+    }
+};
+
+}  // namespace
+
+Stmt inject_gpu_offload(Stmt s, const Target &host_target) {
+    return InjectGpuOffload(host_target).inject(s);
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/OffloadGPULoops.h b/src/OffloadGPULoops.h
new file mode 100644
index 000000000000..c513a35fd8d5
--- /dev/null
+++ b/src/OffloadGPULoops.h
@@ -0,0 +1,25 @@
+#ifndef HALIDE_OFFLOAD_GPU_LOOPS_H
+#define HALIDE_OFFLOAD_GPU_LOOPS_H
+
+/** \file
+ * Defines a lowering pass to pull loops marked with
+ * GPU device APIs to a separate module, and call them through the
+ * appropriate host runtime module.
+ */
+
+#include "Expr.h"
+
+namespace Halide {
+
+struct Target;
+
+namespace Internal {
+
+/** Pull loops marked with GPU device APIs to a separate
+ * module, and call them through the appropriate host runtime module. */
+Stmt inject_gpu_offload(Stmt s, const Target &host_target);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif

From c87f37e84384a05e61e506fa458d31ac6b01e5a4 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 26 Feb 2021 17:06:00 -0700
Subject: [PATCH 03/19] clang-format.

---
 src/OffloadGPULoops.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index cd80ebb94d92..04d4d5631437 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -187,7 +187,7 @@ class InjectGpuOffload : public IRMutator {
         sort(closure_args.begin(), closure_args.end(),
              [](const DeviceArgument &a, const DeviceArgument &b) {
                  if (a.is_buffer == b.is_buffer) {
-                    return a.type.bits() > b.type.bits();
+                     return a.type.bits() > b.type.bits();
                  } else {
                      // Ensure that buffer arguments come first:
                      // for many OpenGL/Compute systems, the
@@ -277,7 +277,8 @@ class InjectGpuOffload : public IRMutator {
     }
 
 public:
-    InjectGpuOffload(const Target &target) : target(target) {
+    InjectGpuOffload(const Target &target)
+        : target(target) {
         if (target.has_feature(Target::OpenGLCompute)) {
             cgdev[DeviceAPI::OpenGLCompute] = new_CodeGen_OpenGLCompute_Dev(target);
         }

From 797e43e9660cf37ac8e3c32e318b5d61b7a9fa06 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 26 Feb 2021 17:08:25 -0700
Subject: [PATCH 04/19] clang-format sorting is case sensitive!?

---
 src/OffloadGPULoops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index 04d4d5631437..187c735dbb97 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -8,10 +8,10 @@
 #include "CodeGen_OpenGLCompute_Dev.h"
 #include "CodeGen_PTX_Dev.h"
 #include "ExprUsesVar.h"
-#include "InjectHostDevBufferCopies.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
+#include "InjectHostDevBufferCopies.h"
 #include "OffloadGPULoops.h"
 #include "Util.h"
 

From da6effb095b3825c6ff18917c68d72f441ebdfcb Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 26 Feb 2021 17:33:34 -0700
Subject: [PATCH 05/19] clang-tidy

---
 src/OffloadGPULoops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index 187c735dbb97..ed9743a9ac05 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -298,7 +298,7 @@ class InjectGpuOffload : public IRMutator {
         internal_assert(!cgdev.empty()) << "Requested unknown GPU target: " << target.to_string() << "\n";
     }
 
-    Stmt inject(Stmt s) {
+    Stmt inject(const Stmt &s) {
         // Create a new module for all of the kernels we find in this function.
         for (auto &i : cgdev) {
             i.second->init_module();

From ebbb9a5554aad9dfab133ebc0eb33a16aac88a4f Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 26 Feb 2021 17:31:39 -0700
Subject: [PATCH 06/19] Move codegen backends into anonymous namespaces in
 source files.

---
 Makefile                    |   9 +--
 src/CMakeLists.txt          |   7 +--
 src/CodeGen_ARM.cpp         | 106 ++++++++++++++++++++++++++----------
 src/CodeGen_ARM.h           |  72 ------------------------
 src/CodeGen_Hexagon.cpp     |  17 +-----
 src/CodeGen_Hexagon.h       |  29 ----------
 src/CodeGen_LLVM.cpp        |  58 +++++---------------
 src/CodeGen_LLVM.h          |  15 -----
 src/CodeGen_MIPS.cpp        |  43 ++++++++++++---
 src/CodeGen_MIPS.h          |  34 ------------
 src/CodeGen_PTX_Dev.cpp     |  24 ++++----
 src/CodeGen_PowerPC.cpp     |  67 ++++++++++++++++-------
 src/CodeGen_PowerPC.h       |  42 --------------
 src/CodeGen_RISCV.cpp       |  43 ++++++++++++---
 src/CodeGen_RISCV.h         |  33 -----------
 src/CodeGen_Targets.h       |  30 ++++++++++
 src/CodeGen_WebAssembly.cpp |  59 +++++++++++++-------
 src/CodeGen_WebAssembly.h   |  35 ------------
 src/CodeGen_X86.cpp         |  84 ++++++++++++++++++++--------
 src/CodeGen_X86.h           |  58 --------------------
 src/OffloadGPULoops.cpp     |   2 +-
 src/WasmExecutor.cpp        |   4 +-
 22 files changed, 359 insertions(+), 512 deletions(-)
 delete mode 100644 src/CodeGen_ARM.h
 delete mode 100644 src/CodeGen_Hexagon.h
 delete mode 100644 src/CodeGen_MIPS.h
 delete mode 100644 src/CodeGen_PowerPC.h
 delete mode 100644 src/CodeGen_RISCV.h
 create mode 100644 src/CodeGen_Targets.h
 delete mode 100644 src/CodeGen_WebAssembly.h
 delete mode 100644 src/CodeGen_X86.h

diff --git a/Makefile b/Makefile
index c685c299940c..f97386f093da 100644
--- a/Makefile
+++ b/Makefile
@@ -591,23 +591,18 @@ HEADER_FILES = \
   Buffer.h \
   CanonicalizeGPUVars.h \
   Closure.h \
-  CodeGen_ARM.h \
   CodeGen_C.h \
   CodeGen_D3D12Compute_Dev.h \
   CodeGen_GPU_Dev.h \
   CodeGen_Internal.h \
   CodeGen_LLVM.h \
   CodeGen_Metal_Dev.h \
-  CodeGen_MIPS.h \
   CodeGen_OpenCL_Dev.h \
   CodeGen_OpenGLCompute_Dev.h \
   CodeGen_Posix.h \
-  CodeGen_PowerPC.h \
   CodeGen_PTX_Dev.h \
   CodeGen_PyTorch.h \
-  CodeGen_RISCV.h \
-  CodeGen_WebAssembly.h \
-  CodeGen_X86.h \
+  CodeGen_Targets.h \
   CompilerLogger.h \
   ConciseCasts.h \
   CPlusPlusMangle.h \
@@ -1091,7 +1086,7 @@ $(BUILD_DIR)/initmod_ptx.%_ll.o: $(BUILD_DIR)/initmod_ptx.%_ll.cpp
 $(BUILD_DIR)/initmod.%.o: $(BUILD_DIR)/initmod.%.cpp
 	$(CXX) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o
 
-$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp $(SRC_DIR)/%.h $(BUILD_DIR)/llvm_ok
+$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp $(BUILD_DIR)/llvm_ok
 	@mkdir -p $(@D)
 	$(CXX) $(CXX_FLAGS) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b74006aa3528..8bc68eeeb26c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -23,23 +23,18 @@ set(HEADER_FILES
     Buffer.h
     CanonicalizeGPUVars.h
     Closure.h
-    CodeGen_ARM.h
     CodeGen_C.h
     CodeGen_D3D12Compute_Dev.h
     CodeGen_GPU_Dev.h
     CodeGen_Internal.h
     CodeGen_LLVM.h
     CodeGen_Metal_Dev.h
-    CodeGen_MIPS.h
     CodeGen_OpenCL_Dev.h
     CodeGen_OpenGLCompute_Dev.h
     CodeGen_Posix.h
-    CodeGen_PowerPC.h
     CodeGen_PTX_Dev.h
     CodeGen_PyTorch.h
-    CodeGen_RISCV.h
-    CodeGen_WebAssembly.h
-    CodeGen_X86.h
+    CodeGen_Targets.h
     CompilerLogger.h
     ConciseCasts.h
     CPlusPlusMangle.h
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 89260d758cc7..31cd8d7d8e7f 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1,9 +1,8 @@
-#include <iostream>
 #include <sstream>
 
 #include "CSE.h"
-#include "CodeGen_ARM.h"
 #include "CodeGen_Internal.h"
+#include "CodeGen_Posix.h"
 #include "ConciseCasts.h"
 #include "Debug.h"
 #include "IREquality.h"
@@ -25,6 +24,8 @@ using std::vector;
 using namespace Halide::ConciseCasts;
 using namespace llvm;
 
+#if defined(WITH_ARM) || defined(WITH_AARCH64)
+
 namespace {
 
 // Broadcast to an unknown number of lanes, for making patterns.
@@ -32,21 +33,59 @@ Expr bc(Expr x) {
     return Broadcast::make(std::move(x), 0);
 }
 
-}  // namespace
+/** A code generator that emits ARM code from a given Halide stmt. */
+class CodeGen_ARM : public CodeGen_Posix {
+public:
+    /** Create an ARM code generator for the given arm target. */
+    CodeGen_ARM(const Target &);
+
+protected:
+    using CodeGen_Posix::visit;
+
+    /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that
+     * takes one vector argument and splits it into two to call inner. */
+    llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const string &name);
+    void init_module() override;
+
+    /** Nodes for which we want to emit specific neon intrinsics */
+    // @{
+    void visit(const Cast *) override;
+    void visit(const Sub *) override;
+    void visit(const Mul *) override;
+    void visit(const Min *) override;
+    void visit(const Max *) override;
+    void visit(const Store *) override;
+    void visit(const Load *) override;
+    void visit(const Call *) override;
+    void visit(const LT *) override;
+    void visit(const LE *) override;
+    void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
+    // @}
+
+    /** Various patterns to peephole match against */
+    struct Pattern {
+        string intrin;  ///< Name of the intrinsic
+        Expr pattern;        ///< The pattern to match against
+        Pattern() = default;
+        Pattern(const string &intrin, Expr p)
+            : intrin(intrin), pattern(std::move(p)) {
+        }
+    };
+    vector<Pattern> casts, averagings, negations;
+
+    string mcpu() const override;
+    string mattrs() const override;
+    bool use_soft_float_abi() const override;
+    int native_vector_bits() const override;
+
+    // NEON can be disabled for older processors.
+    bool neon_intrinsics_disabled() {
+        return target.has_feature(Target::NoNEON);
+    }
+};
 
 CodeGen_ARM::CodeGen_ARM(const Target &target)
     : CodeGen_Posix(target) {
-    if (target.bits == 32) {
-#if !defined(WITH_ARM)
-        user_error << "arm not enabled for this build of Halide.";
-#endif
-        user_assert(llvm_ARM_enabled) << "llvm build not configured with ARM target enabled\n.";
-    } else {
-#if !defined(WITH_AARCH64)
-        user_error << "aarch64 not enabled for this build of Halide.";
-#endif
-        user_assert(llvm_AArch64_enabled) << "llvm build not configured with AArch64 target enabled.\n";
-    }
 
     // RADDHN - Add and narrow with rounding
     // These must come before other narrowing rounding shift patterns
@@ -162,8 +201,6 @@ CodeGen_ARM::CodeGen_ARM(const Target &target)
     // clang-format on
 }
 
-namespace {
-
 constexpr int max_intrinsic_args = 4;
 
 struct ArmIntrinsic {
@@ -512,9 +549,7 @@ const ArmIntrinsic intrinsic_defs[] = {
 };
 // clang-format on
 
-}  // namespace
-
-llvm::Function *CodeGen_ARM::define_concat_args_wrapper(llvm::Function *inner, const std::string &name) {
+llvm::Function *CodeGen_ARM::define_concat_args_wrapper(llvm::Function *inner, const string &name) {
     llvm::FunctionType *inner_ty = inner->getFunctionType();
 
     internal_assert(inner_ty->getNumParams() == 2);
@@ -558,7 +593,7 @@ void CodeGen_ARM::init_module() {
         return;
     }
 
-    std::string prefix = target.bits == 32 ? "llvm.arm.neon." : "llvm.aarch64.neon.";
+    string prefix = target.bits == 32 ? "llvm.arm.neon." : "llvm.aarch64.neon.";
     for (const ArmIntrinsic &intrin : intrinsic_defs) {
         // Get the name of the intrinsic with the appropriate prefix.
         const char *intrin_name = nullptr;
@@ -570,13 +605,13 @@ void CodeGen_ARM::init_module() {
         if (!intrin_name) {
             continue;
         }
-        std::string full_name = intrin_name;
+        string full_name = intrin_name;
         if (!starts_with(full_name, "llvm.")) {
             full_name = prefix + full_name;
         }
 
         // We might have to generate versions of this intrinsic with multiple widths.
-        std::vector<int> width_factors = {1};
+        vector<int> width_factors = {1};
         if (intrin.flags & ArmIntrinsic::HalfWidth) {
             width_factors.push_back(2);
         }
@@ -585,7 +620,7 @@ void CodeGen_ARM::init_module() {
             Type ret_type = intrin.ret_type;
             ret_type = ret_type.with_lanes(ret_type.lanes() * width_factor);
             internal_assert(ret_type.bits() * ret_type.lanes() <= 128) << full_name << "\n";
-            std::vector<Type> arg_types;
+            vector<Type> arg_types;
             arg_types.reserve(4);
             for (halide_type_t i : intrin.arg_types) {
                 if (i.bits == 0) {
@@ -603,7 +638,7 @@ void CodeGen_ARM::init_module() {
             mangled_name_builder << full_name;
             if (starts_with(full_name, "llvm.") && (intrin.flags & ArmIntrinsic::NoMangle) == 0) {
                 // Append LLVM name mangling for either the return type or the arguments, or both.
-                std::vector<Type> types;
+                vector<Type> types;
                 if (intrin.flags & ArmIntrinsic::MangleArgs) {
                     types = arg_types;
                 } else if (intrin.flags & ArmIntrinsic::MangleRetArgs) {
@@ -622,12 +657,12 @@ void CodeGen_ARM::init_module() {
                     mangled_name_builder << t.bits();
                 }
             }
-            std::string mangled_name = mangled_name_builder.str();
+            string mangled_name = mangled_name_builder.str();
 
             llvm::Function *intrin_impl = nullptr;
             if (intrin.flags & ArmIntrinsic::SplitArg0) {
                 // This intrinsic needs a wrapper to split the argument.
-                std::string wrapper_name = intrin.name + unique_name("_wrapper");
+                string wrapper_name = intrin.name + unique_name("_wrapper");
                 Type split_arg_type = arg_types[0].with_lanes(arg_types[0].lanes() / 2);
                 llvm::Function *to_wrap = get_llvm_intrin(ret_type, mangled_name, {split_arg_type, split_arg_type});
                 intrin_impl = define_concat_args_wrapper(to_wrap, wrapper_name);
@@ -1178,7 +1213,7 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
     // clang-format on
 
     int factor = op->value.type().lanes() / op->type.lanes();
-    std::vector<Expr> matches;
+    vector<Expr> matches;
     for (const Pattern &p : patterns) {
         if (op->op != p.reduce_op || factor % p.factor != 0) {
             continue;
@@ -1208,7 +1243,7 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
     // TODO: Move this to be patterns? The patterns are pretty trivial, but some
     // of the other logic is tricky.
     const char *intrin = nullptr;
-    std::vector<Expr> intrin_args;
+    vector<Expr> intrin_args;
     Expr accumulator = init;
     if (op->op == VectorReduce::Add && factor == 2) {
         Type narrow_type = op->type.narrow().with_lanes(op->value.type().lanes());
@@ -1340,5 +1375,20 @@ int CodeGen_ARM::native_vector_bits() const {
     return 128;
 }
 
+}  // namespace
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_ARM(const Target &target) {
+    return std::make_unique<CodeGen_ARM>(target);
+}
+
+#else  // WITH_ARM || WITH_AARCH64
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_ARM(const Target &target) {
+    user_error << "ARM not enabled for this build of Halide.\n";
+    return nullptr;
+}
+
+#endif  // WITH_ARM || WITH_AARCH64
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_ARM.h b/src/CodeGen_ARM.h
deleted file mode 100644
index 39d1a39d7a0f..000000000000
--- a/src/CodeGen_ARM.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef HALIDE_CODEGEN_ARM_H
-#define HALIDE_CODEGEN_ARM_H
-
-/** \file
- * Defines the code-generator for producing ARM machine code
- */
-
-#include <utility>
-
-#include "CodeGen_Posix.h"
-
-namespace Halide {
-
-struct Target;
-
-namespace Internal {
-
-/** A code generator that emits ARM code from a given Halide stmt. */
-class CodeGen_ARM : public CodeGen_Posix {
-public:
-    /** Create an ARM code generator for the given arm target. */
-    CodeGen_ARM(const Target &);
-
-protected:
-    using CodeGen_Posix::visit;
-
-    /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that
-     * takes one vector argument and splits it into two to call inner. */
-    llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const std::string &name);
-    void init_module() override;
-
-    /** Nodes for which we want to emit specific neon intrinsics */
-    // @{
-    void visit(const Cast *) override;
-    void visit(const Sub *) override;
-    void visit(const Mul *) override;
-    void visit(const Min *) override;
-    void visit(const Max *) override;
-    void visit(const Store *) override;
-    void visit(const Load *) override;
-    void visit(const Call *) override;
-    void visit(const LT *) override;
-    void visit(const LE *) override;
-    void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
-    // @}
-
-    /** Various patterns to peephole match against */
-    struct Pattern {
-        std::string intrin;  ///< Name of the intrinsic
-        Expr pattern;        ///< The pattern to match against
-        Pattern() = default;
-        Pattern(const std::string &intrin, Expr p)
-            : intrin(intrin), pattern(std::move(p)) {
-        }
-    };
-    std::vector<Pattern> casts, averagings, negations;
-
-    std::string mcpu() const override;
-    std::string mattrs() const override;
-    bool use_soft_float_abi() const override;
-    int native_vector_bits() const override;
-
-    // NEON can be disabled for older processors.
-    bool neon_intrinsics_disabled() {
-        return target.has_feature(Target::NoNEON);
-    }
-};
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 223a5231da66..1f033fdcc33c 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -1,7 +1,3 @@
-#include "CodeGen_Hexagon.h"
-
-#include <iostream>
-#include <mutex>
 #include <sstream>
 #include <utility>
 
@@ -12,14 +8,11 @@
 #include "Debug.h"
 #include "HexagonOptimize.h"
 #include "IREquality.h"
-#include "IRMatch.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
-#include "LICM.h"
 #include "LLVM_Headers.h"
 #include "LoopCarry.h"
-#include "Monotonic.h"
 #include "Simplify.h"
 #include "Substitute.h"
 #include "Target.h"
@@ -138,8 +131,6 @@ class CodeGen_Hexagon : public CodeGen_Posix {
 
 CodeGen_Hexagon::CodeGen_Hexagon(const Target &t)
     : CodeGen_Posix(t) {
-    user_assert(llvm_Hexagon_enabled)
-        << "llvm build not configured with Hexagon target enabled.\n";
     if (target.has_feature(Halide::Target::HVX_v66)) {
         isa_version = 66;
     } else if (target.has_feature(Halide::Target::HVX_v65)) {
@@ -2326,15 +2317,13 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
 
 }  // namespace
 
-std::unique_ptr<CodeGen_Posix> new_CodeGen_Hexagon(const Target &target, llvm::LLVMContext &context) {
-    std::unique_ptr<CodeGen_Posix> ret(std::make_unique<CodeGen_Hexagon>(target));
-    ret->set_context(context);
-    return ret;
+std::unique_ptr<CodeGen_Posix> new_CodeGen_Hexagon(const Target &target) {
+    return std::make_unique<CodeGen_Hexagon>(target);
 }
 
 #else  // WITH_HEXAGON
 
-std::unique_ptr<CodeGen_Posix> new_CodeGen_Hexagon(const Target &target, llvm::LLVMContext &context) {
+std::unique_ptr<CodeGen_Posix> new_CodeGen_Hexagon(const Target &target) {
     user_error << "hexagon not enabled for this build of Halide.\n";
     return nullptr;
 }
diff --git a/src/CodeGen_Hexagon.h b/src/CodeGen_Hexagon.h
deleted file mode 100644
index a844c594e6c7..000000000000
--- a/src/CodeGen_Hexagon.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef HALIDE_CODEGEN_HEXAGON_H
-#define HALIDE_CODEGEN_HEXAGON_H
-
-/** \file
- * Defines the code-generator for producing Hexagon machine code
- */
-
-#include <memory>
-
-namespace llvm {
-
-class LLVMContext;
-
-}
-
-namespace Halide {
-
-struct Target;
-
-namespace Internal {
-
-class CodeGen_Posix;
-
-std::unique_ptr<CodeGen_Posix> new_CodeGen_Hexagon(const Target &target, llvm::LLVMContext &context);
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 97dbd8c225da..913cf0c6f786 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1,20 +1,13 @@
-#include <iostream>
 #include <limits>
 #include <memory>
-#include <mutex>
 #include <sstream>
 
 #include "CPlusPlusMangle.h"
 #include "CSE.h"
-#include "CodeGen_ARM.h"
-#include "CodeGen_Hexagon.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_LLVM.h"
-#include "CodeGen_MIPS.h"
-#include "CodeGen_PowerPC.h"
-#include "CodeGen_RISCV.h"
-#include "CodeGen_WebAssembly.h"
-#include "CodeGen_X86.h"
+#include "CodeGen_Posix.h"
+#include "CodeGen_Targets.h"
 #include "CompilerLogger.h"
 #include "Debug.h"
 #include "Deinterleave.h"
@@ -81,8 +74,7 @@ using std::vector;
 #define InitializeTarget(target)          \
     LLVMInitialize##target##Target();     \
     LLVMInitialize##target##TargetInfo(); \
-    LLVMInitialize##target##TargetMC();   \
-    llvm_##target##_enabled = true;
+    LLVMInitialize##target##TargetMC();
 
 #define InitializeAsmParser(target) \
     LLVMInitialize##target##AsmParser();
@@ -222,41 +214,30 @@ CodeGen_LLVM::CodeGen_LLVM(const Target &t)
     initialize_llvm();
 }
 
-namespace {
-
-template<typename T>
-std::unique_ptr<CodeGen_LLVM> make_codegen(const Target &target, llvm::LLVMContext &context) {
-    std::unique_ptr<CodeGen_LLVM> ret = std::make_unique<T>(target);
-    ret->set_context(context);
-    return ret;
-}
-
-}  // namespace
-
 void CodeGen_LLVM::set_context(llvm::LLVMContext &context) {
     this->context = &context;
 }
 
 std::unique_ptr<CodeGen_LLVM> CodeGen_LLVM::new_for_target(const Target &target, llvm::LLVMContext &context) {
+    std::unique_ptr<CodeGen_LLVM> result;
     if (target.arch == Target::X86) {
-        return make_codegen<CodeGen_X86>(target, context);
+        result = new_CodeGen_X86(target);
     } else if (target.arch == Target::ARM) {
-        return make_codegen<CodeGen_ARM>(target, context);
+        result = new_CodeGen_ARM(target);
     } else if (target.arch == Target::MIPS) {
-        return make_codegen<CodeGen_MIPS>(target, context);
+        result = new_CodeGen_MIPS(target);
     } else if (target.arch == Target::POWERPC) {
-        return make_codegen<CodeGen_PowerPC>(target, context);
+        result = new_CodeGen_PowerPC(target);
     } else if (target.arch == Target::Hexagon) {
-        return new_CodeGen_Hexagon(target, context);
+        result = new_CodeGen_Hexagon(target);
     } else if (target.arch == Target::WebAssembly) {
-        return make_codegen<CodeGen_WebAssembly>(target, context);
+        result = new_CodeGen_WebAssembly(target);
     } else if (target.arch == Target::RISCV) {
-        return make_codegen<CodeGen_RISCV>(target, context);
+        result = new_CodeGen_RISCV(target);
     }
-
-    user_error << "Unknown target architecture: "
-               << target.to_string() << "\n";
-    return nullptr;
+    user_assert(result) << "Unknown target architecture: " << target.to_string() << "\n";
+    result->set_context(context);
+    return result;
 }
 
 void CodeGen_LLVM::initialize_llvm() {
@@ -352,17 +333,6 @@ CodeGen_LLVM::~CodeGen_LLVM() {
     delete builder;
 }
 
-bool CodeGen_LLVM::llvm_X86_enabled = false;
-bool CodeGen_LLVM::llvm_ARM_enabled = false;
-bool CodeGen_LLVM::llvm_Hexagon_enabled = false;
-bool CodeGen_LLVM::llvm_AArch64_enabled = false;
-bool CodeGen_LLVM::llvm_NVPTX_enabled = false;
-bool CodeGen_LLVM::llvm_Mips_enabled = false;
-bool CodeGen_LLVM::llvm_PowerPC_enabled = false;
-bool CodeGen_LLVM::llvm_AMDGPU_enabled = false;
-bool CodeGen_LLVM::llvm_WebAssembly_enabled = false;
-bool CodeGen_LLVM::llvm_RISCV_enabled = false;
-
 namespace {
 
 struct MangledNames {
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 060fe9ee10ce..092bc7713b5b 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -138,21 +138,6 @@ class CodeGen_LLVM : public IRVisitor {
      * of functions as. */
     virtual Type upgrade_type_for_argument_passing(const Type &) const;
 
-    /** State needed by llvm for code generation, including the
-     * current module, function, context, builder, and most recently
-     * generated llvm value. */
-    //@{
-    static bool llvm_X86_enabled;
-    static bool llvm_ARM_enabled;
-    static bool llvm_Hexagon_enabled;
-    static bool llvm_AArch64_enabled;
-    static bool llvm_NVPTX_enabled;
-    static bool llvm_Mips_enabled;
-    static bool llvm_PowerPC_enabled;
-    static bool llvm_AMDGPU_enabled;
-    static bool llvm_WebAssembly_enabled;
-    static bool llvm_RISCV_enabled;
-
     std::unique_ptr<llvm::Module> module;
     llvm::Function *function;
     llvm::LLVMContext *context;
diff --git a/src/CodeGen_MIPS.cpp b/src/CodeGen_MIPS.cpp
index ddc0b9d9c9ea..4118a12b684f 100644
--- a/src/CodeGen_MIPS.cpp
+++ b/src/CodeGen_MIPS.cpp
@@ -1,20 +1,32 @@
-#include "CodeGen_MIPS.h"
-#include "LLVM_Headers.h"
-#include "Util.h"
+#include "CodeGen_Posix.h"
 
 namespace Halide {
 namespace Internal {
 
 using std::string;
 
-using namespace llvm;
+#if defined(WITH_MIPS)
+
+namespace {
+
+/** A code generator that emits mips code from a given Halide stmt. */
+class CodeGen_MIPS : public CodeGen_Posix {
+public:
+    /** Create a mips code generator. Processor features can be
+     * enabled using the appropriate flags in the target struct. */
+    CodeGen_MIPS(const Target &);
+
+protected:
+    using CodeGen_Posix::visit;
+
+    string mcpu() const override;
+    string mattrs() const override;
+    bool use_soft_float_abi() const override;
+    int native_vector_bits() const override;
+};
 
 CodeGen_MIPS::CodeGen_MIPS(const Target &t)
     : CodeGen_Posix(t) {
-#if !defined(WITH_MIPS)
-    user_error << "llvm build not configured with MIPS target enabled.\n";
-#endif
-    user_assert(llvm_Mips_enabled) << "llvm build not configured with MIPS target enabled.\n";
 }
 
 string CodeGen_MIPS::mcpu() const {
@@ -41,5 +53,20 @@ int CodeGen_MIPS::native_vector_bits() const {
     return 128;
 }
 
+}  // namespace
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_MIPS(const Target &target) {
+    return std::make_unique<CodeGen_MIPS>(target);
+}
+
+#else  // WITH_MIPS
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_MIPS(const Target &target) {
+    user_error << "MIPS not enabled for this build of Halide.\n";
+    return nullptr;
+}
+
+#endif  // WITH_MIPS
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_MIPS.h b/src/CodeGen_MIPS.h
deleted file mode 100644
index fe5e2d2cb12e..000000000000
--- a/src/CodeGen_MIPS.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef HALIDE_CODEGEN_MIPS_H
-#define HALIDE_CODEGEN_MIPS_H
-
-/** \file
- * Defines the code-generator for producing MIPS machine code.
- */
-
-#include "CodeGen_Posix.h"
-
-namespace Halide {
-namespace Internal {
-
-/** A code generator that emits mips code from a given Halide stmt. */
-class CodeGen_MIPS : public CodeGen_Posix {
-public:
-    /** Create a mips code generator. Processor features can be
-     * enabled using the appropriate flags in the target struct. */
-    CodeGen_MIPS(const Target &);
-
-    static void test();
-
-protected:
-    using CodeGen_Posix::visit;
-
-    std::string mcpu() const override;
-    std::string mattrs() const override;
-    bool use_soft_float_abi() const override;
-    int native_vector_bits() const override;
-};
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index efe0b25bd2c0..26822cda2296 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -37,6 +37,8 @@ using namespace Halide::ConciseCasts;
 
 using namespace llvm;
 
+#ifdef WITH_NVPTX
+
 namespace {
 
 /** A code generator that emits GPU code from a given Halide stmt. */
@@ -111,11 +113,6 @@ class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
 
 CodeGen_PTX_Dev::CodeGen_PTX_Dev(const Target &host)
     : CodeGen_LLVM(host) {
-#if !defined(WITH_NVPTX)
-    user_error << "ptx not enabled for this build of Halide.\n";
-#endif
-    user_assert(llvm_NVPTX_enabled) << "llvm build not configured with nvptx target enabled\n.";
-
     context = new llvm::LLVMContext();
 }
 
@@ -229,9 +226,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt,
 void CodeGen_PTX_Dev::init_module() {
     init_context();
 
-#ifdef WITH_NVPTX
     module = get_initial_module_for_ptx_device(target, context);
-#endif
 
     declare_intrin_overload("dp4a", Int(32), "dp4a_s32_s32", {Int(8, 4), Int(8, 4), Int(32)});
     declare_intrin_overload("dp4a", Int(32), "dp4a_s32_u32", {Int(8, 4), UInt(8, 4), Int(32)});
@@ -587,9 +582,6 @@ bool CodeGen_PTX_Dev::use_soft_float_abi() const {
 }
 
 vector<char> CodeGen_PTX_Dev::compile_to_src() {
-
-#ifdef WITH_NVPTX
-
     debug(2) << "In CodeGen_PTX_Dev::compile_to_src";
 
     // DISABLED - hooked in here to force PrintBeforeAll option - seems to be the only way?
@@ -755,9 +747,6 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
     // Null-terminate the ptx source
     buffer.push_back(0);
     return buffer;
-#else  // WITH_NVPTX
-    return vector<char>();
-#endif
 }
 
 int CodeGen_PTX_Dev::native_vector_bits() const {
@@ -801,5 +790,14 @@ std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target) {
     return std::make_unique<CodeGen_PTX_Dev>(target);
 }
 
+#else  // WITH_PTX
+
+std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target) {
+    user_error << "PTX not enabled for this build of Halide.\n";
+    return nullptr;
+}
+
+#endif  // WITH_PTX
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp
index e81d6f281b93..312649280f62 100644
--- a/src/CodeGen_PowerPC.cpp
+++ b/src/CodeGen_PowerPC.cpp
@@ -1,9 +1,4 @@
-#include "CodeGen_PowerPC.h"
-#include "ConciseCasts.h"
-#include "IRMatch.h"
-#include "IROperator.h"
-#include "LLVM_Headers.h"
-#include "Util.h"
+#include "CodeGen_Posix.h"
 
 namespace Halide {
 namespace Internal {
@@ -11,19 +6,38 @@ namespace Internal {
 using std::string;
 using std::vector;
 
-using namespace Halide::ConciseCasts;
-using namespace llvm;
+#if defined(WITH_POWERPC)
+
+namespace {
+
+/** A code generator that emits mips code from a given Halide stmt. */
+class CodeGen_PowerPC : public CodeGen_Posix {
+public:
+    /** Create a powerpc code generator. Processor features can be
+     * enabled using the appropriate flags in the target struct. */
+    CodeGen_PowerPC(const Target &);
+
+protected:
+    void init_module() override;
+
+    string mcpu() const override;
+    string mattrs() const override;
+    bool use_soft_float_abi() const override;
+    int native_vector_bits() const override;
+
+    using CodeGen_Posix::visit;
+
+    /** Nodes for which we want to emit specific PowerPC intrinsics */
+    // @{
+    void visit(const Min *) override;
+    void visit(const Max *) override;
+    // @}
+};
 
 CodeGen_PowerPC::CodeGen_PowerPC(const Target &t)
     : CodeGen_Posix(t) {
-#if !defined(WITH_POWERPC)
-    user_error << "llvm build not configured with PowerPC target enabled.\n";
-#endif
-    user_assert(llvm_PowerPC_enabled) << "llvm build not configured with PowerPC target enabled.\n";
 }
 
-namespace {
-
 const int max_intrinsic_args = 4;
 
 struct PowerPCIntrinsic {
@@ -81,8 +95,6 @@ const PowerPCIntrinsic intrinsic_defs[] = {
 };
 // clang-format on
 
-}  // namespace
-
 void CodeGen_PowerPC::init_module() {
     CodeGen_Posix::init_module();
 
@@ -92,7 +104,7 @@ void CodeGen_PowerPC::init_module() {
         }
 
         Type ret_type = i.ret_type;
-        std::vector<Type> arg_types;
+        vector<Type> arg_types;
         arg_types.reserve(max_intrinsic_args);
         for (halide_type_t j : i.arg_types) {
             if (j.bits == 0) {
@@ -140,9 +152,9 @@ string CodeGen_PowerPC::mcpu() const {
 }
 
 string CodeGen_PowerPC::mattrs() const {
-    std::string features;
-    std::string separator;
-    std::string enable;
+    string features;
+    string separator;
+    string enable;
 
     features += "+altivec";
     separator = ",";
@@ -172,5 +184,20 @@ int CodeGen_PowerPC::native_vector_bits() const {
     return 128;
 }
 
+}  // namespace
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_PowerPC(const Target &target) {
+    return std::make_unique<CodeGen_PowerPC>(target);
+}
+
+#else  // WITH_POWERPC
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_PowerPC(const Target &target) {
+    user_error << "PowerPC not enabled for this build of Halide.\n";
+    return nullptr;
+}
+
+#endif  // WITH_POWERPC
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_PowerPC.h b/src/CodeGen_PowerPC.h
deleted file mode 100644
index c23cc5011e68..000000000000
--- a/src/CodeGen_PowerPC.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef HALIDE_CODEGEN_POWERPC_H
-#define HALIDE_CODEGEN_POWERPC_H
-
-/** \file
- * Defines the code-generator for producing POWERPC machine code.
- */
-
-#include "CodeGen_Posix.h"
-
-namespace Halide {
-namespace Internal {
-
-/** A code generator that emits mips code from a given Halide stmt. */
-class CodeGen_PowerPC : public CodeGen_Posix {
-public:
-    /** Create a powerpc code generator. Processor features can be
-     * enabled using the appropriate flags in the target struct. */
-    CodeGen_PowerPC(const Target &);
-
-    static void test();
-
-protected:
-    void init_module() override;
-
-    std::string mcpu() const override;
-    std::string mattrs() const override;
-    bool use_soft_float_abi() const override;
-    int native_vector_bits() const override;
-
-    using CodeGen_Posix::visit;
-
-    /** Nodes for which we want to emit specific PowerPC intrinsics */
-    // @{
-    void visit(const Min *) override;
-    void visit(const Max *) override;
-    // @}
-};
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
index 3fa9e3a529c3..01395f596b91 100644
--- a/src/CodeGen_RISCV.cpp
+++ b/src/CodeGen_RISCV.cpp
@@ -1,19 +1,33 @@
-#include "CodeGen_RISCV.h"
-#include "LLVM_Headers.h"
-#include "Util.h"
+#include "CodeGen_Posix.h"
 
 namespace Halide {
 namespace Internal {
 
 using std::string;
 
-using namespace llvm;
+#if defined(WITH_RISCV)
+
+namespace {
+
+/** A code generator that emits mips code from a given Halide stmt. */
+class CodeGen_RISCV : public CodeGen_Posix {
+public:
+    /** Create a mips code generator. Processor features can be
+     * enabled using the appropriate flags in the target struct. */
+    CodeGen_RISCV(const Target &);
+
+protected:
+    using CodeGen_Posix::visit;
+
+    string mcpu() const override;
+    string mattrs() const override;
+    string mabi() const override;
+    bool use_soft_float_abi() const override;
+    int native_vector_bits() const override;
+};
 
 CodeGen_RISCV::CodeGen_RISCV(const Target &t)
     : CodeGen_Posix(t) {
-#if !defined(WITH_RISCV)
-    user_error << "llvm build not configured with RISCV target enabled.\n";
-#endif
 }
 
 string CodeGen_RISCV::mcpu() const {
@@ -57,5 +71,20 @@ int CodeGen_RISCV::native_vector_bits() const {
     return 128;
 }
 
+}  // namespace
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_RISCV(const Target &target) {
+    return std::make_unique<CodeGen_RISCV>(target);
+}
+
+#else  // WITH_RISCV
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_RISCV(const Target &target) {
+    user_error << "RISCV not enabled for this build of Halide.\n";
+    return nullptr;
+}
+
+#endif  // WITH_RISCV
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_RISCV.h b/src/CodeGen_RISCV.h
deleted file mode 100644
index d6cb8328dec5..000000000000
--- a/src/CodeGen_RISCV.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef HALIDE_CODEGEN_RISCV_H
-#define HALIDE_CODEGEN_RISCV_H
-
-/** \file
- * Defines the code-generator for producing RISCV machine code.
- */
-
-#include "CodeGen_Posix.h"
-
-namespace Halide {
-namespace Internal {
-
-/** A code generator that emits mips code from a given Halide stmt. */
-class CodeGen_RISCV : public CodeGen_Posix {
-public:
-    /** Create a mips code generator. Processor features can be
-     * enabled using the appropriate flags in the target struct. */
-    CodeGen_RISCV(const Target &);
-
-protected:
-    using CodeGen_Posix::visit;
-
-    std::string mcpu() const override;
-    std::string mattrs() const override;
-    std::string mabi() const override;
-    bool use_soft_float_abi() const override;
-    int native_vector_bits() const override;
-};
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/CodeGen_Targets.h b/src/CodeGen_Targets.h
new file mode 100644
index 000000000000..1667703fe5e3
--- /dev/null
+++ b/src/CodeGen_Targets.h
@@ -0,0 +1,30 @@
+#ifndef HALIDE_CODEGEN_TARGETS_H
+#define HALIDE_CODEGEN_TARGETS_H
+
+/** \file
+ * Provides constructors for code generators for various targets.
+ */
+
+#include <memory>
+
+namespace Halide {
+
+struct Target;
+
+namespace Internal {
+
+class CodeGen_Posix;
+
+/** Construct CodeGen object for a variety of targets. */
+std::unique_ptr<CodeGen_Posix> new_CodeGen_ARM(const Target &target);
+std::unique_ptr<CodeGen_Posix> new_CodeGen_Hexagon(const Target &target);
+std::unique_ptr<CodeGen_Posix> new_CodeGen_MIPS(const Target &target);
+std::unique_ptr<CodeGen_Posix> new_CodeGen_PowerPC(const Target &target);
+std::unique_ptr<CodeGen_Posix> new_CodeGen_RISCV(const Target &target);
+std::unique_ptr<CodeGen_Posix> new_CodeGen_X86(const Target &target);
+std::unique_ptr<CodeGen_Posix> new_CodeGen_WebAssembly(const Target &target);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
index 6d5292195abe..fe7ee2fe6db0 100644
--- a/src/CodeGen_WebAssembly.cpp
+++ b/src/CodeGen_WebAssembly.cpp
@@ -1,33 +1,37 @@
-#include "CodeGen_WebAssembly.h"
-
-#include "ConciseCasts.h"
-#include "IRMatch.h"
-#include "IROperator.h"
-#include "LLVM_Headers.h"
-#include "Util.h"
+#include "CodeGen_Posix.h"
 
 #include <sstream>
 
 namespace Halide {
 namespace Internal {
 
-using namespace Halide::ConciseCasts;
-using namespace llvm;
 using std::string;
-using std::vector;
+
+#if defined(WITH_WEBASSEMBLY)
+
+namespace {
+
+/** A code generator that emits WebAssembly code from a given Halide stmt. */
+class CodeGen_WebAssembly : public CodeGen_Posix {
+public:
+    CodeGen_WebAssembly(const Target &);
+
+protected:
+    using CodeGen_Posix::visit;
+
+    void init_module() override;
+
+    string mcpu() const override;
+    string mattrs() const override;
+    bool use_soft_float_abi() const override;
+    int native_vector_bits() const override;
+    bool use_pic() const override;
+};
 
 CodeGen_WebAssembly::CodeGen_WebAssembly(const Target &t)
     : CodeGen_Posix(t) {
-#if !defined(WITH_WEBASSEMBLY)
-    user_error << "llvm build not configured with WebAssembly target enabled.\n";
-#endif
-    user_assert(LLVM_VERSION >= 110) << "Generating WebAssembly is only supported under LLVM 11+.";
-    user_assert(llvm_WebAssembly_enabled) << "llvm build not configured with WebAssembly target enabled.\n";
-    user_assert(target.bits == 32) << "Only wasm32 is supported.";
 }
 
-namespace {
-
 constexpr int max_intrinsic_args = 4;
 
 struct WasmIntrinsic {
@@ -65,8 +69,6 @@ const WasmIntrinsic intrinsic_defs[] = {
 };
 // clang-format on
 
-}  // namespace
-
 void CodeGen_WebAssembly::init_module() {
     CodeGen_Posix::init_module();
 
@@ -140,5 +142,22 @@ int CodeGen_WebAssembly::native_vector_bits() const {
     return 128;
 }
 
+}  // namespace
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_WebAssembly(const Target &target) {
+    user_assert(LLVM_VERSION >= 110) << "Generating WebAssembly is only supported under LLVM 11+.";
+    user_assert(target.bits == 32) << "Only wasm32 is supported.";
+    return std::make_unique<CodeGen_WebAssembly>(target);
+}
+
+#else  // WITH_WEBASSEMBLY
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_WebAssembly(const Target &target) {
+    user_error << "WebAssembly not enabled for this build of Halide.\n";
+    return nullptr;
+}
+
+#endif  // WITH_WEBASSEMBLY
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_WebAssembly.h b/src/CodeGen_WebAssembly.h
deleted file mode 100644
index ffbedae8d907..000000000000
--- a/src/CodeGen_WebAssembly.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef HALIDE_CODEGEN_WEBASSEMBLY_H
-#define HALIDE_CODEGEN_WEBASSEMBLY_H
-
-/** \file
- * Defines the code-generator for producing WebAssembly machine code.
- */
-
-#include "CodeGen_Posix.h"
-
-namespace Halide {
-namespace Internal {
-
-/** A code generator that emits WebAssembly code from a given Halide stmt. */
-class CodeGen_WebAssembly : public CodeGen_Posix {
-public:
-    CodeGen_WebAssembly(const Target &);
-
-    static void test();
-
-protected:
-    using CodeGen_Posix::visit;
-
-    void init_module() override;
-
-    std::string mcpu() const override;
-    std::string mattrs() const override;
-    bool use_soft_float_abi() const override;
-    int native_vector_bits() const override;
-    bool use_pic() const override;
-};
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 83cdae86dbdd..fbfc4881e70d 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -1,17 +1,12 @@
-#include <iostream>
-
-#include "CodeGen_X86.h"
+#include "CodeGen_Posix.h"
 #include "ConciseCasts.h"
 #include "Debug.h"
 #include "IRMatch.h"
 #include "IRMutator.h"
 #include "IROperator.h"
-#include "JITModule.h"
 #include "LLVM_Headers.h"
-#include "Param.h"
 #include "Simplify.h"
 #include "Util.h"
-#include "Var.h"
 
 namespace Halide {
 namespace Internal {
@@ -22,7 +17,10 @@ using std::vector;
 using namespace Halide::ConciseCasts;
 using namespace llvm;
 
+#if defined(WITH_X86)
+
 namespace {
+
 // Populate feature flags in a target according to those implied by
 // existing flags, so that instruction patterns can just check for the
 // oldest feature flag that supports an instruction.
@@ -46,20 +44,49 @@ Target complete_x86_target(Target t) {
     }
     return t;
 }
-}  // namespace
+
+/** A code generator that emits x86 code from a given Halide stmt. */
+class CodeGen_X86 : public CodeGen_Posix {
+public:
+    /** Create an x86 code generator. Processor features can be
+     * enabled using the appropriate flags in the target struct. */
+    CodeGen_X86(Target);
+
+protected:
+    string mcpu() const override;
+    string mattrs() const override;
+    bool use_soft_float_abi() const override;
+    int native_vector_bits() const override;
+
+    int vector_lanes_for_slice(const Type &t) const;
+
+    llvm::Type *llvm_type_of(const Type &t) const override;
+
+    using CodeGen_Posix::visit;
+
+    void init_module() override;
+
+    /** Nodes for which we want to emit specific sse/avx intrinsics */
+    // @{
+    void visit(const Add *) override;
+    void visit(const Sub *) override;
+    void visit(const Cast *) override;
+    void visit(const Call *) override;
+    void visit(const GT *) override;
+    void visit(const LT *) override;
+    void visit(const LE *) override;
+    void visit(const GE *) override;
+    void visit(const EQ *) override;
+    void visit(const NE *) override;
+    void visit(const Select *) override;
+    void codegen_vector_reduce(const VectorReduce *, const Expr &init) override;
+    // @}
+};
 
 CodeGen_X86::CodeGen_X86(Target t)
     : CodeGen_Posix(complete_x86_target(t)) {
-
-#if !defined(WITH_X86)
-    user_error << "x86 not enabled for this build of Halide.\n";
-#endif
-
-    user_assert(llvm_X86_enabled) << "llvm build not configured with X86 target enabled.\n";
 }
 
-namespace {
-
 const int max_intrinsic_args = 4;
 
 struct x86Intrinsic {
@@ -160,8 +187,6 @@ const x86Intrinsic intrinsic_defs[] = {
 };
 // clang-format on
 
-}  // namespace
-
 void CodeGen_X86::init_module() {
     CodeGen_Posix::init_module();
 
@@ -171,7 +196,7 @@ void CodeGen_X86::init_module() {
         }
 
         Type ret_type = i.ret_type;
-        std::vector<Type> arg_types;
+        vector<Type> arg_types;
         arg_types.reserve(max_intrinsic_args);
         for (halide_type_t j : i.arg_types) {
             if (j.bits == 0) {
@@ -184,8 +209,6 @@ void CodeGen_X86::init_module() {
     }
 }
 
-namespace {
-
 // i32(i16_a)*i32(i16_b) +/- i32(i16_c)*i32(i16_d) can be done by
 // interleaving a, c, and b, d, and then using pmaddwd. We
 // recognize it here, and implement it in the initial module.
@@ -230,8 +253,6 @@ bool should_use_pmaddwd(const Expr &a, const Expr &b, vector<Expr> &result) {
     return false;
 }
 
-}  // namespace
-
 void CodeGen_X86::visit(const Add *op) {
     vector<Expr> matches;
     if (should_use_pmaddwd(op->a, op->b, matches)) {
@@ -581,8 +602,8 @@ string CodeGen_X86::mcpu() const {
 }
 
 string CodeGen_X86::mattrs() const {
-    std::string features;
-    std::string separator;
+    string features;
+    string separator;
     if (target.has_feature(Target::FMA)) {
         features += "+fma";
         separator = ",";
@@ -669,5 +690,20 @@ llvm::Type *CodeGen_X86::llvm_type_of(const Type &t) const {
     }
 }
 
+}  // namespace
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_X86(const Target &target) {
+    return std::make_unique<CodeGen_X86>(target);
+}
+
+#else  // WITH_X86
+
+std::unique_ptr<CodeGen_Posix> new_CodeGen_X86(const Target &target) {
+    user_error << "x86 not enabled for this build of Halide.\n";
+    return nullptr;
+}
+
+#endif  // WITH_X86
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_X86.h b/src/CodeGen_X86.h
deleted file mode 100644
index 5d1d503df6a0..000000000000
--- a/src/CodeGen_X86.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef HALIDE_CODEGEN_X86_H
-#define HALIDE_CODEGEN_X86_H
-
-/** \file
- * Defines the code-generator for producing x86 machine code
- */
-
-#include "CodeGen_Posix.h"
-
-namespace llvm {
-class JITEventListener;
-}
-
-namespace Halide {
-namespace Internal {
-
-/** A code generator that emits x86 code from a given Halide stmt. */
-class CodeGen_X86 : public CodeGen_Posix {
-public:
-    /** Create an x86 code generator. Processor features can be
-     * enabled using the appropriate flags in the target struct. */
-    CodeGen_X86(Target);
-
-protected:
-    std::string mcpu() const override;
-    std::string mattrs() const override;
-    bool use_soft_float_abi() const override;
-    int native_vector_bits() const override;
-
-    int vector_lanes_for_slice(const Type &t) const;
-
-    llvm::Type *llvm_type_of(const Type &t) const override;
-
-    using CodeGen_Posix::visit;
-
-    void init_module() override;
-
-    /** Nodes for which we want to emit specific sse/avx intrinsics */
-    // @{
-    void visit(const Add *) override;
-    void visit(const Sub *) override;
-    void visit(const Cast *) override;
-    void visit(const Call *) override;
-    void visit(const GT *) override;
-    void visit(const LT *) override;
-    void visit(const LE *) override;
-    void visit(const GE *) override;
-    void visit(const EQ *) override;
-    void visit(const NE *) override;
-    void visit(const Select *) override;
-    void codegen_vector_reduce(const VectorReduce *, const Expr &init) override;
-    // @}
-};
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index 187c735dbb97..ed9743a9ac05 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -298,7 +298,7 @@ class InjectGpuOffload : public IRMutator {
         internal_assert(!cgdev.empty()) << "Requested unknown GPU target: " << target.to_string() << "\n";
     }
 
-    Stmt inject(Stmt s) {
+    Stmt inject(const Stmt &s) {
         // Create a new module for all of the kernels we find in this function.
         for (auto &i : cgdev) {
             i.second->init_module();
diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp
index 8415e0eca39d..71136eb02be8 100644
--- a/src/WasmExecutor.cpp
+++ b/src/WasmExecutor.cpp
@@ -1,6 +1,6 @@
 #include "WasmExecutor.h"
 
-#include "CodeGen_WebAssembly.h"
+#include "CodeGen_Targets.h"
 #include "Error.h"
 #include "Float16.h"
 #include "Func.h"
@@ -285,7 +285,7 @@ std::vector<char> compile_to_wasm(const Module &module, const std::string &fn_na
     // for the alloca usage.
     size_t stack_size = 65536;
     {
-        std::unique_ptr<CodeGen_WebAssembly> cg(new CodeGen_WebAssembly(module.target()));
+        std::unique_ptr<CodeGen_Posix> cg(new_CodeGen_WebAssembly(module.target()));
         cg->set_context(context);
         fn_module = cg->compile(module);
         stack_size += cg->get_requested_alloca_total();

From 2ab52cb2ba88e3cb33f669e70ba8a60f963fa66e Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 26 Feb 2021 17:43:44 -0700
Subject: [PATCH 07/19] clang-format

---
 src/CodeGen_ARM.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 31cd8d7d8e7f..e43097b95180 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -65,7 +65,7 @@ class CodeGen_ARM : public CodeGen_Posix {
     /** Various patterns to peephole match against */
     struct Pattern {
         string intrin;  ///< Name of the intrinsic
-        Expr pattern;        ///< The pattern to match against
+        Expr pattern;   ///< The pattern to match against
         Pattern() = default;
         Pattern(const string &intrin, Expr p)
             : intrin(intrin), pattern(std::move(p)) {

From 7c0c5ddee2463ee87d9abb1e8f5aedd8b4097455 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 26 Feb 2021 18:15:21 -0700
Subject: [PATCH 08/19] Pass type arguments correctly.

---
 src/OffloadGPULoops.cpp | 18 +++++-------------
 src/OffloadGPULoops.h   |  2 +-
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index ed9743a9ac05..f25b3ded42bd 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -92,15 +92,6 @@ class ExtractBounds : public IRVisitor {
     }
 };
 
-Expr make_type_arg(const Type &t) {
-    vector<Expr> args = {
-        cast<uint8_t>(t.code()),
-        cast<uint8_t>(t.bits()),
-        cast<uint16_t>(t.lanes()),
-    };
-    return Call::make(type_of<void *>(), Call::make_struct, args, Call::Intrinsic);
-}
-
 class InjectGpuOffload : public IRMutator {
     /** Child code generator for device kernels. */
     map<DeviceAPI, unique_ptr<CodeGen_GPU_Dev>> cgdev;
@@ -229,7 +220,8 @@ class InjectGpuOffload : public IRMutator {
             args.push_back(val);
 
             if (runtime_run_takes_types) {
-                arg_types_or_sizes.push_back(make_type_arg(i.type.with_lanes(1)));
+                internal_assert(sizeof(halide_type_t) == sizeof(uint32_t));
+                arg_types_or_sizes.push_back(Expr(*(const uint32_t*)&i.type));
             } else {
                 arg_types_or_sizes.push_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes()));
             }
@@ -240,8 +232,8 @@ class InjectGpuOffload : public IRMutator {
         // nullptr-terminate the lists
         args.push_back(reinterpret(Handle(), cast<uint64_t>(0)));
         if (runtime_run_takes_types) {
-            internal_assert(halide_type_int == 0);
-            arg_types_or_sizes.push_back(make_type_arg(Type(halide_type_int, 0, 0)));
+            internal_assert(sizeof(halide_type_t) == sizeof(uint32_t));
+            arg_types_or_sizes.push_back(cast<uint32_t>(0));
         } else {
             arg_types_or_sizes.push_back(cast(target_size_t_type, 0));
         }
@@ -337,7 +329,7 @@ class InjectGpuOffload : public IRMutator {
 
 }  // namespace
 
-Stmt inject_gpu_offload(Stmt s, const Target &host_target) {
+Stmt inject_gpu_offload(const Stmt &s, const Target &host_target) {
     return InjectGpuOffload(host_target).inject(s);
 }
 
diff --git a/src/OffloadGPULoops.h b/src/OffloadGPULoops.h
index c513a35fd8d5..d927f1a8b780 100644
--- a/src/OffloadGPULoops.h
+++ b/src/OffloadGPULoops.h
@@ -17,7 +17,7 @@ namespace Internal {
 
 /** Pull loops marked with GPU device APIs to a separate
  * module, and call them through the appropriate host runtime module. */
-Stmt inject_gpu_offload(Stmt s, const Target &host_target);
+Stmt inject_gpu_offload(const Stmt &s, const Target &host_target);
 
 }  // namespace Internal
 }  // namespace Halide

From 902453332ad17dd560f6f4ae638864a5ab929dc7 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 26 Feb 2021 17:23:22 -0800
Subject: [PATCH 09/19] Update OffloadGPULoops.cpp

---
 src/OffloadGPULoops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index f25b3ded42bd..cc34469c9cbe 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -221,7 +221,7 @@ class InjectGpuOffload : public IRMutator {
 
             if (runtime_run_takes_types) {
                 internal_assert(sizeof(halide_type_t) == sizeof(uint32_t));
-                arg_types_or_sizes.push_back(Expr(*(const uint32_t*)&i.type));
+                arg_types_or_sizes.push_back(Expr(*(const uint32_t *)&i.type));
             } else {
                 arg_types_or_sizes.push_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes()));
             }

From 25107fb9d3a6f3a6c0551d39076c99d18197d02a Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Sun, 28 Feb 2021 09:51:00 -0800
Subject: [PATCH 10/19] trigger buildbots


From 571fda3bbb056120af33528b003f364271a4138c Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Sun, 28 Feb 2021 09:51:13 -0800
Subject: [PATCH 11/19] trigger buildbots


From bf62dc4b723e939a1d843a1cebb40a89eb24eeb7 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Sun, 28 Feb 2021 15:04:56 -0700
Subject: [PATCH 12/19] Hack around tests that rely on the IR for offloaded GPU
 loops.

---
 src/Lower.cpp | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/Lower.cpp b/src/Lower.cpp
index 6855aed4ec61..07cb56f3556c 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -441,15 +441,9 @@ Module lower(const vector<Function> &output_funcs,
         debug(1) << "Skipping Hexagon offload...\n";
     }
 
-    if (t.has_gpu_feature()) {
-        debug(1) << "Offloading GPU loops...\n";
-        s = inject_gpu_offload(s, t);
-        debug(2) << "Lowering after splitting off GPU loops:\n"
-                 << s << "\n\n";
-    } else {
-        debug(1) << "Skipping GPU offload...\n";
-    }
-
+    // TODO: Several tests depend on these custom passes running before
+    // inject_gpu_offload. We should either make this consistent with
+    // inject_hexagon_rpc above, or find a way to avoid this dependency.
     if (!custom_passes.empty()) {
         for (size_t i = 0; i < custom_passes.size(); i++) {
             debug(1) << "Running custom lowering pass " << i << "...\n";
@@ -459,6 +453,15 @@ Module lower(const vector<Function> &output_funcs,
         }
     }
 
+    if (t.has_gpu_feature()) {
+        debug(1) << "Offloading GPU loops...\n";
+        s = inject_gpu_offload(s, t);
+        debug(2) << "Lowering after splitting off GPU loops:\n"
+                 << s << "\n\n";
+    } else {
+        debug(1) << "Skipping GPU offload...\n";
+    }
+
     vector<Argument> public_args = args;
     for (const auto &out : outputs) {
         for (const Parameter &buf : out.output_buffers()) {

From 5ee9236161d149c5087e475d7eb2dd5a205bf9e7 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Sun, 28 Feb 2021 15:22:51 -0700
Subject: [PATCH 13/19] Fix missing include.

---
 src/WasmExecutor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp
index 71136eb02be8..6a6208d47ba7 100644
--- a/src/WasmExecutor.cpp
+++ b/src/WasmExecutor.cpp
@@ -1,5 +1,6 @@
 #include "WasmExecutor.h"
 
+#include "CodeGen_Posix.h"
 #include "CodeGen_Targets.h"
 #include "Error.h"
 #include "Float16.h"

From dc7e61dbbc7d04999c5e7f8aee7993f68ba6477d Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Sun, 28 Feb 2021 15:26:14 -0700
Subject: [PATCH 14/19] Remove unused include.

---
 test/internal.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/internal.cpp b/test/internal.cpp
index bb50f45390d8..35e9fa4efcbc 100644
--- a/test/internal.cpp
+++ b/test/internal.cpp
@@ -5,7 +5,6 @@
 #include "CSE.h"
 #include "CodeGen_C.h"
 #include "CodeGen_PyTorch.h"
-#include "CodeGen_X86.h"
 #include "Deinterleave.h"
 #include "Func.h"
 #include "Generator.h"

From 4a590959f377395a3c32e2ad57a145fd159eb36e Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Sun, 28 Feb 2021 15:28:34 -0700
Subject: [PATCH 15/19] clang-tidy

---
 src/OffloadGPULoops.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index cc34469c9cbe..1c5b38f4738c 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -217,27 +217,27 @@ class InjectGpuOffload : public IRMutator {
                 val = Variable::make(i.type, i.name);
                 val = Call::make(type_of<void *>(), Call::make_struct, {val}, Call::Intrinsic);
             }
-            args.push_back(val);
+            args.emplace_back(val);
 
             if (runtime_run_takes_types) {
                 internal_assert(sizeof(halide_type_t) == sizeof(uint32_t));
-                arg_types_or_sizes.push_back(Expr(*(const uint32_t *)&i.type));
+                arg_types_or_sizes.emplace_back(Expr(*(const uint32_t *)&i.type));
             } else {
-                arg_types_or_sizes.push_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes()));
+                arg_types_or_sizes.emplace_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes()));
             }
 
-            arg_is_buffer.push_back(cast<uint8_t>(i.is_buffer));
+            arg_is_buffer.emplace_back(cast<uint8_t>(i.is_buffer));
         }
 
         // nullptr-terminate the lists
-        args.push_back(reinterpret(Handle(), cast<uint64_t>(0)));
+        args.emplace_back(reinterpret(Handle(), cast<uint64_t>(0)));
         if (runtime_run_takes_types) {
             internal_assert(sizeof(halide_type_t) == sizeof(uint32_t));
-            arg_types_or_sizes.push_back(cast<uint32_t>(0));
+            arg_types_or_sizes.emplace_back(cast<uint32_t>(0));
         } else {
-            arg_types_or_sizes.push_back(cast(target_size_t_type, 0));
+            arg_types_or_sizes.emplace_back(cast(target_size_t_type, 0));
         }
-        arg_is_buffer.push_back(cast<uint8_t>(0));
+        arg_is_buffer.emplace_back(cast<uint8_t>(0));
 
         // TODO: only three dimensions can be passed to
         // cuLaunchKernel. How should we handle blkid[3]?

From db8841186f6d3dc09086096022c1b04e9f6ce040 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Sun, 28 Feb 2021 21:49:42 -0700
Subject: [PATCH 16/19] Use custom lowering pass to see code before GPU
 offloading

---
 test/correctness/trim_no_ops.cpp | 50 +++++++++++++++-----------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/test/correctness/trim_no_ops.cpp b/test/correctness/trim_no_ops.cpp
index 3453cc74981e..c2773bd0b635 100644
--- a/test/correctness/trim_no_ops.cpp
+++ b/test/correctness/trim_no_ops.cpp
@@ -2,7 +2,7 @@
 
 using namespace Halide;
 
-class CountConditionals : public Internal::IRVisitor {
+class CountConditionals : public Internal::IRMutator {
 public:
     int count = 0;
     int count_if = 0;
@@ -10,33 +10,27 @@ class CountConditionals : public Internal::IRVisitor {
     bool in_produce = false;
 
 private:
-    using Internal::IRVisitor::visit;
+    using Internal::IRMutator::visit;
 
-    void visit(const Internal::Select *op) override {
+    Expr visit(const Internal::Select *op) override {
         if (in_produce) {
             count++;
             count_select++;
         }
-        Internal::IRVisitor::visit(op);
+        return Internal::IRMutator::visit(op);
     }
 
-    void visit(const Internal::IfThenElse *op) override {
+    Internal::Stmt visit(const Internal::IfThenElse *op) override {
         if (in_produce) {
             count++;
             count_if++;
         }
-        Internal::IRVisitor::visit(op);
+        return Internal::IRMutator::visit(op);
     }
 
-    void visit(const Internal::ProducerConsumer *op) override {
-        if (op->is_producer) {
-            bool old_in_produce = in_produce;
-            in_produce = true;
-            Internal::IRVisitor::visit(op);
-            in_produce = old_in_produce;
-        } else {
-            IRVisitor::visit(op);
-        }
+    Internal::Stmt visit(const Internal::ProducerConsumer *op) override {
+        Internal::ScopedValue<bool> v(in_produce, op->is_producer);
+        return Internal::IRMutator::visit(op);
     }
 };
 
@@ -52,10 +46,10 @@ int main(int argc, char **argv) {
         f(x) *= select(x > 20 && x < 30, 2, 1);
         f(x) = select(x >= 60 && x <= 100, 100 - f(x), f(x));
 
-        // There should be no selects or ifs after trim_no_ops runs
-        Module m = f.compile_to_module({});
         CountConditionals s;
-        m.functions().front().body.accept(&s);
+        f.add_custom_lowering_pass(&s, []() {});
+        Module m = f.compile_to_module({});
+
         if (s.count != 0) {
             std::cerr << "There were conditionals in the lowered code: \n"
                       << m.functions().front().body << "\n";
@@ -86,11 +80,12 @@ int main(int argc, char **argv) {
         Var x, y;
         f(x, y) = x + y;
         f(x, y) += select((x == 10) && (x < y), 1, 0);
-        Module m = f.compile_to_module({});
 
         // There should be no selects after trim_no_ops runs
         CountConditionals s;
-        m.functions().front().body.accept(&s);
+        f.add_custom_lowering_pass(&s, []() {});
+        Module m = f.compile_to_module({});
+
         if (s.count != 0) {
             std::cerr << "There were selects in the lowered code: \n"
                       << m.functions().front().body << "\n";
@@ -128,9 +123,10 @@ int main(int argc, char **argv) {
             hist(f(clamp(xi, 0, 73), clamp(yi, 0, 73))) +=
                 select(xi >= 0 && xi <= 73 && yi >= 0 && yi <= 73, 1, 0);
 
-            Module m = hist.compile_to_module({});
             CountConditionals s;
-            m.functions().front().body.accept(&s);
+            hist.add_custom_lowering_pass(&s, []() {});
+            Module m = hist.compile_to_module({});
+
             if (s.count != 0) {
                 std::cerr << "There were selects in the lowered code: \n"
                           << m.functions().front().body << "\n";
@@ -169,9 +165,10 @@ int main(int argc, char **argv) {
         f.tile(x, y, xi, yi, 4, 4);
 
         // Check there are no if statements.
-        Module m = f.compile_to_module({});
         CountConditionals s;
-        m.functions().front().body.accept(&s);
+        f.add_custom_lowering_pass(&s, []() {});
+        Module m = f.compile_to_module({});
+
         if (s.count != 0) {
             std::cerr << "There were selects or ifs in the lowered code: \n"
                       << m.functions().front().body << "\n";
@@ -207,9 +204,10 @@ int main(int argc, char **argv) {
         // if condition since it depends on gpu outer loop r.y
         Target gpu_target(get_host_target());
         gpu_target.set_feature(Target::CUDA);
-        Module m = f.compile_to_module({}, "", gpu_target);
         CountConditionals s;
-        m.functions().front().body.accept(&s);
+        f.add_custom_lowering_pass(&s, []() {});
+        Module m = f.compile_to_module({}, "", gpu_target);
+
         if (s.count_select != 0) {
             std::cerr << "There were selects in the lowered code: \n"
                       << m.functions().front().body << "\n";

From 18b459ba2f863ee576019f21aaefeaf643cd2fb8 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Mon, 1 Mar 2021 10:44:39 -0700
Subject: [PATCH 17/19] Speculative fix for segfault

---
 test/correctness/trim_no_ops.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/correctness/trim_no_ops.cpp b/test/correctness/trim_no_ops.cpp
index c2773bd0b635..c989a3a5c8f9 100644
--- a/test/correctness/trim_no_ops.cpp
+++ b/test/correctness/trim_no_ops.cpp
@@ -116,6 +116,7 @@ int main(int argc, char **argv) {
         f.compute_root();
 
         Func hist;
+        Buffer<int> hist_result;
         {
             RDom r(0, 10, 0, 10, 0, 10, 0, 10);
             Expr xi = r[0] + r[2] * 10, yi = r[1] + r[3] * 10;
@@ -132,8 +133,8 @@ int main(int argc, char **argv) {
                           << m.functions().front().body << "\n";
                 return -1;
             }
+            hist_result = hist.realize({256});
         }
-        Buffer<int> hist_result = hist.realize({256});
 
         // Also check the output is correct.
         Func true_hist;

From 04d4a1f870579d2a9bdb9b8b6c6a611fce4c4c32 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Mon, 1 Mar 2021 12:51:43 -0700
Subject: [PATCH 18/19] Fix const correctness

---
 src/CodeGen_C.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index ac490283738e..2516fe26a71f 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -1728,7 +1728,8 @@ string CodeGen_C::print_assignment(Type t, const std::string &rhs) {
     auto cached = cache.find(rhs);
     if (cached == cache.end()) {
         id = unique_name('_');
-        stream << get_indent() << print_type(t, AppendSpace) << (output_kind == CPlusPlusImplementation ? "const " : "") << id << " = " << rhs << ";\n";
+        const char *const_flag = output_kind == CPlusPlusImplementation ? "const " : "";
+        stream << get_indent() << print_type(t, AppendSpace) << const_flag << id << " = " << rhs << ";\n";
         cache[rhs] = id;
     } else {
         id = cached->second;
@@ -2316,7 +2317,8 @@ void CodeGen_C::visit(const Load *op) {
         bool type_cast_needed = !(allocations.contains(op->name) &&
                                   allocations.get(op->name).type.element_of() == t.element_of());
         if (type_cast_needed) {
-            rhs << "((const " << print_type(t.element_of()) << " *)" << name << ")";
+            const char *const_flag = output_kind == CPlusPlusImplementation ? "const " : "";
+            rhs << "((" << const_flag << " " << print_type(t.element_of()) << " *)" << name << ")";
         } else {
             rhs << name;
         }

From 7abf29deefcaf2d6596a3639d07ea1523a8ae01f Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Mon, 1 Mar 2021 12:51:57 -0700
Subject: [PATCH 19/19] Fix error on unused variables in generated code.

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c685c299940c..8c3f6d77bde3 100644
--- a/Makefile
+++ b/Makefile
@@ -1568,7 +1568,7 @@ $(FILTERS_DIR)/gpu_multi_context_threaded_%.a: $(BIN_DIR)/gpu_multi_context_thre
 	@mkdir -p $(@D)
 	$(CURDIR)/$< -g gpu_multi_context_threaded_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context
 
-GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas
+GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas -Wno-unused-variable
 GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools
 GEN_AOT_LD_FLAGS=$(COMMON_LD_FLAGS)