From 8b74f72f6af35582f91887ac4090457e57d1e9aa Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 11 Dec 2020 10:48:04 -0700
Subject: [PATCH 1/3] Hide inaccessible symbols in internal linkage.

---
 src/AddImageChecks.cpp            |   4 +
 src/AddParameterChecks.cpp        |   4 +
 src/AllocationBoundsInference.cpp |   4 +
 src/AsyncProducers.cpp            |   4 +
 src/BoundSmallAllocations.cpp     |   4 +
 src/Bounds.cpp                    |   8 +
 src/BoundsInference.cpp           |   3 +-
 src/CodeGen_C.cpp                 |   3 +-
 src/CodeGen_D3D12Compute_Dev.cpp  |  94 +++++++
 src/CodeGen_D3D12Compute_Dev.h    |  91 +------
 src/CodeGen_GPU_Host.cpp          |  10 +-
 src/CodeGen_Hexagon.cpp           |   3 +
 src/CodeGen_Metal_Dev.cpp         |  89 +++++++
 src/CodeGen_Metal_Dev.h           |  86 +------
 src/CodeGen_OpenCL_Dev.cpp        |  91 +++++++
 src/CodeGen_OpenCL_Dev.h          |  88 +------
 src/CodeGen_OpenGLCompute_Dev.cpp |  69 +++++
 src/CodeGen_OpenGLCompute_Dev.h   |  68 +----
 src/CodeGen_PTX_Dev.cpp           |  80 ++++++
 src/CodeGen_PTX_Dev.h             |  82 +-----
 src/DebugToFile.cpp               |   4 +
 src/DerivativeUtils.cpp           |  28 +++
 src/EliminateBoolVectors.cpp      |   4 +
 src/FastIntegerDivide.cpp         |  16 +-
 src/FastIntegerDivide.h           |  13 -
 src/Function.cpp                  |   6 +-
 src/FuseGPUThreadLoops.cpp        |   8 +-
 src/IRMatch.cpp                   |   4 +
 src/InjectOpenGLIntrinsics.cpp    |   4 +
 src/InlineReductions.cpp          |   3 +
 src/Introspection.cpp             |   6 +-
 src/LICM.cpp                      |   4 +
 src/LLVM_Output.cpp               |   4 +
 src/LLVM_Runtime_Linker.cpp       |   4 -
 src/Memoization.cpp               |   8 +-
 src/ModulusRemainder.cpp          | 402 +++++++++++++++---------------
 src/Monotonic.cpp                 |   4 +
 src/PrintLoopNest.cpp             |   4 +
 src/Profiling.cpp                 |   4 +
 src/PurifyIndexMath.cpp           |   4 +
 src/PythonExtensionGen.cpp        |  10 +-
 src/Qualify.cpp                   |   4 +
 src/Random.cpp                    |   4 +
 src/RemoveDeadAllocations.cpp     |   4 +
 src/RemoveExternLoops.cpp         |   4 +
 src/RemoveUndef.cpp               |   4 +
 src/ScheduleFunctions.cpp         |   5 +-
 src/ScheduleFunctions.h           |   4 +-
 src/SelectGPUAPI.cpp              |   4 +
 src/SkipStages.cpp                |   4 +-
 src/SlidingWindow.cpp             |   4 +-
 src/StorageFolding.cpp            |   4 +-
 src/StrictifyFloat.cpp            |   4 +
 src/Substitute.cpp                |  16 ++
 src/Tracing.cpp                   |   4 +
 src/UnifyDuplicateLets.cpp        |   4 +
 src/UnrollLoops.cpp               |   4 +
 src/VaryingAttributes.cpp         |  22 +-
 58 files changed, 878 insertions(+), 650 deletions(-)

diff --git a/src/AddImageChecks.cpp b/src/AddImageChecks.cpp
index a2345339b645..b98fd62ff86d 100644
--- a/src/AddImageChecks.cpp
+++ b/src/AddImageChecks.cpp
@@ -16,6 +16,8 @@ using std::pair;
 using std::string;
 using std::vector;
 
+namespace {
+
 /* Find all the externally referenced buffers in a stmt */
 class FindBuffers : public IRGraphVisitor {
 public:
@@ -710,6 +712,8 @@ Stmt add_image_checks_inner(Stmt s,
     return s;
 }
 
+}  // namespace
+
 // The following function repeats the arguments list it just passes
 // through six times. Surely there is a better way?
 Stmt add_image_checks(const Stmt &s,
diff --git a/src/AddParameterChecks.cpp b/src/AddParameterChecks.cpp
index 8706485430c0..e1024dd5caa8 100644
--- a/src/AddParameterChecks.cpp
+++ b/src/AddParameterChecks.cpp
@@ -12,6 +12,8 @@ using std::pair;
 using std::string;
 using std::vector;
 
+namespace {
+
 // Find all the externally referenced scalar parameters
 class FindParameters : public IRGraphVisitor {
 public:
@@ -26,6 +28,8 @@ class FindParameters : public IRGraphVisitor {
     }
 };
 
+}  // namespace
+
 // Insert checks to make sure that parameters are within their
 // declared range.
 Stmt add_parameter_checks(const vector<Stmt> &preconditions, Stmt s, const Target &t) {
diff --git a/src/AllocationBoundsInference.cpp b/src/AllocationBoundsInference.cpp
index 3a3874f1a35e..076cb74f86af 100644
--- a/src/AllocationBoundsInference.cpp
+++ b/src/AllocationBoundsInference.cpp
@@ -14,6 +14,8 @@ using std::set;
 using std::string;
 using std::vector;
 
+namespace {
+
 // Figure out the region touched of each buffer, and deposit them as
 // let statements outside of each realize node, or at the top level if
 // they're not internal allocations.
@@ -154,6 +156,8 @@ class StripDeclareBoxTouched : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt allocation_bounds_inference(Stmt s,
                                  const map<string, Function> &env,
                                  const FuncValueBounds &fb) {
diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp
index ff7356409c58..40ae31c8021f 100644
--- a/src/AsyncProducers.cpp
+++ b/src/AsyncProducers.cpp
@@ -14,6 +14,8 @@ using std::set;
 using std::string;
 using std::vector;
 
+namespace {
+
 /** A mutator which eagerly folds no-op stmts */
 class NoOpCollapsingMutator : public IRMutator {
 protected:
@@ -651,6 +653,8 @@ class TightenForkNodes : public IRMutator {
 
 // TODO: merge semaphores?
 
+}  // namespace
+
 Stmt fork_async_producers(Stmt s, const map<string, Function> &env) {
     s = TightenProducerConsumerNodes(env).mutate(s);
     s = ForkAsyncProducers(env).mutate(s);
diff --git a/src/BoundSmallAllocations.cpp b/src/BoundSmallAllocations.cpp
index b2ad80d78de5..6dff63c25d3a 100644
--- a/src/BoundSmallAllocations.cpp
+++ b/src/BoundSmallAllocations.cpp
@@ -8,6 +8,8 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
 // Find a constant upper bound on the size of each thread-local allocation
 class BoundSmallAllocations : public IRMutator {
     using IRMutator::visit;
@@ -147,6 +149,8 @@ class BoundSmallAllocations : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt bound_small_allocations(const Stmt &s) {
     return BoundSmallAllocations().mutate(s);
 }
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index ba1a51d3c23b..3749f5006c59 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -98,6 +98,8 @@ std::ostream &operator<<(std::ostream &stream, const Box &b) {
     return stream;
 }
 
+namespace {
+
 class Bounds : public IRVisitor {
 public:
     Interval interval;
@@ -1577,6 +1579,8 @@ class Bounds : public IRVisitor {
     }
 };
 
+}  // namespace
+
 Interval bounds_of_expr_in_scope(const Expr &expr, const Scope<Interval> &scope, const FuncValueBounds &fb, bool const_bound) {
     //debug(3) << "computing bounds_of_expr_in_scope " << expr << "\n";
     Bounds b(&scope, fb, const_bound);
@@ -1782,6 +1786,8 @@ bool box_contains(const Box &outer, const Box &inner) {
     return can_prove(condition);
 }
 
+namespace {
+
 class FindInnermostVar : public IRVisitor {
 public:
     const Scope<int> &vars_depth;
@@ -2630,6 +2636,8 @@ class BoxesTouched : public IRGraphVisitor {
     }
 };
 
+}  // namespace
+
 map<string, Box> boxes_touched(const Expr &e, Stmt s, bool consider_calls, bool consider_provides,
                                const string &fn, const Scope<Interval> &scope, const FuncValueBounds &fb) {
     if (!fn.empty() && s.defined()) {
diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp
index 414c4e2ada9a..71ef6ef5221d 100644
--- a/src/BoundsInference.cpp
+++ b/src/BoundsInference.cpp
@@ -198,7 +198,6 @@ bool is_fused_with_others(const vector<vector<Function>> &fused_groups,
     }
     return false;
 }
-}  // namespace
 
 class BoundsInference : public IRMutator {
 public:
@@ -1255,6 +1254,8 @@ class BoundsInference : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt bounds_inference(Stmt s,
                       const vector<Function> &outputs,
                       const vector<string> &order,
diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index 77a294abdb4f..809556873fa0 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -220,7 +220,6 @@ class HalideFreeHelper {
 };
 } // namespace
 )INLINE_CODE";
-}  // namespace
 
 class TypeInfoGatherer : public IRGraphVisitor {
 private:
@@ -311,6 +310,8 @@ class TypeInfoGatherer : public IRGraphVisitor {
     std::set<Type> vector_types_used;
 };
 
+}  // namespace
+
 CodeGen_C::CodeGen_C(ostream &s, Target t, OutputKind output_kind, const std::string &guard)
     : IRPrinter(s), id("$$ BAD ID $$"), target(t), output_kind(output_kind),
       extern_c_open(false), inside_atomic_mutex_node(false), emit_atomic_stores(false), using_vector_typedefs(false) {
diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index 10675ad546d1..3335bf0ca482 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -3,7 +3,9 @@
 #include <sstream>
 #include <utility>
 
+#include "CodeGen_C.h"
 #include "CodeGen_D3D12Compute_Dev.h"
+#include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "Debug.h"
 #include "DeviceArgument.h"
@@ -23,6 +25,92 @@ using std::vector;
 
 static ostringstream nil;
 
+namespace {
+
+class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
+public:
+    CodeGen_D3D12Compute_Dev(Target target);
+
+    /** Compile a GPU kernel into the module. This may be called many times
+     * with different kernels, which will all be accumulated into a single
+     * source module shared by a given Halide pipeline. */
+    void add_kernel(Stmt stmt,
+                    const std::string &name,
+                    const std::vector<DeviceArgument> &args) override;
+
+    /** (Re)initialize the GPU kernel module. This is separate from compile,
+     * since a GPU device module will often have many kernels compiled into it
+     * for a single pipeline. */
+    void init_module() override;
+
+    std::vector<char> compile_to_src() override;
+
+    std::string get_current_kernel_name() override;
+
+    void dump() override;
+
+    std::string print_gpu_name(const std::string &name) override;
+
+    std::string api_unique_name() override {
+        return "d3d12compute";
+    }
+
+protected:
+    friend struct StoragePackUnpack;
+
+    class CodeGen_D3D12Compute_C : public CodeGen_C {
+    public:
+        CodeGen_D3D12Compute_C(std::ostream &s, Target t)
+            : CodeGen_C(s, t) {
+            integer_suffix_style = IntegerSuffixStyle::HLSL;
+        }
+        void add_kernel(Stmt stmt,
+                        const std::string &name,
+                        const std::vector<DeviceArgument> &args);
+
+    protected:
+        friend struct StoragePackUnpack;
+
+        std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override;
+        std::string print_storage_type(Type type);
+        std::string print_type_maybe_storage(Type type, bool storage, AppendSpaceIfNeeded space);
+        std::string print_reinterpret(Type type, const Expr &e) override;
+        std::string print_extern_call(const Call *op) override;
+
+        std::string print_vanilla_cast(Type type, const std::string &value_expr);
+        std::string print_reinforced_cast(Type type, const std::string &value_expr);
+        std::string print_cast(Type target_type, Type source_type, const std::string &value_expr);
+        std::string print_reinterpret_cast(Type type, const std::string &value_expr);
+
+        std::string print_assignment(Type t, const std::string &rhs) override;
+
+        using CodeGen_C::visit;
+        void visit(const Evaluate *op) override;
+        void visit(const Min *) override;
+        void visit(const Max *) override;
+        void visit(const Div *) override;
+        void visit(const Mod *) override;
+        void visit(const For *) override;
+        void visit(const Ramp *op) override;
+        void visit(const Broadcast *op) override;
+        void visit(const Call *op) override;
+        void visit(const Load *op) override;
+        void visit(const Store *op) override;
+        void visit(const Select *op) override;
+        void visit(const Allocate *op) override;
+        void visit(const Free *op) override;
+        void visit(const Cast *op) override;
+        void visit(const Atomic *op) override;
+        void visit(const FloatImm *op) override;
+
+        Scope<> groupshared_allocations;
+    };
+
+    std::ostringstream src_stream;
+    std::string cur_kernel_name;
+    CodeGen_D3D12Compute_C d3d12compute_c;
+};
+
 CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_Dev(Target t)
     : d3d12compute_c(src_stream, t) {
 }
@@ -1279,5 +1367,11 @@ std::string CodeGen_D3D12Compute_Dev::print_gpu_name(const std::string &name) {
     return name;
 }
 
+}  // namespace
+
+CodeGen_GPU_Dev *new_CodeGen_D3D12Compute_Dev(const Target &target) {
+    return new CodeGen_D3D12Compute_Dev(target);
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_D3D12Compute_Dev.h b/src/CodeGen_D3D12Compute_Dev.h
index e08f9f5b6aaa..295efeff05e1 100644
--- a/src/CodeGen_D3D12Compute_Dev.h
+++ b/src/CodeGen_D3D12Compute_Dev.h
@@ -5,98 +5,15 @@
  * Defines the code-generator for producing D3D12-compatible HLSL kernel code
  */
 
-#include <sstream>
-
-#include "CodeGen_C.h"
-#include "CodeGen_GPU_Dev.h"
-#include "Target.h"
-
 namespace Halide {
-namespace Internal {
-
-class CodeGen_D3D12Compute_Dev : public CodeGen_GPU_Dev {
-public:
-    CodeGen_D3D12Compute_Dev(Target target);
-
-    /** Compile a GPU kernel into the module. This may be called many times
-     * with different kernels, which will all be accumulated into a single
-     * source module shared by a given Halide pipeline. */
-    void add_kernel(Stmt stmt,
-                    const std::string &name,
-                    const std::vector<DeviceArgument> &args) override;
-
-    /** (Re)initialize the GPU kernel module. This is separate from compile,
-     * since a GPU device module will often have many kernels compiled into it
-     * for a single pipeline. */
-    void init_module() override;
-
-    std::vector<char> compile_to_src() override;
-
-    std::string get_current_kernel_name() override;
-
-    void dump() override;
 
-    std::string print_gpu_name(const std::string &name) override;
+struct Target;
 
-    std::string api_unique_name() override {
-        return "d3d12compute";
-    }
-
-protected:
-    friend struct StoragePackUnpack;
-
-    class CodeGen_D3D12Compute_C : public CodeGen_C {
-    public:
-        CodeGen_D3D12Compute_C(std::ostream &s, Target t)
-            : CodeGen_C(s, t) {
-            integer_suffix_style = IntegerSuffixStyle::HLSL;
-        }
-        void add_kernel(Stmt stmt,
-                        const std::string &name,
-                        const std::vector<DeviceArgument> &args);
-
-    protected:
-        friend struct StoragePackUnpack;
-
-        std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override;
-        std::string print_storage_type(Type type);
-        std::string print_type_maybe_storage(Type type, bool storage, AppendSpaceIfNeeded space);
-        std::string print_reinterpret(Type type, const Expr &e) override;
-        std::string print_extern_call(const Call *op) override;
-
-        std::string print_vanilla_cast(Type type, const std::string &value_expr);
-        std::string print_reinforced_cast(Type type, const std::string &value_expr);
-        std::string print_cast(Type target_type, Type source_type, const std::string &value_expr);
-        std::string print_reinterpret_cast(Type type, const std::string &value_expr);
-
-        std::string print_assignment(Type t, const std::string &rhs) override;
-
-        using CodeGen_C::visit;
-        void visit(const Evaluate *op) override;
-        void visit(const Min *) override;
-        void visit(const Max *) override;
-        void visit(const Div *) override;
-        void visit(const Mod *) override;
-        void visit(const For *) override;
-        void visit(const Ramp *op) override;
-        void visit(const Broadcast *op) override;
-        void visit(const Call *op) override;
-        void visit(const Load *op) override;
-        void visit(const Store *op) override;
-        void visit(const Select *op) override;
-        void visit(const Allocate *op) override;
-        void visit(const Free *op) override;
-        void visit(const Cast *op) override;
-        void visit(const Atomic *op) override;
-        void visit(const FloatImm *op) override;
+namespace Internal {
 
-        Scope<> groupshared_allocations;
-    };
+struct CodeGen_GPU_Dev;
 
-    std::ostringstream src_stream;
-    std::string cur_kernel_name;
-    CodeGen_D3D12Compute_C d3d12compute_c;
-};
+CodeGen_GPU_Dev *new_CodeGen_D3D12Compute_Dev(const Target &target);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp
index 4cb09da4712d..8c5a1e061197 100644
--- a/src/CodeGen_GPU_Host.cpp
+++ b/src/CodeGen_GPU_Host.cpp
@@ -108,23 +108,23 @@ CodeGen_GPU_Host<CodeGen_CPU>::CodeGen_GPU_Host(Target target)
     }
     if (target.has_feature(Target::OpenGLCompute)) {
         debug(1) << "Constructing OpenGL Compute device codegen\n";
-        cgdev[DeviceAPI::OpenGLCompute] = new CodeGen_OpenGLCompute_Dev(target);
+        cgdev[DeviceAPI::OpenGLCompute] = new_CodeGen_OpenGLCompute_Dev(target);
     }
     if (target.has_feature(Target::CUDA)) {
         debug(1) << "Constructing CUDA device codegen\n";
-        cgdev[DeviceAPI::CUDA] = new CodeGen_PTX_Dev(target);
+        cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(target);
     }
     if (target.has_feature(Target::OpenCL)) {
         debug(1) << "Constructing OpenCL device codegen\n";
-        cgdev[DeviceAPI::OpenCL] = new CodeGen_OpenCL_Dev(target);
+        cgdev[DeviceAPI::OpenCL] = new_CodeGen_OpenCL_Dev(target);
     }
     if (target.has_feature(Target::Metal)) {
         debug(1) << "Constructing Metal device codegen\n";
-        cgdev[DeviceAPI::Metal] = new CodeGen_Metal_Dev(target);
+        cgdev[DeviceAPI::Metal] = new_CodeGen_Metal_Dev(target);
     }
     if (target.has_feature(Target::D3D12Compute)) {
         debug(1) << "Constructing Direct3D 12 Compute device codegen\n";
-        cgdev[DeviceAPI::D3D12Compute] = new CodeGen_D3D12Compute_Dev(target);
+        cgdev[DeviceAPI::D3D12Compute] = new_CodeGen_D3D12Compute_Dev(target);
     }
 
     if (cgdev.empty()) {
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 55783989d897..1103c18d973b 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -62,6 +62,7 @@ Stmt call_halide_qurt_hvx_lock(const Target &target) {
         AssertStmt::make(EQ::make(hvx_lock_result_var, 0), hvx_lock_result_var));
     return check_hvx_lock;
 }
+
 Stmt call_halide_qurt_hvx_unlock() {
     Expr hvx_unlock =
         Call::make(Int(32), "halide_qurt_hvx_unlock", {}, Call::Extern);
@@ -73,6 +74,7 @@ Stmt call_halide_qurt_hvx_unlock() {
                                        hvx_unlock_result_var));
     return check_hvx_unlock;
 }
+
 // Wrap the stmt in a call to qurt_hvx_lock, calling qurt_hvx_unlock
 // as a destructor if successful.
 Stmt acquire_hvx_context(Stmt stmt, const Target &target) {
@@ -89,6 +91,7 @@ Stmt acquire_hvx_context(Stmt stmt, const Target &target) {
     stmt = Block::make(check_hvx_lock, stmt);
     return stmt;
 }
+
 bool is_dense_ramp(const Expr &x) {
     const Ramp *r = x.as<Ramp>();
     if (!r) {
diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index 87195e72bb69..1b465791f9a8 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -2,6 +2,8 @@
 #include <sstream>
 #include <utility>
 
+#include "CodeGen_C.h"
+#include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Metal_Dev.h"
 #include "Debug.h"
@@ -17,6 +19,87 @@ using std::vector;
 
 static ostringstream nil;
 
+namespace {
+
+class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
+public:
+    CodeGen_Metal_Dev(Target target);
+
+    /** Compile a GPU kernel into the module. This may be called many times
+     * with different kernels, which will all be accumulated into a single
+     * source module shared by a given Halide pipeline. */
+    void add_kernel(Stmt stmt,
+                    const std::string &name,
+                    const std::vector<DeviceArgument> &args) override;
+
+    /** (Re)initialize the GPU kernel module. This is separate from compile,
+     * since a GPU device module will often have many kernels compiled into it
+     * for a single pipeline. */
+    void init_module() override;
+
+    std::vector<char> compile_to_src() override;
+
+    std::string get_current_kernel_name() override;
+
+    void dump() override;
+
+    std::string print_gpu_name(const std::string &name) override;
+
+    std::string api_unique_name() override {
+        return "metal";
+    }
+
+protected:
+    class CodeGen_Metal_C : public CodeGen_C {
+    public:
+        CodeGen_Metal_C(std::ostream &s, Target t)
+            : CodeGen_C(s, t) {
+        }
+        void add_kernel(const Stmt &stmt,
+                        const std::string &name,
+                        const std::vector<DeviceArgument> &args);
+
+    protected:
+        using CodeGen_C::visit;
+        std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override;
+        // Vectors in Metal come in two varieties, regular and packed.
+        // For storage allocations and pointers used in address arithmetic,
+        // packed types must be used. For temporaries, constructors, etc.
+        // regular types must be used.
+        // This concept also potentially applies to half types, which are
+        // often only supported for storage, not arithmetic,
+        // hence the method name.
+        std::string print_storage_type(Type type);
+        std::string print_type_maybe_storage(Type type, bool storage, AppendSpaceIfNeeded space);
+        std::string print_reinterpret(Type type, const Expr &e) override;
+        std::string print_extern_call(const Call *op) override;
+
+        std::string get_memory_space(const std::string &);
+
+        std::string shared_name;
+
+        void visit(const Min *) override;
+        void visit(const Max *) override;
+        void visit(const Div *) override;
+        void visit(const Mod *) override;
+        void visit(const For *) override;
+        void visit(const Ramp *op) override;
+        void visit(const Broadcast *op) override;
+        void visit(const Call *op) override;
+        void visit(const Load *op) override;
+        void visit(const Store *op) override;
+        void visit(const Select *op) override;
+        void visit(const Allocate *op) override;
+        void visit(const Free *op) override;
+        void visit(const Cast *op) override;
+        void visit(const Atomic *op) override;
+    };
+
+    std::ostringstream src_stream;
+    std::string cur_kernel_name;
+    CodeGen_Metal_C metal_c;
+};
+
 CodeGen_Metal_Dev::CodeGen_Metal_Dev(Target t)
     : metal_c(src_stream, t) {
 }
@@ -726,5 +809,11 @@ std::string CodeGen_Metal_Dev::print_gpu_name(const std::string &name) {
     return name;
 }
 
+}  // namespace
+
+CodeGen_GPU_Dev *new_CodeGen_Metal_Dev(const Target &target) {
+    return new CodeGen_Metal_Dev(target);
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_Metal_Dev.h b/src/CodeGen_Metal_Dev.h
index a8dcce1ed30f..02d09cb80e05 100644
--- a/src/CodeGen_Metal_Dev.h
+++ b/src/CodeGen_Metal_Dev.h
@@ -5,93 +5,15 @@
  * Defines the code-generator for producing Apple Metal shading language kernel code
  */
 
-#include <sstream>
-
-#include "CodeGen_C.h"
-#include "CodeGen_GPU_Dev.h"
-#include "Target.h"
-
 namespace Halide {
-namespace Internal {
-
-class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
-public:
-    CodeGen_Metal_Dev(Target target);
-
-    /** Compile a GPU kernel into the module. This may be called many times
-     * with different kernels, which will all be accumulated into a single
-     * source module shared by a given Halide pipeline. */
-    void add_kernel(Stmt stmt,
-                    const std::string &name,
-                    const std::vector<DeviceArgument> &args) override;
-
-    /** (Re)initialize the GPU kernel module. This is separate from compile,
-     * since a GPU device module will often have many kernels compiled into it
-     * for a single pipeline. */
-    void init_module() override;
-
-    std::vector<char> compile_to_src() override;
 
-    std::string get_current_kernel_name() override;
+struct Target;
 
-    void dump() override;
-
-    std::string print_gpu_name(const std::string &name) override;
-
-    std::string api_unique_name() override {
-        return "metal";
-    }
-
-protected:
-    class CodeGen_Metal_C : public CodeGen_C {
-    public:
-        CodeGen_Metal_C(std::ostream &s, Target t)
-            : CodeGen_C(s, t) {
-        }
-        void add_kernel(const Stmt &stmt,
-                        const std::string &name,
-                        const std::vector<DeviceArgument> &args);
-
-    protected:
-        using CodeGen_C::visit;
-        std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override;
-        // Vectors in Metal come in two varieties, regular and packed.
-        // For storage allocations and pointers used in address arithmetic,
-        // packed types must be used. For temporaries, constructors, etc.
-        // regular types must be used.
-        // This concept also potentially applies to half types, which are
-        // often only supported for storage, not arithmetic,
-        // hence the method name.
-        std::string print_storage_type(Type type);
-        std::string print_type_maybe_storage(Type type, bool storage, AppendSpaceIfNeeded space);
-        std::string print_reinterpret(Type type, const Expr &e) override;
-        std::string print_extern_call(const Call *op) override;
-
-        std::string get_memory_space(const std::string &);
-
-        std::string shared_name;
+namespace Internal {
 
-        void visit(const Min *) override;
-        void visit(const Max *) override;
-        void visit(const Div *) override;
-        void visit(const Mod *) override;
-        void visit(const For *) override;
-        void visit(const Ramp *op) override;
-        void visit(const Broadcast *op) override;
-        void visit(const Call *op) override;
-        void visit(const Load *op) override;
-        void visit(const Store *op) override;
-        void visit(const Select *op) override;
-        void visit(const Allocate *op) override;
-        void visit(const Free *op) override;
-        void visit(const Cast *op) override;
-        void visit(const Atomic *op) override;
-    };
+struct CodeGen_GPU_Dev;
 
-    std::ostringstream src_stream;
-    std::string cur_kernel_name;
-    CodeGen_Metal_C metal_c;
-};
+CodeGen_GPU_Dev *new_CodeGen_Metal_Dev(const Target &target);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index d6e26894a14d..856eae556be5 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -4,6 +4,8 @@
 #include <utility>
 
 #include "CSE.h"
+#include "CodeGen_C.h"
+#include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_OpenCL_Dev.h"
 #include "Debug.h"
@@ -22,6 +24,89 @@ using std::sort;
 using std::string;
 using std::vector;
 
+namespace {
+
+class CodeGen_OpenCL_Dev : public CodeGen_GPU_Dev {
+public:
+    CodeGen_OpenCL_Dev(Target target);
+
+    /** Compile a GPU kernel into the module. This may be called many times
+     * with different kernels, which will all be accumulated into a single
+     * source module shared by a given Halide pipeline. */
+    void add_kernel(Stmt stmt,
+                    const std::string &name,
+                    const std::vector<DeviceArgument> &args) override;
+
+    /** (Re)initialize the GPU kernel module. This is separate from compile,
+     * since a GPU device module will often have many kernels compiled into it
+     * for a single pipeline. */
+    void init_module() override;
+
+    std::vector<char> compile_to_src() override;
+
+    std::string get_current_kernel_name() override;
+
+    void dump() override;
+
+    std::string print_gpu_name(const std::string &name) override;
+
+    std::string api_unique_name() override {
+        return "opencl";
+    }
+
+protected:
+    class CodeGen_OpenCL_C : public CodeGen_C {
+    public:
+        CodeGen_OpenCL_C(std::ostream &s, Target t)
+            : CodeGen_C(s, t) {
+            integer_suffix_style = IntegerSuffixStyle::OpenCL;
+        }
+        void add_kernel(Stmt stmt,
+                        const std::string &name,
+                        const std::vector<DeviceArgument> &args);
+
+    protected:
+        using CodeGen_C::visit;
+        std::string print_type(Type type, AppendSpaceIfNeeded append_space = DoNotAppendSpace) override;
+        std::string print_reinterpret(Type type, const Expr &e) override;
+        std::string print_extern_call(const Call *op) override;
+        std::string print_array_access(const std::string &name,
+                                       const Type &type,
+                                       const std::string &id_index);
+        void add_vector_typedefs(const std::set<Type> &vector_types) override;
+
+        std::string get_memory_space(const std::string &);
+
+        std::string shared_name;
+
+        void visit(const For *) override;
+        void visit(const Ramp *op) override;
+        void visit(const Broadcast *op) override;
+        void visit(const Call *op) override;
+        void visit(const Load *op) override;
+        void visit(const Store *op) override;
+        void visit(const Cast *op) override;
+        void visit(const Select *op) override;
+        void visit(const EQ *) override;
+        void visit(const NE *) override;
+        void visit(const LT *) override;
+        void visit(const LE *) override;
+        void visit(const GT *) override;
+        void visit(const GE *) override;
+        void visit(const Allocate *op) override;
+        void visit(const Free *op) override;
+        void visit(const AssertStmt *op) override;
+        void visit(const Shuffle *op) override;
+        void visit(const Min *op) override;
+        void visit(const Max *op) override;
+        void visit(const Atomic *op) override;
+    };
+
+    std::ostringstream src_stream;
+    std::string cur_kernel_name;
+    CodeGen_OpenCL_C clc;
+};
+
 CodeGen_OpenCL_Dev::CodeGen_OpenCL_Dev(Target t)
     : clc(src_stream, t) {
 }
@@ -1133,5 +1218,11 @@ std::string CodeGen_OpenCL_Dev::print_gpu_name(const std::string &name) {
     return name;
 }
 
+}  // namespace
+
+CodeGen_GPU_Dev *new_CodeGen_OpenCL_Dev(const Target &target) {
+    return new CodeGen_OpenCL_Dev(target);
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_OpenCL_Dev.h b/src/CodeGen_OpenCL_Dev.h
index dd20efec1e88..5743e20343ed 100644
--- a/src/CodeGen_OpenCL_Dev.h
+++ b/src/CodeGen_OpenCL_Dev.h
@@ -5,95 +5,15 @@
  * Defines the code-generator for producing OpenCL C kernel code
  */
 
-#include <sstream>
-
-#include "CodeGen_C.h"
-#include "CodeGen_GPU_Dev.h"
-#include "Target.h"
-
 namespace Halide {
-namespace Internal {
-
-class CodeGen_OpenCL_Dev : public CodeGen_GPU_Dev {
-public:
-    CodeGen_OpenCL_Dev(Target target);
-
-    /** Compile a GPU kernel into the module. This may be called many times
-     * with different kernels, which will all be accumulated into a single
-     * source module shared by a given Halide pipeline. */
-    void add_kernel(Stmt stmt,
-                    const std::string &name,
-                    const std::vector<DeviceArgument> &args) override;
-
-    /** (Re)initialize the GPU kernel module. This is separate from compile,
-     * since a GPU device module will often have many kernels compiled into it
-     * for a single pipeline. */
-    void init_module() override;
-
-    std::vector<char> compile_to_src() override;
 
-    std::string get_current_kernel_name() override;
+struct Target;
 
-    void dump() override;
-
-    std::string print_gpu_name(const std::string &name) override;
-
-    std::string api_unique_name() override {
-        return "opencl";
-    }
-
-protected:
-    class CodeGen_OpenCL_C : public CodeGen_C {
-    public:
-        CodeGen_OpenCL_C(std::ostream &s, Target t)
-            : CodeGen_C(s, t) {
-            integer_suffix_style = IntegerSuffixStyle::OpenCL;
-        }
-        void add_kernel(Stmt stmt,
-                        const std::string &name,
-                        const std::vector<DeviceArgument> &args);
-
-    protected:
-        using CodeGen_C::visit;
-        std::string print_type(Type type, AppendSpaceIfNeeded append_space = DoNotAppendSpace) override;
-        std::string print_reinterpret(Type type, const Expr &e) override;
-        std::string print_extern_call(const Call *op) override;
-        std::string print_array_access(const std::string &name,
-                                       const Type &type,
-                                       const std::string &id_index);
-        void add_vector_typedefs(const std::set<Type> &vector_types) override;
-
-        std::string get_memory_space(const std::string &);
-
-        std::string shared_name;
+namespace Internal {
 
-        void visit(const For *) override;
-        void visit(const Ramp *op) override;
-        void visit(const Broadcast *op) override;
-        void visit(const Call *op) override;
-        void visit(const Load *op) override;
-        void visit(const Store *op) override;
-        void visit(const Cast *op) override;
-        void visit(const Select *op) override;
-        void visit(const EQ *) override;
-        void visit(const NE *) override;
-        void visit(const LT *) override;
-        void visit(const LE *) override;
-        void visit(const GT *) override;
-        void visit(const GE *) override;
-        void visit(const Allocate *op) override;
-        void visit(const Free *op) override;
-        void visit(const AssertStmt *op) override;
-        void visit(const Shuffle *op) override;
-        void visit(const Min *op) override;
-        void visit(const Max *op) override;
-        void visit(const Atomic *op) override;
-    };
+struct CodeGen_GPU_Dev;
 
-    std::ostringstream src_stream;
-    std::string cur_kernel_name;
-    CodeGen_OpenCL_C clc;
-};
+CodeGen_GPU_Dev *new_CodeGen_OpenCL_Dev(const Target &target);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_OpenGLCompute_Dev.cpp b/src/CodeGen_OpenGLCompute_Dev.cpp
index 8bafed22cbd3..7e5033668f88 100644
--- a/src/CodeGen_OpenGLCompute_Dev.cpp
+++ b/src/CodeGen_OpenGLCompute_Dev.cpp
@@ -1,3 +1,5 @@
+#include "CodeGen_GPU_Dev.h"
+#include "CodeGen_OpenGL_Dev.h"
 #include "CodeGen_OpenGLCompute_Dev.h"
 #include "Debug.h"
 #include "Deinterleave.h"
@@ -17,6 +19,67 @@ using std::ostringstream;
 using std::string;
 using std::vector;
 
+namespace {
+
+class CodeGen_OpenGLCompute_Dev : public CodeGen_GPU_Dev {
+public:
+    CodeGen_OpenGLCompute_Dev(Target target);
+
+    // CodeGen_GPU_Dev interface
+    void add_kernel(Stmt stmt,
+                    const std::string &name,
+                    const std::vector<DeviceArgument> &args) override;
+
+    void init_module() override;
+
+    std::vector<char> compile_to_src() override;
+
+    std::string get_current_kernel_name() override;
+
+    void dump() override;
+
+    std::string print_gpu_name(const std::string &name) override;
+
+    std::string api_unique_name() override {
+        return "openglcompute";
+    }
+    bool kernel_run_takes_types() const override {
+        return true;
+    }
+
+protected:
+    class CodeGen_OpenGLCompute_C : public CodeGen_GLSLBase {
+    public:
+        CodeGen_OpenGLCompute_C(std::ostream &s, Target t);
+        void add_kernel(const Stmt &stmt,
+                        const std::string &name,
+                        const std::vector<DeviceArgument> &args);
+
+    protected:
+        std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override;
+
+        using CodeGen_GLSLBase::visit;
+        void visit(const For *) override;
+        void visit(const Ramp *op) override;
+        void visit(const Broadcast *op) override;
+        void visit(const Load *op) override;
+        void visit(const Store *op) override;
+        void visit(const Call *op) override;
+        void visit(const Allocate *op) override;
+        void visit(const Free *op) override;
+        void visit(const Select *op) override;
+        void visit(const Evaluate *op) override;
+        void visit(const IntImm *op) override;
+
+    public:
+        int workgroup_size[3];
+    };
+
+    std::ostringstream src_stream;
+    std::string cur_kernel_name;
+    CodeGen_OpenGLCompute_C glc;
+};
+
 CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_Dev(Target target)
     : glc(src_stream, target) {
 }
@@ -392,5 +455,11 @@ std::string CodeGen_OpenGLCompute_Dev::print_gpu_name(const std::string &name) {
     return name;
 }
 
+}  // namespace
+
+CodeGen_GPU_Dev *new_CodeGen_OpenGLCompute_Dev(const Target &target) {
+    return new CodeGen_OpenGLCompute_Dev(target);
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_OpenGLCompute_Dev.h b/src/CodeGen_OpenGLCompute_Dev.h
index 141e9e1ae1e9..e91e3a4134b1 100644
--- a/src/CodeGen_OpenGLCompute_Dev.h
+++ b/src/CodeGen_OpenGLCompute_Dev.h
@@ -5,75 +5,15 @@
  * Defines the code-generator for producing GLSL kernel code for OpenGL Compute.
  */
 
-#include <map>
-#include <sstream>
-
-#include "CodeGen_C.h"
-#include "CodeGen_GPU_Dev.h"
-#include "CodeGen_OpenGL_Dev.h"
-#include "Target.h"
-
 namespace Halide {
-namespace Internal {
-
-class CodeGen_OpenGLCompute_Dev : public CodeGen_GPU_Dev {
-public:
-    CodeGen_OpenGLCompute_Dev(Target target);
-
-    // CodeGen_GPU_Dev interface
-    void add_kernel(Stmt stmt,
-                    const std::string &name,
-                    const std::vector<DeviceArgument> &args) override;
-
-    void init_module() override;
-
-    std::vector<char> compile_to_src() override;
 
-    std::string get_current_kernel_name() override;
+struct Target;
 
-    void dump() override;
-
-    std::string print_gpu_name(const std::string &name) override;
-
-    std::string api_unique_name() override {
-        return "openglcompute";
-    }
-    bool kernel_run_takes_types() const override {
-        return true;
-    }
-
-protected:
-    class CodeGen_OpenGLCompute_C : public CodeGen_GLSLBase {
-    public:
-        CodeGen_OpenGLCompute_C(std::ostream &s, Target t);
-        void add_kernel(const Stmt &stmt,
-                        const std::string &name,
-                        const std::vector<DeviceArgument> &args);
-
-    protected:
-        std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override;
-
-        using CodeGen_GLSLBase::visit;
-        void visit(const For *) override;
-        void visit(const Ramp *op) override;
-        void visit(const Broadcast *op) override;
-        void visit(const Load *op) override;
-        void visit(const Store *op) override;
-        void visit(const Call *op) override;
-        void visit(const Allocate *op) override;
-        void visit(const Free *op) override;
-        void visit(const Select *op) override;
-        void visit(const Evaluate *op) override;
-        void visit(const IntImm *op) override;
+namespace Internal {
 
-    public:
-        int workgroup_size[3];
-    };
+struct CodeGen_GPU_Dev;
 
-    std::ostringstream src_stream;
-    std::string cur_kernel_name;
-    CodeGen_OpenGLCompute_C glc;
-};
+CodeGen_GPU_Dev *new_CodeGen_OpenGLCompute_Dev(const Target &target);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 7c3dd2405b25..d0032581008b 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -1,3 +1,5 @@
+#include "CodeGen_GPU_Dev.h"
+#include "CodeGen_LLVM.h"
 #include "CodeGen_PTX_Dev.h"
 #include "CSE.h"
 #include "CodeGen_Internal.h"
@@ -32,6 +34,78 @@ using std::vector;
 
 using namespace llvm;
 
+namespace {
+
+/** A code generator that emits GPU code from a given Halide stmt. */
+class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
+public:
+    /** Create a PTX device code generator. */
+    CodeGen_PTX_Dev(Target host);
+    ~CodeGen_PTX_Dev() override;
+
+    void add_kernel(Stmt stmt,
+                    const std::string &name,
+                    const std::vector<DeviceArgument> &args) override;
+
+    static void test();
+
+    std::vector<char> compile_to_src() override;
+    std::string get_current_kernel_name() override;
+
+    void dump() override;
+
+    std::string print_gpu_name(const std::string &name) override;
+
+    std::string api_unique_name() override {
+        return "cuda";
+    }
+
+protected:
+    using CodeGen_LLVM::visit;
+
+    /** (Re)initialize the PTX module. This is separate from compile, since
+     * a PTX device module will often have many kernels compiled into it for
+     * a single pipeline. */
+    /* override */ void init_module() override;
+
+    /** We hold onto the basic block at the start of the device
+     * function in order to inject allocas */
+    llvm::BasicBlock *entry_block;
+
+    /** Nodes for which we need to override default behavior for the GPU runtime */
+    // @{
+    void visit(const Call *) override;
+    void visit(const For *) override;
+    void visit(const Allocate *) override;
+    void visit(const Free *) override;
+    void visit(const AssertStmt *) override;
+    void visit(const Load *) override;
+    void visit(const Store *) override;
+    void visit(const Atomic *) override;
+    void codegen_vector_reduce(const VectorReduce *op, const Expr &init) override;
+    // @}
+
+    std::string march() const;
+    std::string mcpu() const override;
+    std::string mattrs() const override;
+    bool use_soft_float_abi() const override;
+    int native_vector_bits() const override;
+    bool promote_indices() const override {
+        return false;
+    }
+
+    Type upgrade_type_for_arithmetic(const Type &t) const override {
+        return t;
+    }
+    Type upgrade_type_for_storage(const Type &t) const override;
+
+    /** Map from simt variable names (e.g. foo.__block_id_x) to the llvm
+     * ptx intrinsic functions to call to get them. */
+    std::string simt_intrinsic(const std::string &name);
+
+    bool supports_atomic_add(const Type &t) const override;
+};
+
 CodeGen_PTX_Dev::CodeGen_PTX_Dev(Target host)
     : CodeGen_LLVM(host) {
 #if !defined(WITH_NVPTX)
@@ -723,5 +797,11 @@ bool CodeGen_PTX_Dev::supports_atomic_add(const Type &t) const {
     return false;
 }
 
+}  // namespace
+
+CodeGen_GPU_Dev *new_CodeGen_PTX_Dev(const Target &target) {
+    return new CodeGen_PTX_Dev(target);
+}
+
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/CodeGen_PTX_Dev.h b/src/CodeGen_PTX_Dev.h
index 7f7c80669f47..eabf4d9cc2c4 100644
--- a/src/CodeGen_PTX_Dev.h
+++ b/src/CodeGen_PTX_Dev.h
@@ -5,89 +5,15 @@
  * Defines the code-generator for producing CUDA host code
  */
 
-#include "CodeGen_GPU_Dev.h"
-#include "CodeGen_GPU_Host.h"
-#include "CodeGen_LLVM.h"
-
-namespace llvm {
-class BasicBlock;
-}
-
 namespace Halide {
-namespace Internal {
-
-/** A code generator that emits GPU code from a given Halide stmt. */
-class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
-public:
-    friend class CodeGen_GPU_Host<CodeGen_X86>;
-    friend class CodeGen_GPU_Host<CodeGen_ARM>;
-
-    /** Create a PTX device code generator. */
-    CodeGen_PTX_Dev(Target host);
-    ~CodeGen_PTX_Dev() override;
-
-    void add_kernel(Stmt stmt,
-                    const std::string &name,
-                    const std::vector<DeviceArgument> &args) override;
-
-    static void test();
-
-    std::vector<char> compile_to_src() override;
-    std::string get_current_kernel_name() override;
 
-    void dump() override;
+struct Target;
 
-    std::string print_gpu_name(const std::string &name) override;
-
-    std::string api_unique_name() override {
-        return "cuda";
-    }
-
-protected:
-    using CodeGen_LLVM::visit;
-
-    /** (Re)initialize the PTX module. This is separate from compile, since
-     * a PTX device module will often have many kernels compiled into it for
-     * a single pipeline. */
-    /* override */ void init_module() override;
-
-    /** We hold onto the basic block at the start of the device
-     * function in order to inject allocas */
-    llvm::BasicBlock *entry_block;
-
-    /** Nodes for which we need to override default behavior for the GPU runtime */
-    // @{
-    void visit(const Call *) override;
-    void visit(const For *) override;
-    void visit(const Allocate *) override;
-    void visit(const Free *) override;
-    void visit(const AssertStmt *) override;
-    void visit(const Load *) override;
-    void visit(const Store *) override;
-    void visit(const Atomic *) override;
-    void codegen_vector_reduce(const VectorReduce *op, const Expr &init) override;
-    // @}
-
-    std::string march() const;
-    std::string mcpu() const override;
-    std::string mattrs() const override;
-    bool use_soft_float_abi() const override;
-    int native_vector_bits() const override;
-    bool promote_indices() const override {
-        return false;
-    }
-
-    Type upgrade_type_for_arithmetic(const Type &t) const override {
-        return t;
-    }
-    Type upgrade_type_for_storage(const Type &t) const override;
+namespace Internal {
 
-    /** Map from simt variable names (e.g. foo.__block_id_x) to the llvm
-     * ptx intrinsic functions to call to get them. */
-    std::string simt_intrinsic(const std::string &name);
+struct CodeGen_GPU_Dev;
 
-    bool supports_atomic_add(const Type &t) const override;
-};
+CodeGen_GPU_Dev *new_CodeGen_PTX_Dev(const Target &target);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/DebugToFile.cpp b/src/DebugToFile.cpp
index 1a8ff962a4fb..bbc04ea3b11b 100644
--- a/src/DebugToFile.cpp
+++ b/src/DebugToFile.cpp
@@ -15,6 +15,8 @@ using std::ostringstream;
 using std::string;
 using std::vector;
 
+namespace {
+
 class DebugToFile : public IRMutator {
     const map<string, Function> &env;
 
@@ -146,6 +148,8 @@ class AddDummyRealizations : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt debug_to_file(Stmt s, const vector<Function> &outputs, const map<string, Function> &env) {
     // Temporarily wrap the produce nodes for the output functions in
     // realize nodes so that we know when to write the debug outputs.
diff --git a/src/DerivativeUtils.cpp b/src/DerivativeUtils.cpp
index b5113e6062d0..c9028679a1af 100644
--- a/src/DerivativeUtils.cpp
+++ b/src/DerivativeUtils.cpp
@@ -28,6 +28,8 @@ using std::set;
 using std::string;
 using std::vector;
 
+namespace {
+
 class StripLets : public IRGraphMutator {
 public:
     using IRGraphMutator::visit;
@@ -36,6 +38,8 @@ class StripLets : public IRGraphMutator {
     }
 };
 
+}  // namespace
+
 vector<int> gather_variables(const Expr &expr,
                              const vector<string> &filter) {
 
@@ -130,6 +134,8 @@ Expr add_let_expression(const Expr &expr,
     return ret;
 }
 
+namespace {
+
 /** Gather the expression DAG and sort them in topological order
  */
 class ExpressionSorter : public IRGraphVisitor {
@@ -198,6 +204,8 @@ void ExpressionSorter::include(const Expr &e) {
     }
 }
 
+}  // namespace
+
 vector<Expr> sort_expressions(const Expr &expr) {
     ExpressionSorter sorter;
     return sorter.sort(expr);
@@ -330,6 +338,8 @@ vector<string> vars_to_strings(const vector<Var> &vars) {
     return ret;
 }
 
+namespace {
+
 class RDomExtractor : public IRGraphVisitor {
 public:
     using IRGraphVisitor::visit;
@@ -348,6 +358,8 @@ class RDomExtractor : public IRGraphVisitor {
     ReductionDomain rdom;
 };
 
+}  // namespace
+
 ReductionDomain extract_rdom(const Expr &expr) {
     RDomExtractor extractor;
     return extractor.gather(expr);
@@ -383,6 +395,8 @@ pair<bool, Expr> solve_inverse(Expr expr,
     return {true, rmin + r.x};
 }
 
+namespace {
+
 struct BufferDimensionsFinder : public IRGraphVisitor {
 public:
     using IRGraphVisitor::visit;
@@ -429,11 +443,15 @@ struct BufferDimensionsFinder : public IRGraphVisitor {
     map<string, BufferInfo> buffer_calls;
 };
 
+}  // namespace
+
 map<string, BufferInfo> find_buffer_param_calls(const Func &func) {
     BufferDimensionsFinder finder;
     return finder.find(func);
 }
 
+namespace {
+
 struct ImplicitVariablesFinder : public IRGraphVisitor {
 public:
     using IRGraphVisitor::visit;
@@ -453,6 +471,8 @@ struct ImplicitVariablesFinder : public IRGraphVisitor {
     set<string> implicit_variables;
 };
 
+}  // namespace
+
 set<string> find_implicit_variables(const Expr &expr) {
     ImplicitVariablesFinder finder;
     return finder.find(expr);
@@ -477,6 +497,8 @@ Expr substitute_rdom_predicate(
     return substituted;
 }
 
+namespace {
+
 struct FunctionCallFinder : public IRGraphVisitor {
 public:
     using IRGraphVisitor::visit;
@@ -524,6 +546,8 @@ struct FunctionCallFinder : public IRGraphVisitor {
     bool found;
 };
 
+}  // namespace
+
 bool is_calling_function(
     const string &func_name, const Expr &expr,
     const map<string, Expr> &let_var_mapping) {
@@ -538,6 +562,8 @@ bool is_calling_function(
     return finder.find(expr, let_var_mapping);
 }
 
+namespace {
+
 struct SubstituteCallArgWithPureArg : public IRMutator {
 public:
     SubstituteCallArgWithPureArg(Func f, int variable_id)
@@ -560,6 +586,8 @@ struct SubstituteCallArgWithPureArg : public IRMutator {
     int variable_id;
 };
 
+}  // namespace
+
 Expr substitute_call_arg_with_pure_arg(Func f, int variable_id, const Expr &e) {
     return simplify(SubstituteCallArgWithPureArg(std::move(f), variable_id).mutate(e));
 }
diff --git a/src/EliminateBoolVectors.cpp b/src/EliminateBoolVectors.cpp
index 16e6814e3423..f2f042f348da 100644
--- a/src/EliminateBoolVectors.cpp
+++ b/src/EliminateBoolVectors.cpp
@@ -6,6 +6,8 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
 class EliminateBoolVectors : public IRMutator {
 private:
     using IRMutator::visit;
@@ -314,6 +316,8 @@ class EliminateBoolVectors : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt eliminate_bool_vectors(const Stmt &s) {
     return EliminateBoolVectors().mutate(s);
 }
diff --git a/src/FastIntegerDivide.cpp b/src/FastIntegerDivide.cpp
index 09f581d19c8c..e5fbdf9bd2c6 100644
--- a/src/FastIntegerDivide.cpp
+++ b/src/FastIntegerDivide.cpp
@@ -8,7 +8,7 @@ namespace Halide {
 
 using namespace Halide::Internal::IntegerDivision;
 
-namespace IntegerDivideTable {
+namespace {
 
 Buffer<uint8_t> integer_divide_table_u8() {
     static std::mutex initialize_lock;
@@ -111,7 +111,7 @@ Buffer<uint32_t> integer_divide_table_s32() {
         return im;
     }
 }
-}  // namespace IntegerDivideTable
+}  // namespace
 
 Expr fast_integer_divide(Expr numerator, Expr denominator) {
     if (is_const(denominator)) {
@@ -133,20 +133,20 @@ Expr fast_integer_divide(Expr numerator, Expr denominator) {
         Expr mul, shift;
         switch (t.bits()) {
         case 8: {
-            Buffer<uint8_t> table = IntegerDivideTable::integer_divide_table_u8();
+            Buffer<uint8_t> table = integer_divide_table_u8();
             mul = table(denominator, 0);
             shift = table(denominator, 1);
             break;
         }
         case 16: {
-            Buffer<uint16_t> table = IntegerDivideTable::integer_divide_table_u16();
+            Buffer<uint16_t> table = integer_divide_table_u16();
             mul = table(denominator, 0);
             shift = table(denominator, 1);
             break;
         }
         default:  // 32
         {
-            Buffer<uint32_t> table = IntegerDivideTable::integer_divide_table_u32();
+            Buffer<uint32_t> table = integer_divide_table_u32();
             mul = table(denominator, 0);
             shift = table(denominator, 1);
             break;
@@ -175,20 +175,20 @@ Expr fast_integer_divide(Expr numerator, Expr denominator) {
         Expr mul, shift;
         switch (t.bits()) {
         case 8: {
-            Buffer<uint8_t> table = IntegerDivideTable::integer_divide_table_s8();
+            Buffer<uint8_t> table = integer_divide_table_s8();
             mul = table(denominator, 0);
             shift = table(denominator, 1);
             break;
         }
         case 16: {
-            Buffer<uint16_t> table = IntegerDivideTable::integer_divide_table_s16();
+            Buffer<uint16_t> table = integer_divide_table_s16();
             mul = table(denominator, 0);
             shift = table(denominator, 1);
             break;
         }
         default:  // 32
         {
-            Buffer<uint32_t> table = IntegerDivideTable::integer_divide_table_s32();
+            Buffer<uint32_t> table = integer_divide_table_s32();
             mul = table(denominator, 0);
             shift = table(denominator, 1);
             break;
diff --git a/src/FastIntegerDivide.h b/src/FastIntegerDivide.h
index 9ed10c0105ad..7a802ababa75 100644
--- a/src/FastIntegerDivide.h
+++ b/src/FastIntegerDivide.h
@@ -6,19 +6,6 @@
 
 namespace Halide {
 
-/** Built-in images used for fast_integer_divide below. Use of
- * fast_integer_divide will automatically embed the appropriate tables
- * in your object file. They are declared here in case you want to do
- * something non-default with them. */
-namespace IntegerDivideTable {
-Buffer<uint8_t> integer_divide_table_u8();
-Buffer<uint8_t> integer_divide_table_s8();
-Buffer<uint16_t> integer_divide_table_u16();
-Buffer<uint16_t> integer_divide_table_s16();
-Buffer<uint32_t> integer_divide_table_u32();
-Buffer<uint32_t> integer_divide_table_s32();
-}  // namespace IntegerDivideTable
-
 /** Integer division by small values can be done exactly as multiplies
  * and shifts. This function does integer division for numerators of
  * various integer types (8, 16, 32 bit signed and unsigned)
diff --git a/src/Function.cpp b/src/Function.cpp
index 4b6a437a181d..74ab2723a55e 100644
--- a/src/Function.cpp
+++ b/src/Function.cpp
@@ -175,6 +175,8 @@ void destroy<FunctionGroup>(const FunctionGroup *f) {
     delete f;
 }
 
+namespace {
+
 // All variables present in any part of a function definition must
 // either be pure args, elements of the reduction domain, parameters
 // (i.e. attached to some Parameter object), or part of a let node
@@ -280,9 +282,9 @@ class FreezeFunctions : public IRGraphVisitor {
 };
 
 // A counter to use in tagging random variables
-namespace {
 std::atomic<int> rand_counter{0};
-}
+
+}  // namespace
 
 Function::Function(const FunctionPtr &ptr)
     : contents(ptr) {
diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index 7a678a92b5b4..9faf0d7a41df 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -28,9 +28,9 @@ using std::string;
 using std::vector;
 
 namespace {
+
 string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"};
 string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"};
-}  // namespace
 
 class ExtractBlockSize : public IRVisitor {
     Expr block_extent[4], block_count[4];
@@ -1543,11 +1543,15 @@ class ValidateGPULoopNesting : public IRVisitor {
     }
 };
 
+}  // namespace
+
 // Also used by InjectImageIntrinsics
 Stmt zero_gpu_loop_mins(const Stmt &s) {
     return ZeroGPULoopMins().mutate(s);
 }
 
+namespace {
+
 // Find the inner most GPU block of a statement.
 class FindInnermostGPUBlock : public IRVisitor {
     using IRVisitor::visit;
@@ -1615,6 +1619,8 @@ class NormalizeIfStatements : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt fuse_gpu_thread_loops(Stmt s) {
     ValidateGPULoopNesting validate;
     s.accept(&validate);
diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp
index ba7f59b1ab83..d990f7c3b2dc 100644
--- a/src/IRMatch.cpp
+++ b/src/IRMatch.cpp
@@ -50,6 +50,8 @@ void expr_match_test() {
     std::cout << "expr_match test passed" << std::endl;
 }
 
+namespace {
+
 class IRMatch : public IRVisitor {
 public:
     bool result;
@@ -294,6 +296,8 @@ class IRMatch : public IRVisitor {
     }
 };
 
+}  // namespace
+
 bool expr_match(const Expr &pattern, const Expr &expr, vector<Expr> &matches) {
     matches.clear();
     if (!pattern.defined() && !expr.defined()) {
diff --git a/src/InjectOpenGLIntrinsics.cpp b/src/InjectOpenGLIntrinsics.cpp
index 9319c211558a..1a96cb6bff35 100644
--- a/src/InjectOpenGLIntrinsics.cpp
+++ b/src/InjectOpenGLIntrinsics.cpp
@@ -12,6 +12,8 @@ namespace Internal {
 using std::string;
 using std::vector;
 
+namespace {
+
 /** Normalizes image loads/stores and produces glsl_texture_load/stores. */
 class InjectOpenGLIntrinsics : public IRMutator {
 public:
@@ -90,6 +92,8 @@ class InjectOpenGLIntrinsics : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt inject_opengl_intrinsics(const Stmt &s) {
     InjectOpenGLIntrinsics gl;
     return gl.mutate(s);
diff --git a/src/InlineReductions.cpp b/src/InlineReductions.cpp
index 515cca1d09d5..10714a4a1358 100644
--- a/src/InlineReductions.cpp
+++ b/src/InlineReductions.cpp
@@ -15,6 +15,7 @@ using std::string;
 using std::vector;
 
 namespace Internal {
+namespace {
 
 class FindFreeVars : public IRMutator {
 public:
@@ -103,6 +104,8 @@ class FindFreeVars : public IRMutator {
         return expr;
     }
 };
+
+}  // namespace
 }  // namespace Internal
 
 Expr sum(Expr e, const std::string &name) {
diff --git a/src/Introspection.cpp b/src/Introspection.cpp
index 5b34a2471cb9..18218b550496 100644
--- a/src/Introspection.cpp
+++ b/src/Introspection.cpp
@@ -51,8 +51,6 @@ inline T load_misaligned(const T *p) {
 
 typedef uint64_t llvm_offset_t;
 
-}  // namespace
-
 class DebugSections {
 
     bool calibrated;
@@ -2215,9 +2213,9 @@ class DebugSections {
     }
 };
 
-namespace {
 DebugSections *debug_sections = nullptr;
-}
+
+}  // namespace
 
 bool dump_stack_frame() {
     if (!debug_sections || !debug_sections->working) {
diff --git a/src/LICM.cpp b/src/LICM.cpp
index 20f4a73da4d9..64360fd64b9d 100644
--- a/src/LICM.cpp
+++ b/src/LICM.cpp
@@ -17,6 +17,8 @@ using std::set;
 using std::string;
 using std::vector;
 
+namespace {
+
 // Is it safe to lift an Expr out of a loop (and potentially across a device boundary)
 class CanLift : public IRVisitor {
     using IRVisitor::visit;
@@ -527,6 +529,8 @@ class GroupLoopInvariants : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt hoist_loop_invariant_values(Stmt s) {
     s = GroupLoopInvariants().mutate(s);
     s = common_subexpression_elimination(s);
diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp
index 0adab9e01f41..88ddb8f7e87e 100644
--- a/src/LLVM_Output.cpp
+++ b/src/LLVM_Output.cpp
@@ -25,6 +25,8 @@ namespace Halide {
 namespace Internal {
 namespace Archive {
 
+namespace {
+
 // This is a bare-bones Windows .lib file writer, based on inspection
 // of the LLVM ArchiveWriter class and the documentation at
 // https://www.microsoft.com/msj/0498/hood0498.aspx and
@@ -300,6 +302,8 @@ void write_coff_archive(std::ostream &out,
     }
 }
 
+}  // namespace
+
 }  // namespace Archive
 }  // namespace Internal
 
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 048d2495c757..d6216fb7b67a 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -25,8 +25,6 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
     return result;
 }
 
-}  // namespace
-
 #define DECLARE_INITMOD(mod)                                                              \
     extern "C" unsigned char halide_internal_initmod_##mod[];                             \
     extern "C" int halide_internal_initmod_##mod##_length;                                \
@@ -259,8 +257,6 @@ DECLARE_CPP_INITMOD(riscv_cpu_features)
 DECLARE_NO_INITMOD(riscv_cpu_features)
 #endif  // WITH_RISCV
 
-namespace {
-
 llvm::DataLayout get_data_layout_for_target(Target target) {
     if (target.arch == Target::X86) {
         if (target.bits == 32) {
diff --git a/src/Memoization.cpp b/src/Memoization.cpp
index 164452684bfb..64e41978cb2f 100644
--- a/src/Memoization.cpp
+++ b/src/Memoization.cpp
@@ -311,8 +311,6 @@ class KeyInfo {
     }
 };
 
-}  // namespace
-
 // Inject caching structure around memoized realizations.
 class InjectMemoization : public IRMutator {
 public:
@@ -436,6 +434,8 @@ class InjectMemoization : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt inject_memoization(const Stmt &s, const std::map<std::string, Function> &env,
                         const std::string &name,
                         const std::vector<Function> &outputs) {
@@ -450,6 +450,8 @@ Stmt inject_memoization(const Stmt &s, const std::map<std::string, Function> &en
     return injector.mutate(s);
 }
 
+namespace {
+
 class RewriteMemoizedAllocations : public IRMutator {
 public:
     RewriteMemoizedAllocations(const std::map<std::string, Function> &e)
@@ -538,6 +540,8 @@ class RewriteMemoizedAllocations : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt rewrite_memoized_allocations(const Stmt &s, const std::map<std::string, Function> &env) {
 
     RewriteMemoizedAllocations rewriter(env);
diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp
index 926cc37c2cd3..1e7d49aa3e04 100644
--- a/src/ModulusRemainder.cpp
+++ b/src/ModulusRemainder.cpp
@@ -9,6 +9,7 @@ namespace Halide {
 namespace Internal {
 
 namespace {
+
 // A version of mod where a % 0 == a
 int64_t mod(int64_t a, int64_t b) {
     if (b == 0) {
@@ -17,7 +18,6 @@ int64_t mod(int64_t a, int64_t b) {
         return mod_imp(a, b);
     }
 }
-}  // namespace
 
 class ComputeModulusRemainder : public IRVisitor {
 public:
@@ -78,6 +78,207 @@ class ComputeModulusRemainder : public IRVisitor {
     void visit(const Atomic *) override;
 };
 
+void ComputeModulusRemainder::visit(const IntImm *op) {
+    // Equal to op->value modulo anything. We'll use zero as the
+    // modulus to mark this special case. We'd better be able to
+    // handle zero in the rest of the code...
+    result = {0, op->value};
+}
+
+void ComputeModulusRemainder::visit(const UIntImm *op) {
+    internal_error << "modulus_remainder of uint\n";
+}
+
+void ComputeModulusRemainder::visit(const FloatImm *) {
+    internal_error << "modulus_remainder of float\n";
+}
+
+void ComputeModulusRemainder::visit(const StringImm *) {
+    internal_error << "modulus_remainder of string\n";
+}
+
+void ComputeModulusRemainder::visit(const Cast *) {
+    // TODO: Could probably do something reasonable for integer
+    // upcasts and downcasts where the modulus is a power of two.
+    result = ModulusRemainder{};
+}
+
+void ComputeModulusRemainder::visit(const Variable *op) {
+    if (scope.contains(op->name)) {
+        result = scope.get(op->name);
+    } else {
+        result = ModulusRemainder{};
+    }
+}
+
+void ComputeModulusRemainder::visit(const Add *op) {
+    result = analyze(op->a) + analyze(op->b);
+}
+
+void ComputeModulusRemainder::visit(const Sub *op) {
+    result = analyze(op->a) - analyze(op->b);
+}
+
+void ComputeModulusRemainder::visit(const Mul *op) {
+    result = analyze(op->a) * analyze(op->b);
+}
+
+void ComputeModulusRemainder::visit(const Div *op) {
+    result = analyze(op->a) / analyze(op->b);
+}
+
+void ComputeModulusRemainder::visit(const Min *op) {
+    result = ModulusRemainder::unify(analyze(op->a), analyze(op->b));
+}
+
+void ComputeModulusRemainder::visit(const Max *op) {
+    result = ModulusRemainder::unify(analyze(op->a), analyze(op->b));
+}
+
+void ComputeModulusRemainder::visit(const EQ *) {
+    internal_error << "modulus_remainder of bool\n";
+}
+
+void ComputeModulusRemainder::visit(const NE *) {
+    internal_error << "modulus_remainder of bool\n";
+}
+
+void ComputeModulusRemainder::visit(const LT *) {
+    internal_error << "modulus_remainder of bool\n";
+}
+
+void ComputeModulusRemainder::visit(const LE *) {
+    internal_error << "modulus_remainder of bool\n";
+}
+
+void ComputeModulusRemainder::visit(const GT *) {
+    internal_error << "modulus_remainder of bool\n";
+}
+
+void ComputeModulusRemainder::visit(const GE *) {
+    internal_error << "modulus_remainder of bool\n";
+}
+
+void ComputeModulusRemainder::visit(const And *) {
+    internal_error << "modulus_remainder of bool\n";
+}
+
+void ComputeModulusRemainder::visit(const Or *) {
+    internal_error << "modulus_remainder of bool\n";
+}
+
+void ComputeModulusRemainder::visit(const Not *) {
+    internal_error << "modulus_remainder of bool\n";
+}
+
+void ComputeModulusRemainder::visit(const Select *op) {
+    result = ModulusRemainder::unify(analyze(op->true_value),
+                                     analyze(op->false_value));
+}
+
+void ComputeModulusRemainder::visit(const Load *) {
+    result = ModulusRemainder{};
+}
+
+void ComputeModulusRemainder::visit(const Ramp *) {
+    internal_error << "modulus_remainder of vector\n";
+}
+
+void ComputeModulusRemainder::visit(const Broadcast *) {
+    internal_error << "modulus_remainder of vector\n";
+}
+
+void ComputeModulusRemainder::visit(const Call *) {
+    result = ModulusRemainder{};
+}
+
+void ComputeModulusRemainder::visit(const Let *op) {
+    if (op->value.type().is_int()) {
+        ScopedBinding<ModulusRemainder> bind(scope, op->name, analyze(op->value));
+        result = analyze(op->body);
+    } else {
+        result = analyze(op->body);
+    }
+}
+
+void ComputeModulusRemainder::visit(const Shuffle *op) {
+    // It's possible that scalar expressions are extracting a lane of
+    // a vector - don't fail in this case, but stop
+    internal_assert(op->indices.size() == 1) << "modulus_remainder of vector\n";
+    result = ModulusRemainder{};
+}
+
+void ComputeModulusRemainder::visit(const VectorReduce *op) {
+    internal_assert(op->type.is_scalar()) << "modulus_remainder of vector\n";
+    result = ModulusRemainder{};
+}
+
+void ComputeModulusRemainder::visit(const LetStmt *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const AssertStmt *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const ProducerConsumer *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const For *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Acquire *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Store *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Provide *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Allocate *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Realize *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Block *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Fork *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Free *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const IfThenElse *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Evaluate *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Prefetch *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+void ComputeModulusRemainder::visit(const Atomic *) {
+    internal_error << "modulus_remainder of statement\n";
+}
+
+}  // namespace
+
 ModulusRemainder modulus_remainder(const Expr &e) {
     ComputeModulusRemainder mr(nullptr);
     return mr.analyze(e);
@@ -153,39 +354,6 @@ void modulus_remainder_test() {
     std::cout << "modulus_remainder test passed\n";
 }
 
-void ComputeModulusRemainder::visit(const IntImm *op) {
-    // Equal to op->value modulo anything. We'll use zero as the
-    // modulus to mark this special case. We'd better be able to
-    // handle zero in the rest of the code...
-    result = {0, op->value};
-}
-
-void ComputeModulusRemainder::visit(const UIntImm *op) {
-    internal_error << "modulus_remainder of uint\n";
-}
-
-void ComputeModulusRemainder::visit(const FloatImm *) {
-    internal_error << "modulus_remainder of float\n";
-}
-
-void ComputeModulusRemainder::visit(const StringImm *) {
-    internal_error << "modulus_remainder of string\n";
-}
-
-void ComputeModulusRemainder::visit(const Cast *) {
-    // TODO: Could probably do something reasonable for integer
-    // upcasts and downcasts where the modulus is a power of two.
-    result = ModulusRemainder{};
-}
-
-void ComputeModulusRemainder::visit(const Variable *op) {
-    if (scope.contains(op->name)) {
-        result = scope.get(op->name);
-    } else {
-        result = ModulusRemainder{};
-    }
-}
-
 int64_t gcd(int64_t a, int64_t b) {
     if (a < b) {
         std::swap(a, b);
@@ -210,10 +378,6 @@ int64_t lcm(int64_t a, int64_t b) {
     }
 }
 
-void ComputeModulusRemainder::visit(const Add *op) {
-    result = analyze(op->a) + analyze(op->b);
-}
-
 ModulusRemainder operator+(const ModulusRemainder &a, const ModulusRemainder &b) {
     if (add_would_overflow(64, a.remainder, b.remainder)) {
         return {1, 0};
@@ -224,10 +388,6 @@ ModulusRemainder operator+(const ModulusRemainder &a, const ModulusRemainder &b)
     }
 }
 
-void ComputeModulusRemainder::visit(const Sub *op) {
-    result = analyze(op->a) - analyze(op->b);
-}
-
 ModulusRemainder operator-(const ModulusRemainder &a, const ModulusRemainder &b) {
     if (sub_would_overflow(64, a.remainder, b.remainder)) {
         return {1, 0};
@@ -238,10 +398,6 @@ ModulusRemainder operator-(const ModulusRemainder &a, const ModulusRemainder &b)
     }
 }
 
-void ComputeModulusRemainder::visit(const Mul *op) {
-    result = analyze(op->a) * analyze(op->b);
-}
-
 ModulusRemainder operator*(const ModulusRemainder &a, const ModulusRemainder &b) {
     if (a.modulus == 0) {
         // a is constant
@@ -280,10 +436,6 @@ ModulusRemainder operator*(const ModulusRemainder &a, const ModulusRemainder &b)
     return ModulusRemainder{};
 }
 
-void ComputeModulusRemainder::visit(const Div *op) {
-    result = analyze(op->a) / analyze(op->b);
-}
-
 ModulusRemainder operator/(const ModulusRemainder &a, const ModulusRemainder &b) {
     // What can we say about:
     // floor((m1 * x + r1) / (m2 * y + r2))
@@ -423,155 +575,5 @@ ModulusRemainder operator%(const ModulusRemainder &a, int64_t b) {
     return a % ModulusRemainder(0, b);
 }
 
-void ComputeModulusRemainder::visit(const Min *op) {
-    result = ModulusRemainder::unify(analyze(op->a), analyze(op->b));
-}
-
-void ComputeModulusRemainder::visit(const Max *op) {
-    result = ModulusRemainder::unify(analyze(op->a), analyze(op->b));
-}
-
-void ComputeModulusRemainder::visit(const EQ *) {
-    internal_error << "modulus_remainder of bool\n";
-}
-
-void ComputeModulusRemainder::visit(const NE *) {
-    internal_error << "modulus_remainder of bool\n";
-}
-
-void ComputeModulusRemainder::visit(const LT *) {
-    internal_error << "modulus_remainder of bool\n";
-}
-
-void ComputeModulusRemainder::visit(const LE *) {
-    internal_error << "modulus_remainder of bool\n";
-}
-
-void ComputeModulusRemainder::visit(const GT *) {
-    internal_error << "modulus_remainder of bool\n";
-}
-
-void ComputeModulusRemainder::visit(const GE *) {
-    internal_error << "modulus_remainder of bool\n";
-}
-
-void ComputeModulusRemainder::visit(const And *) {
-    internal_error << "modulus_remainder of bool\n";
-}
-
-void ComputeModulusRemainder::visit(const Or *) {
-    internal_error << "modulus_remainder of bool\n";
-}
-
-void ComputeModulusRemainder::visit(const Not *) {
-    internal_error << "modulus_remainder of bool\n";
-}
-
-void ComputeModulusRemainder::visit(const Select *op) {
-    result = ModulusRemainder::unify(analyze(op->true_value),
-                                     analyze(op->false_value));
-}
-
-void ComputeModulusRemainder::visit(const Load *) {
-    result = ModulusRemainder{};
-}
-
-void ComputeModulusRemainder::visit(const Ramp *) {
-    internal_error << "modulus_remainder of vector\n";
-}
-
-void ComputeModulusRemainder::visit(const Broadcast *) {
-    internal_error << "modulus_remainder of vector\n";
-}
-
-void ComputeModulusRemainder::visit(const Call *) {
-    result = ModulusRemainder{};
-}
-
-void ComputeModulusRemainder::visit(const Let *op) {
-    if (op->value.type().is_int()) {
-        ScopedBinding<ModulusRemainder> bind(scope, op->name, analyze(op->value));
-        result = analyze(op->body);
-    } else {
-        result = analyze(op->body);
-    }
-}
-
-void ComputeModulusRemainder::visit(const Shuffle *op) {
-    // It's possible that scalar expressions are extracting a lane of
-    // a vector - don't fail in this case, but stop
-    internal_assert(op->indices.size() == 1) << "modulus_remainder of vector\n";
-    result = ModulusRemainder{};
-}
-
-void ComputeModulusRemainder::visit(const VectorReduce *op) {
-    internal_assert(op->type.is_scalar()) << "modulus_remainder of vector\n";
-    result = ModulusRemainder{};
-}
-
-void ComputeModulusRemainder::visit(const LetStmt *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const AssertStmt *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const ProducerConsumer *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const For *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Acquire *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Store *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Provide *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Allocate *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Realize *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Block *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Fork *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Free *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const IfThenElse *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Evaluate *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Prefetch *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
-void ComputeModulusRemainder::visit(const Atomic *) {
-    internal_error << "modulus_remainder of statement\n";
-}
-
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp
index 86617c42565d..66277e4e8354 100644
--- a/src/Monotonic.cpp
+++ b/src/Monotonic.cpp
@@ -28,6 +28,8 @@ std::ostream &operator<<(std::ostream &stream, const Monotonic &m) {
 
 using std::string;
 
+namespace {
+
 class MonotonicVisitor : public IRVisitor {
     const string &var;
 
@@ -462,6 +464,8 @@ class MonotonicVisitor : public IRVisitor {
     }
 };
 
+}  // namespace
+
 Monotonic is_monotonic(const Expr &e, const std::string &var, const Scope<Monotonic> &scope) {
     if (!e.defined()) {
         return Monotonic::Unknown;
diff --git a/src/PrintLoopNest.cpp b/src/PrintLoopNest.cpp
index 9ffaeacd9f13..19b0bf9ba926 100644
--- a/src/PrintLoopNest.cpp
+++ b/src/PrintLoopNest.cpp
@@ -27,6 +27,8 @@ using std::map;
 using std::string;
 using std::vector;
 
+namespace {
+
 class PrintLoopNest : public IRVisitor {
 public:
     PrintLoopNest(std::ostream &output, const map<string, Function> &e)
@@ -158,6 +160,8 @@ class PrintLoopNest : public IRVisitor {
     }
 };
 
+}  // namespace
+
 string print_loop_nest(const vector<Function> &output_funcs) {
     // Do the first part of lowering:
 
diff --git a/src/Profiling.cpp b/src/Profiling.cpp
index 5644bc97b76b..7d1a47d5e66a 100644
--- a/src/Profiling.cpp
+++ b/src/Profiling.cpp
@@ -19,6 +19,8 @@ using std::map;
 using std::string;
 using std::vector;
 
+namespace {
+
 class InjectProfiling : public IRMutator {
 public:
     map<string, int> indices;  // maps from func name -> index in buffer.
@@ -292,6 +294,8 @@ class InjectProfiling : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt inject_profiling(Stmt s, const string &pipeline_name) {
     InjectProfiling profiling(pipeline_name);
     s = profiling.mutate(s);
diff --git a/src/PurifyIndexMath.cpp b/src/PurifyIndexMath.cpp
index 6d3f94c73886..1ea205a6ff6b 100644
--- a/src/PurifyIndexMath.cpp
+++ b/src/PurifyIndexMath.cpp
@@ -6,6 +6,8 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
 class PurifyIndexMath : public IRMutator {
     using IRMutator::visit;
 
@@ -21,6 +23,8 @@ class PurifyIndexMath : public IRMutator {
     }
 };
 
+}  // namespace
+
 Expr purify_index_math(const Expr &s) {
     return PurifyIndexMath().mutate(s);
 }
diff --git a/src/PythonExtensionGen.cpp b/src/PythonExtensionGen.cpp
index acf07759e293..4faf3cbb44a7 100644
--- a/src/PythonExtensionGen.cpp
+++ b/src/PythonExtensionGen.cpp
@@ -13,7 +13,9 @@ using std::ostream;
 using std::ostringstream;
 using std::string;
 
-static string sanitize_name(const string &name) {
+namespace {
+
+string sanitize_name(const string &name) {
     ostringstream oss;
     for (size_t i = 0; i < name.size(); i++) {
         if (name[i] == '.' || name[i] == '_') {
@@ -27,7 +29,7 @@ static string sanitize_name(const string &name) {
     return oss.str();
 }
 
-static string remove_namespaces(const string &name) {
+string remove_namespaces(const string &name) {
     size_t i = name.find_last_of(':');
     if (i == string::npos) {
         return name;
@@ -36,7 +38,7 @@ static string remove_namespaces(const string &name) {
     }
 }
 
-static bool can_convert(const LoweredArgument *arg) {
+bool can_convert(const LoweredArgument *arg) {
     if (arg->type.is_handle()) {
         if (arg->name == "__user_context") {
             /* __user_context is a void* pointer to a user supplied memory region.
@@ -95,6 +97,8 @@ std::pair<string, string> print_type(const LoweredArgument *arg) {
     }
 }
 
+}  // namespace
+
 void PythonExtensionGen::convert_buffer(const string &name, const LoweredArgument *arg) {
     internal_assert(arg->is_buffer());
     internal_assert(arg->dimensions);
diff --git a/src/Qualify.cpp b/src/Qualify.cpp
index 9e32868bc1f9..470162909f94 100644
--- a/src/Qualify.cpp
+++ b/src/Qualify.cpp
@@ -6,6 +6,8 @@ namespace Internal {
 
 using std::string;
 
+namespace {
+
 // Prefix all names in an expression with some string.
 class QualifyExpr : public IRMutator {
     using IRMutator::visit;
@@ -31,6 +33,8 @@ class QualifyExpr : public IRMutator {
     }
 };
 
+}  // namespace
+
 Expr qualify(const string &prefix, const Expr &value) {
     QualifyExpr q(prefix);
     return q.mutate(value);
diff --git a/src/Random.cpp b/src/Random.cpp
index f9c0cea35baa..6f9e4bfa3e92 100644
--- a/src/Random.cpp
+++ b/src/Random.cpp
@@ -93,6 +93,8 @@ Expr random_float(const vector<Expr> &e) {
     return clamp(reinterpret(Float(32), result) - 1.0f, 0.0f, 1.0f);
 }
 
+namespace {
+
 class LowerRandom : public IRMutator {
     using IRMutator::visit;
 
@@ -130,6 +132,8 @@ class LowerRandom : public IRMutator {
     }
 };
 
+}  // namespace
+
 Expr lower_random(const Expr &e, const vector<VarOrRVar> &free_vars, int tag) {
     LowerRandom r(free_vars, tag);
     return r.mutate(e);
diff --git a/src/RemoveDeadAllocations.cpp b/src/RemoveDeadAllocations.cpp
index 090138f46337..6c998e036460 100644
--- a/src/RemoveDeadAllocations.cpp
+++ b/src/RemoveDeadAllocations.cpp
@@ -6,6 +6,8 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
 class RemoveDeadAllocations : public IRMutator {
     using IRMutator::visit;
 
@@ -83,6 +85,8 @@ class RemoveDeadAllocations : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt remove_dead_allocations(const Stmt &s) {
     return RemoveDeadAllocations().mutate(s);
 }
diff --git a/src/RemoveExternLoops.cpp b/src/RemoveExternLoops.cpp
index 84e932b0a2ec..9fb0e187b3eb 100644
--- a/src/RemoveExternLoops.cpp
+++ b/src/RemoveExternLoops.cpp
@@ -4,6 +4,8 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
 class RemoveExternLoops : public IRMutator {
 private:
     using IRMutator::visit;
@@ -17,6 +19,8 @@ class RemoveExternLoops : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt remove_extern_loops(const Stmt &s) {
     return RemoveExternLoops().mutate(s);
 }
diff --git a/src/RemoveUndef.cpp b/src/RemoveUndef.cpp
index 85ec06ae09ba..48b8ba8320ce 100644
--- a/src/RemoveUndef.cpp
+++ b/src/RemoveUndef.cpp
@@ -10,6 +10,8 @@ namespace Internal {
 
 using std::vector;
 
+namespace {
+
 class RemoveUndef : public IRMutator {
 public:
     Expr predicate;
@@ -576,6 +578,8 @@ class RemoveUndef : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt remove_undef(Stmt s) {
     RemoveUndef r;
     s = r.mutate(s);
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index f21044caa840..114516d9b1a8 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -31,6 +31,7 @@ using std::string;
 using std::vector;
 
 namespace {
+
 // A structure representing a containing LetStmt, IfThenElse, or For
 // loop. Used in build_provide_loop_nest below. Both If and IfInner represent
 // IfThenElse stmts, however, IfInner should not be reordered to outside of
@@ -57,8 +58,6 @@ bool var_name_match(const string &v1, const string &v2) {
             Internal::ends_with(v2, "." + v1));
 }
 
-}  // anonymous namespace
-
 class ContainsImpureCall : public IRVisitor {
     using IRVisitor::visit;
 
@@ -2285,6 +2284,8 @@ bool group_should_be_inlined(const vector<Function> &funcs) {
             funcs[0].schedule().compute_level().is_inlined());
 }
 
+}  // namespace
+
 std::ostream &operator<<(std::ostream &out, const std::vector<Function> &v) {
     out << "{ ";
     for (size_t i = 0; i < v.size(); ++i) {
diff --git a/src/ScheduleFunctions.h b/src/ScheduleFunctions.h
index f513210d3f0c..11324f77283f 100644
--- a/src/ScheduleFunctions.h
+++ b/src/ScheduleFunctions.h
@@ -12,9 +12,11 @@
 #include <vector>
 
 #include "Expr.h"
-#include "Target.h"
 
 namespace Halide {
+
+struct Target;
+
 namespace Internal {
 
 class Function;
diff --git a/src/SelectGPUAPI.cpp b/src/SelectGPUAPI.cpp
index 0957bbbd59c5..066b3a07b860 100644
--- a/src/SelectGPUAPI.cpp
+++ b/src/SelectGPUAPI.cpp
@@ -5,6 +5,8 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
 class SelectGPUAPI : public IRMutator {
     using IRMutator::visit;
 
@@ -48,6 +50,8 @@ class SelectGPUAPI : public IRMutator {
     };
 };
 
+}  // namespace
+
 Stmt select_gpu_api(const Stmt &s, Target t) {
     return SelectGPUAPI(t).mutate(s);
 }
diff --git a/src/SkipStages.cpp b/src/SkipStages.cpp
index e4b519c0ea02..d756d768d23f 100644
--- a/src/SkipStages.cpp
+++ b/src/SkipStages.cpp
@@ -39,8 +39,6 @@ bool extern_call_uses_buffer(const Call *op, const std::string &func) {
     return false;
 }
 
-}  // namespace
-
 class PredicateFinder : public IRVisitor {
 public:
     Expr predicate;
@@ -508,6 +506,8 @@ class MightBeSkippable : public IRVisitor {
     set<string> candidates;
 };
 
+}  // namespace
+
 Stmt skip_stages(Stmt stmt, const vector<string> &order) {
     // Don't consider the last stage, because it's the output, so it's
     // never skippable.
diff --git a/src/SlidingWindow.cpp b/src/SlidingWindow.cpp
index 38488d08eb03..e55848db5783 100644
--- a/src/SlidingWindow.cpp
+++ b/src/SlidingWindow.cpp
@@ -82,8 +82,6 @@ Expr expand_expr(const Expr &e, const Scope<Expr> &scope) {
     return result;
 }
 
-}  // namespace
-
 // Perform sliding window optimization for a function over a
 // particular serial for loop
 class SlidingWindowOnFunctionAndLoop : public IRMutator {
@@ -423,6 +421,8 @@ class SlidingWindow : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt sliding_window(const Stmt &s, const map<string, Function> &env) {
     return SlidingWindow(env).mutate(s);
 }
diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp
index 01af13ce806e..5f7f2b14bf70 100644
--- a/src/StorageFolding.cpp
+++ b/src/StorageFolding.cpp
@@ -21,8 +21,6 @@ int64_t next_power_of_two(int64_t x) {
     return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
 }
 
-}  // namespace
-
 using std::map;
 using std::string;
 using std::vector;
@@ -998,6 +996,8 @@ class StorageFolding : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt storage_folding(const Stmt &s, const std::map<std::string, Function> &env) {
     return StorageFolding(env).mutate(s);
 }
diff --git a/src/StrictifyFloat.cpp b/src/StrictifyFloat.cpp
index 7947eb00ebe8..161e549f7c17 100644
--- a/src/StrictifyFloat.cpp
+++ b/src/StrictifyFloat.cpp
@@ -7,6 +7,8 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
 class StrictifyFloat : public IRMutator {
     enum Strictness {
         FastMath,
@@ -59,6 +61,8 @@ class StrictifyFloat : public IRMutator {
     }
 };
 
+}  // namespace
+
 bool strictify_float(std::map<std::string, Function> &env, const Target &t) {
     bool any_strict_float = false;
     for (auto &iter : env) {
diff --git a/src/Substitute.cpp b/src/Substitute.cpp
index 5e4ffb289683..e188842923fd 100644
--- a/src/Substitute.cpp
+++ b/src/Substitute.cpp
@@ -9,6 +9,8 @@ namespace Internal {
 using std::map;
 using std::string;
 
+namespace {
+
 class Substitute : public IRMutator {
     const map<string, Expr> &replace;
     Scope<> hidden;
@@ -83,6 +85,8 @@ class Substitute : public IRMutator {
     }
 };
 
+}  // namespace
+
 Expr substitute(const string &name, const Expr &replacement, const Expr &expr) {
     map<string, Expr> m;
     m[name] = replacement;
@@ -107,6 +111,8 @@ Stmt substitute(const map<string, Expr> &m, const Stmt &stmt) {
     return s.mutate(stmt);
 }
 
+namespace {
+
 class SubstituteExpr : public IRMutator {
 public:
     Expr find, replacement;
@@ -122,6 +128,8 @@ class SubstituteExpr : public IRMutator {
     }
 };
 
+}  // namespace
+
 Expr substitute(const Expr &find, const Expr &replacement, const Expr &expr) {
     SubstituteExpr s;
     s.find = find;
@@ -136,6 +144,8 @@ Stmt substitute(const Expr &find, const Expr &replacement, const Stmt &stmt) {
     return s.mutate(stmt);
 }
 
+namespace {
+
 /** Substitute an expr for a var in a graph. */
 class GraphSubstitute : public IRGraphMutator {
     string var;
@@ -187,6 +197,8 @@ class GraphSubstituteExpr : public IRGraphMutator {
     }
 };
 
+}  // namespace
+
 Expr graph_substitute(const string &name, const Expr &replacement, const Expr &expr) {
     return GraphSubstitute(name, replacement).mutate(expr);
 }
@@ -203,6 +215,8 @@ Stmt graph_substitute(const Expr &find, const Expr &replacement, const Stmt &stm
     return GraphSubstituteExpr(find, replacement).mutate(stmt);
 }
 
+namespace {
+
 class SubstituteInAllLets : public IRGraphMutator {
 
     using IRGraphMutator::visit;
@@ -214,6 +228,8 @@ class SubstituteInAllLets : public IRGraphMutator {
     }
 };
 
+}  // namespace
+
 Expr substitute_in_all_lets(const Expr &expr) {
     return SubstituteInAllLets().mutate(expr);
 }
diff --git a/src/Tracing.cpp b/src/Tracing.cpp
index 1af6d2cec4ac..5e36a347bb30 100644
--- a/src/Tracing.cpp
+++ b/src/Tracing.cpp
@@ -15,6 +15,8 @@ using std::set;
 using std::string;
 using std::vector;
 
+namespace {
+
 struct TraceEventBuilder {
     string func;
     Expr trace_tag_expr = Expr("");
@@ -326,6 +328,8 @@ class RemoveRealizeOverOutput : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt inject_tracing(Stmt s, const string &pipeline_name, bool trace_pipeline,
                     const map<string, Function> &env, const vector<Function> &outputs,
                     const Target &t) {
diff --git a/src/UnifyDuplicateLets.cpp b/src/UnifyDuplicateLets.cpp
index 87e0b1e08b8c..5f6e120d6e76 100644
--- a/src/UnifyDuplicateLets.cpp
+++ b/src/UnifyDuplicateLets.cpp
@@ -9,6 +9,8 @@ namespace Internal {
 using std::map;
 using std::string;
 
+namespace {
+
 class UnifyDuplicateLets : public IRMutator {
     using IRMutator::visit;
 
@@ -112,6 +114,8 @@ class UnifyDuplicateLets : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt unify_duplicate_lets(const Stmt &s) {
     return UnifyDuplicateLets().mutate(s);
 }
diff --git a/src/UnrollLoops.cpp b/src/UnrollLoops.cpp
index 5b503d780785..2b80f5b7cde6 100644
--- a/src/UnrollLoops.cpp
+++ b/src/UnrollLoops.cpp
@@ -12,6 +12,8 @@ using std::vector;
 namespace Halide {
 namespace Internal {
 
+namespace {
+
 class UnrollLoops : public IRMutator {
     using IRMutator::visit;
 
@@ -113,6 +115,8 @@ class UnrollLoops : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt unroll_loops(const Stmt &s) {
     return UnrollLoops().mutate(s);
 }
diff --git a/src/VaryingAttributes.cpp b/src/VaryingAttributes.cpp
index ffff2b398ceb..df9ebf94f6b1 100644
--- a/src/VaryingAttributes.cpp
+++ b/src/VaryingAttributes.cpp
@@ -12,6 +12,8 @@
 namespace Halide {
 namespace Internal {
 
+namespace {
+
 Stmt make_block(Stmt first, Stmt rest) {
     if (first.defined() && rest.defined()) {
         return Block::make(first, rest);
@@ -414,11 +416,15 @@ class FindLinearExpressions : public IRMutator {
     FindLinearExpressions() = default;
 };
 
+}  // namespace
+
 Stmt find_linear_expressions(const Stmt &s) {
 
     return FindLinearExpressions().mutate(s);
 }
 
+namespace {
+
 // This visitor produces a map containing name and expression pairs from varying
 // tagged intrinsics
 class FindVaryingAttributeTags : public IRVisitor {
@@ -455,10 +461,14 @@ class RemoveVaryingAttributeTags : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt remove_varying_attributes(const Stmt &s) {
     return RemoveVaryingAttributeTags().mutate(s);
 }
 
+namespace {
+
 // This visitor removes glsl_varying intrinsics and replaces them with
 // variables. After this visitor is called, the varying attribute expressions
 // will no longer appear in the IR tree, only variables with the .varying tag
@@ -482,10 +492,14 @@ class ReplaceVaryingAttributeTags : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt replace_varying_attributes(const Stmt &s) {
     return ReplaceVaryingAttributeTags().mutate(s);
 }
 
+namespace {
+
 // This visitor produces a set of variable names that are tagged with
 // ".varying".
 class FindVaryingAttributeVars : public IRVisitor {
@@ -501,6 +515,8 @@ class FindVaryingAttributeVars : public IRVisitor {
     std::set<std::string> variables;
 };
 
+}  // namespace
+
 // Remove varying attributes from the varying's map if they do not appear in the
 // loop_stmt because they were simplified away.
 void prune_varying_attributes(const Stmt &loop_stmt, std::map<std::string, Expr> &varying) {
@@ -522,6 +538,8 @@ void prune_varying_attributes(const Stmt &loop_stmt, std::map<std::string, Expr>
     }
 }
 
+namespace {
+
 // This visitor changes the type of variables tagged with .varying to float,
 // since GLSL will only interpolate floats. In the case that the type of the
 // varying attribute was integer, the interpolated float value is snapped to the
@@ -808,7 +826,6 @@ Stmt IRFilter::mutate(const Stmt &s) {
     return stmt;
 }
 
-namespace {
 template<typename T, typename A>
 void mutate_operator(IRFilter *mutator, const T *op, const A op_a, Stmt *stmt) {
     Stmt a = mutator->mutate(op_a);
@@ -827,7 +844,6 @@ void mutate_operator(IRFilter *mutator, const T *op, const A op_a, const B op_b,
     Stmt c = mutator->mutate(op_c);
     *stmt = make_block(make_block(a, b), c);
 }
-}  // namespace
 
 void IRFilter::visit(const IntImm *op) {
     stmt = Stmt();
@@ -1362,6 +1378,8 @@ class CreateVertexBufferHostLoops : public IRMutator {
     }
 };
 
+}  // namespace
+
 Stmt setup_gpu_vertex_buffer(const Stmt &s) {
     CreateVertexBufferHostLoops vb;
     return vb.mutate(s);

From b48606f15799a8f5c694ad0f1ba64d1e0304cfb5 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 11 Dec 2020 10:54:57 -0700
Subject: [PATCH 2/3] clang-format

---
 src/CodeGen_OpenGLCompute_Dev.cpp | 2 +-
 src/CodeGen_PTX_Dev.cpp           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/CodeGen_OpenGLCompute_Dev.cpp b/src/CodeGen_OpenGLCompute_Dev.cpp
index 7e5033668f88..a491c1cc5c60 100644
--- a/src/CodeGen_OpenGLCompute_Dev.cpp
+++ b/src/CodeGen_OpenGLCompute_Dev.cpp
@@ -1,6 +1,6 @@
+#include "CodeGen_OpenGLCompute_Dev.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_OpenGL_Dev.h"
-#include "CodeGen_OpenGLCompute_Dev.h"
 #include "Debug.h"
 #include "Deinterleave.h"
 #include "IRMatch.h"
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index d0032581008b..be2b2c04000e 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -1,8 +1,8 @@
-#include "CodeGen_GPU_Dev.h"
-#include "CodeGen_LLVM.h"
 #include "CodeGen_PTX_Dev.h"
 #include "CSE.h"
+#include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
+#include "CodeGen_LLVM.h"
 #include "Debug.h"
 #include "ExprUsesVar.h"
 #include "IREquality.h"

From 7fd118494c1e8f8088124af0070e9f5b851190d2 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 11 Dec 2020 11:12:45 -0700
Subject: [PATCH 3/3] Remove redundant static.

---
 src/CodeGen_D3D12Compute_Dev.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index 3335bf0ca482..afef3020264e 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -754,7 +754,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Select *op) {
     print_assignment(op->type, rhs.str());
 }
 
-static bool is_shared_allocation(const Allocate *op) {
+bool is_shared_allocation(const Allocate *op) {
     return op->memory_type == MemoryType::GPUShared;
 }