From 7d70656a4390453c6331001a01953213eb914cb8 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 25 Feb 2021 19:02:05 -0700 Subject: [PATCH 01/19] Remove unused vertex buffer parameters. --- src/CodeGen_GPU_Host.cpp | 12 ------------ src/runtime/HalideRuntimeCuda.h | 6 +----- src/runtime/HalideRuntimeD3D12Compute.h | 6 +----- src/runtime/HalideRuntimeMetal.h | 6 +----- src/runtime/HalideRuntimeOpenCL.h | 6 +----- src/runtime/HalideRuntimeOpenGLCompute.h | 6 +----- src/runtime/cuda.cpp | 6 +----- src/runtime/d3d12compute.cpp | 6 +----- src/runtime/metal.cpp | 6 +----- src/runtime/opencl.cpp | 6 +----- src/runtime/openglcompute.cpp | 9 ++------- 11 files changed, 11 insertions(+), 64 deletions(-) diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp index dea5a586c4ad..0ea02b28a04c 100644 --- a/src/CodeGen_GPU_Host.cpp +++ b/src/CodeGen_GPU_Host.cpp @@ -269,14 +269,6 @@ void CodeGen_GPU_Host::visit(const For *loop) { } } - Value *null_float_ptr = ConstantPointerNull::get(CodeGen_LLVM::f32_t->getPointerTo()); - Value *zero_int32 = codegen(Expr(cast(0))); - - Value *gpu_num_padded_attributes = zero_int32; - Value *gpu_vertex_buffer = null_float_ptr; - Value *gpu_num_coords_dim0 = zero_int32; - Value *gpu_num_coords_dim1 = zero_int32; - // compute a closure over the state passed into the kernel HostClosure c(loop->body, loop->name); @@ -500,10 +492,6 @@ void CodeGen_GPU_Host::visit(const For *loop) { 0, 0, "gpu_arg_is_buffer_ref" + api_unique_name), - gpu_num_padded_attributes, - gpu_vertex_buffer, - gpu_num_coords_dim0, - gpu_num_coords_dim1, }; std::string run_fn_name = "halide_" + api_unique_name + "_run"; llvm::Function *dev_run_fn = module->getFunction(run_fn_name); diff --git a/src/runtime/HalideRuntimeCuda.h b/src/runtime/HalideRuntimeCuda.h index 7f8481642ba2..3eff7d834543 100644 --- a/src/runtime/HalideRuntimeCuda.h +++ b/src/runtime/HalideRuntimeCuda.h @@ -33,11 +33,7 @@ extern int halide_cuda_run(void *user_context, int shared_mem_bytes, size_t arg_sizes[], void *args[], - int8_t arg_is_buffer[], - int num_attributes, - float *vertex_buffer, - int num_coords_dim0, - int num_coords_dim1); + int8_t arg_is_buffer[]); extern void halide_cuda_finalize_kernels(void *user_context, void *state_ptr); // @} diff --git a/src/runtime/HalideRuntimeD3D12Compute.h b/src/runtime/HalideRuntimeD3D12Compute.h index ef6d618b1b5d..5814f85a8de2 100644 --- a/src/runtime/HalideRuntimeD3D12Compute.h +++ b/src/runtime/HalideRuntimeD3D12Compute.h @@ -30,11 +30,7 @@ extern int halide_d3d12compute_run(void *user_context, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, - halide_type_t arg_types[], void *args[], int8_t arg_is_buffer[], - int num_attributes, - float *vertex_buffer, - int num_coords_dim0, - int num_coords_dim1); + halide_type_t arg_types[], void *args[], int8_t arg_is_buffer[]); extern void halide_d3d12compute_finalize_kernels(void *user_context, void *state_ptr); // @} diff --git a/src/runtime/HalideRuntimeMetal.h b/src/runtime/HalideRuntimeMetal.h index 6e795416e442..802d6659317f 100644 --- a/src/runtime/HalideRuntimeMetal.h +++ b/src/runtime/HalideRuntimeMetal.h @@ -35,11 +35,7 @@ extern int halide_metal_run(void *user_context, int shared_mem_bytes, size_t arg_sizes[], void *args[], - int8_t arg_is_buffer[], - int num_attributes, - float *vertex_buffer, - int num_coords_dim0, - int num_coords_dim1); + int8_t arg_is_buffer[]); // @} /** Set the underlying MTLBuffer for a halide_buffer_t. This memory should be diff --git a/src/runtime/HalideRuntimeOpenCL.h b/src/runtime/HalideRuntimeOpenCL.h index 54b8b4157489..510dc6f1ba8e 100644 --- a/src/runtime/HalideRuntimeOpenCL.h +++ b/src/runtime/HalideRuntimeOpenCL.h @@ -34,11 +34,7 @@ extern int halide_opencl_run(void *user_context, int shared_mem_bytes, size_t arg_sizes[], void *args[], - int8_t arg_is_buffer[], - int num_attributes, - float *vertex_buffer, - int num_coords_dim0, - int num_coords_dim1); + int8_t arg_is_buffer[]); extern void halide_opencl_finalize_kernels(void *user_context, void *state_ptr); // @} diff --git a/src/runtime/HalideRuntimeOpenGLCompute.h b/src/runtime/HalideRuntimeOpenGLCompute.h index 2fd76c2fd697..decca61124f1 100644 --- a/src/runtime/HalideRuntimeOpenGLCompute.h +++ b/src/runtime/HalideRuntimeOpenGLCompute.h @@ -44,11 +44,7 @@ extern int halide_openglcompute_run(void *user_context, int shared_mem_bytes, struct halide_type_t arg_types[], void *args[], - int8_t is_buffer[], - int num_attributes, - float *vertex_buffer, - int num_coords_dim0, - int num_coords_dim1); + int8_t is_buffer[]); extern void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr); // @} diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 5260e2847b9d..0846bd9b5c40 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -1045,11 +1045,7 @@ WEAK int halide_cuda_run(void *user_context, int shared_mem_bytes, size_t arg_sizes[], void *args[], - int8_t arg_is_buffer[], - int num_attributes, - float *vertex_buffer, - int num_coords_dim0, - int num_coords_dim1) { + int8_t arg_is_buffer[]) { debug(user_context) << "CUDA: halide_cuda_run (" << "user_context: " << user_context << ", " diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index d190c164ad8e..eab19a6aacd5 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -2986,11 +2986,7 @@ WEAK int halide_d3d12compute_run(void *user_context, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, - halide_type_t arg_types[], void *args[], int8_t arg_is_buffer[], - int num_attributes, - float *vertex_buffer, - int num_coords_dim0, - int num_coords_dim1) { + halide_type_t arg_types[], void *args[], int8_t arg_is_buffer[]) { TRACELOG; D3D12ContextHolder d3d12_context(user_context, true); diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp index 9450ab5eee00..4d8e6d093b8d 100644 --- a/src/runtime/metal.cpp +++ b/src/runtime/metal.cpp @@ -728,11 +728,7 @@ WEAK int halide_metal_run(void *user_context, int shared_mem_bytes, size_t arg_sizes[], void *args[], - int8_t arg_is_buffer[], - int num_attributes, - float *vertex_buffer, - int num_coords_dim0, - int num_coords_dim1) { + int8_t arg_is_buffer[]) { #ifdef DEBUG_RUNTIME uint64_t t_before = halide_current_time_ns(user_context); #endif diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index c62b957f0e3b..04fce0178cb3 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -1043,11 +1043,7 @@ WEAK int halide_opencl_run(void *user_context, int shared_mem_bytes, size_t arg_sizes[], void *args[], - int8_t arg_is_buffer[], - int num_attributes, - float *vertex_buffer, - int num_coords_dim0, - int num_coords_dim1) { + int8_t arg_is_buffer[]) { debug(user_context) << "CL: halide_opencl_run (user_context: " << user_context << ", " << "entry: " << entry_name << ", " diff --git a/src/runtime/openglcompute.cpp b/src/runtime/openglcompute.cpp index 22b3ac15aa7d..99707ab02888 100644 --- a/src/runtime/openglcompute.cpp +++ b/src/runtime/openglcompute.cpp @@ -592,9 +592,7 @@ WEAK int halide_openglcompute_run(void *user_context, void *state_ptr, const char *entry_name, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, halide_type_t arg_types[], void *args[], - int8_t arg_is_buffer[], int num_attributes, - float *vertex_buffer, int num_coords_dim0, - int num_coords_dim1) { + int8_t arg_is_buffer[]) { #ifdef DEBUG_RUNTIME uint64_t t_before = halide_current_time_ns(user_context); #endif @@ -604,10 +602,7 @@ WEAK int halide_openglcompute_run(void *user_context, void *state_ptr, << "entry: " << entry_name << ", " << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", " << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", " - << "shmem: " << shared_mem_bytes << ", " - << "num_attributes: " << num_attributes << ", " - << "num_coords_dim0: " << num_coords_dim0 << ", " - << "num_coords_dim1: " << num_coords_dim1 << "\n"; + << "shmem: " << shared_mem_bytes << "\n"; if (!global_state.initialized) { error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_run)."; From 041b9fe09865bf25fa26f90db72a5d7b631899a6 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Thu, 25 Feb 2021 19:48:06 -0700 Subject: [PATCH 02/19] Offload GPU code in a lowering pass instead of via CodeGen_GPU_Host. Fixes #5650, fixes #2797, fixes #2084, now #1971 is more relevant. --- Makefile | 4 +- src/CMakeLists.txt | 4 +- src/CodeGen_GPU_Host.cpp | 537 --------------------------------------- src/CodeGen_GPU_Host.h | 75 ------ src/CodeGen_LLVM.cpp | 43 +--- src/Lower.cpp | 10 + src/OffloadGPULoops.cpp | 344 +++++++++++++++++++++++++ src/OffloadGPULoops.h | 25 ++ 8 files changed, 384 insertions(+), 658 deletions(-) delete mode 100644 src/CodeGen_GPU_Host.cpp delete mode 100644 src/CodeGen_GPU_Host.h create mode 100644 src/OffloadGPULoops.cpp create mode 100644 src/OffloadGPULoops.h diff --git a/Makefile b/Makefile index edc66e4178ca..c685c299940c 100644 --- a/Makefile +++ b/Makefile @@ -422,7 +422,6 @@ SOURCE_FILES = \ CodeGen_C.cpp \ CodeGen_D3D12Compute_Dev.cpp \ CodeGen_GPU_Dev.cpp \ - CodeGen_GPU_Host.cpp \ CodeGen_Hexagon.cpp \ CodeGen_Internal.cpp \ CodeGen_LLVM.cpp \ @@ -498,6 +497,7 @@ SOURCE_FILES = \ ModulusRemainder.cpp \ Monotonic.cpp \ ObjectInstanceRegistry.cpp \ + OffloadGPULoops.cpp \ OutputImageParam.cpp \ ParallelRVar.cpp \ Parameter.cpp \ @@ -595,7 +595,6 @@ HEADER_FILES = \ CodeGen_C.h \ CodeGen_D3D12Compute_Dev.h \ CodeGen_GPU_Dev.h \ - CodeGen_GPU_Host.h \ CodeGen_Internal.h \ CodeGen_LLVM.h \ CodeGen_Metal_Dev.h \ @@ -679,6 +678,7 @@ HEADER_FILES = \ ModulusRemainder.h \ Monotonic.h \ ObjectInstanceRegistry.h \ + OffloadGPULoops.h \ OutputImageParam.h \ ParallelRVar.h \ Param.h \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 289b7bc9447d..b74006aa3528 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -27,7 +27,6 @@ set(HEADER_FILES CodeGen_C.h CodeGen_D3D12Compute_Dev.h CodeGen_GPU_Dev.h - CodeGen_GPU_Host.h CodeGen_Internal.h CodeGen_LLVM.h CodeGen_Metal_Dev.h @@ -110,6 +109,7 @@ set(HEADER_FILES ModulusRemainder.h Monotonic.h ObjectInstanceRegistry.h + OffloadGPULoops.h OutputImageParam.h ParallelRVar.h Param.h @@ -190,7 +190,6 @@ set(SOURCE_FILES CodeGen_C.cpp CodeGen_D3D12Compute_Dev.cpp CodeGen_GPU_Dev.cpp - CodeGen_GPU_Host.cpp CodeGen_Hexagon.cpp CodeGen_Internal.cpp CodeGen_LLVM.cpp @@ -266,6 +265,7 @@ set(SOURCE_FILES ModulusRemainder.cpp Monotonic.cpp ObjectInstanceRegistry.cpp + OffloadGPULoops.cpp OutputImageParam.cpp ParallelRVar.cpp Parameter.cpp diff --git a/src/CodeGen_GPU_Host.cpp b/src/CodeGen_GPU_Host.cpp deleted file mode 100644 index 0ea02b28a04c..000000000000 --- a/src/CodeGen_GPU_Host.cpp +++ /dev/null @@ -1,537 +0,0 @@ -#include - -#include "CodeGen_ARM.h" -#include "CodeGen_D3D12Compute_Dev.h" -#include "CodeGen_GPU_Host.h" -#include "CodeGen_Internal.h" -#include "CodeGen_MIPS.h" -#include "CodeGen_Metal_Dev.h" -#include "CodeGen_OpenCL_Dev.h" -#include "CodeGen_OpenGLCompute_Dev.h" -#include "CodeGen_PTX_Dev.h" -#include "CodeGen_PowerPC.h" -#include "CodeGen_RISCV.h" -#include "CodeGen_WebAssembly.h" -#include "CodeGen_X86.h" -#include "Debug.h" -#include "DeviceArgument.h" -#include "ExprUsesVar.h" -#include "IROperator.h" -#include "IRPrinter.h" -#include "LLVM_Headers.h" -#include "Simplify.h" -#include "Util.h" - -namespace Halide { -namespace Internal { - -using std::map; -using std::string; -using std::vector; - -using namespace llvm; - -namespace { - -// Sniff the contents of a kernel to extracts the bounds of all the -// thread indices (so we know how many threads to launch), and the -// amount of shared memory to allocate. -class ExtractBounds : public IRVisitor { -public: - Expr num_threads[4]; - Expr num_blocks[4]; - Expr shared_mem_size; - - ExtractBounds() - : shared_mem_size(0) { - for (int i = 0; i < 4; i++) { - num_threads[i] = num_blocks[i] = 1; - } - } - -private: - bool found_shared = false; - - using IRVisitor::visit; - - void visit(const For *op) override { - if (CodeGen_GPU_Dev::is_gpu_var(op->name)) { - internal_assert(is_const_zero(op->min)); - } - - if (ends_with(op->name, ".__thread_id_x")) { - num_threads[0] = op->extent; - } else if (ends_with(op->name, ".__thread_id_y")) { - num_threads[1] = op->extent; - } else if (ends_with(op->name, ".__thread_id_z")) { - num_threads[2] = op->extent; - } else if (ends_with(op->name, ".__thread_id_w")) { - num_threads[3] = op->extent; - } else if (ends_with(op->name, ".__block_id_x")) { - num_blocks[0] = op->extent; - } else if (ends_with(op->name, ".__block_id_y")) { - num_blocks[1] = op->extent; - } else if (ends_with(op->name, ".__block_id_z")) { - num_blocks[2] = op->extent; - } else if (ends_with(op->name, ".__block_id_w")) { - num_blocks[3] = op->extent; - } - - op->body.accept(this); - } - - void visit(const LetStmt *op) override { - if (expr_uses_var(shared_mem_size, op->name)) { - shared_mem_size = Let::make(op->name, op->value, shared_mem_size); - } - op->body.accept(this); - } - - void visit(const Allocate *allocate) override { - user_assert(!allocate->new_expr.defined()) << "Allocate node inside GPU kernel has custom new expression.\n" - << "(Memoization is not supported inside GPU kernels at present.)\n"; - - if (allocate->memory_type == MemoryType::GPUShared) { - internal_assert(allocate->extents.size() == 1); - shared_mem_size += allocate->extents[0] * allocate->type.bytes(); - found_shared = true; - } - allocate->body.accept(this); - } -}; - -Value *get_module_state(llvm::Module *module, const std::string &function_name, - const std::string &api_unique_name, bool create = true) { - std::string name = "module_state_" + function_name + "_" + api_unique_name; - GlobalVariable *module_state = module->getGlobalVariable(name, true); - if (!module_state && create) { - // Create a global variable to hold the module state - PointerType *void_ptr_type = llvm::Type::getInt8PtrTy(module->getContext()); - module_state = new GlobalVariable(*module, void_ptr_type, - false, GlobalVariable::InternalLinkage, - ConstantPointerNull::get(void_ptr_type), - name); - debug(4) << "Created device module state global variable\n"; - } - - return module_state; -} - -} // namespace - -template -CodeGen_GPU_Host::CodeGen_GPU_Host(const Target &target) - : CodeGen_CPU(target) { - // For the default GPU, the order of preferences is: Metal, - // OpenCL, CUDA, OpenGLCompute last. - // The code is in reverse order to allow later tests to override - // earlier ones. - if (target.has_feature(Target::OpenGLCompute)) { - debug(1) << "Constructing OpenGL Compute device codegen\n"; - cgdev[DeviceAPI::OpenGLCompute] = new_CodeGen_OpenGLCompute_Dev(target); - } - if (target.has_feature(Target::CUDA)) { - debug(1) << "Constructing CUDA device codegen\n"; - cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(target); - } - if (target.has_feature(Target::OpenCL)) { - debug(1) << "Constructing OpenCL device codegen\n"; - cgdev[DeviceAPI::OpenCL] = new_CodeGen_OpenCL_Dev(target); - } - if (target.has_feature(Target::Metal)) { - debug(1) << "Constructing Metal device codegen\n"; - cgdev[DeviceAPI::Metal] = new_CodeGen_Metal_Dev(target); - } - if (target.has_feature(Target::D3D12Compute)) { - debug(1) << "Constructing Direct3D 12 Compute device codegen\n"; - cgdev[DeviceAPI::D3D12Compute] = new_CodeGen_D3D12Compute_Dev(target); - } - - if (cgdev.empty()) { - internal_error << "Requested unknown GPU target: " << target.to_string() << "\n"; - } -} - -template -void CodeGen_GPU_Host::compile_func(const LoweredFunc &f, - const std::string &simple_name, - const std::string &extern_name) { - function_name = simple_name; - - // Create a new module for all of the kernels we find in this function. - for (auto &i : cgdev) { - i.second->init_module(); - } - - // Call the base implementation to create the function. - CodeGen_CPU::compile_func(f, simple_name, extern_name); - - // We need to insert code after the existing entry block, so that - // the destructor stack slots exist before we do the assertions - // involved in initializing gpu kernels. - - // Split the entry block just before its end. - BasicBlock *entry = &function->getEntryBlock(); - llvm::Instruction *terminator = entry->getTerminator(); - internal_assert(terminator); - BasicBlock *post_entry = entry->splitBasicBlock(terminator); - - // Create some code that does the GPU initialization. - BasicBlock *init_kernels_bb = BasicBlock::Create(*context, "init_kernels", - function, post_entry); - - // The entry block should go to the init kernels block instead of - // the post entry block. - entry->getTerminator()->eraseFromParent(); - builder->SetInsertPoint(entry); - builder->CreateBr(init_kernels_bb); - - // Fill out the init kernels block - builder->SetInsertPoint(init_kernels_bb); - - for (auto &i : cgdev) { - CodeGen_GPU_Dev *gpu_codegen = i.second.get(); - std::string api_unique_name = gpu_codegen->api_unique_name(); - - // If the module state for this API/function did not get created, there were - // no kernels using this API. - llvm::Value *module_state = get_module_state(module.get(), function_name, api_unique_name, false); - if (!module_state) { - continue; - } - - debug(2) << "Generating init_kernels for " << api_unique_name << "\n"; - - std::vector kernel_src = gpu_codegen->compile_to_src(); - - Value *kernel_src_ptr = - CodeGen_CPU::create_binary_blob(kernel_src, - "halide_" + function_name + "_" + api_unique_name + "_kernel_src"); - - if (f.args[0].name == "__user_context") { - // The user context is first argument of the function. - // We retrieve it here so it's available for subsequent calls of - // get_user_context(). - sym_push("__user_context", iterator_to_pointer(function->arg_begin())); - } - - Value *user_context = get_user_context(); - Value *kernel_size = ConstantInt::get(i32_t, kernel_src.size()); - std::string init_kernels_name = "halide_" + api_unique_name + "_initialize_kernels"; - llvm::Function *init = module->getFunction(init_kernels_name); - internal_assert(init) << "Could not find function " + init_kernels_name + " in initial module\n"; - vector init_kernels_args = {user_context, module_state, kernel_src_ptr, kernel_size}; - Value *result = builder->CreateCall(init, init_kernels_args); - Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0)); - CodeGen_CPU::create_assertion(did_succeed, Expr(), result); - - // Generate a finalizer call as well to relase any refcounts or other resource usage - // specific to this filter call. - std::string finalize_kernels_name = "halide_" + api_unique_name + "_finalize_kernels"; - llvm::Function *finalize = module->getFunction(finalize_kernels_name); - Value *module_state_value = builder->CreateLoad(module_state); - register_destructor(finalize, module_state_value, CodeGen_CPU::Always); - } - - // the init kernels block should branch to the post-entry block - builder->CreateBr(post_entry); - - function_name = ""; -} - -template -void CodeGen_GPU_Host::visit(const For *loop) { - if (CodeGen_GPU_Dev::is_gpu_var(loop->name)) { - // We're in the loop over outermost block dimension - debug(2) << "Kernel launch: " << loop->name << "\n"; - - internal_assert(loop->device_api != DeviceAPI::Default_GPU) - << "A concrete device API should have been selected before codegen."; - - ExtractBounds bounds; - loop->accept(&bounds); - - debug(2) << "Kernel bounds: (" - << bounds.num_threads[0] << ", " - << bounds.num_threads[1] << ", " - << bounds.num_threads[2] << ", " - << bounds.num_threads[3] << ") threads, (" - << bounds.num_blocks[0] << ", " - << bounds.num_blocks[1] << ", " - << bounds.num_blocks[2] << ", " - << bounds.num_blocks[3] << ") blocks\n"; - - // compile the kernel - string kernel_name = unique_name("kernel_" + loop->name); - for (size_t i = 0; i < kernel_name.size(); i++) { - if (!isalnum(kernel_name[i])) { - kernel_name[i] = '_'; - } - } - - // compute a closure over the state passed into the kernel - HostClosure c(loop->body, loop->name); - - // Determine the arguments that must be passed into the halide function - vector closure_args = c.arguments(); - - // Sort the args by the size of the underlying type. This is - // helpful for avoiding struct-packing ambiguities in metal, - // which passes the scalar args as a struct. - std::sort(closure_args.begin(), closure_args.end(), - [](const DeviceArgument &a, const DeviceArgument &b) { - if (a.is_buffer == b.is_buffer) { - return a.type.bits() > b.type.bits(); - } else { - // Ensure that buffer arguments come first: - // for many OpenGL/Compute systems, the - // legal indices for buffer args are much - // more restrictive than for scalar args, - // and scalar args can be 'grown' by - // LICM. Putting buffers first makes it much - // more likely we won't fail on some - // hardware. - return a.is_buffer > b.is_buffer; - } - }); - - for (size_t i = 0; i < closure_args.size(); i++) { - if (closure_args[i].is_buffer && allocations.contains(closure_args[i].name)) { - closure_args[i].size = allocations.get(closure_args[i].name).constant_bytes; - } - } - - CodeGen_GPU_Dev *gpu_codegen = cgdev[loop->device_api].get(); - user_assert(gpu_codegen != nullptr) - << "Loop is scheduled on device " << loop->device_api - << " which does not appear in target " << target.to_string() << "\n"; - gpu_codegen->add_kernel(loop, kernel_name, closure_args); - - // get the actual name of the generated kernel for this loop - kernel_name = gpu_codegen->get_current_kernel_name(); - debug(2) << "Compiled launch to kernel \"" << kernel_name << "\"\n"; - Value *entry_name_str = builder->CreateGlobalStringPtr(kernel_name, "entry_name"); - - llvm::Type *target_size_t_type = (target.bits == 32) ? i32_t : i64_t; - - // build the kernel arguments array - llvm::PointerType *arg_t = i8_t->getPointerTo(); // void* - int num_args = (int)closure_args.size(); - - // nullptr-terminated list - llvm::Type *gpu_args_arr_type = ArrayType::get(arg_t, num_args + 1); - Value *gpu_args_arr = - create_alloca_at_entry( - gpu_args_arr_type, - 1, false, - kernel_name + "_args"); - - // nullptr-terminated list of size_t's - llvm::Type *gpu_arg_sizes_arr_type = ArrayType::get(target_size_t_type, num_args + 1); - llvm::ArrayType *gpu_arg_types_arr_type = ArrayType::get(type_t_type, num_args + 1); - vector arg_types_array_entries; - - std::string api_unique_name = gpu_codegen->api_unique_name(); - - Value *gpu_arg_sizes_arr = nullptr; - bool runtime_run_takes_types = gpu_codegen->kernel_run_takes_types(); - - if (!runtime_run_takes_types) { - gpu_arg_sizes_arr = - create_alloca_at_entry( - gpu_arg_sizes_arr_type, - 1, false, - kernel_name + "_arg_sizes"); - } - - llvm::Type *gpu_arg_is_buffer_arr_type = ArrayType::get(i8_t, num_args + 1); - Value *gpu_arg_is_buffer_arr = - create_alloca_at_entry( - gpu_arg_is_buffer_arr_type, - 1, false, - kernel_name + "_arg_is_buffer"); - - for (int i = 0; i < num_args; i++) { - // get the closure argument - string name = closure_args[i].name; - Value *val; - - if (closure_args[i].is_buffer) { - // If it's a buffer, get the .buffer symbol - val = sym_get(name + ".buffer"); - } else if (ends_with(name, ".varying")) { - // Expressions for varying attributes are passed in the - // expression mesh. Pass a non-nullptr value in the argument array - // to keep it in sync with the argument names encoded in the - // shader header - val = ConstantInt::get(target_size_t_type, 1); - } else { - // Otherwise just look up the symbol - val = sym_get(name); - } - - if (!closure_args[i].is_buffer) { - // allocate stack space to mirror the closure element. It - // might be in a register and we need a pointer to it for - // the gpu args array. - Value *ptr = create_alloca_at_entry(val->getType(), 1, false, name + ".stack"); - // store the closure value into the stack space - builder->CreateStore(val, ptr); - val = ptr; - } - - // store a void * pointer to the argument into the gpu_args_arr - Value *bits = builder->CreateBitCast(val, arg_t); - builder->CreateStore(bits, - builder->CreateConstGEP2_32( - gpu_args_arr_type, - gpu_args_arr, - 0, - i)); - - if (runtime_run_takes_types) { - Constant *arg_type_fields[] = { - ConstantInt::get(i8_t, closure_args[i].type.code()), - ConstantInt::get(i8_t, closure_args[i].type.bits()), - ConstantInt::get(i16_t, 1)}; - arg_types_array_entries.push_back(ConstantStruct::get(type_t_type, arg_type_fields)); - } else { - // store the size of the argument. - int size_bytes = (closure_args[i].is_buffer) ? 8 : closure_args[i].type.bytes(); - builder->CreateStore(ConstantInt::get(target_size_t_type, size_bytes), - builder->CreateConstGEP2_32( - gpu_arg_sizes_arr_type, - gpu_arg_sizes_arr, - 0, - i)); - } - - builder->CreateStore(ConstantInt::get(i8_t, closure_args[i].is_buffer), - builder->CreateConstGEP2_32( - gpu_arg_is_buffer_arr_type, - gpu_arg_is_buffer_arr, - 0, - i)); - } - // nullptr-terminate the lists - builder->CreateStore(ConstantPointerNull::get(arg_t), - builder->CreateConstGEP2_32( - gpu_args_arr_type, - gpu_args_arr, - 0, - num_args)); - if (runtime_run_takes_types) { - Constant *arg_type_fields[] = { - ConstantInt::get(i8_t, 0), - ConstantInt::get(i8_t, 0), - ConstantInt::get(i16_t, 0)}; - arg_types_array_entries.push_back(ConstantStruct::get(type_t_type, arg_type_fields)); - } else { - builder->CreateStore(ConstantInt::get(target_size_t_type, 0), - builder->CreateConstGEP2_32( - gpu_arg_sizes_arr_type, - gpu_arg_sizes_arr, - 0, - num_args)); - } - builder->CreateStore(ConstantInt::get(i8_t, 0), - builder->CreateConstGEP2_32( - gpu_arg_is_buffer_arr_type, - gpu_arg_is_buffer_arr, - 0, - num_args)); - - GlobalVariable *arg_types_array_storage = nullptr; - if (runtime_run_takes_types) { - arg_types_array_storage = new GlobalVariable( - *module, - gpu_arg_types_arr_type, - /*isConstant*/ true, - GlobalValue::PrivateLinkage, - ConstantArray::get(gpu_arg_types_arr_type, arg_types_array_entries)); - } - - // TODO: only three dimensions can be passed to - // cuLaunchKernel. How should we handle blkid[3]? - internal_assert(is_const_one(bounds.num_threads[3]) && is_const_one(bounds.num_blocks[3])) - << bounds.num_threads[3] << ", " << bounds.num_blocks[3] << "\n"; - debug(4) << "CodeGen_GPU_Host get_user_context returned " << get_user_context() << "\n"; - debug(3) << "bounds.num_blocks[0] = " << bounds.num_blocks[0] << "\n"; - debug(3) << "bounds.num_blocks[1] = " << bounds.num_blocks[1] << "\n"; - debug(3) << "bounds.num_blocks[2] = " << bounds.num_blocks[2] << "\n"; - debug(3) << "bounds.num_threads[0] = " << bounds.num_threads[0] << "\n"; - debug(3) << "bounds.num_threads[1] = " << bounds.num_threads[1] << "\n"; - debug(3) << "bounds.num_threads[2] = " << bounds.num_threads[2] << "\n"; - - Constant *zero = ConstantInt::get(i32_t, 0); - Value *zeros[] = {zero, zero}; - - // Order-of-evaluation is guaranteed to be in order in brace-init-lists, - // so the multiple calls to codegen here are fine - Value *launch_args[] = { - get_user_context(), - builder->CreateLoad(get_module_state(module.get(), function_name, api_unique_name)), - entry_name_str, - codegen(bounds.num_blocks[0]), - codegen(bounds.num_blocks[1]), - codegen(bounds.num_blocks[2]), - codegen(bounds.num_threads[0]), - codegen(bounds.num_threads[1]), - codegen(bounds.num_threads[2]), - codegen(bounds.shared_mem_size), - runtime_run_takes_types ? ConstantExpr::getInBoundsGetElementPtr(gpu_arg_types_arr_type, arg_types_array_storage, zeros) : builder->CreateConstGEP2_32(gpu_arg_sizes_arr_type, gpu_arg_sizes_arr, 0, 0, "gpu_arg_sizes_ar_ref" + api_unique_name), - builder->CreateConstGEP2_32( - gpu_args_arr_type, - gpu_args_arr, - 0, - 0, - "gpu_args_arr_ref" + api_unique_name), - builder->CreateConstGEP2_32( - gpu_arg_is_buffer_arr_type, - gpu_arg_is_buffer_arr, - 0, - 0, - "gpu_arg_is_buffer_ref" + api_unique_name), - }; - std::string run_fn_name = "halide_" + api_unique_name + "_run"; - llvm::Function *dev_run_fn = module->getFunction(run_fn_name); - internal_assert(dev_run_fn) << "Could not find " << run_fn_name << " in module\n"; - Value *result = builder->CreateCall(dev_run_fn, launch_args); - Value *did_succeed = builder->CreateICmpEQ(result, ConstantInt::get(i32_t, 0)); - - CodeGen_CPU::create_assertion(did_succeed, - // Should have already called halide_error inside the gpu runtime - halide_error_code_device_run_failed, - result); - } else { - CodeGen_CPU::visit(loop); - } -} - -// Force template instantiation. -#ifdef WITH_X86 -template class CodeGen_GPU_Host; -#endif - -#if defined(WITH_ARM) || defined(WITH_AARCH64) -template class CodeGen_GPU_Host; -#endif - -#ifdef WITH_MIPS -template class CodeGen_GPU_Host; -#endif - -#ifdef WITH_POWERPC -template class CodeGen_GPU_Host; -#endif - -#ifdef WITH_WEBASSEMBLY -template class CodeGen_GPU_Host; -#endif - -#ifdef WITH_RISCV -template class CodeGen_GPU_Host; -#endif - -} // namespace Internal -} // namespace Halide diff --git a/src/CodeGen_GPU_Host.h b/src/CodeGen_GPU_Host.h deleted file mode 100644 index 6b867fb2a2c3..000000000000 --- a/src/CodeGen_GPU_Host.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef HALIDE_CODEGEN_GPU_HOST_H -#define HALIDE_CODEGEN_GPU_HOST_H - -/** \file - * Defines the code-generator for producing GPU host code - */ - -#include -#include - -#include "CodeGen_GPU_Dev.h" -#include "IR.h" - -namespace Halide { - -struct Target; - -namespace Internal { - -/** A code generator that emits GPU code from a given Halide stmt. */ -template -class CodeGen_GPU_Host : public CodeGen_CPU { -public: - /** Create a GPU code generator. GPU target is selected via - * CodeGen_GPU_Options. Processor features can be enabled using the - * appropriate flags from Target */ - CodeGen_GPU_Host(const Target &); - -protected: - void compile_func(const LoweredFunc &func, const std::string &simple_name, const std::string &extern_name) override; - - /** Declare members of the base class that must exist to help the - * compiler do name lookup. Annoying but necessary, because the - * compiler doesn't know that CodeGen_CPU will in fact inherit - * from CodeGen for every instantiation of this template. */ - using CodeGen_CPU::allocations; - using CodeGen_CPU::builder; - using CodeGen_CPU::codegen; - using CodeGen_CPU::context; - using CodeGen_CPU::create_alloca_at_entry; - using CodeGen_CPU::function; - using CodeGen_CPU::get_user_context; - using CodeGen_CPU::halide_buffer_t_type; - using CodeGen_CPU::i16_t; - using CodeGen_CPU::i32_t; - using CodeGen_CPU::i64_t; - using CodeGen_CPU::i8_t; - using CodeGen_CPU::init_module; - using CodeGen_CPU::llvm_type_of; - using CodeGen_CPU::module; - using CodeGen_CPU::register_destructor; - using CodeGen_CPU::sym_exists; - using CodeGen_CPU::sym_get; - using CodeGen_CPU::sym_pop; - using CodeGen_CPU::sym_push; - using CodeGen_CPU::target; - using CodeGen_CPU::type_t_type; - using CodeGen_CPU::visit; - - /** Nodes for which we need to override default behavior for the GPU runtime */ - // @{ - void visit(const For *) override; - // @} - - std::string function_name; - -private: - /** Child code generator for device kernels. */ - std::map> cgdev; -}; - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 6ae5e9a74547..97dbd8c225da 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -7,7 +7,6 @@ #include "CPlusPlusMangle.h" #include "CSE.h" #include "CodeGen_ARM.h" -#include "CodeGen_GPU_Host.h" #include "CodeGen_Hexagon.h" #include "CodeGen_Internal.h" #include "CodeGen_LLVM.h" @@ -239,47 +238,7 @@ void CodeGen_LLVM::set_context(llvm::LLVMContext &context) { } std::unique_ptr CodeGen_LLVM::new_for_target(const Target &target, llvm::LLVMContext &context) { - // The awkward mapping from targets to code generators - if (target.features_any_of({Target::CUDA, - Target::OpenCL, - Target::OpenGLCompute, - Target::Metal, - Target::D3D12Compute})) { -#ifdef WITH_X86 - if (target.arch == Target::X86) { - return make_codegen>(target, context); - } -#endif -#if defined(WITH_ARM) || defined(WITH_AARCH64) - if (target.arch == Target::ARM) { - return make_codegen>(target, context); - } -#endif -#ifdef WITH_MIPS - if (target.arch == Target::MIPS) { - return make_codegen>(target, context); - } -#endif -#ifdef WITH_POWERPC - if (target.arch == Target::POWERPC) { - return make_codegen>(target, context); - } -#endif -#ifdef WITH_WEBASSEMBLY - if (target.arch == Target::WebAssembly) { - return make_codegen>(target, context); - } -#endif -#ifdef WITH_RISCV - if (target.arch == Target::RISCV) { - return make_codegen>(target, context); - } -#endif - user_error << "Invalid target architecture for GPU backend: " - << target.to_string() << "\n"; - return nullptr; - - } else if (target.arch == Target::X86) { + if (target.arch == Target::X86) { return make_codegen(target, context); } else if (target.arch == Target::ARM) { return make_codegen(target, context); diff --git a/src/Lower.cpp b/src/Lower.cpp index d8a3b12a9b34..6855aed4ec61 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -40,6 +40,7 @@ #include "LoopCarry.h" #include "LowerWarpShuffles.h" #include "Memoization.h" +#include "OffloadGPULoops.h" #include "PartitionLoops.h" #include "Prefetch.h" #include "Profiling.h" @@ -440,6 +441,15 @@ Module lower(const vector &output_funcs, debug(1) << "Skipping Hexagon offload...\n"; } + if (t.has_gpu_feature()) { + debug(1) << "Offloading GPU loops...\n"; + s = inject_gpu_offload(s, t); + debug(2) << "Lowering after splitting off GPU loops:\n" + << s << "\n\n"; + } else { + debug(1) << "Skipping GPU offload...\n"; + } + if (!custom_passes.empty()) { for (size_t i = 0; i < custom_passes.size(); i++) { debug(1) << "Running custom lowering pass " << i << "...\n"; diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp new file mode 100644 index 000000000000..cd80ebb94d92 --- /dev/null +++ b/src/OffloadGPULoops.cpp @@ -0,0 +1,344 @@ +#include + +#include "Closure.h" +#include "CodeGen_D3D12Compute_Dev.h" +#include "CodeGen_GPU_Dev.h" +#include "CodeGen_Metal_Dev.h" +#include "CodeGen_OpenCL_Dev.h" +#include "CodeGen_OpenGLCompute_Dev.h" +#include "CodeGen_PTX_Dev.h" +#include "ExprUsesVar.h" +#include "InjectHostDevBufferCopies.h" +#include "IRMutator.h" +#include "IROperator.h" +#include "IRPrinter.h" +#include "OffloadGPULoops.h" +#include "Util.h" + +namespace Halide { +namespace Internal { + +using std::map; +using std::string; +using std::unique_ptr; +using std::vector; + +namespace { + +// Sniff the contents of a kernel to extracts the bounds of all the +// thread indices (so we know how many threads to launch), and the +// amount of shared memory to allocate. +class ExtractBounds : public IRVisitor { +public: + Expr num_threads[4]; + Expr num_blocks[4]; + Expr shared_mem_size; + + ExtractBounds() + : shared_mem_size(0) { + for (int i = 0; i < 4; i++) { + num_threads[i] = num_blocks[i] = 1; + } + } + +private: + bool found_shared = false; + + using IRVisitor::visit; + + void visit(const For *op) override { + if (CodeGen_GPU_Dev::is_gpu_var(op->name)) { + internal_assert(is_const_zero(op->min)); + } + + if (ends_with(op->name, ".__thread_id_x")) { + num_threads[0] = op->extent; + } else if (ends_with(op->name, ".__thread_id_y")) { + num_threads[1] = op->extent; + } else if (ends_with(op->name, ".__thread_id_z")) { + num_threads[2] = op->extent; + } else if (ends_with(op->name, ".__thread_id_w")) { + num_threads[3] = op->extent; + } else if (ends_with(op->name, ".__block_id_x")) { + num_blocks[0] = op->extent; + } else if (ends_with(op->name, ".__block_id_y")) { + num_blocks[1] = op->extent; + } else if (ends_with(op->name, ".__block_id_z")) { + num_blocks[2] = op->extent; + } else if (ends_with(op->name, ".__block_id_w")) { + num_blocks[3] = op->extent; + } + + op->body.accept(this); + } + + void visit(const LetStmt *op) override { + if (expr_uses_var(shared_mem_size, op->name)) { + shared_mem_size = Let::make(op->name, op->value, shared_mem_size); + } + op->body.accept(this); + } + + void visit(const Allocate *allocate) override { + user_assert(!allocate->new_expr.defined()) << "Allocate node inside GPU kernel has custom new expression.\n" + << "(Memoization is not supported inside GPU kernels at present.)\n"; + + if (allocate->memory_type == MemoryType::GPUShared) { + internal_assert(allocate->extents.size() == 1); + shared_mem_size += allocate->extents[0] * allocate->type.bytes(); + found_shared = true; + } + allocate->body.accept(this); + } +}; + +Expr make_type_arg(const Type &t) { + vector args = { + cast(t.code()), + cast(t.bits()), + cast(t.lanes()), + }; + return Call::make(type_of(), Call::make_struct, args, Call::Intrinsic); +} + +class InjectGpuOffload : public IRMutator { + /** Child code generator for device kernels. */ + map> cgdev; + + map state_bufs; + + const Target ⌖ + + Expr state_var(const string &name, Type type, bool create) { + Expr ptr = state_var_ptr(name, type, create); + if (!ptr.defined()) { + return Expr(); + } + return Let::make(name, ptr, + Load::make(type_of(), name, 0, + Buffer<>(), Parameter(), const_true(), ModulusRemainder())); + } + + Expr state_var_ptr(const string &name, Type type, bool create) { + Expr &buf = state_bufs[name]; + if (!buf.defined() && create) { + auto storage = Buffer::make_scalar(name + "_buf"); + storage() = nullptr; + buf = Variable::make(type_of(), storage.name() + ".buffer", storage); + } + if (buf.defined()) { + return Call::make(Handle(), Call::buffer_get_host, {buf}, Call::Extern); + } else { + return Expr(); + } + } + + Expr module_state(const string &api_name, bool create = true) { + return state_var(api_name, type_of(), create); + } + + Expr module_state_ptr(const string &api_name, bool create = true) { + return state_var_ptr(api_name, type_of(), create); + } + + // Create a Buffer containing the given vector, and return an + // expression for a pointer to the first element. + Expr make_buffer_ptr(const vector &data, const string &name) { + Buffer code((int)data.size(), name); + memcpy(code.data(), data.data(), (int)data.size()); + Expr buf = Variable::make(type_of(), name + ".buffer", code); + return Call::make(Handle(), Call::buffer_get_host, {buf}, Call::Extern); + } + + using IRMutator::visit; + + Stmt visit(const For *loop) override { + if (!CodeGen_GPU_Dev::is_gpu_var(loop->name)) { + return IRMutator::visit(loop); + } + + // We're in the loop over outermost block dimension + debug(2) << "Kernel launch: " << loop->name << "\n"; + + internal_assert(loop->device_api != DeviceAPI::Default_GPU) + << "A concrete device API should have been selected before codegen."; + + ExtractBounds bounds; + loop->accept(&bounds); + debug(2) << "Kernel bounds: (" + << bounds.num_threads[0] << ", " + << bounds.num_threads[1] << ", " + << bounds.num_threads[2] << ", " + << bounds.num_threads[3] << ") threads, (" + << bounds.num_blocks[0] << ", " + << bounds.num_blocks[1] << ", " + << bounds.num_blocks[2] << ", " + << bounds.num_blocks[3] << ") blocks\n"; + + // compute a closure over the state passed into the kernel + HostClosure c(loop->body, loop->name); + + // Determine the arguments that must be passed into the halide function + vector closure_args = c.arguments(); + + // Sort the args by the size of the underlying type. This is + // helpful for avoiding struct-packing ambiguities in metal, + // which passes the scalar args as a struct. + sort(closure_args.begin(), closure_args.end(), + [](const DeviceArgument &a, const DeviceArgument &b) { + if (a.is_buffer == b.is_buffer) { + return a.type.bits() > b.type.bits(); + } else { + // Ensure that buffer arguments come first: + // for many OpenGL/Compute systems, the + // legal indices for buffer args are much + // more restrictive than for scalar args, + // and scalar args can be 'grown' by + // LICM. Putting buffers first makes it much + // more likely we won't fail on some + // hardware. + return a.is_buffer > b.is_buffer; + } + }); + + // compile the kernel + string kernel_name = c_print_name(unique_name("kernel_" + loop->name)); + + CodeGen_GPU_Dev *gpu_codegen = cgdev[loop->device_api].get(); + user_assert(gpu_codegen != nullptr) + << "Loop is scheduled on device " << loop->device_api + << " which does not appear in target " << target.to_string() << "\n"; + gpu_codegen->add_kernel(loop, kernel_name, closure_args); + + // get the actual name of the generated kernel for this loop + kernel_name = gpu_codegen->get_current_kernel_name(); + debug(2) << "Compiled launch to kernel \"" << kernel_name << "\"\n"; + + bool runtime_run_takes_types = gpu_codegen->kernel_run_takes_types(); + Type target_size_t_type = target.bits == 32 ? Int(32) : Int(64); + + vector args, arg_types_or_sizes, arg_is_buffer; + for (const DeviceArgument &i : closure_args) { + Expr val; + if (i.is_buffer) { + val = Variable::make(Handle(), i.name + ".buffer"); + } else { + val = Variable::make(i.type, i.name); + val = Call::make(type_of(), Call::make_struct, {val}, Call::Intrinsic); + } + args.push_back(val); + + if (runtime_run_takes_types) { + arg_types_or_sizes.push_back(make_type_arg(i.type.with_lanes(1))); + } else { + arg_types_or_sizes.push_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes())); + } + + arg_is_buffer.push_back(cast(i.is_buffer)); + } + + // nullptr-terminate the lists + args.push_back(reinterpret(Handle(), cast(0))); + if (runtime_run_takes_types) { + internal_assert(halide_type_int == 0); + arg_types_or_sizes.push_back(make_type_arg(Type(halide_type_int, 0, 0))); + } else { + arg_types_or_sizes.push_back(cast(target_size_t_type, 0)); + } + arg_is_buffer.push_back(cast(0)); + + // TODO: only three dimensions can be passed to + // cuLaunchKernel. How should we handle blkid[3]? + internal_assert(is_const_one(bounds.num_threads[3]) && is_const_one(bounds.num_blocks[3])) + << bounds.num_threads[3] << ", " << bounds.num_blocks[3] << "\n"; + debug(3) << "bounds.num_blocks[0] = " << bounds.num_blocks[0] << "\n"; + debug(3) << "bounds.num_blocks[1] = " << bounds.num_blocks[1] << "\n"; + debug(3) << "bounds.num_blocks[2] = " << bounds.num_blocks[2] << "\n"; + debug(3) << "bounds.num_threads[0] = " << bounds.num_threads[0] << "\n"; + debug(3) << "bounds.num_threads[1] = " << bounds.num_threads[1] << "\n"; + debug(3) << "bounds.num_threads[2] = " << bounds.num_threads[2] << "\n"; + + string api_unique_name = gpu_codegen->api_unique_name(); + vector run_args = { + module_state(api_unique_name), + kernel_name, + Expr(bounds.num_blocks[0]), + Expr(bounds.num_blocks[1]), + Expr(bounds.num_blocks[2]), + Expr(bounds.num_threads[0]), + Expr(bounds.num_threads[1]), + Expr(bounds.num_threads[2]), + Expr(bounds.shared_mem_size), + Call::make(Handle(), Call::make_struct, arg_types_or_sizes, Call::Intrinsic), + Call::make(Handle(), Call::make_struct, args, Call::Intrinsic), + Call::make(Handle(), Call::make_struct, arg_is_buffer, Call::Intrinsic), + }; + return call_extern_and_assert("halide_" + api_unique_name + "_run", run_args); + } + +public: + InjectGpuOffload(const Target &target) : target(target) { + if (target.has_feature(Target::OpenGLCompute)) { + cgdev[DeviceAPI::OpenGLCompute] = new_CodeGen_OpenGLCompute_Dev(target); + } + if (target.has_feature(Target::CUDA)) { + cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(target); + } + if (target.has_feature(Target::OpenCL)) { + cgdev[DeviceAPI::OpenCL] = new_CodeGen_OpenCL_Dev(target); + } + if (target.has_feature(Target::Metal)) { + cgdev[DeviceAPI::Metal] = new_CodeGen_Metal_Dev(target); + } + if (target.has_feature(Target::D3D12Compute)) { + cgdev[DeviceAPI::D3D12Compute] = new_CodeGen_D3D12Compute_Dev(target); + } + + internal_assert(!cgdev.empty()) << "Requested unknown GPU target: " << target.to_string() << "\n"; + } + + Stmt inject(Stmt s) { + // Create a new module for all of the kernels we find in this function. + for (auto &i : cgdev) { + i.second->init_module(); + } + + Stmt result = mutate(s); + + for (auto &i : cgdev) { + string api_unique_name = i.second->api_unique_name(); + + // If the module state for this API/function did not get created, there were + // no kernels using this API. + Expr state_ptr = module_state_ptr(api_unique_name, false); + if (!state_ptr.defined()) { + continue; + } + + debug(2) << "Generating init_kernels for " << api_unique_name << "\n"; + vector kernel_src = i.second->compile_to_src(); + Expr kernel_src_buf = make_buffer_ptr(kernel_src, api_unique_name + "_kernels"); + + string init_kernels_name = "halide_" + api_unique_name + "_initialize_kernels"; + vector init_args = {state_ptr, kernel_src_buf, Expr((int)kernel_src.size())}; + Stmt init_kernels = call_extern_and_assert(init_kernels_name, init_args); + + string destructor_name = "halide_" + api_unique_name + "_finalize_kernels"; + vector finalize_args = {Expr(destructor_name), module_state(api_unique_name)}; + Stmt register_destructor = Evaluate::make( + Call::make(Handle(), Call::register_destructor, finalize_args, Call::Intrinsic)); + + result = Block::make({init_kernels, register_destructor, result}); + } + return result; + } +}; + +} // namespace + +Stmt inject_gpu_offload(Stmt s, const Target &host_target) { + return InjectGpuOffload(host_target).inject(s); +} + +} // namespace Internal +} // namespace Halide diff --git a/src/OffloadGPULoops.h b/src/OffloadGPULoops.h new file mode 100644 index 000000000000..c513a35fd8d5 --- /dev/null +++ b/src/OffloadGPULoops.h @@ -0,0 +1,25 @@ +#ifndef HALIDE_OFFLOAD_GPU_LOOPS_H +#define HALIDE_OFFLOAD_GPU_LOOPS_H + +/** \file + * Defines a lowering pass to pull loops marked with + * GPU device APIs to a separate module, and call them through the + * appropriate host runtime module. + */ + +#include "Expr.h" + +namespace Halide { + +struct Target; + +namespace Internal { + +/** Pull loops marked with GPU device APIs to a separate + * module, and call them through the appropriate host runtime module. */ +Stmt inject_gpu_offload(Stmt s, const Target &host_target); + +} // namespace Internal +} // namespace Halide + +#endif From c87f37e84384a05e61e506fa458d31ac6b01e5a4 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 26 Feb 2021 17:06:00 -0700 Subject: [PATCH 03/19] clang-format. --- src/OffloadGPULoops.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index cd80ebb94d92..04d4d5631437 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -187,7 +187,7 @@ class InjectGpuOffload : public IRMutator { sort(closure_args.begin(), closure_args.end(), [](const DeviceArgument &a, const DeviceArgument &b) { if (a.is_buffer == b.is_buffer) { - return a.type.bits() > b.type.bits(); + return a.type.bits() > b.type.bits(); } else { // Ensure that buffer arguments come first: // for many OpenGL/Compute systems, the @@ -277,7 +277,8 @@ class InjectGpuOffload : public IRMutator { } public: - InjectGpuOffload(const Target &target) : target(target) { + InjectGpuOffload(const Target &target) + : target(target) { if (target.has_feature(Target::OpenGLCompute)) { cgdev[DeviceAPI::OpenGLCompute] = new_CodeGen_OpenGLCompute_Dev(target); } From 797e43e9660cf37ac8e3c32e318b5d61b7a9fa06 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 26 Feb 2021 17:08:25 -0700 Subject: [PATCH 04/19] clang-format sorting is case sensitive!? --- src/OffloadGPULoops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index 04d4d5631437..187c735dbb97 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -8,10 +8,10 @@ #include "CodeGen_OpenGLCompute_Dev.h" #include "CodeGen_PTX_Dev.h" #include "ExprUsesVar.h" -#include "InjectHostDevBufferCopies.h" #include "IRMutator.h" #include "IROperator.h" #include "IRPrinter.h" +#include "InjectHostDevBufferCopies.h" #include "OffloadGPULoops.h" #include "Util.h" From da6effb095b3825c6ff18917c68d72f441ebdfcb Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 26 Feb 2021 17:33:34 -0700 Subject: [PATCH 05/19] clang-tidy --- src/OffloadGPULoops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index 187c735dbb97..ed9743a9ac05 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -298,7 +298,7 @@ class InjectGpuOffload : public IRMutator { internal_assert(!cgdev.empty()) << "Requested unknown GPU target: " << target.to_string() << "\n"; } - Stmt inject(Stmt s) { + Stmt inject(const Stmt &s) { // Create a new module for all of the kernels we find in this function. for (auto &i : cgdev) { i.second->init_module(); From ebbb9a5554aad9dfab133ebc0eb33a16aac88a4f Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 26 Feb 2021 17:31:39 -0700 Subject: [PATCH 06/19] Move codegen backends into anonymous namespaces in source files. --- Makefile | 9 +-- src/CMakeLists.txt | 7 +-- src/CodeGen_ARM.cpp | 106 ++++++++++++++++++++++++++---------- src/CodeGen_ARM.h | 72 ------------------------ src/CodeGen_Hexagon.cpp | 17 +----- src/CodeGen_Hexagon.h | 29 ---------- src/CodeGen_LLVM.cpp | 58 +++++--------------- src/CodeGen_LLVM.h | 15 ----- src/CodeGen_MIPS.cpp | 43 ++++++++++++--- src/CodeGen_MIPS.h | 34 ------------ src/CodeGen_PTX_Dev.cpp | 24 ++++---- src/CodeGen_PowerPC.cpp | 67 ++++++++++++++++------- src/CodeGen_PowerPC.h | 42 -------------- src/CodeGen_RISCV.cpp | 43 ++++++++++++--- src/CodeGen_RISCV.h | 33 ----------- src/CodeGen_Targets.h | 30 ++++++++++ src/CodeGen_WebAssembly.cpp | 59 +++++++++++++------- src/CodeGen_WebAssembly.h | 35 ------------ src/CodeGen_X86.cpp | 84 ++++++++++++++++++++-------- src/CodeGen_X86.h | 58 -------------------- src/OffloadGPULoops.cpp | 2 +- src/WasmExecutor.cpp | 4 +- 22 files changed, 359 insertions(+), 512 deletions(-) delete mode 100644 src/CodeGen_ARM.h delete mode 100644 src/CodeGen_Hexagon.h delete mode 100644 src/CodeGen_MIPS.h delete mode 100644 src/CodeGen_PowerPC.h delete mode 100644 src/CodeGen_RISCV.h create mode 100644 src/CodeGen_Targets.h delete mode 100644 src/CodeGen_WebAssembly.h delete mode 100644 src/CodeGen_X86.h diff --git a/Makefile b/Makefile index c685c299940c..f97386f093da 100644 --- a/Makefile +++ b/Makefile @@ -591,23 +591,18 @@ HEADER_FILES = \ Buffer.h \ CanonicalizeGPUVars.h \ Closure.h \ - CodeGen_ARM.h \ CodeGen_C.h \ CodeGen_D3D12Compute_Dev.h \ CodeGen_GPU_Dev.h \ CodeGen_Internal.h \ CodeGen_LLVM.h \ CodeGen_Metal_Dev.h \ - CodeGen_MIPS.h \ CodeGen_OpenCL_Dev.h \ CodeGen_OpenGLCompute_Dev.h \ CodeGen_Posix.h \ - CodeGen_PowerPC.h \ CodeGen_PTX_Dev.h \ CodeGen_PyTorch.h \ - CodeGen_RISCV.h \ - CodeGen_WebAssembly.h \ - CodeGen_X86.h \ + CodeGen_Targets.h \ CompilerLogger.h \ ConciseCasts.h \ CPlusPlusMangle.h \ @@ -1091,7 +1086,7 @@ $(BUILD_DIR)/initmod_ptx.%_ll.o: $(BUILD_DIR)/initmod_ptx.%_ll.cpp $(BUILD_DIR)/initmod.%.o: $(BUILD_DIR)/initmod.%.cpp $(CXX) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o -$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp $(SRC_DIR)/%.h $(BUILD_DIR)/llvm_ok +$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp $(BUILD_DIR)/llvm_ok @mkdir -p $(@D) $(CXX) $(CXX_FLAGS) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b74006aa3528..8bc68eeeb26c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -23,23 +23,18 @@ set(HEADER_FILES Buffer.h CanonicalizeGPUVars.h Closure.h - CodeGen_ARM.h CodeGen_C.h CodeGen_D3D12Compute_Dev.h CodeGen_GPU_Dev.h CodeGen_Internal.h CodeGen_LLVM.h CodeGen_Metal_Dev.h - CodeGen_MIPS.h CodeGen_OpenCL_Dev.h CodeGen_OpenGLCompute_Dev.h CodeGen_Posix.h - CodeGen_PowerPC.h CodeGen_PTX_Dev.h CodeGen_PyTorch.h - CodeGen_RISCV.h - CodeGen_WebAssembly.h - CodeGen_X86.h + CodeGen_Targets.h CompilerLogger.h ConciseCasts.h CPlusPlusMangle.h diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 89260d758cc7..31cd8d7d8e7f 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1,9 +1,8 @@ -#include #include #include "CSE.h" -#include "CodeGen_ARM.h" #include "CodeGen_Internal.h" +#include "CodeGen_Posix.h" #include "ConciseCasts.h" #include "Debug.h" #include "IREquality.h" @@ -25,6 +24,8 @@ using std::vector; using namespace Halide::ConciseCasts; using namespace llvm; +#if defined(WITH_ARM) || defined(WITH_AARCH64) + namespace { // Broadcast to an unknown number of lanes, for making patterns. @@ -32,21 +33,59 @@ Expr bc(Expr x) { return Broadcast::make(std::move(x), 0); } -} // namespace +/** A code generator that emits ARM code from a given Halide stmt. */ +class CodeGen_ARM : public CodeGen_Posix { +public: + /** Create an ARM code generator for the given arm target. */ + CodeGen_ARM(const Target &); + +protected: + using CodeGen_Posix::visit; + + /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that + * takes one vector argument and splits it into two to call inner. */ + llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const string &name); + void init_module() override; + + /** Nodes for which we want to emit specific neon intrinsics */ + // @{ + void visit(const Cast *) override; + void visit(const Sub *) override; + void visit(const Mul *) override; + void visit(const Min *) override; + void visit(const Max *) override; + void visit(const Store *) override; + void visit(const Load *) override; + void visit(const Call *) override; + void visit(const LT *) override; + void visit(const LE *) override; + void codegen_vector_reduce(const VectorReduce *, const Expr &) override; + // @} + + /** Various patterns to peephole match against */ + struct Pattern { + string intrin; ///< Name of the intrinsic + Expr pattern; ///< The pattern to match against + Pattern() = default; + Pattern(const string &intrin, Expr p) + : intrin(intrin), pattern(std::move(p)) { + } + }; + vector casts, averagings, negations; + + string mcpu() const override; + string mattrs() const override; + bool use_soft_float_abi() const override; + int native_vector_bits() const override; + + // NEON can be disabled for older processors. + bool neon_intrinsics_disabled() { + return target.has_feature(Target::NoNEON); + } +}; CodeGen_ARM::CodeGen_ARM(const Target &target) : CodeGen_Posix(target) { - if (target.bits == 32) { -#if !defined(WITH_ARM) - user_error << "arm not enabled for this build of Halide."; -#endif - user_assert(llvm_ARM_enabled) << "llvm build not configured with ARM target enabled\n."; - } else { -#if !defined(WITH_AARCH64) - user_error << "aarch64 not enabled for this build of Halide."; -#endif - user_assert(llvm_AArch64_enabled) << "llvm build not configured with AArch64 target enabled.\n"; - } // RADDHN - Add and narrow with rounding // These must come before other narrowing rounding shift patterns @@ -162,8 +201,6 @@ CodeGen_ARM::CodeGen_ARM(const Target &target) // clang-format on } -namespace { - constexpr int max_intrinsic_args = 4; struct ArmIntrinsic { @@ -512,9 +549,7 @@ const ArmIntrinsic intrinsic_defs[] = { }; // clang-format on -} // namespace - -llvm::Function *CodeGen_ARM::define_concat_args_wrapper(llvm::Function *inner, const std::string &name) { +llvm::Function *CodeGen_ARM::define_concat_args_wrapper(llvm::Function *inner, const string &name) { llvm::FunctionType *inner_ty = inner->getFunctionType(); internal_assert(inner_ty->getNumParams() == 2); @@ -558,7 +593,7 @@ void CodeGen_ARM::init_module() { return; } - std::string prefix = target.bits == 32 ? "llvm.arm.neon." : "llvm.aarch64.neon."; + string prefix = target.bits == 32 ? "llvm.arm.neon." : "llvm.aarch64.neon."; for (const ArmIntrinsic &intrin : intrinsic_defs) { // Get the name of the intrinsic with the appropriate prefix. const char *intrin_name = nullptr; @@ -570,13 +605,13 @@ void CodeGen_ARM::init_module() { if (!intrin_name) { continue; } - std::string full_name = intrin_name; + string full_name = intrin_name; if (!starts_with(full_name, "llvm.")) { full_name = prefix + full_name; } // We might have to generate versions of this intrinsic with multiple widths. - std::vector width_factors = {1}; + vector width_factors = {1}; if (intrin.flags & ArmIntrinsic::HalfWidth) { width_factors.push_back(2); } @@ -585,7 +620,7 @@ void CodeGen_ARM::init_module() { Type ret_type = intrin.ret_type; ret_type = ret_type.with_lanes(ret_type.lanes() * width_factor); internal_assert(ret_type.bits() * ret_type.lanes() <= 128) << full_name << "\n"; - std::vector arg_types; + vector arg_types; arg_types.reserve(4); for (halide_type_t i : intrin.arg_types) { if (i.bits == 0) { @@ -603,7 +638,7 @@ void CodeGen_ARM::init_module() { mangled_name_builder << full_name; if (starts_with(full_name, "llvm.") && (intrin.flags & ArmIntrinsic::NoMangle) == 0) { // Append LLVM name mangling for either the return type or the arguments, or both. - std::vector types; + vector types; if (intrin.flags & ArmIntrinsic::MangleArgs) { types = arg_types; } else if (intrin.flags & ArmIntrinsic::MangleRetArgs) { @@ -622,12 +657,12 @@ void CodeGen_ARM::init_module() { mangled_name_builder << t.bits(); } } - std::string mangled_name = mangled_name_builder.str(); + string mangled_name = mangled_name_builder.str(); llvm::Function *intrin_impl = nullptr; if (intrin.flags & ArmIntrinsic::SplitArg0) { // This intrinsic needs a wrapper to split the argument. - std::string wrapper_name = intrin.name + unique_name("_wrapper"); + string wrapper_name = intrin.name + unique_name("_wrapper"); Type split_arg_type = arg_types[0].with_lanes(arg_types[0].lanes() / 2); llvm::Function *to_wrap = get_llvm_intrin(ret_type, mangled_name, {split_arg_type, split_arg_type}); intrin_impl = define_concat_args_wrapper(to_wrap, wrapper_name); @@ -1178,7 +1213,7 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init // clang-format on int factor = op->value.type().lanes() / op->type.lanes(); - std::vector matches; + vector matches; for (const Pattern &p : patterns) { if (op->op != p.reduce_op || factor % p.factor != 0) { continue; @@ -1208,7 +1243,7 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init // TODO: Move this to be patterns? The patterns are pretty trivial, but some // of the other logic is tricky. const char *intrin = nullptr; - std::vector intrin_args; + vector intrin_args; Expr accumulator = init; if (op->op == VectorReduce::Add && factor == 2) { Type narrow_type = op->type.narrow().with_lanes(op->value.type().lanes()); @@ -1340,5 +1375,20 @@ int CodeGen_ARM::native_vector_bits() const { return 128; } +} // namespace + +std::unique_ptr new_CodeGen_ARM(const Target &target) { + return std::make_unique(target); +} + +#else // WITH_ARM || WITH_AARCH64 + +std::unique_ptr new_CodeGen_ARM(const Target &target) { + user_error << "ARM not enabled for this build of Halide.\n"; + return nullptr; +} + +#endif // WITH_ARM || WITH_AARCH64 + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_ARM.h b/src/CodeGen_ARM.h deleted file mode 100644 index 39d1a39d7a0f..000000000000 --- a/src/CodeGen_ARM.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef HALIDE_CODEGEN_ARM_H -#define HALIDE_CODEGEN_ARM_H - -/** \file - * Defines the code-generator for producing ARM machine code - */ - -#include - -#include "CodeGen_Posix.h" - -namespace Halide { - -struct Target; - -namespace Internal { - -/** A code generator that emits ARM code from a given Halide stmt. */ -class CodeGen_ARM : public CodeGen_Posix { -public: - /** Create an ARM code generator for the given arm target. */ - CodeGen_ARM(const Target &); - -protected: - using CodeGen_Posix::visit; - - /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that - * takes one vector argument and splits it into two to call inner. */ - llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const std::string &name); - void init_module() override; - - /** Nodes for which we want to emit specific neon intrinsics */ - // @{ - void visit(const Cast *) override; - void visit(const Sub *) override; - void visit(const Mul *) override; - void visit(const Min *) override; - void visit(const Max *) override; - void visit(const Store *) override; - void visit(const Load *) override; - void visit(const Call *) override; - void visit(const LT *) override; - void visit(const LE *) override; - void codegen_vector_reduce(const VectorReduce *, const Expr &) override; - // @} - - /** Various patterns to peephole match against */ - struct Pattern { - std::string intrin; ///< Name of the intrinsic - Expr pattern; ///< The pattern to match against - Pattern() = default; - Pattern(const std::string &intrin, Expr p) - : intrin(intrin), pattern(std::move(p)) { - } - }; - std::vector casts, averagings, negations; - - std::string mcpu() const override; - std::string mattrs() const override; - bool use_soft_float_abi() const override; - int native_vector_bits() const override; - - // NEON can be disabled for older processors. - bool neon_intrinsics_disabled() { - return target.has_feature(Target::NoNEON); - } -}; - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp index 223a5231da66..1f033fdcc33c 100644 --- a/src/CodeGen_Hexagon.cpp +++ b/src/CodeGen_Hexagon.cpp @@ -1,7 +1,3 @@ -#include "CodeGen_Hexagon.h" - -#include -#include #include #include @@ -12,14 +8,11 @@ #include "Debug.h" #include "HexagonOptimize.h" #include "IREquality.h" -#include "IRMatch.h" #include "IRMutator.h" #include "IROperator.h" #include "IRPrinter.h" -#include "LICM.h" #include "LLVM_Headers.h" #include "LoopCarry.h" -#include "Monotonic.h" #include "Simplify.h" #include "Substitute.h" #include "Target.h" @@ -138,8 +131,6 @@ class CodeGen_Hexagon : public CodeGen_Posix { CodeGen_Hexagon::CodeGen_Hexagon(const Target &t) : CodeGen_Posix(t) { - user_assert(llvm_Hexagon_enabled) - << "llvm build not configured with Hexagon target enabled.\n"; if (target.has_feature(Halide::Target::HVX_v66)) { isa_version = 66; } else if (target.has_feature(Halide::Target::HVX_v65)) { @@ -2326,15 +2317,13 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) { } // namespace -std::unique_ptr new_CodeGen_Hexagon(const Target &target, llvm::LLVMContext &context) { - std::unique_ptr ret(std::make_unique(target)); - ret->set_context(context); - return ret; +std::unique_ptr new_CodeGen_Hexagon(const Target &target) { + return std::make_unique(target); } #else // WITH_HEXAGON -std::unique_ptr new_CodeGen_Hexagon(const Target &target, llvm::LLVMContext &context) { +std::unique_ptr new_CodeGen_Hexagon(const Target &target) { user_error << "hexagon not enabled for this build of Halide.\n"; return nullptr; } diff --git a/src/CodeGen_Hexagon.h b/src/CodeGen_Hexagon.h deleted file mode 100644 index a844c594e6c7..000000000000 --- a/src/CodeGen_Hexagon.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef HALIDE_CODEGEN_HEXAGON_H -#define HALIDE_CODEGEN_HEXAGON_H - -/** \file - * Defines the code-generator for producing Hexagon machine code - */ - -#include - -namespace llvm { - -class LLVMContext; - -} - -namespace Halide { - -struct Target; - -namespace Internal { - -class CodeGen_Posix; - -std::unique_ptr new_CodeGen_Hexagon(const Target &target, llvm::LLVMContext &context); - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 97dbd8c225da..913cf0c6f786 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1,20 +1,13 @@ -#include #include #include -#include #include #include "CPlusPlusMangle.h" #include "CSE.h" -#include "CodeGen_ARM.h" -#include "CodeGen_Hexagon.h" #include "CodeGen_Internal.h" #include "CodeGen_LLVM.h" -#include "CodeGen_MIPS.h" -#include "CodeGen_PowerPC.h" -#include "CodeGen_RISCV.h" -#include "CodeGen_WebAssembly.h" -#include "CodeGen_X86.h" +#include "CodeGen_Posix.h" +#include "CodeGen_Targets.h" #include "CompilerLogger.h" #include "Debug.h" #include "Deinterleave.h" @@ -81,8 +74,7 @@ using std::vector; #define InitializeTarget(target) \ LLVMInitialize##target##Target(); \ LLVMInitialize##target##TargetInfo(); \ - LLVMInitialize##target##TargetMC(); \ - llvm_##target##_enabled = true; + LLVMInitialize##target##TargetMC(); #define InitializeAsmParser(target) \ LLVMInitialize##target##AsmParser(); @@ -222,41 +214,30 @@ CodeGen_LLVM::CodeGen_LLVM(const Target &t) initialize_llvm(); } -namespace { - -template -std::unique_ptr make_codegen(const Target &target, llvm::LLVMContext &context) { - std::unique_ptr ret = std::make_unique(target); - ret->set_context(context); - return ret; -} - -} // namespace - void CodeGen_LLVM::set_context(llvm::LLVMContext &context) { this->context = &context; } std::unique_ptr CodeGen_LLVM::new_for_target(const Target &target, llvm::LLVMContext &context) { + std::unique_ptr result; if (target.arch == Target::X86) { - return make_codegen(target, context); + result = new_CodeGen_X86(target); } else if (target.arch == Target::ARM) { - return make_codegen(target, context); + result = new_CodeGen_ARM(target); } else if (target.arch == Target::MIPS) { - return make_codegen(target, context); + result = new_CodeGen_MIPS(target); } else if (target.arch == Target::POWERPC) { - return make_codegen(target, context); + result = new_CodeGen_PowerPC(target); } else if (target.arch == Target::Hexagon) { - return new_CodeGen_Hexagon(target, context); + result = new_CodeGen_Hexagon(target); } else if (target.arch == Target::WebAssembly) { - return make_codegen(target, context); + result = new_CodeGen_WebAssembly(target); } else if (target.arch == Target::RISCV) { - return make_codegen(target, context); + result = new_CodeGen_RISCV(target); } - - user_error << "Unknown target architecture: " - << target.to_string() << "\n"; - return nullptr; + user_assert(result) << "Unknown target architecture: " << target.to_string() << "\n"; + result->set_context(context); + return result; } void CodeGen_LLVM::initialize_llvm() { @@ -352,17 +333,6 @@ CodeGen_LLVM::~CodeGen_LLVM() { delete builder; } -bool CodeGen_LLVM::llvm_X86_enabled = false; -bool CodeGen_LLVM::llvm_ARM_enabled = false; -bool CodeGen_LLVM::llvm_Hexagon_enabled = false; -bool CodeGen_LLVM::llvm_AArch64_enabled = false; -bool CodeGen_LLVM::llvm_NVPTX_enabled = false; -bool CodeGen_LLVM::llvm_Mips_enabled = false; -bool CodeGen_LLVM::llvm_PowerPC_enabled = false; -bool CodeGen_LLVM::llvm_AMDGPU_enabled = false; -bool CodeGen_LLVM::llvm_WebAssembly_enabled = false; -bool CodeGen_LLVM::llvm_RISCV_enabled = false; - namespace { struct MangledNames { diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 060fe9ee10ce..092bc7713b5b 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -138,21 +138,6 @@ class CodeGen_LLVM : public IRVisitor { * of functions as. */ virtual Type upgrade_type_for_argument_passing(const Type &) const; - /** State needed by llvm for code generation, including the - * current module, function, context, builder, and most recently - * generated llvm value. */ - //@{ - static bool llvm_X86_enabled; - static bool llvm_ARM_enabled; - static bool llvm_Hexagon_enabled; - static bool llvm_AArch64_enabled; - static bool llvm_NVPTX_enabled; - static bool llvm_Mips_enabled; - static bool llvm_PowerPC_enabled; - static bool llvm_AMDGPU_enabled; - static bool llvm_WebAssembly_enabled; - static bool llvm_RISCV_enabled; - std::unique_ptr module; llvm::Function *function; llvm::LLVMContext *context; diff --git a/src/CodeGen_MIPS.cpp b/src/CodeGen_MIPS.cpp index ddc0b9d9c9ea..4118a12b684f 100644 --- a/src/CodeGen_MIPS.cpp +++ b/src/CodeGen_MIPS.cpp @@ -1,20 +1,32 @@ -#include "CodeGen_MIPS.h" -#include "LLVM_Headers.h" -#include "Util.h" +#include "CodeGen_Posix.h" namespace Halide { namespace Internal { using std::string; -using namespace llvm; +#if defined(WITH_MIPS) + +namespace { + +/** A code generator that emits mips code from a given Halide stmt. */ +class CodeGen_MIPS : public CodeGen_Posix { +public: + /** Create a mips code generator. Processor features can be + * enabled using the appropriate flags in the target struct. */ + CodeGen_MIPS(const Target &); + +protected: + using CodeGen_Posix::visit; + + string mcpu() const override; + string mattrs() const override; + bool use_soft_float_abi() const override; + int native_vector_bits() const override; +}; CodeGen_MIPS::CodeGen_MIPS(const Target &t) : CodeGen_Posix(t) { -#if !defined(WITH_MIPS) - user_error << "llvm build not configured with MIPS target enabled.\n"; -#endif - user_assert(llvm_Mips_enabled) << "llvm build not configured with MIPS target enabled.\n"; } string CodeGen_MIPS::mcpu() const { @@ -41,5 +53,20 @@ int CodeGen_MIPS::native_vector_bits() const { return 128; } +} // namespace + +std::unique_ptr new_CodeGen_MIPS(const Target &target) { + return std::make_unique(target); +} + +#else // WITH_MIPS + +std::unique_ptr new_CodeGen_MIPS(const Target &target) { + user_error << "MIPS not enabled for this build of Halide.\n"; + return nullptr; +} + +#endif // WITH_MIPS + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_MIPS.h b/src/CodeGen_MIPS.h deleted file mode 100644 index fe5e2d2cb12e..000000000000 --- a/src/CodeGen_MIPS.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef HALIDE_CODEGEN_MIPS_H -#define HALIDE_CODEGEN_MIPS_H - -/** \file - * Defines the code-generator for producing MIPS machine code. - */ - -#include "CodeGen_Posix.h" - -namespace Halide { -namespace Internal { - -/** A code generator that emits mips code from a given Halide stmt. */ -class CodeGen_MIPS : public CodeGen_Posix { -public: - /** Create a mips code generator. Processor features can be - * enabled using the appropriate flags in the target struct. */ - CodeGen_MIPS(const Target &); - - static void test(); - -protected: - using CodeGen_Posix::visit; - - std::string mcpu() const override; - std::string mattrs() const override; - bool use_soft_float_abi() const override; - int native_vector_bits() const override; -}; - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index efe0b25bd2c0..26822cda2296 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -37,6 +37,8 @@ using namespace Halide::ConciseCasts; using namespace llvm; +#ifdef WITH_NVPTX + namespace { /** A code generator that emits GPU code from a given Halide stmt. */ @@ -111,11 +113,6 @@ class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev { CodeGen_PTX_Dev::CodeGen_PTX_Dev(const Target &host) : CodeGen_LLVM(host) { -#if !defined(WITH_NVPTX) - user_error << "ptx not enabled for this build of Halide.\n"; -#endif - user_assert(llvm_NVPTX_enabled) << "llvm build not configured with nvptx target enabled\n."; - context = new llvm::LLVMContext(); } @@ -229,9 +226,7 @@ void CodeGen_PTX_Dev::add_kernel(Stmt stmt, void CodeGen_PTX_Dev::init_module() { init_context(); -#ifdef WITH_NVPTX module = get_initial_module_for_ptx_device(target, context); -#endif declare_intrin_overload("dp4a", Int(32), "dp4a_s32_s32", {Int(8, 4), Int(8, 4), Int(32)}); declare_intrin_overload("dp4a", Int(32), "dp4a_s32_u32", {Int(8, 4), UInt(8, 4), Int(32)}); @@ -587,9 +582,6 @@ bool CodeGen_PTX_Dev::use_soft_float_abi() const { } vector CodeGen_PTX_Dev::compile_to_src() { - -#ifdef WITH_NVPTX - debug(2) << "In CodeGen_PTX_Dev::compile_to_src"; // DISABLED - hooked in here to force PrintBeforeAll option - seems to be the only way? @@ -755,9 +747,6 @@ vector CodeGen_PTX_Dev::compile_to_src() { // Null-terminate the ptx source buffer.push_back(0); return buffer; -#else // WITH_NVPTX - return vector(); -#endif } int CodeGen_PTX_Dev::native_vector_bits() const { @@ -801,5 +790,14 @@ std::unique_ptr new_CodeGen_PTX_Dev(const Target &target) { return std::make_unique(target); } +#else // WITH_PTX + +std::unique_ptr new_CodeGen_PTX_Dev(const Target &target) { + user_error << "PTX not enabled for this build of Halide.\n"; + return nullptr; +} + +#endif // WITH_PTX + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp index e81d6f281b93..312649280f62 100644 --- a/src/CodeGen_PowerPC.cpp +++ b/src/CodeGen_PowerPC.cpp @@ -1,9 +1,4 @@ -#include "CodeGen_PowerPC.h" -#include "ConciseCasts.h" -#include "IRMatch.h" -#include "IROperator.h" -#include "LLVM_Headers.h" -#include "Util.h" +#include "CodeGen_Posix.h" namespace Halide { namespace Internal { @@ -11,19 +6,38 @@ namespace Internal { using std::string; using std::vector; -using namespace Halide::ConciseCasts; -using namespace llvm; +#if defined(WITH_POWERPC) + +namespace { + +/** A code generator that emits mips code from a given Halide stmt. */ +class CodeGen_PowerPC : public CodeGen_Posix { +public: + /** Create a powerpc code generator. Processor features can be + * enabled using the appropriate flags in the target struct. */ + CodeGen_PowerPC(const Target &); + +protected: + void init_module() override; + + string mcpu() const override; + string mattrs() const override; + bool use_soft_float_abi() const override; + int native_vector_bits() const override; + + using CodeGen_Posix::visit; + + /** Nodes for which we want to emit specific PowerPC intrinsics */ + // @{ + void visit(const Min *) override; + void visit(const Max *) override; + // @} +}; CodeGen_PowerPC::CodeGen_PowerPC(const Target &t) : CodeGen_Posix(t) { -#if !defined(WITH_POWERPC) - user_error << "llvm build not configured with PowerPC target enabled.\n"; -#endif - user_assert(llvm_PowerPC_enabled) << "llvm build not configured with PowerPC target enabled.\n"; } -namespace { - const int max_intrinsic_args = 4; struct PowerPCIntrinsic { @@ -81,8 +95,6 @@ const PowerPCIntrinsic intrinsic_defs[] = { }; // clang-format on -} // namespace - void CodeGen_PowerPC::init_module() { CodeGen_Posix::init_module(); @@ -92,7 +104,7 @@ void CodeGen_PowerPC::init_module() { } Type ret_type = i.ret_type; - std::vector arg_types; + vector arg_types; arg_types.reserve(max_intrinsic_args); for (halide_type_t j : i.arg_types) { if (j.bits == 0) { @@ -140,9 +152,9 @@ string CodeGen_PowerPC::mcpu() const { } string CodeGen_PowerPC::mattrs() const { - std::string features; - std::string separator; - std::string enable; + string features; + string separator; + string enable; features += "+altivec"; separator = ","; @@ -172,5 +184,20 @@ int CodeGen_PowerPC::native_vector_bits() const { return 128; } +} // namespace + +std::unique_ptr new_CodeGen_PowerPC(const Target &target) { + return std::make_unique(target); +} + +#else // WITH_POWERPC + +std::unique_ptr new_CodeGen_PowerPC(const Target &target) { + user_error << "PowerPC not enabled for this build of Halide.\n"; + return nullptr; +} + +#endif // WITH_POWERPC + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_PowerPC.h b/src/CodeGen_PowerPC.h deleted file mode 100644 index c23cc5011e68..000000000000 --- a/src/CodeGen_PowerPC.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef HALIDE_CODEGEN_POWERPC_H -#define HALIDE_CODEGEN_POWERPC_H - -/** \file - * Defines the code-generator for producing POWERPC machine code. - */ - -#include "CodeGen_Posix.h" - -namespace Halide { -namespace Internal { - -/** A code generator that emits mips code from a given Halide stmt. */ -class CodeGen_PowerPC : public CodeGen_Posix { -public: - /** Create a powerpc code generator. Processor features can be - * enabled using the appropriate flags in the target struct. */ - CodeGen_PowerPC(const Target &); - - static void test(); - -protected: - void init_module() override; - - std::string mcpu() const override; - std::string mattrs() const override; - bool use_soft_float_abi() const override; - int native_vector_bits() const override; - - using CodeGen_Posix::visit; - - /** Nodes for which we want to emit specific PowerPC intrinsics */ - // @{ - void visit(const Min *) override; - void visit(const Max *) override; - // @} -}; - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp index 3fa9e3a529c3..01395f596b91 100644 --- a/src/CodeGen_RISCV.cpp +++ b/src/CodeGen_RISCV.cpp @@ -1,19 +1,33 @@ -#include "CodeGen_RISCV.h" -#include "LLVM_Headers.h" -#include "Util.h" +#include "CodeGen_Posix.h" namespace Halide { namespace Internal { using std::string; -using namespace llvm; +#if defined(WITH_RISCV) + +namespace { + +/** A code generator that emits mips code from a given Halide stmt. */ +class CodeGen_RISCV : public CodeGen_Posix { +public: + /** Create a mips code generator. Processor features can be + * enabled using the appropriate flags in the target struct. */ + CodeGen_RISCV(const Target &); + +protected: + using CodeGen_Posix::visit; + + string mcpu() const override; + string mattrs() const override; + string mabi() const override; + bool use_soft_float_abi() const override; + int native_vector_bits() const override; +}; CodeGen_RISCV::CodeGen_RISCV(const Target &t) : CodeGen_Posix(t) { -#if !defined(WITH_RISCV) - user_error << "llvm build not configured with RISCV target enabled.\n"; -#endif } string CodeGen_RISCV::mcpu() const { @@ -57,5 +71,20 @@ int CodeGen_RISCV::native_vector_bits() const { return 128; } +} // namespace + +std::unique_ptr new_CodeGen_RISCV(const Target &target) { + return std::make_unique(target); +} + +#else // WITH_RISCV + +std::unique_ptr new_CodeGen_RISCV(const Target &target) { + user_error << "RISCV not enabled for this build of Halide.\n"; + return nullptr; +} + +#endif // WITH_RISCV + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_RISCV.h b/src/CodeGen_RISCV.h deleted file mode 100644 index d6cb8328dec5..000000000000 --- a/src/CodeGen_RISCV.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef HALIDE_CODEGEN_RISCV_H -#define HALIDE_CODEGEN_RISCV_H - -/** \file - * Defines the code-generator for producing RISCV machine code. - */ - -#include "CodeGen_Posix.h" - -namespace Halide { -namespace Internal { - -/** A code generator that emits mips code from a given Halide stmt. */ -class CodeGen_RISCV : public CodeGen_Posix { -public: - /** Create a mips code generator. Processor features can be - * enabled using the appropriate flags in the target struct. */ - CodeGen_RISCV(const Target &); - -protected: - using CodeGen_Posix::visit; - - std::string mcpu() const override; - std::string mattrs() const override; - std::string mabi() const override; - bool use_soft_float_abi() const override; - int native_vector_bits() const override; -}; - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/CodeGen_Targets.h b/src/CodeGen_Targets.h new file mode 100644 index 000000000000..1667703fe5e3 --- /dev/null +++ b/src/CodeGen_Targets.h @@ -0,0 +1,30 @@ +#ifndef HALIDE_CODEGEN_TARGETS_H +#define HALIDE_CODEGEN_TARGETS_H + +/** \file + * Provides constructors for code generators for various targets. + */ + +#include + +namespace Halide { + +struct Target; + +namespace Internal { + +class CodeGen_Posix; + +/** Construct CodeGen object for a variety of targets. */ +std::unique_ptr new_CodeGen_ARM(const Target &target); +std::unique_ptr new_CodeGen_Hexagon(const Target &target); +std::unique_ptr new_CodeGen_MIPS(const Target &target); +std::unique_ptr new_CodeGen_PowerPC(const Target &target); +std::unique_ptr new_CodeGen_RISCV(const Target &target); +std::unique_ptr new_CodeGen_X86(const Target &target); +std::unique_ptr new_CodeGen_WebAssembly(const Target &target); + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp index 6d5292195abe..fe7ee2fe6db0 100644 --- a/src/CodeGen_WebAssembly.cpp +++ b/src/CodeGen_WebAssembly.cpp @@ -1,33 +1,37 @@ -#include "CodeGen_WebAssembly.h" - -#include "ConciseCasts.h" -#include "IRMatch.h" -#include "IROperator.h" -#include "LLVM_Headers.h" -#include "Util.h" +#include "CodeGen_Posix.h" #include namespace Halide { namespace Internal { -using namespace Halide::ConciseCasts; -using namespace llvm; using std::string; -using std::vector; + +#if defined(WITH_WEBASSEMBLY) + +namespace { + +/** A code generator that emits WebAssembly code from a given Halide stmt. */ +class CodeGen_WebAssembly : public CodeGen_Posix { +public: + CodeGen_WebAssembly(const Target &); + +protected: + using CodeGen_Posix::visit; + + void init_module() override; + + string mcpu() const override; + string mattrs() const override; + bool use_soft_float_abi() const override; + int native_vector_bits() const override; + bool use_pic() const override; +}; CodeGen_WebAssembly::CodeGen_WebAssembly(const Target &t) : CodeGen_Posix(t) { -#if !defined(WITH_WEBASSEMBLY) - user_error << "llvm build not configured with WebAssembly target enabled.\n"; -#endif - user_assert(LLVM_VERSION >= 110) << "Generating WebAssembly is only supported under LLVM 11+."; - user_assert(llvm_WebAssembly_enabled) << "llvm build not configured with WebAssembly target enabled.\n"; - user_assert(target.bits == 32) << "Only wasm32 is supported."; } -namespace { - constexpr int max_intrinsic_args = 4; struct WasmIntrinsic { @@ -65,8 +69,6 @@ const WasmIntrinsic intrinsic_defs[] = { }; // clang-format on -} // namespace - void CodeGen_WebAssembly::init_module() { CodeGen_Posix::init_module(); @@ -140,5 +142,22 @@ int CodeGen_WebAssembly::native_vector_bits() const { return 128; } +} // namespace + +std::unique_ptr new_CodeGen_WebAssembly(const Target &target) { + user_assert(LLVM_VERSION >= 110) << "Generating WebAssembly is only supported under LLVM 11+."; + user_assert(target.bits == 32) << "Only wasm32 is supported."; + return std::make_unique(target); +} + +#else // WITH_WEBASSEMBLY + +std::unique_ptr new_CodeGen_WebAssembly(const Target &target) { + user_error << "WebAssembly not enabled for this build of Halide.\n"; + return nullptr; +} + +#endif // WITH_WEBASSEMBLY + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_WebAssembly.h b/src/CodeGen_WebAssembly.h deleted file mode 100644 index ffbedae8d907..000000000000 --- a/src/CodeGen_WebAssembly.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef HALIDE_CODEGEN_WEBASSEMBLY_H -#define HALIDE_CODEGEN_WEBASSEMBLY_H - -/** \file - * Defines the code-generator for producing WebAssembly machine code. - */ - -#include "CodeGen_Posix.h" - -namespace Halide { -namespace Internal { - -/** A code generator that emits WebAssembly code from a given Halide stmt. */ -class CodeGen_WebAssembly : public CodeGen_Posix { -public: - CodeGen_WebAssembly(const Target &); - - static void test(); - -protected: - using CodeGen_Posix::visit; - - void init_module() override; - - std::string mcpu() const override; - std::string mattrs() const override; - bool use_soft_float_abi() const override; - int native_vector_bits() const override; - bool use_pic() const override; -}; - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 83cdae86dbdd..fbfc4881e70d 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -1,17 +1,12 @@ -#include - -#include "CodeGen_X86.h" +#include "CodeGen_Posix.h" #include "ConciseCasts.h" #include "Debug.h" #include "IRMatch.h" #include "IRMutator.h" #include "IROperator.h" -#include "JITModule.h" #include "LLVM_Headers.h" -#include "Param.h" #include "Simplify.h" #include "Util.h" -#include "Var.h" namespace Halide { namespace Internal { @@ -22,7 +17,10 @@ using std::vector; using namespace Halide::ConciseCasts; using namespace llvm; +#if defined(WITH_X86) + namespace { + // Populate feature flags in a target according to those implied by // existing flags, so that instruction patterns can just check for the // oldest feature flag that supports an instruction. @@ -46,20 +44,49 @@ Target complete_x86_target(Target t) { } return t; } -} // namespace + +/** A code generator that emits x86 code from a given Halide stmt. */ +class CodeGen_X86 : public CodeGen_Posix { +public: + /** Create an x86 code generator. Processor features can be + * enabled using the appropriate flags in the target struct. */ + CodeGen_X86(Target); + +protected: + string mcpu() const override; + string mattrs() const override; + bool use_soft_float_abi() const override; + int native_vector_bits() const override; + + int vector_lanes_for_slice(const Type &t) const; + + llvm::Type *llvm_type_of(const Type &t) const override; + + using CodeGen_Posix::visit; + + void init_module() override; + + /** Nodes for which we want to emit specific sse/avx intrinsics */ + // @{ + void visit(const Add *) override; + void visit(const Sub *) override; + void visit(const Cast *) override; + void visit(const Call *) override; + void visit(const GT *) override; + void visit(const LT *) override; + void visit(const LE *) override; + void visit(const GE *) override; + void visit(const EQ *) override; + void visit(const NE *) override; + void visit(const Select *) override; + void codegen_vector_reduce(const VectorReduce *, const Expr &init) override; + // @} +}; CodeGen_X86::CodeGen_X86(Target t) : CodeGen_Posix(complete_x86_target(t)) { - -#if !defined(WITH_X86) - user_error << "x86 not enabled for this build of Halide.\n"; -#endif - - user_assert(llvm_X86_enabled) << "llvm build not configured with X86 target enabled.\n"; } -namespace { - const int max_intrinsic_args = 4; struct x86Intrinsic { @@ -160,8 +187,6 @@ const x86Intrinsic intrinsic_defs[] = { }; // clang-format on -} // namespace - void CodeGen_X86::init_module() { CodeGen_Posix::init_module(); @@ -171,7 +196,7 @@ void CodeGen_X86::init_module() { } Type ret_type = i.ret_type; - std::vector arg_types; + vector arg_types; arg_types.reserve(max_intrinsic_args); for (halide_type_t j : i.arg_types) { if (j.bits == 0) { @@ -184,8 +209,6 @@ void CodeGen_X86::init_module() { } } -namespace { - // i32(i16_a)*i32(i16_b) +/- i32(i16_c)*i32(i16_d) can be done by // interleaving a, c, and b, d, and then using pmaddwd. We // recognize it here, and implement it in the initial module. @@ -230,8 +253,6 @@ bool should_use_pmaddwd(const Expr &a, const Expr &b, vector &result) { return false; } -} // namespace - void CodeGen_X86::visit(const Add *op) { vector matches; if (should_use_pmaddwd(op->a, op->b, matches)) { @@ -581,8 +602,8 @@ string CodeGen_X86::mcpu() const { } string CodeGen_X86::mattrs() const { - std::string features; - std::string separator; + string features; + string separator; if (target.has_feature(Target::FMA)) { features += "+fma"; separator = ","; @@ -669,5 +690,20 @@ llvm::Type *CodeGen_X86::llvm_type_of(const Type &t) const { } } +} // namespace + +std::unique_ptr new_CodeGen_X86(const Target &target) { + return std::make_unique(target); +} + +#else // WITH_X86 + +std::unique_ptr new_CodeGen_X86(const Target &target) { + user_error << "x86 not enabled for this build of Halide.\n"; + return nullptr; +} + +#endif // WITH_X86 + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_X86.h b/src/CodeGen_X86.h deleted file mode 100644 index 5d1d503df6a0..000000000000 --- a/src/CodeGen_X86.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef HALIDE_CODEGEN_X86_H -#define HALIDE_CODEGEN_X86_H - -/** \file - * Defines the code-generator for producing x86 machine code - */ - -#include "CodeGen_Posix.h" - -namespace llvm { -class JITEventListener; -} - -namespace Halide { -namespace Internal { - -/** A code generator that emits x86 code from a given Halide stmt. */ -class CodeGen_X86 : public CodeGen_Posix { -public: - /** Create an x86 code generator. Processor features can be - * enabled using the appropriate flags in the target struct. */ - CodeGen_X86(Target); - -protected: - std::string mcpu() const override; - std::string mattrs() const override; - bool use_soft_float_abi() const override; - int native_vector_bits() const override; - - int vector_lanes_for_slice(const Type &t) const; - - llvm::Type *llvm_type_of(const Type &t) const override; - - using CodeGen_Posix::visit; - - void init_module() override; - - /** Nodes for which we want to emit specific sse/avx intrinsics */ - // @{ - void visit(const Add *) override; - void visit(const Sub *) override; - void visit(const Cast *) override; - void visit(const Call *) override; - void visit(const GT *) override; - void visit(const LT *) override; - void visit(const LE *) override; - void visit(const GE *) override; - void visit(const EQ *) override; - void visit(const NE *) override; - void visit(const Select *) override; - void codegen_vector_reduce(const VectorReduce *, const Expr &init) override; - // @} -}; - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index 187c735dbb97..ed9743a9ac05 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -298,7 +298,7 @@ class InjectGpuOffload : public IRMutator { internal_assert(!cgdev.empty()) << "Requested unknown GPU target: " << target.to_string() << "\n"; } - Stmt inject(Stmt s) { + Stmt inject(const Stmt &s) { // Create a new module for all of the kernels we find in this function. for (auto &i : cgdev) { i.second->init_module(); diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp index 8415e0eca39d..71136eb02be8 100644 --- a/src/WasmExecutor.cpp +++ b/src/WasmExecutor.cpp @@ -1,6 +1,6 @@ #include "WasmExecutor.h" -#include "CodeGen_WebAssembly.h" +#include "CodeGen_Targets.h" #include "Error.h" #include "Float16.h" #include "Func.h" @@ -285,7 +285,7 @@ std::vector compile_to_wasm(const Module &module, const std::string &fn_na // for the alloca usage. size_t stack_size = 65536; { - std::unique_ptr cg(new CodeGen_WebAssembly(module.target())); + std::unique_ptr cg(new_CodeGen_WebAssembly(module.target())); cg->set_context(context); fn_module = cg->compile(module); stack_size += cg->get_requested_alloca_total(); From 2ab52cb2ba88e3cb33f669e70ba8a60f963fa66e Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 26 Feb 2021 17:43:44 -0700 Subject: [PATCH 07/19] clang-format --- src/CodeGen_ARM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 31cd8d7d8e7f..e43097b95180 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -65,7 +65,7 @@ class CodeGen_ARM : public CodeGen_Posix { /** Various patterns to peephole match against */ struct Pattern { string intrin; ///< Name of the intrinsic - Expr pattern; ///< The pattern to match against + Expr pattern; ///< The pattern to match against Pattern() = default; Pattern(const string &intrin, Expr p) : intrin(intrin), pattern(std::move(p)) { From 7c0c5ddee2463ee87d9abb1e8f5aedd8b4097455 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Fri, 26 Feb 2021 18:15:21 -0700 Subject: [PATCH 08/19] Pass type arguments correctly. --- src/OffloadGPULoops.cpp | 18 +++++------------- src/OffloadGPULoops.h | 2 +- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index ed9743a9ac05..f25b3ded42bd 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -92,15 +92,6 @@ class ExtractBounds : public IRVisitor { } }; -Expr make_type_arg(const Type &t) { - vector args = { - cast(t.code()), - cast(t.bits()), - cast(t.lanes()), - }; - return Call::make(type_of(), Call::make_struct, args, Call::Intrinsic); -} - class InjectGpuOffload : public IRMutator { /** Child code generator for device kernels. */ map> cgdev; @@ -229,7 +220,8 @@ class InjectGpuOffload : public IRMutator { args.push_back(val); if (runtime_run_takes_types) { - arg_types_or_sizes.push_back(make_type_arg(i.type.with_lanes(1))); + internal_assert(sizeof(halide_type_t) == sizeof(uint32_t)); + arg_types_or_sizes.push_back(Expr(*(const uint32_t*)&i.type)); } else { arg_types_or_sizes.push_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes())); } @@ -240,8 +232,8 @@ class InjectGpuOffload : public IRMutator { // nullptr-terminate the lists args.push_back(reinterpret(Handle(), cast(0))); if (runtime_run_takes_types) { - internal_assert(halide_type_int == 0); - arg_types_or_sizes.push_back(make_type_arg(Type(halide_type_int, 0, 0))); + internal_assert(sizeof(halide_type_t) == sizeof(uint32_t)); + arg_types_or_sizes.push_back(cast(0)); } else { arg_types_or_sizes.push_back(cast(target_size_t_type, 0)); } @@ -337,7 +329,7 @@ class InjectGpuOffload : public IRMutator { } // namespace -Stmt inject_gpu_offload(Stmt s, const Target &host_target) { +Stmt inject_gpu_offload(const Stmt &s, const Target &host_target) { return InjectGpuOffload(host_target).inject(s); } diff --git a/src/OffloadGPULoops.h b/src/OffloadGPULoops.h index c513a35fd8d5..d927f1a8b780 100644 --- a/src/OffloadGPULoops.h +++ b/src/OffloadGPULoops.h @@ -17,7 +17,7 @@ namespace Internal { /** Pull loops marked with GPU device APIs to a separate * module, and call them through the appropriate host runtime module. */ -Stmt inject_gpu_offload(Stmt s, const Target &host_target); +Stmt inject_gpu_offload(const Stmt &s, const Target &host_target); } // namespace Internal } // namespace Halide From 902453332ad17dd560f6f4ae638864a5ab929dc7 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 26 Feb 2021 17:23:22 -0800 Subject: [PATCH 09/19] Update OffloadGPULoops.cpp --- src/OffloadGPULoops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index f25b3ded42bd..cc34469c9cbe 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -221,7 +221,7 @@ class InjectGpuOffload : public IRMutator { if (runtime_run_takes_types) { internal_assert(sizeof(halide_type_t) == sizeof(uint32_t)); - arg_types_or_sizes.push_back(Expr(*(const uint32_t*)&i.type)); + arg_types_or_sizes.push_back(Expr(*(const uint32_t *)&i.type)); } else { arg_types_or_sizes.push_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes())); } From 25107fb9d3a6f3a6c0551d39076c99d18197d02a Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Sun, 28 Feb 2021 09:51:00 -0800 Subject: [PATCH 10/19] trigger buildbots From 571fda3bbb056120af33528b003f364271a4138c Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Sun, 28 Feb 2021 09:51:13 -0800 Subject: [PATCH 11/19] trigger buildbots From bf62dc4b723e939a1d843a1cebb40a89eb24eeb7 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Sun, 28 Feb 2021 15:04:56 -0700 Subject: [PATCH 12/19] Hack around tests that rely on the IR for offloaded GPU loops. --- src/Lower.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/Lower.cpp b/src/Lower.cpp index 6855aed4ec61..07cb56f3556c 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -441,15 +441,9 @@ Module lower(const vector &output_funcs, debug(1) << "Skipping Hexagon offload...\n"; } - if (t.has_gpu_feature()) { - debug(1) << "Offloading GPU loops...\n"; - s = inject_gpu_offload(s, t); - debug(2) << "Lowering after splitting off GPU loops:\n" - << s << "\n\n"; - } else { - debug(1) << "Skipping GPU offload...\n"; - } - + // TODO: Several tests depend on these custom passes running before + // inject_gpu_offload. We should either make this consistent with + // inject_hexagon_rpc above, or find a way to avoid this dependency. if (!custom_passes.empty()) { for (size_t i = 0; i < custom_passes.size(); i++) { debug(1) << "Running custom lowering pass " << i << "...\n"; @@ -459,6 +453,15 @@ Module lower(const vector &output_funcs, } } + if (t.has_gpu_feature()) { + debug(1) << "Offloading GPU loops...\n"; + s = inject_gpu_offload(s, t); + debug(2) << "Lowering after splitting off GPU loops:\n" + << s << "\n\n"; + } else { + debug(1) << "Skipping GPU offload...\n"; + } + vector public_args = args; for (const auto &out : outputs) { for (const Parameter &buf : out.output_buffers()) { From 5ee9236161d149c5087e475d7eb2dd5a205bf9e7 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Sun, 28 Feb 2021 15:22:51 -0700 Subject: [PATCH 13/19] Fix missing include. --- src/WasmExecutor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp index 71136eb02be8..6a6208d47ba7 100644 --- a/src/WasmExecutor.cpp +++ b/src/WasmExecutor.cpp @@ -1,5 +1,6 @@ #include "WasmExecutor.h" +#include "CodeGen_Posix.h" #include "CodeGen_Targets.h" #include "Error.h" #include "Float16.h" From dc7e61dbbc7d04999c5e7f8aee7993f68ba6477d Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Sun, 28 Feb 2021 15:26:14 -0700 Subject: [PATCH 14/19] Remove unused include. --- test/internal.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/internal.cpp b/test/internal.cpp index bb50f45390d8..35e9fa4efcbc 100644 --- a/test/internal.cpp +++ b/test/internal.cpp @@ -5,7 +5,6 @@ #include "CSE.h" #include "CodeGen_C.h" #include "CodeGen_PyTorch.h" -#include "CodeGen_X86.h" #include "Deinterleave.h" #include "Func.h" #include "Generator.h" From 4a590959f377395a3c32e2ad57a145fd159eb36e Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Sun, 28 Feb 2021 15:28:34 -0700 Subject: [PATCH 15/19] clang-tidy --- src/OffloadGPULoops.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index cc34469c9cbe..1c5b38f4738c 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -217,27 +217,27 @@ class InjectGpuOffload : public IRMutator { val = Variable::make(i.type, i.name); val = Call::make(type_of(), Call::make_struct, {val}, Call::Intrinsic); } - args.push_back(val); + args.emplace_back(val); if (runtime_run_takes_types) { internal_assert(sizeof(halide_type_t) == sizeof(uint32_t)); - arg_types_or_sizes.push_back(Expr(*(const uint32_t *)&i.type)); + arg_types_or_sizes.emplace_back(Expr(*(const uint32_t *)&i.type)); } else { - arg_types_or_sizes.push_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes())); + arg_types_or_sizes.emplace_back(cast(target_size_t_type, i.is_buffer ? 8 : i.type.bytes())); } - arg_is_buffer.push_back(cast(i.is_buffer)); + arg_is_buffer.emplace_back(cast(i.is_buffer)); } // nullptr-terminate the lists - args.push_back(reinterpret(Handle(), cast(0))); + args.emplace_back(reinterpret(Handle(), cast(0))); if (runtime_run_takes_types) { internal_assert(sizeof(halide_type_t) == sizeof(uint32_t)); - arg_types_or_sizes.push_back(cast(0)); + arg_types_or_sizes.emplace_back(cast(0)); } else { - arg_types_or_sizes.push_back(cast(target_size_t_type, 0)); + arg_types_or_sizes.emplace_back(cast(target_size_t_type, 0)); } - arg_is_buffer.push_back(cast(0)); + arg_is_buffer.emplace_back(cast(0)); // TODO: only three dimensions can be passed to // cuLaunchKernel. How should we handle blkid[3]? From db8841186f6d3dc09086096022c1b04e9f6ce040 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Sun, 28 Feb 2021 21:49:42 -0700 Subject: [PATCH 16/19] Use custom lowering pass to see code before GPU offloading --- test/correctness/trim_no_ops.cpp | 50 +++++++++++++++----------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/test/correctness/trim_no_ops.cpp b/test/correctness/trim_no_ops.cpp index 3453cc74981e..c2773bd0b635 100644 --- a/test/correctness/trim_no_ops.cpp +++ b/test/correctness/trim_no_ops.cpp @@ -2,7 +2,7 @@ using namespace Halide; -class CountConditionals : public Internal::IRVisitor { +class CountConditionals : public Internal::IRMutator { public: int count = 0; int count_if = 0; @@ -10,33 +10,27 @@ class CountConditionals : public Internal::IRVisitor { bool in_produce = false; private: - using Internal::IRVisitor::visit; + using Internal::IRMutator::visit; - void visit(const Internal::Select *op) override { + Expr visit(const Internal::Select *op) override { if (in_produce) { count++; count_select++; } - Internal::IRVisitor::visit(op); + return Internal::IRMutator::visit(op); } - void visit(const Internal::IfThenElse *op) override { + Internal::Stmt visit(const Internal::IfThenElse *op) override { if (in_produce) { count++; count_if++; } - Internal::IRVisitor::visit(op); + return Internal::IRMutator::visit(op); } - void visit(const Internal::ProducerConsumer *op) override { - if (op->is_producer) { - bool old_in_produce = in_produce; - in_produce = true; - Internal::IRVisitor::visit(op); - in_produce = old_in_produce; - } else { - IRVisitor::visit(op); - } + Internal::Stmt visit(const Internal::ProducerConsumer *op) override { + Internal::ScopedValue v(in_produce, op->is_producer); + return Internal::IRMutator::visit(op); } }; @@ -52,10 +46,10 @@ int main(int argc, char **argv) { f(x) *= select(x > 20 && x < 30, 2, 1); f(x) = select(x >= 60 && x <= 100, 100 - f(x), f(x)); - // There should be no selects or ifs after trim_no_ops runs - Module m = f.compile_to_module({}); CountConditionals s; - m.functions().front().body.accept(&s); + f.add_custom_lowering_pass(&s, []() {}); + Module m = f.compile_to_module({}); + if (s.count != 0) { std::cerr << "There were conditionals in the lowered code: \n" << m.functions().front().body << "\n"; @@ -86,11 +80,12 @@ int main(int argc, char **argv) { Var x, y; f(x, y) = x + y; f(x, y) += select((x == 10) && (x < y), 1, 0); - Module m = f.compile_to_module({}); // There should be no selects after trim_no_ops runs CountConditionals s; - m.functions().front().body.accept(&s); + f.add_custom_lowering_pass(&s, []() {}); + Module m = f.compile_to_module({}); + if (s.count != 0) { std::cerr << "There were selects in the lowered code: \n" << m.functions().front().body << "\n"; @@ -128,9 +123,10 @@ int main(int argc, char **argv) { hist(f(clamp(xi, 0, 73), clamp(yi, 0, 73))) += select(xi >= 0 && xi <= 73 && yi >= 0 && yi <= 73, 1, 0); - Module m = hist.compile_to_module({}); CountConditionals s; - m.functions().front().body.accept(&s); + hist.add_custom_lowering_pass(&s, []() {}); + Module m = hist.compile_to_module({}); + if (s.count != 0) { std::cerr << "There were selects in the lowered code: \n" << m.functions().front().body << "\n"; @@ -169,9 +165,10 @@ int main(int argc, char **argv) { f.tile(x, y, xi, yi, 4, 4); // Check there are no if statements. - Module m = f.compile_to_module({}); CountConditionals s; - m.functions().front().body.accept(&s); + f.add_custom_lowering_pass(&s, []() {}); + Module m = f.compile_to_module({}); + if (s.count != 0) { std::cerr << "There were selects or ifs in the lowered code: \n" << m.functions().front().body << "\n"; @@ -207,9 +204,10 @@ int main(int argc, char **argv) { // if condition since it depends on gpu outer loop r.y Target gpu_target(get_host_target()); gpu_target.set_feature(Target::CUDA); - Module m = f.compile_to_module({}, "", gpu_target); CountConditionals s; - m.functions().front().body.accept(&s); + f.add_custom_lowering_pass(&s, []() {}); + Module m = f.compile_to_module({}, "", gpu_target); + if (s.count_select != 0) { std::cerr << "There were selects in the lowered code: \n" << m.functions().front().body << "\n"; From 18b459ba2f863ee576019f21aaefeaf643cd2fb8 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Mon, 1 Mar 2021 10:44:39 -0700 Subject: [PATCH 17/19] Speculative fix for segfault --- test/correctness/trim_no_ops.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/correctness/trim_no_ops.cpp b/test/correctness/trim_no_ops.cpp index c2773bd0b635..c989a3a5c8f9 100644 --- a/test/correctness/trim_no_ops.cpp +++ b/test/correctness/trim_no_ops.cpp @@ -116,6 +116,7 @@ int main(int argc, char **argv) { f.compute_root(); Func hist; + Buffer hist_result; { RDom r(0, 10, 0, 10, 0, 10, 0, 10); Expr xi = r[0] + r[2] * 10, yi = r[1] + r[3] * 10; @@ -132,8 +133,8 @@ int main(int argc, char **argv) { << m.functions().front().body << "\n"; return -1; } + hist_result = hist.realize({256}); } - Buffer hist_result = hist.realize({256}); // Also check the output is correct. Func true_hist; From 04d4a1f870579d2a9bdb9b8b6c6a611fce4c4c32 Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Mon, 1 Mar 2021 12:51:43 -0700 Subject: [PATCH 18/19] Fix const correctness --- src/CodeGen_C.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp index ac490283738e..2516fe26a71f 100644 --- a/src/CodeGen_C.cpp +++ b/src/CodeGen_C.cpp @@ -1728,7 +1728,8 @@ string CodeGen_C::print_assignment(Type t, const std::string &rhs) { auto cached = cache.find(rhs); if (cached == cache.end()) { id = unique_name('_'); - stream << get_indent() << print_type(t, AppendSpace) << (output_kind == CPlusPlusImplementation ? "const " : "") << id << " = " << rhs << ";\n"; + const char *const_flag = output_kind == CPlusPlusImplementation ? "const " : ""; + stream << get_indent() << print_type(t, AppendSpace) << const_flag << id << " = " << rhs << ";\n"; cache[rhs] = id; } else { id = cached->second; @@ -2316,7 +2317,8 @@ void CodeGen_C::visit(const Load *op) { bool type_cast_needed = !(allocations.contains(op->name) && allocations.get(op->name).type.element_of() == t.element_of()); if (type_cast_needed) { - rhs << "((const " << print_type(t.element_of()) << " *)" << name << ")"; + const char *const_flag = output_kind == CPlusPlusImplementation ? "const " : ""; + rhs << "((" << const_flag << " " << print_type(t.element_of()) << " *)" << name << ")"; } else { rhs << name; } From 7abf29deefcaf2d6596a3639d07ea1523a8ae01f Mon Sep 17 00:00:00 2001 From: Dillon Sharlet Date: Mon, 1 Mar 2021 12:51:57 -0700 Subject: [PATCH 19/19] Fix error on unused variables in generated code. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c685c299940c..8c3f6d77bde3 100644 --- a/Makefile +++ b/Makefile @@ -1568,7 +1568,7 @@ $(FILTERS_DIR)/gpu_multi_context_threaded_%.a: $(BIN_DIR)/gpu_multi_context_thre @mkdir -p $(@D) $(CURDIR)/$< -g gpu_multi_context_threaded_$* $(GEN_AOT_OUTPUTS) -o $(CURDIR)/$(FILTERS_DIR) target=$(TARGET)-no_runtime-user_context -GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas +GEN_AOT_CXX_FLAGS=$(TEST_CXX_FLAGS) -Wno-unknown-pragmas -Wno-unused-variable GEN_AOT_INCLUDES=-I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common -I $(ROOT_DIR)/apps/support -I $(SRC_DIR)/runtime -I$(ROOT_DIR)/tools GEN_AOT_LD_FLAGS=$(COMMON_LD_FLAGS)