diff --git a/apps/onnx/model.cpp b/apps/onnx/model.cpp index b2d1738d3680..eb7327974612 100644 --- a/apps/onnx/model.cpp +++ b/apps/onnx/model.cpp @@ -344,8 +344,6 @@ std::vector run( } Halide::Realization real(outputs); Halide::Target tgt = Halide::get_host_target(); - // Don't allow LLVM to mess with the code. - tgt.set_feature(Halide::Target::DisableLLVMLoopOpt, true); // Don't create buffers larger than 2GB since we use 32bit signed indices to // index the data stored in them. tgt.set_feature(Halide::Target::LargeBuffers, false); @@ -461,8 +459,6 @@ double benchmark( Halide::Realization real(outputs); Halide::Target tgt = Halide::get_host_target(); - // Don't allow LLVM to mess with the code. - tgt.set_feature(Halide::Target::DisableLLVMLoopOpt, true); // Don't create buffers larger than 2GB since we use 32bit signed indices to // index the data stored in them. tgt.set_feature(Halide::Target::LargeBuffers, false); diff --git a/python_bindings/src/PyEnums.cpp b/python_bindings/src/PyEnums.cpp index f232b43c4c59..ce585f7ef06b 100644 --- a/python_bindings/src/PyEnums.cpp +++ b/python_bindings/src/PyEnums.cpp @@ -140,7 +140,6 @@ void define_enums(py::module &m) { .value("HexagonDma", Target::Feature::HexagonDma) .value("EmbedBitcode", Target::Feature::EmbedBitcode) .value("EnableLLVMLoopOpt", Target::Feature::EnableLLVMLoopOpt) - .value("DisableLLVMLoopOpt", Target::Feature::DisableLLVMLoopOpt) .value("WasmSimd128", Target::Feature::WasmSimd128) .value("WasmSignExt", Target::Feature::WasmSignExt) .value("WasmSatFloatToInt", Target::Feature::WasmSatFloatToInt) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index d408f1dea135..d3e782cbca5d 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1081,14 +1081,7 @@ void CodeGen_LLVM::optimize_module() { std::unique_ptr tm = make_target_machine(*module); - // At present, we default to *enabling* LLVM loop optimization, - // unless DisableLLVMLoopOpt is set; we're going to flip this to defaulting - // to *not* enabling these optimizations (and removing the DisableLLVMLoopOpt feature). - // See https://github.com/halide/Halide/issues/4113 for more info. - // (Note that setting EnableLLVMLoopOpt always enables loop opt, regardless - // of the setting of DisableLLVMLoopOpt.) - const bool do_loop_opt = !get_target().has_feature(Target::DisableLLVMLoopOpt) || - get_target().has_feature(Target::EnableLLVMLoopOpt); + const bool do_loop_opt = get_target().has_feature(Target::EnableLLVMLoopOpt); PipelineTuningOptions pto; pto.LoopInterleaving = do_loop_opt; diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index d1c3dd757de0..d43f83df9e31 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -667,20 +667,17 @@ vector CodeGen_PTX_Dev::compile_to_src() { } } - // At present, we default to *enabling* LLVM loop optimization, - // unless DisableLLVMLoopOpt is set; we're going to flip this to defaulting - // to *not* enabling these optimizations (and removing the DisableLLVMLoopOpt feature). - // See https://github.com/halide/Halide/issues/4113 for more info. - // (Note that setting EnableLLVMLoopOpt always enables loop opt, regardless - // of the setting of DisableLLVMLoopOpt.) - const bool do_loop_opt = !target.has_feature(Target::DisableLLVMLoopOpt) || - target.has_feature(Target::EnableLLVMLoopOpt); + const bool do_loop_opt = target.has_feature(Target::EnableLLVMLoopOpt); PassManagerBuilder b; b.OptLevel = 3; b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false); b.LoopVectorize = do_loop_opt; b.SLPVectorize = true; + // Setting DisableUnrollLoops = true can occasionally generate PTX code that + // will fail at runtime under some conditions (e.g. correctness_gpu_dynamic_shared + // using NVidia driver 460.x). + // b.DisableUnrollLoops = false; // !do_loop_opt; b.DisableUnrollLoops = !do_loop_opt; target_machine->adjustPassManager(b); diff --git a/src/HexagonOffload.cpp b/src/HexagonOffload.cpp index 9d4512ce3d0d..b0fdac95a741 100644 --- a/src/HexagonOffload.cpp +++ b/src/HexagonOffload.cpp @@ -977,7 +977,6 @@ Stmt inject_hexagon_rpc(Stmt s, const Target &host_target, Target::HVX_v62, Target::HVX_v65, Target::HVX_v66, - Target::DisableLLVMLoopOpt, }; for (Target::Feature i : shared_features) { if (host_target.has_feature(i)) { diff --git a/src/Target.cpp b/src/Target.cpp index 4e21db617f68..2a838cee3010 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -370,7 +370,6 @@ const std::map feature_name_map = { {"check_unsafe_promises", Target::CheckUnsafePromises}, {"hexagon_dma", Target::HexagonDma}, {"embed_bitcode", Target::EmbedBitcode}, - {"disable_llvm_loop_opt", Target::DisableLLVMLoopOpt}, {"enable_llvm_loop_opt", Target::EnableLLVMLoopOpt}, {"wasm_simd128", Target::WasmSimd128}, {"wasm_signext", Target::WasmSignExt}, diff --git a/src/Target.h b/src/Target.h index 1f4e55bc7b55..90cf1d61fed1 100644 --- a/src/Target.h +++ b/src/Target.h @@ -117,7 +117,6 @@ struct Target { CheckUnsafePromises = halide_target_feature_check_unsafe_promises, EmbedBitcode = halide_target_feature_embed_bitcode, EnableLLVMLoopOpt = halide_target_feature_enable_llvm_loop_opt, - DisableLLVMLoopOpt = halide_target_feature_disable_llvm_loop_opt, WasmSimd128 = halide_target_feature_wasm_simd128, WasmSignExt = halide_target_feature_wasm_signext, WasmSatFloatToInt = halide_target_feature_wasm_sat_float_to_int, diff --git a/src/autoschedulers/adams2019/autotune_loop.sh b/src/autoschedulers/adams2019/autotune_loop.sh index d36830f71249..b11aaa1d24ab 100755 --- a/src/autoschedulers/adams2019/autotune_loop.sh +++ b/src/autoschedulers/adams2019/autotune_loop.sh @@ -65,14 +65,6 @@ else echo Copying starting weights from ${START_WEIGHTS_FILE} to ${WEIGHTS} fi -# We could add this unconditionally, but it's easier to wade thru -# results if we only add if needed -for F in disable_llvm_loop_opt; do - if [[ ! ${HL_TARGET} =~ .*${F}.* ]]; then - HL_TARGET="${HL_TARGET}-${F}" - fi -done - # A batch of this many samples is built in parallel, and then # benchmarked serially. BATCH_SIZE=32 diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 6496f1eebc58..fec3ffd7c252 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1323,8 +1323,7 @@ typedef enum halide_target_feature_t { halide_target_feature_check_unsafe_promises, ///< Insert assertions for promises. halide_target_feature_hexagon_dma, ///< Enable Hexagon DMA buffers. halide_target_feature_embed_bitcode, ///< Emulate clang -fembed-bitcode flag. - halide_target_feature_enable_llvm_loop_opt, ///< Enable loop vectorization + unrolling in LLVM. Overrides halide_target_feature_disable_llvm_loop_opt. (Ignored for non-LLVM targets.) - halide_target_feature_disable_llvm_loop_opt, ///< Disable loop vectorization + unrolling in LLVM. (Ignored for non-LLVM targets.) + halide_target_feature_enable_llvm_loop_opt, ///< Enable loop vectorization + unrolling in LLVM. halide_target_feature_wasm_simd128, ///< Enable +simd128 instructions for WebAssembly codegen. halide_target_feature_wasm_signext, ///< Enable +sign-ext instructions for WebAssembly codegen. halide_target_feature_wasm_sat_float_to_int, ///< Enable saturating (nontrapping) float-to-int instructions for WebAssembly codegen. diff --git a/test/correctness/float16_t.cpp b/test/correctness/float16_t.cpp index 7962af423273..21d8f082a59d 100644 --- a/test/correctness/float16_t.cpp +++ b/test/correctness/float16_t.cpp @@ -238,7 +238,7 @@ int main(int argc, char **argv) { to_f16.compute_root().vectorize(x, 8, TailStrategy::RoundUp); from_f16.compute_root().vectorize(x, 8, TailStrategy::RoundUp); - from_f16.compile_to_assembly("/dev/stdout", {}, Target("host-no_asserts-no_bounds_query-no_runtime-disable_llvm_loop_unroll-disable_llvm_loop_vectorize")); + from_f16.compile_to_assembly("/dev/stdout", {}, Target("host-no_asserts-no_bounds_query-no_runtime")); } // Check infinity handling for both float16_t and Halide codegen. diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index 29f151e10455..df343bd0d7aa 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -51,8 +51,7 @@ class SimdOpCheckTest { target = target .with_feature(Target::NoBoundsQuery) .with_feature(Target::NoAsserts) - .with_feature(Target::NoRuntime) - .with_feature(Target::DisableLLVMLoopOpt); + .with_feature(Target::NoRuntime); num_threads = Internal::ThreadPool::num_processors_online(); } virtual ~SimdOpCheckTest() = default; diff --git a/test/performance/nested_vectorization_gemm.cpp b/test/performance/nested_vectorization_gemm.cpp index 25a0bc746fb1..88904879dfab 100644 --- a/test/performance/nested_vectorization_gemm.cpp +++ b/test/performance/nested_vectorization_gemm.cpp @@ -10,9 +10,6 @@ int main(int argc, char **argv) { printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n"); return 0; } - // We don't want to be sensitive to LLVM pulling the same tricks - // or not. - target.set_feature(Target::DisableLLVMLoopOpt); // 8-bit mat-mul into 32-bit accumulator {