halide · steven-johnson · Jun 9, 2020 · Jul 29, 2020 · Apr 8, 2021 · Apr 8, 2021
diff --git a/apps/onnx/model.cpp b/apps/onnx/model.cpp
@@ -344,8 +344,6 @@ std::vector<py::array> run(
     }
     Halide::Realization real(outputs);
     Halide::Target tgt = Halide::get_host_target();
-    // Don't allow LLVM to mess with the code.
-    tgt.set_feature(Halide::Target::DisableLLVMLoopOpt, true);
     // Don't create buffers larger than 2GB since we use 32bit signed indices to
     // index the data stored in them.
     tgt.set_feature(Halide::Target::LargeBuffers, false);
@@ -461,8 +459,6 @@ double benchmark(
 
     Halide::Realization real(outputs);
     Halide::Target tgt = Halide::get_host_target();
-    // Don't allow LLVM to mess with the code.
-    tgt.set_feature(Halide::Target::DisableLLVMLoopOpt, true);
     // Don't create buffers larger than 2GB since we use 32bit signed indices to
     // index the data stored in them.
     tgt.set_feature(Halide::Target::LargeBuffers, false);

diff --git a/python_bindings/src/PyEnums.cpp b/python_bindings/src/PyEnums.cpp
@@ -140,7 +140,6 @@ void define_enums(py::module &m) {
         .value("HexagonDma", Target::Feature::HexagonDma)
         .value("EmbedBitcode", Target::Feature::EmbedBitcode)
         .value("EnableLLVMLoopOpt", Target::Feature::EnableLLVMLoopOpt)
-        .value("DisableLLVMLoopOpt", Target::Feature::DisableLLVMLoopOpt)
         .value("WasmSimd128", Target::Feature::WasmSimd128)
         .value("WasmSignExt", Target::Feature::WasmSignExt)
         .value("WasmSatFloatToInt", Target::Feature::WasmSatFloatToInt)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -1081,14 +1081,7 @@ void CodeGen_LLVM::optimize_module() {
 
     std::unique_ptr<TargetMachine> tm = make_target_machine(*module);
 
-    // At present, we default to *enabling* LLVM loop optimization,
-    // unless DisableLLVMLoopOpt is set; we're going to flip this to defaulting
-    // to *not* enabling these optimizations (and removing the DisableLLVMLoopOpt feature).
-    // See https://github.com/halide/Halide/issues/4113 for more info.
-    // (Note that setting EnableLLVMLoopOpt always enables loop opt, regardless
-    // of the setting of DisableLLVMLoopOpt.)
-    const bool do_loop_opt = !get_target().has_feature(Target::DisableLLVMLoopOpt) ||
-                             get_target().has_feature(Target::EnableLLVMLoopOpt);
+    const bool do_loop_opt = get_target().has_feature(Target::EnableLLVMLoopOpt);
 
     PipelineTuningOptions pto;
     pto.LoopInterleaving = do_loop_opt;

diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
@@ -667,20 +667,17 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
         }
     }
 
-    // At present, we default to *enabling* LLVM loop optimization,
-    // unless DisableLLVMLoopOpt is set; we're going to flip this to defaulting
-    // to *not* enabling these optimizations (and removing the DisableLLVMLoopOpt feature).
-    // See https://github.com/halide/Halide/issues/4113 for more info.
-    // (Note that setting EnableLLVMLoopOpt always enables loop opt, regardless
-    // of the setting of DisableLLVMLoopOpt.)
-    const bool do_loop_opt = !target.has_feature(Target::DisableLLVMLoopOpt) ||
-                             target.has_feature(Target::EnableLLVMLoopOpt);
+    const bool do_loop_opt = target.has_feature(Target::EnableLLVMLoopOpt);
 
     PassManagerBuilder b;
     b.OptLevel = 3;
     b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false);
     b.LoopVectorize = do_loop_opt;
     b.SLPVectorize = true;
+    // Setting DisableUnrollLoops = true can occasionally generate PTX code that
+    // will fail at runtime under some conditions (e.g. correctness_gpu_dynamic_shared
+    // using NVidia driver 460.x).
+    // b.DisableUnrollLoops = false;  // !do_loop_opt;
     b.DisableUnrollLoops = !do_loop_opt;
 
     target_machine->adjustPassManager(b);

diff --git a/src/HexagonOffload.cpp b/src/HexagonOffload.cpp
@@ -977,7 +977,6 @@ Stmt inject_hexagon_rpc(Stmt s, const Target &host_target,
         Target::HVX_v62,
         Target::HVX_v65,
         Target::HVX_v66,
-        Target::DisableLLVMLoopOpt,
     };
     for (Target::Feature i : shared_features) {
         if (host_target.has_feature(i)) {

diff --git a/src/Target.cpp b/src/Target.cpp
@@ -370,7 +370,6 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"check_unsafe_promises", Target::CheckUnsafePromises},
     {"hexagon_dma", Target::HexagonDma},
     {"embed_bitcode", Target::EmbedBitcode},
-    {"disable_llvm_loop_opt", Target::DisableLLVMLoopOpt},
     {"enable_llvm_loop_opt", Target::EnableLLVMLoopOpt},
     {"wasm_simd128", Target::WasmSimd128},
     {"wasm_signext", Target::WasmSignExt},

diff --git a/src/Target.h b/src/Target.h
@@ -117,7 +117,6 @@ struct Target {
         CheckUnsafePromises = halide_target_feature_check_unsafe_promises,
         EmbedBitcode = halide_target_feature_embed_bitcode,
         EnableLLVMLoopOpt = halide_target_feature_enable_llvm_loop_opt,
-        DisableLLVMLoopOpt = halide_target_feature_disable_llvm_loop_opt,
         WasmSimd128 = halide_target_feature_wasm_simd128,
         WasmSignExt = halide_target_feature_wasm_signext,
         WasmSatFloatToInt = halide_target_feature_wasm_sat_float_to_int,

diff --git a/src/autoschedulers/adams2019/autotune_loop.sh b/src/autoschedulers/adams2019/autotune_loop.sh
@@ -65,14 +65,6 @@ else
     echo Copying starting weights from ${START_WEIGHTS_FILE} to ${WEIGHTS}
 fi
 
-# We could add this unconditionally, but it's easier to wade thru
-# results if we only add if needed
-for F in disable_llvm_loop_opt; do
-    if [[ ! ${HL_TARGET} =~ .*${F}.* ]]; then
-        HL_TARGET="${HL_TARGET}-${F}"
-    fi
-done
-
 # A batch of this many samples is built in parallel, and then
 # benchmarked serially.
 BATCH_SIZE=32

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
@@ -1323,8 +1323,7 @@ typedef enum halide_target_feature_t {
     halide_target_feature_check_unsafe_promises,  ///< Insert assertions for promises.
     halide_target_feature_hexagon_dma,            ///< Enable Hexagon DMA buffers.
     halide_target_feature_embed_bitcode,          ///< Emulate clang -fembed-bitcode flag.
-    halide_target_feature_enable_llvm_loop_opt,   ///< Enable loop vectorization + unrolling in LLVM. Overrides halide_target_feature_disable_llvm_loop_opt. (Ignored for non-LLVM targets.)
-    halide_target_feature_disable_llvm_loop_opt,  ///< Disable loop vectorization + unrolling in LLVM. (Ignored for non-LLVM targets.)
+    halide_target_feature_enable_llvm_loop_opt,   ///< Enable loop vectorization + unrolling in LLVM.
     halide_target_feature_wasm_simd128,           ///< Enable +simd128 instructions for WebAssembly codegen.
     halide_target_feature_wasm_signext,           ///< Enable +sign-ext instructions for WebAssembly codegen.
     halide_target_feature_wasm_sat_float_to_int,  ///< Enable saturating (nontrapping) float-to-int instructions for WebAssembly codegen.

diff --git a/test/correctness/float16_t.cpp b/test/correctness/float16_t.cpp
@@ -238,7 +238,7 @@ int main(int argc, char **argv) {
         to_f16.compute_root().vectorize(x, 8, TailStrategy::RoundUp);
         from_f16.compute_root().vectorize(x, 8, TailStrategy::RoundUp);
 
-        from_f16.compile_to_assembly("/dev/stdout", {}, Target("host-no_asserts-no_bounds_query-no_runtime-disable_llvm_loop_unroll-disable_llvm_loop_vectorize"));
+        from_f16.compile_to_assembly("/dev/stdout", {}, Target("host-no_asserts-no_bounds_query-no_runtime"));
     }
 
     // Check infinity handling for both float16_t and Halide codegen.

diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
@@ -51,8 +51,7 @@ class SimdOpCheckTest {
         target = target
                      .with_feature(Target::NoBoundsQuery)
                      .with_feature(Target::NoAsserts)
-                     .with_feature(Target::NoRuntime)
-                     .with_feature(Target::DisableLLVMLoopOpt);
+                     .with_feature(Target::NoRuntime);
         num_threads = Internal::ThreadPool<void>::num_processors_online();
     }
     virtual ~SimdOpCheckTest() = default;

diff --git a/test/performance/nested_vectorization_gemm.cpp b/test/performance/nested_vectorization_gemm.cpp
@@ -10,9 +10,6 @@ int main(int argc, char **argv) {
         printf("[SKIP] Performance tests are meaningless and/or misleading under WebAssembly interpreter.\n");
         return 0;
     }
-    // We don't want to be sensitive to LLVM pulling the same tricks
-    // or not.
-    target.set_feature(Target::DisableLLVMLoopOpt);
 
     // 8-bit mat-mul into 32-bit accumulator
     {