From 9c6c062248d023dd5a74cfbc48ae08ccf598da51 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 27 Sep 2023 11:37:28 -0700 Subject: [PATCH 01/10] Disallow async nestings that violate read after write dependencies Fixes #7867 --- src/AsyncProducers.cpp | 52 ++++++++++++++++++++++++++++++++++ test/error/CMakeLists.txt | 1 + test/performance/async_gpu.cpp | 33 ++++++++++++++++----- 3 files changed, 79 insertions(+), 7 deletions(-) diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index 00f27603d971..6b6425ad6421 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -109,15 +109,56 @@ class NoOpCollapsingMutator : public IRMutator { class GenerateProducerBody : public NoOpCollapsingMutator { const string &func; vector sema; + std::set producers_dropped; + bool found_producer = false; using NoOpCollapsingMutator::visit; + void bad_producer_nesting_error(const string &producer, const string &async_consumer) { + user_error + << "The non-async Func " << producer << " is consumed by async Func " << async_consumer + << " and is compute_at a location in between the store_at " + << "location and the compute_at location of " << async_consumer + << ". This is unsupported. Either schedule " << producer + << " it outside the store_at location of " << async_consumer + << " or inside the compute_at location of " << async_consumer << "."; + } + // Preserve produce nodes and add synchronization Stmt visit(const ProducerConsumer *op) override { if (op->name == func && op->is_producer) { + found_producer = true; + // Add post-synchronization internal_assert(!sema.empty()) << "Duplicate produce node: " << op->name << "\n"; Stmt body = op->body; + + // We don't currently support waiting on producers to the producer + // half of the fork node. Or rather, if you want to do that you have + // to schedule those Funcs as async too. Check for any consume nodes + // where the producer has gone to the consumer side of the fork + // node. + class FindBadConsumeNodes : public IRVisitor { + const std::set &producers_dropped; + using IRVisitor::visit; + + void visit(const ProducerConsumer *op) override { + if (!op->is_producer && producers_dropped.count(op->name)) { + found = op->name; + } + } + + public: + string found; + FindBadConsumeNodes(const std::set &p) + : producers_dropped(p) { + } + } finder(producers_dropped); + body.accept(&finder); + if (!finder.found.empty()) { + bad_producer_nesting_error(finder.found, func); + } + while (!sema.empty()) { Expr release = Call::make(Int(32), "halide_semaphore_release", {sema.back(), 1}, Call::Extern); body = Block::make(body, Evaluate::make(release)); @@ -125,7 +166,18 @@ class GenerateProducerBody : public NoOpCollapsingMutator { } return ProducerConsumer::make_produce(op->name, body); } else { + if (op->is_producer) { + producers_dropped.insert(op->name); + } + bool found_producer_before = found_producer; Stmt body = mutate(op->body); + if (!op->is_producer && producers_dropped.count(op->name) && + found_producer && !found_producer_before) { + // We've found a consume node wrapping our async producer where + // the corresponding producer node was dropped from this half of + // the fork. + bad_producer_nesting_error(op->name, func); + } if (is_no_op(body) || op->is_producer) { return body; } else { diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt index 6e69490657f5..6a7521446718 100644 --- a/test/error/CMakeLists.txt +++ b/test/error/CMakeLists.txt @@ -9,6 +9,7 @@ tests(GROUPS error auto_schedule_no_parallel.cpp auto_schedule_no_reorder.cpp autodiff_unbounded.cpp + bad_async_producer.cpp bad_bound.cpp bad_bound_storage.cpp bad_compute_at.cpp diff --git a/test/performance/async_gpu.cpp b/test/performance/async_gpu.cpp index 9d78efe4022e..55263e39546f 100644 --- a/test/performance/async_gpu.cpp +++ b/test/performance/async_gpu.cpp @@ -9,7 +9,7 @@ Expr expensive(Expr x, int c) { if (c <= 0) { return x; } else { - return expensive(fast_pow(x, x + 1), c - 1); + return expensive(x * (x + 1), c - 1); } } @@ -31,11 +31,12 @@ int main(int argc, char **argv) { } double times[2]; + uint32_t correct = 0; for (int use_async = 0; use_async < 2; use_async++) { Var x, y, t, xi, yi; - ImageParam in(Float(32), 3); - Func cpu, gpu; + ImageParam in(UInt(32), 3); + Func cpu("cpu"), gpu("gpu"); // We have a two-stage pipeline that processes frames. We want // to run the first stage on the GPU and the second stage on @@ -50,19 +51,21 @@ int main(int argc, char **argv) { // Assume GPU memory is limited, and compute the GPU stage one // frame at a time. Hoist the allocation to the top level. - gpu.compute_at(cpu, t).store_root().gpu_tile(x, y, xi, yi, 8, 8); + gpu.compute_at(gpu.in(), Var::outermost()).store_root().gpu_tile(x, y, xi, yi, 8, 8); // Stage the copy-back of the GPU result into a host-side // double-buffer. gpu.in().copy_to_host().compute_at(cpu, t).store_root().fold_storage(t, 2); if (use_async) { + // gpu.async(); gpu.in().async(); - gpu.async(); } - in.set(Buffer(800, 800, 16)); - Buffer out(800, 800, 16); + Buffer in_buf(800, 800, 16); + in_buf.fill(17); + in.set(in_buf); + Buffer out(800, 800, 16); cpu.compile_jit(); @@ -70,6 +73,22 @@ int main(int argc, char **argv) { cpu.realize(out); }); + if (!use_async) { + correct = out(0, 0, 0); + } else { + for (int t = 0; t < out.dim(2).extent(); t++) { + for (int y = 0; y < out.dim(1).extent(); y++) { + for (int x = 0; x < out.dim(0).extent(); x++) { + if (out(x, y, t) != correct) { + printf("Async output at (%d, %d, %d) is %u instead of %u\n", + x, y, t, out(x, y, t), correct); + return 1; + } + } + } + } + } + printf("%s: %f\n", use_async ? "with async" : "without async", times[use_async]); From 4ccbe8d6ff0e475794dfe9b2f983b3f0463647f0 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 27 Sep 2023 11:48:29 -0700 Subject: [PATCH 02/10] Add test --- test/error/bad_async_producer.cpp | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 test/error/bad_async_producer.cpp diff --git a/test/error/bad_async_producer.cpp b/test/error/bad_async_producer.cpp new file mode 100644 index 000000000000..9e78e268958c --- /dev/null +++ b/test/error/bad_async_producer.cpp @@ -0,0 +1,31 @@ + +#include "Halide.h" + +using namespace Halide; + +int main(int argc, char **argv) { + + Func f{"f"}, g{"g"}, h{"h"}; + Var x; + + f(x) = cast(x + 7); + g(x) = f(x); + h(x) = g(x); + + // The schedule below is an error. It should really be: + // f.store_root().compute_at(g, Var::outermost()); + // So that it's nested inside the consumer h. + f.store_root().compute_at(h, x); + g.store_root().compute_at(h, x).async(); + + Buffer buf = h.realize({32}); + for (int i = 0; i < buf.dim(0).extent(); i++) { + uint8_t correct = i + 7; + if (buf(i) != correct) { + printf("buf(%d) = %d instead of %d\n", i, buf(i), correct); + return 1; + } + } + + return 0; +} From 4f60796fe5299957f13d955885b9a766bfe88995 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 28 Sep 2023 09:39:11 -0700 Subject: [PATCH 03/10] Add another failure case, and improve error message --- src/AsyncProducers.cpp | 9 ++++----- test/error/CMakeLists.txt | 1 + test/error/bad_async_producer_2.cpp | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) create mode 100644 test/error/bad_async_producer_2.cpp diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index 6b6425ad6421..f81280a3e0b2 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -116,12 +116,11 @@ class GenerateProducerBody : public NoOpCollapsingMutator { void bad_producer_nesting_error(const string &producer, const string &async_consumer) { user_error - << "The non-async Func " << producer << " is consumed by async Func " << async_consumer - << " and is compute_at a location in between the store_at " + << "The Func " << producer << " is consumed by async Func " << async_consumer + << " and has a compute_at location in between the store_at " << "location and the compute_at location of " << async_consumer - << ". This is unsupported. Either schedule " << producer - << " it outside the store_at location of " << async_consumer - << " or inside the compute_at location of " << async_consumer << "."; + << ". This is only legal when " << producer + << " is both async and has a store_at location outside the store_at location of the consumer."; } // Preserve produce nodes and add synchronization diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt index 6a7521446718..e4de95888ee8 100644 --- a/test/error/CMakeLists.txt +++ b/test/error/CMakeLists.txt @@ -10,6 +10,7 @@ tests(GROUPS error auto_schedule_no_reorder.cpp autodiff_unbounded.cpp bad_async_producer.cpp + bad_async_producer_2.cpp bad_bound.cpp bad_bound_storage.cpp bad_compute_at.cpp diff --git a/test/error/bad_async_producer_2.cpp b/test/error/bad_async_producer_2.cpp new file mode 100644 index 000000000000..d9929c56b3c1 --- /dev/null +++ b/test/error/bad_async_producer_2.cpp @@ -0,0 +1,23 @@ +#include "Halide.h" + +using namespace Halide; + +// From https://github.com/halide/Halide/issues/5201 +int main(int argc, char **argv) { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer2(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.compute_at(consumer, y).async(); + producer2.store_root().compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize({16, 16}); + return 0; +} From 50d9470e8f352e8f9966018c00eed153f610c678 Mon Sep 17 00:00:00 2001 From: Volodymyr Kysenko Date: Thu, 20 Aug 2020 12:47:42 -0700 Subject: [PATCH 04/10] Add some more tests --- test/correctness/async_order.cpp | 95 ++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 test/correctness/async_order.cpp diff --git a/test/correctness/async_order.cpp b/test/correctness/async_order.cpp new file mode 100644 index 000000000000..221063b0fb9e --- /dev/null +++ b/test/correctness/async_order.cpp @@ -0,0 +1,95 @@ +#include "Halide.h" +#include + +using namespace Halide; + +int main(int argc, char **argv) { + { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.compute_at(consumer, y); + producer2.compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize(16, 16); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(-1); + } + }); + } + { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.compute_root(); + producer2.store_root().compute_at(consumer, y).async(); + // Correct + // producer1.compute_at(consumer, y); + // producer2.compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize(16, 16); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(-1); + } + }); + } + + { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.store_root().compute_at(consumer, y).async(); + producer2.store_root().compute_at(consumer, y).async(); + // Correct + // producer1.compute_at(consumer, y); + // producer2.compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize(16, 16); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(-1); + } + }); + } + + printf("Success!\n"); + return 0; +} From 3a2f087c91e91928abaf7be628814bdde14637b1 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 28 Sep 2023 10:44:06 -0700 Subject: [PATCH 05/10] Update test --- test/correctness/async_order.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/test/correctness/async_order.cpp b/test/correctness/async_order.cpp index 221063b0fb9e..15a999171215 100644 --- a/test/correctness/async_order.cpp +++ b/test/correctness/async_order.cpp @@ -19,7 +19,7 @@ int main(int argc, char **argv) { consumer.bound(x, 0, 16).bound(y, 0, 16); - Buffer out = consumer.realize(16, 16); + Buffer out = consumer.realize({16, 16}); out.for_each_element([&](int x, int y) { int correct = 2 * (x + y); @@ -42,13 +42,10 @@ int main(int argc, char **argv) { producer1.compute_root(); producer2.store_root().compute_at(consumer, y).async(); - // Correct - // producer1.compute_at(consumer, y); - // producer2.compute_at(consumer, y).async(); consumer.bound(x, 0, 16).bound(y, 0, 16); - Buffer out = consumer.realize(16, 16); + Buffer out = consumer.realize({16, 16}); out.for_each_element([&](int x, int y) { int correct = 2 * (x + y); @@ -72,13 +69,10 @@ int main(int argc, char **argv) { producer1.store_root().compute_at(consumer, y).async(); producer2.store_root().compute_at(consumer, y).async(); - // Correct - // producer1.compute_at(consumer, y); - // producer2.compute_at(consumer, y).async(); consumer.bound(x, 0, 16).bound(y, 0, 16); - Buffer out = consumer.realize(16, 16); + Buffer out = consumer.realize({16, 16}); out.for_each_element([&](int x, int y) { int correct = 2 * (x + y); From fc451397d9f9b5c7fda0ac030b1e70291e0bae73 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 28 Sep 2023 10:46:02 -0700 Subject: [PATCH 06/10] Add new test to cmakelists --- test/correctness/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 4f718661133b..3139cbbae47a 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -8,6 +8,7 @@ tests(GROUPS correctness align_bounds.cpp argmax.cpp async_device_copy.cpp + async_order.cpp autodiff.cpp bad_likely.cpp bit_counting.cpp From 33fa8a69bb1a066a3c4b41d67af0a0f518f9c6ff Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 4 Oct 2023 12:02:13 -0700 Subject: [PATCH 07/10] Fix for llvm trunk --- test/correctness/simd_op_check_riscv.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/correctness/simd_op_check_riscv.cpp b/test/correctness/simd_op_check_riscv.cpp index 05ba5a8f7315..350faa129848 100644 --- a/test/correctness/simd_op_check_riscv.cpp +++ b/test/correctness/simd_op_check_riscv.cpp @@ -70,10 +70,14 @@ class SimdOpCheckRISCV : public SimdOpCheckTest { // Basic math operations. check("vadd.vv", lanes, i_1 + i_2); check("vadd.vv", lanes, u_1 + u_2); - check("vadd.vi", lanes, i_1 + 2); - check("vadd.vi", lanes, u_1 + 2); - check("vadd.vx", lanes, i_1 + 42); - check("vadd.vx", lanes, u_1 + 42); + + // Vector + immediate / scalar form. Disabled because LLVM 18 broadcasts + // scalars to vectors registers outside the loop. + // check("vadd.vi", lanes, i_1 + 2); + // check("vadd.vi", lanes, u_1 + 2); + // check("vadd.vx", lanes, i_1 + 42); + // check("vadd.vx", lanes, u_1 + 42); + check("vsub.vv", lanes, i_1 - i_2); check("vsub.vv", lanes, u_1 - u_2); // TODO: these seem to compile to a vector add From 14c7d7328946595fbb8412d207e75ba3cf54ac09 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 4 Oct 2023 12:04:55 -0700 Subject: [PATCH 08/10] Always acquire the folding semaphore, even if unused --- src/StorageFolding.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp index d24ab58dfd95..6e5cad3eb1fa 100644 --- a/src/StorageFolding.cpp +++ b/src/StorageFolding.cpp @@ -803,11 +803,6 @@ class AttemptStorageFoldingOfFunction : public IRMutator { to_release = max_required - max_required_next; // This is the last time we use these entries } - if (provided.used.defined()) { - to_acquire = select(provided.used, to_acquire, 0); - } - // We should always release the required region, even if we don't use it. - // On the first iteration, we need to acquire the extent of the region shared // between the producer and consumer, and we need to release it on the last // iteration. From 34d1832c24fc22bb37574e772791e2c1468ee82f Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 14 Nov 2023 18:33:22 -0800 Subject: [PATCH 09/10] Skip async_order test under wasm --- test/correctness/async_order.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/correctness/async_order.cpp b/test/correctness/async_order.cpp index 15a999171215..f712d7e19c43 100644 --- a/test/correctness/async_order.cpp +++ b/test/correctness/async_order.cpp @@ -4,6 +4,11 @@ using namespace Halide; int main(int argc, char **argv) { + if (get_jit_target_from_environment().arch == Target::WebAssembly) { + printf("[SKIP] WebAssembly does not support async() yet.\n"); + return 0; + } + { Func producer1, producer2, consumer; Var x, y; From 153709b2fcfb4863acb188f8553839110ef6a942 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 29 Nov 2023 14:46:45 -0800 Subject: [PATCH 10/10] trigger buildbots