From 4e3229969685b46bc7d3a311f3e84be7b48f6428 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Wed, 4 Mar 2026 15:21:41 +0800 Subject: [PATCH 1/7] enable WAR memory dependency representation in taskflow --- include/TaskflowDialect/TaskflowOps.td | 4 ++- .../AffineToTaskflow/AffineToTaskflowPass.cpp | 34 ++++++++++++++++-- lib/TaskflowDialect/TaskflowOps.cpp | 34 +++++++++++++++--- .../MemoryAccessStreamingFusion.cpp | 35 ++++++++++++++++--- 4 files changed, 94 insertions(+), 13 deletions(-) diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index 8359b0cc..3f6131a7 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -70,6 +70,7 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [ ); let results = (outs + Variadic:$read_outputs, Variadic:$write_outputs, Variadic:$value_outputs ); @@ -94,6 +95,7 @@ def TaskflowYieldOp : TaskflowOpBase<"yield", [Terminator, Pure, ReturnLike, Att }]; let arguments = (ins + Variadic:$read_results, Variadic:$memory_results, Variadic:$value_results); @@ -102,7 +104,7 @@ def TaskflowYieldOp : TaskflowOpBase<"yield", [Terminator, Pure, ReturnLike, Att let builders = [ // Default builder for empty yield. OpBuilder<(ins), [{ - build($_builder, $_state, ValueRange{}, ValueRange{}); + build($_builder, $_state, ValueRange{}, ValueRange{}, ValueRange{}); }]> ]; } diff --git a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp index 318c530d..5979edd0 100644 --- a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp +++ b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp @@ -233,6 +233,12 @@ static TaskflowTaskOp convertLoopToTask( //------------------------------------------------------------------- // Step 5: Prepares output types. //------------------------------------------------------------------- + // Read output types: passthrough read memrefs for WAR dependency tracking. + SmallVector read_output_types; + for (Value memref : read_memrefs) { + read_output_types.push_back(memref.getType()); + } + SmallVector memory_output_types; for (Value memref : output_memrefs) { memory_output_types.push_back(memref.getType()); @@ -248,7 +254,8 @@ static TaskflowTaskOp convertLoopToTask( //------------------------------------------------------------------- TaskflowTaskOp task_op = builder.create( loc, - /*memory_outputs=*/memory_output_types, + /*read_outputs=*/read_output_types, + /*write_outputs=*/memory_output_types, /*value_outputs=*/value_output_types, /*read_inputs=*/read_inputs, /*write_inputs=*/write_inputs, @@ -294,9 +301,19 @@ static TaskflowTaskOp convertLoopToTask( // Step 8: Creates the yield operation. //--------------------------------------------------------------- task_builder.setInsertionPointToEnd(task_body); + SmallVector read_yield_operands; SmallVector memory_yield_operands; SmallVector value_yield_operands; + // Read yield outputs: passthrough read memref block args for WAR tracking. + for (Value memref : read_memrefs) { + if (input_to_block_arg.count(memref)) { + read_yield_operands.push_back(input_to_block_arg[memref]); + } else { + assert(false && "Read memref not in inputs!"); + } + } + // Memory yield outputs: yield the written memrefs. for (Value memref : output_memrefs) { if (input_to_block_arg.count(memref)) { @@ -310,14 +327,25 @@ static TaskflowTaskOp convertLoopToTask( for (Value result : cloned_loop->getResults()) { value_yield_operands.push_back(result); } - task_builder.create(loc, memory_yield_operands, + task_builder.create(loc, read_yield_operands, + memory_yield_operands, value_yield_operands); //------------------------------------------------------------------- // Step 9 : Updates value mapping with task outputs for subsequent tasks // conversion. //------------------------------------------------------------------- - // Memory outputs. + // Read outputs: establishes WAR dependency chain. + // Only update mapping for memrefs not already mapped by a prior write. + for (auto [memref, task_read_output] : + llvm::zip(read_memrefs, task_op.getReadOutputs())) { + if (!value_mapping.count(memref)) { + value_mapping[memref] = task_read_output; + } + } + + // Memory outputs (write): establishes RAW/WAW dependency chain. + // Write outputs always overwrite read outputs in the mapping. for (auto [memref, task_output] : llvm::zip(output_memrefs, task_op.getWriteOutputs())) { value_mapping[memref] = task_output; diff --git a/lib/TaskflowDialect/TaskflowOps.cpp b/lib/TaskflowDialect/TaskflowOps.cpp index 3f360e6a..381ea0ff 100644 --- a/lib/TaskflowDialect/TaskflowOps.cpp +++ b/lib/TaskflowDialect/TaskflowOps.cpp @@ -132,6 +132,8 @@ ParseResult TaskflowTaskOp::parse(OpAsmParser &parser, OperationState &result) { static_cast(original_write_operands.size())})); // Adds result segment sizes. + // read_outputs count matches read_memrefs count (WAR dependency tracking). + size_t num_read_outputs = read_operands.size(); size_t num_write_outputs = 0; size_t num_value_outputs = 0; for (Type t : func_type.getResults()) { @@ -140,9 +142,12 @@ ParseResult TaskflowTaskOp::parse(OpAsmParser &parser, OperationState &result) { else num_value_outputs++; } + // Total memref results include both read_outputs and write_outputs. + num_write_outputs = num_write_outputs - num_read_outputs; result.addAttribute("resultSegmentSizes", parser.getBuilder().getDenseI32ArrayAttr( - {static_cast(num_write_outputs), + {static_cast(num_read_outputs), + static_cast(num_write_outputs), static_cast(num_value_outputs)})); return success(); @@ -218,7 +223,8 @@ void TaskflowTaskOp::print(OpAsmPrinter &printer) { getValueInputs().getTypes()), printer); printer << ") -> ("; - llvm::interleaveComma(llvm::concat(getWriteOutputs().getTypes(), + llvm::interleaveComma(llvm::concat(getReadOutputs().getTypes(), + getWriteOutputs().getTypes(), getValueOutputs().getTypes()), printer); printer << ")"; @@ -234,11 +240,20 @@ void TaskflowTaskOp::print(OpAsmPrinter &printer) { ParseResult TaskflowYieldOp::parse(OpAsmParser &parser, OperationState &result) { + SmallVector read_operands; + SmallVector read_types; SmallVector write_operands; SmallVector write_types; SmallVector value_operands; SmallVector value_types; + // Parses reads (WAR dependency passthrough). + if (succeeded(parser.parseOptionalKeyword("reads"))) { + if (parser.parseLParen() || parser.parseOperandList(read_operands) || + parser.parseColonTypeList(read_types) || parser.parseRParen()) + return failure(); + } + // Parses writes. if (succeeded(parser.parseOptionalKeyword("writes"))) { if (parser.parseLParen() || parser.parseOperandList(write_operands) || @@ -253,7 +268,9 @@ ParseResult TaskflowYieldOp::parse(OpAsmParser &parser, return failure(); } - if (parser.resolveOperands(write_operands, write_types, + if (parser.resolveOperands(read_operands, read_types, + parser.getCurrentLocation(), result.operands) || + parser.resolveOperands(write_operands, write_types, parser.getCurrentLocation(), result.operands) || parser.resolveOperands(value_operands, value_types, parser.getCurrentLocation(), result.operands)) @@ -261,13 +278,22 @@ ParseResult TaskflowYieldOp::parse(OpAsmParser &parser, result.addAttribute("operandSegmentSizes", parser.getBuilder().getDenseI32ArrayAttr( - {static_cast(write_operands.size()), + {static_cast(read_operands.size()), + static_cast(write_operands.size()), static_cast(value_operands.size())})); return success(); } void TaskflowYieldOp::print(OpAsmPrinter &printer) { + if (!getReadResults().empty()) { + printer << " reads("; + llvm::interleaveComma(getReadResults(), printer); + printer << " : "; + llvm::interleaveComma(getReadResults().getTypes(), printer); + printer << ")"; + } + if (!getMemoryResults().empty()) { printer << " writes("; llvm::interleaveComma(getMemoryResults(), printer); diff --git a/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp b/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp index f99747fc..7b0a496d 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp @@ -145,6 +145,10 @@ class MemoryDependencyAnalysis { for (auto task_op : tasks) { auto &task_info = task_map[task_op.getOperation()]; + // Map read_outputs for WAR dependency tracking. + for (Value ro : task_op.getReadOutputs()) { + write_output_to_producer[ro] = &task_info; + } for (Value wo : task_op.getWriteOutputs()) { write_output_to_producer[wo] = &task_info; } @@ -432,8 +436,12 @@ class TaskFuser { // Step 2: Builds the result types (same as reader's outputs). // Writer's value_outputs are not included because canFuse rejects // writers with value_outputs. + SmallVector read_output_types; SmallVector write_output_types; SmallVector value_output_types; + for (Value v : reader_op.getReadOutputs()) { + read_output_types.push_back(v.getType()); + } for (Value v : reader_op.getWriteOutputs()) { write_output_types.push_back(v.getType()); } @@ -448,9 +456,10 @@ class TaskFuser { .str(); auto fused_task = builder.create( - writer_op.getLoc(), write_output_types, value_output_types, - fused_read_memrefs, fused_write_memrefs, fused_value_inputs, fused_name, - fused_original_read_memrefs, fused_original_write_memrefs); + writer_op.getLoc(), read_output_types, write_output_types, + value_output_types, fused_read_memrefs, fused_write_memrefs, + fused_value_inputs, fused_name, fused_original_read_memrefs, + fused_original_write_memrefs); // Step 4: Builds fused task body by merging loop nests. if (!buildFusedBody(fused_task, writer_op, reader_op, intermediate, @@ -771,6 +780,7 @@ class TaskFuser { } OpBuilder yield_builder(fused_block, fused_block->end()); + SmallVector yield_reads; SmallVector yield_writes; SmallVector yield_values; @@ -778,6 +788,15 @@ class TaskFuser { auto reader_yield = cast(reader_body.getTerminator()); + // Maps reader yield's read results to fused block args. + for (Value v : reader_yield.getReadResults()) { + if (reader_mapping.contains(v)) { + yield_reads.push_back(reader_mapping.lookup(v)); + } else { + yield_reads.push_back(v); + } + } + // Maps reader yield's memory results to fused block args. for (Value v : reader_yield.getMemoryResults()) { if (reader_mapping.contains(v)) { @@ -794,8 +813,8 @@ class TaskFuser { } } - yield_builder.create(reader_op.getLoc(), - yield_writes, yield_values); + yield_builder.create( + reader_op.getLoc(), yield_reads, yield_writes, yield_values); return true; } @@ -920,6 +939,12 @@ class TaskFuser { taskflow::TaskflowTaskOp fused_task, Value intermediate) { + // Replaces reader's read_outputs with fused task's read_outputs. + for (unsigned i = 0; i < reader_op.getReadOutputs().size(); ++i) { + reader_op.getReadOutputs()[i].replaceAllUsesWith( + fused_task.getReadOutputs()[i]); + } + // Replaces reader's write_outputs with fused task's write_outputs. for (unsigned i = 0; i < reader_op.getWriteOutputs().size(); ++i) { reader_op.getWriteOutputs()[i].replaceAllUsesWith( From d5ff221c6b36c2ad433d39ad1f2b5a0a41f6b39c Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Thu, 5 Mar 2026 10:13:02 +0800 Subject: [PATCH 2/7] update partial test --- test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir | 4 ++-- test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir | 2 +- test/e2e/tosa_e2e.mlir | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir index da68e4b3..15f71a8b 100644 --- a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir +++ b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir @@ -16,7 +16,7 @@ module { } // CHECK: func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>, %arg2: memref<16xf32>) { -// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%arg2 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%arg2 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>, memref<16xf32>, memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg3: memref<16xf32>, %arg4: memref<16xf32>, %arg5: memref<16xf32>): // CHECK-NEXT: affine.for %arg6 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg3[%arg6] : memref<16xf32> @@ -24,7 +24,7 @@ module { // CHECK-NEXT: %2 = arith.addf %0, %1 : f32 // CHECK-NEXT: affine.store %2, %arg5[%arg6] : memref<16xf32> // CHECK-NEXT: } -// CHECK-NEXT: taskflow.yield writes(%arg5 : memref<16xf32>) +// CHECK-NEXT: taskflow.yield reads(%arg3, %arg4 : memref<16xf32>, memref<16xf32>) writes(%arg5 : memref<16xf32>) // CHECK-NEXT: } // CHECK-NEXT: return // CHECK-NEXT: } diff --git a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir index e47beb66..b2016317 100644 --- a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir +++ b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir @@ -11,7 +11,7 @@ func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16 // CHECK: func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { // CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): // CHECK-NEXT: affine.for %arg5 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> diff --git a/test/e2e/tosa_e2e.mlir b/test/e2e/tosa_e2e.mlir index 5e2413b6..c6d5d310 100644 --- a/test/e2e/tosa_e2e.mlir +++ b/test/e2e/tosa_e2e.mlir @@ -11,7 +11,7 @@ func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf // CHECK: func.func @test_e2e(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { // CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): // CHECK-NEXT: affine.for %arg5 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> From 7ed546441a696d106bca517f067e5e1631552d05 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 6 Mar 2026 23:00:06 +0800 Subject: [PATCH 3/7] fix bugs in memory access streaming fusion pass --- include/TaskflowDialect/TaskflowOps.td | 2 +- .../MemoryAccessStreamingFusion.cpp | 56 +- .../ResourceAwareTaskOptimizationPass.cpp | 630 ++++++++++-------- .../TosaToTaskflow/tosa-to-taskflow.mlir | 4 +- test/e2e/tosa_e2e.mlir | 4 +- test/multi-cgra/kernel_mapping/fir/fir.mlir | 24 +- .../loop-in-kernel/loop-in-kernel.mlir | 16 +- .../irregular-loop/irregular-loop.mlir | 12 +- .../taskflow/multi-nested/multi-nested.mlir | 56 +- 9 files changed, 436 insertions(+), 368 deletions(-) diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index 3f6131a7..3e252a5a 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -39,7 +39,7 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [ 1. Memory dependencies: memrefs that are read or written by the task 2. Value dependencies: SSA values from producer tasks - The `read_memrefs` and `write_memrefs` attributes record the actural + The `original_read_memrefs` and `original_write_memrefs` attributes record the actural original memrefs that this task accesses, enabling data placement analysis for multi-CGRA mapping. diff --git a/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp b/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp index 7b0a496d..8c39278c 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp @@ -439,7 +439,8 @@ class TaskFuser { SmallVector read_output_types; SmallVector write_output_types; SmallVector value_output_types; - for (Value v : reader_op.getReadOutputs()) { + // read_outputs corresponds to fused_reads (passthrough for WAR tracking). + for (Value v : fused_read_memrefs) { read_output_types.push_back(v.getType()); } for (Value v : reader_op.getWriteOutputs()) { @@ -788,13 +789,11 @@ class TaskFuser { auto reader_yield = cast(reader_body.getTerminator()); - // Maps reader yield's read results to fused block args. - for (Value v : reader_yield.getReadResults()) { - if (reader_mapping.contains(v)) { - yield_reads.push_back(reader_mapping.lookup(v)); - } else { - yield_reads.push_back(v); - } + // Read yield outputs: passthrough the fused block's read_memref args. + // These correspond to fused_reads (writer reads + reader reads - + // intermediate). + for (unsigned i = 0; i < fused_reads.size(); ++i) { + yield_reads.push_back(fused_block->getArgument(i)); } // Maps reader yield's memory results to fused block args. @@ -939,10 +938,40 @@ class TaskFuser { taskflow::TaskflowTaskOp fused_task, Value intermediate) { - // Replaces reader's read_outputs with fused task's read_outputs. + // Helper: finds the index of an outer memref in fused_reads. + auto findInFusedReads = [&](Value outer_memref) -> int { + for (unsigned i = 0; i < fused_task.getReadMemrefs().size(); ++i) { + if (fused_task.getReadMemrefs()[i] == outer_memref) + return i; + } + return -1; + }; + + // Replaces writer's read_outputs: map each to the fused task's + // corresponding read_output by finding the writer's read_memref + // in the fused task's read_memrefs. + for (unsigned i = 0; i < writer_op.getReadOutputs().size(); ++i) { + Value writer_read_input = writer_op.getReadMemrefs()[i]; + int fused_idx = findInFusedReads(writer_read_input); + if (fused_idx >= 0) { + writer_op.getReadOutputs()[i].replaceAllUsesWith( + fused_task.getReadOutputs()[fused_idx]); + } + } + + // Replaces reader's read_outputs: skip intermediate, map others. for (unsigned i = 0; i < reader_op.getReadOutputs().size(); ++i) { - reader_op.getReadOutputs()[i].replaceAllUsesWith( - fused_task.getReadOutputs()[i]); + Value orig = (i < reader_op.getOriginalReadMemrefs().size()) + ? reader_op.getOriginalReadMemrefs()[i] + : reader_op.getReadMemrefs()[i]; + if (orig == intermediate) + continue; // Intermediate read_output is dead after fusion. + Value reader_read_input = reader_op.getReadMemrefs()[i]; + int fused_idx = findInFusedReads(reader_read_input); + if (fused_idx >= 0) { + reader_op.getReadOutputs()[i].replaceAllUsesWith( + fused_task.getReadOutputs()[fused_idx]); + } } // Replaces reader's write_outputs with fused task's write_outputs. @@ -960,11 +989,6 @@ class TaskFuser { // Erases original tasks (reader first since writer might be used by it // through the intermediate, but we've already replaced all uses). reader_op.erase(); - - // Writer's outputs: The intermediate memref output is no longer used. - // Other outputs should have been handled, but let's verify. - // If the writer has other outputs besides the intermediate, those - // should not exist in the single-reader case. writer_op.erase(); // Erases the intermediate memref allocation if it's now dead. diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp index c5052b83..17e341fe 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp @@ -7,8 +7,9 @@ // lowering compiled_ii. Latency model: II * (trip_count - 1) + steps. // // Targets a 4x4 CGRA grid (16 CGRAs total). Each task may use up to 4 CGRAs. -// Supported per-task shapes: rect (1×1..4×1/1×4/2×2), L (3 or 4 CGRAs), T (4 CGRAs). -// Compiled_ii must come from the downstream pipeline (asserts on failure). +// Supported per-task shapes: rect (1×1..4×1/1×4/2×2), L (3 or 4 CGRAs), T (4 +// CGRAs). Compiled_ii must come from the downstream pipeline (asserts on +// failure). // //===----------------------------------------------------------------------===// @@ -36,7 +37,6 @@ #include #include - using namespace mlir; using namespace mlir::taskflow; @@ -50,7 +50,7 @@ constexpr int kCgraGridRows = 4; constexpr int kCgraGridCols = 4; constexpr int kTotalCGRAs = kCgraGridRows * kCgraGridCols; // 16 constexpr int kMaxBalanceIterations = 100; -constexpr int kMaxCgrasPerTask = 4; // Max CGRAs allocatable to a single task. +constexpr int kMaxCgrasPerTask = 4; // Max CGRAs allocatable to a single task. // Sentinel value: 0 means "not yet profiled". After profileTask() runs, // both steps and ii MUST be > 0. An assert fires if profiling fails. @@ -69,9 +69,9 @@ constexpr int64_t kUnprofiled = 0; // (col, row) coordinates of the occupied CGRAs. `rows`/`cols` give the // bounding box so that tile-level x_tiles/y_tiles can be computed. struct CgraShape { - int rows; // Bounding-box CGRA rows. - int cols; // Bounding-box CGRA columns. - bool is_rectangular; // True if all cells in the bbox are used. + int rows; // Bounding-box CGRA rows. + int cols; // Bounding-box CGRA columns. + bool is_rectangular; // True if all cells in the bbox are used. // Explicit CGRA positions for non-rectangular shapes. // Each pair is (col, row) in CGRA coordinates. Empty for rectangles. SmallVector> cgra_positions; @@ -113,7 +113,8 @@ static SmallVector getRectangularShapes(int cgra_count) { for (int r = 1; r <= kCgraGridRows; ++r) { for (int c = 1; c <= kCgraGridCols; ++c) { if (r * c == cgra_count) { - shapes.push_back({r, c, /*is_rectangular=*/true, /*cgra_positions=*/{}}); + shapes.push_back( + {r, c, /*is_rectangular=*/true, /*cgra_positions=*/{}}); } } } @@ -135,34 +136,37 @@ static SmallVector getNonRectangularShapes(int cgra_count) { if (cgra_count == 3) { // L-shape 3 CGRAs: (0,0)(1,0)(0,1) — bbox 2×2 - shapes.push_back({2, 2, false, {{0,0},{1,0},{0,1}}}); + shapes.push_back({2, 2, false, {{0, 0}, {1, 0}, {0, 1}}}); } if (cgra_count == 4) { // T-shape: three in a row + one below centre // (0,0)(1,0)(2,0)(1,1) — bbox 2×3 - shapes.push_back({2, 3, false, {{0,0},{1,0},{2,0},{1,1}}}); + shapes.push_back({2, 3, false, {{0, 0}, {1, 0}, {2, 0}, {1, 1}}}); // L-shape: three in a column + one offset // (0,0)(0,1)(0,2)(1,2) — bbox 3×2 - shapes.push_back({3, 2, false, {{0,0},{0,1},{0,2},{1,2}}}); + shapes.push_back({3, 2, false, {{0, 0}, {0, 1}, {0, 2}, {1, 2}}}); } return shapes; } // Picks the best shape for display/profiling. -// We prefer shapes with the most compact physical layout (smallest maximum distance -// between nodes) to minimize communication latency. In cases of identical bounding -// box area, we prefer more square-like bounds over long rectangles. +// We prefer shapes with the most compact physical layout (smallest maximum +// distance between nodes) to minimize communication latency. In cases of +// identical bounding box area, we prefer more square-like bounds over long +// rectangles. // -// TODO: This function only picks a localized shape for an idealized single task mapping. -// Global placement and conflict resolution across multiple tasks is legitimately deferred -// to downstream map-on-cgra pass, as speculative profiling assumes unconstrained placement. +// TODO: This function only picks a localized shape for an idealized single task +// mapping. Global placement and conflict resolution across multiple tasks is +// legitimately deferred to downstream map-on-cgra pass, as speculative +// profiling assumes unconstrained placement. static CgraShape pickBestShape(int cgra_count) { - // For cgra_count == 3, the 2x2 L-shape has a smaller maximum physical routing distance - // (dist=2) compared to a 1x3 rectangle (dist=3), despite having a larger bounding box. - // We explicitly prefer the more compact L-shape here for better speculative latency. + // For cgra_count == 3, the 2x2 L-shape has a smaller maximum physical routing + // distance (dist=2) compared to a 1x3 rectangle (dist=3), despite having a + // larger bounding box. We explicitly prefer the more compact L-shape here for + // better speculative latency. if (cgra_count == 3) { auto non_rect_shapes = getNonRectangularShapes(3); if (!non_rect_shapes.empty()) { @@ -177,12 +181,14 @@ static CgraShape pickBestShape(int cgra_count) { if (!candidates.empty()) { return *std::min_element(candidates.begin(), candidates.end(), - [](const CgraShape &a, const CgraShape &b) { - int area_a = a.area(); - int area_b = b.area(); - if (area_a != area_b) return area_a < area_b; - return std::abs(a.rows - a.cols) < std::abs(b.rows - b.cols); - }); + [](const CgraShape &a, const CgraShape &b) { + int area_a = a.area(); + int area_b = b.area(); + if (area_a != area_b) + return area_a < area_b; + return std::abs(a.rows - a.cols) < + std::abs(b.rows - b.cols); + }); } // Fallback: smallest bounding box (should not be reached for 1..4 CGRAs). @@ -218,9 +224,7 @@ struct TaskGraphNode { // Returns estimated task latency using the pipelined execution model: // latency = II * (trip_count - 1) + steps. - int64_t estimatedLatency() const { - return ii * (trip_count - 1) + steps; - } + int64_t estimatedLatency() const { return ii * (trip_count - 1) + steps; } }; class TaskDependencyGraph { @@ -233,10 +237,11 @@ class TaskDependencyGraph { size_t task_id = 0; func.walk([&](TaskflowTaskOp task) { auto node = std::make_unique(task_id++, task); - + // If the task already has profiling attributes (e.g., from fusion), // skip expensive speculative lowering and use those directly. - bool has_precomputed = task->hasAttr("compiled_ii") && task->hasAttr("steps"); + bool has_precomputed = + task->hasAttr("compiled_ii") && task->hasAttr("steps"); if (!has_precomputed) { // Speculative lowering to Neura to get real metrics. profileTask(node.get(), task, skip_mapper); @@ -248,7 +253,7 @@ class TaskDependencyGraph { } else { node->trip_count = computeTripCount(task); } - + // Overrides with explicit attributes if present. if (auto attr = task->getAttrOfType("steps")) { node->steps = attr.getInt(); @@ -259,7 +264,7 @@ class TaskDependencyGraph { if (auto attr = task->getAttrOfType("cgra_count")) { node->cgra_count = attr.getInt(); } - + op_to_node[task] = node.get(); nodes.push_back(std::move(node)); }); @@ -285,7 +290,7 @@ class TaskDependencyGraph { } } } - // WAW: producer wrote a memref that this task also writes. + // WAW/WAR: producer wrote or read a memref that this task writes. for (Value memref : consumer->op.getWriteMemrefs()) { if (auto producer_op = memref.getDefiningOp()) { if (auto *producer = op_to_node[producer_op.getOperation()]) { @@ -295,12 +300,10 @@ class TaskDependencyGraph { } } - llvm::errs() << "TaskDependencyGraph: " << nodes.size() - << " tasks\n"; + llvm::errs() << "TaskDependencyGraph: " << nodes.size() << " tasks\n"; for (auto &n : nodes) { - llvm::errs() << " Task " << n->id << " (" - << n->op.getTaskName().str() << "): trip_count=" - << n->trip_count << ", ii=" << n->ii + llvm::errs() << " Task " << n->id << " (" << n->op.getTaskName().str() + << "): trip_count=" << n->trip_count << ", ii=" << n->ii << ", steps=" << n->steps << ", preds=" << n->predecessors.size() << ", succs=" << n->successors.size() << "\n"; @@ -311,14 +314,17 @@ class TaskDependencyGraph { // source_node to dest_node. bool hasDependency(TaskGraphNode *source_node, TaskGraphNode *dest_node) const { - if (source_node == dest_node) return true; + if (source_node == dest_node) + return true; DenseSet visited; SmallVector worklist; worklist.push_back(source_node); while (!worklist.empty()) { auto *current = worklist.pop_back_val(); - if (current == dest_node) return true; - if (!visited.insert(current).second) continue; + if (current == dest_node) + return true; + if (!visited.insert(current).second) + continue; for (auto *succ : current->successors) { worklist.push_back(succ); } @@ -395,7 +401,7 @@ class TaskDependencyGraph { cloned_kernel = k; }); } - + // Computes tile dimensions for the target CGRA shape. int per_cgra_cols = neura::getArchitecture().getPerCgraColumns(); int per_cgra_rows = neura::getArchitecture().getPerCgraRows(); @@ -409,9 +415,9 @@ class TaskDependencyGraph { for (auto &[cgra_c, cgra_r] : node->shape.cgra_positions) { for (int tr = 0; tr < per_cgra_rows; ++tr) { for (int tc = 0; tc < per_cgra_cols; ++tc) { - if (!os.str().empty()) os << ","; - os << (cgra_c * per_cgra_cols + tc) - << "_" + if (!os.str().empty()) + os << ","; + os << (cgra_c * per_cgra_cols + tc) << "_" << (cgra_r * per_cgra_rows + tr); } } @@ -423,19 +429,17 @@ class TaskDependencyGraph { int compiled_ii = 0; int cp_depth = 1; - if (succeeded( - runNeuraPipelineOnKernel(ctx, cloned_kernel, phase2_module, - compiled_ii, cp_depth, - x_tiles, y_tiles, valid_tiles, - skip_mapper))) { + if (succeeded(runNeuraPipelineOnKernel( + ctx, cloned_kernel, phase2_module, compiled_ii, cp_depth, x_tiles, + y_tiles, valid_tiles, skip_mapper))) { llvm::errs() << "[profileTask] kernel in " << task.getTaskName() << ": compiled_ii=" << compiled_ii << ", cp_depth=" << cp_depth << "\n"; } else { llvm::errs() << "[profileTask] Phase 2 failed for kernel in " << task.getTaskName() << ", extracting partial\n"; - extractMetricsFromPartialIR(phase2_module, compiled_ii, cp_depth, - x_tiles, y_tiles); + extractMetricsFromPartialIR(phase2_module, compiled_ii, cp_depth, x_tiles, + y_tiles); } phase2_module.erase(); @@ -445,8 +449,8 @@ class TaskDependencyGraph { node->steps = std::max(cp_depth, 1); llvm::errs() << "[profileTask] " << task.getTaskName() - << ": compiled_ii=" << node->ii - << ", steps=" << node->steps << "\n"; + << ": compiled_ii=" << node->ii << ", steps=" << node->steps + << "\n"; // Erases the temporary module. tmp_mod.erase(); @@ -457,15 +461,10 @@ class TaskDependencyGraph { // x_tiles/y_tiles: multi-CGRA tile grid dimensions. // valid_tiles: explicit tile list for non-rectangular shapes (empty = full). // skip_mapper: skip MapToAcceleratorPass, use ResMII/RecMII only. - LogicalResult runNeuraPipelineOnKernel(MLIRContext *ctx, - neura::KernelOp kernel, - ModuleOp dst_module, - int &compiled_ii, - int &cp_depth, - int x_tiles = 0, - int y_tiles = 0, - const std::string &valid_tiles = "", - bool skip_mapper = false) { + LogicalResult runNeuraPipelineOnKernel( + MLIRContext *ctx, neura::KernelOp kernel, ModuleOp dst_module, + int &compiled_ii, int &cp_depth, int x_tiles = 0, int y_tiles = 0, + const std::string &valid_tiles = "", bool skip_mapper = false) { Location loc = kernel.getLoc(); OpBuilder builder(ctx); builder.setInsertionPointToStart(dst_module.getBody()); @@ -484,12 +483,11 @@ class TaskDependencyGraph { SmallVector result_types(kernel.getResultTypes()); auto func_type = builder.getFunctionType(arg_types, result_types); - auto wrapper_func = builder.create( - loc, "__speculative_kernel__", func_type); + auto wrapper_func = + builder.create(loc, "__speculative_kernel__", func_type); // Tags as neura accelerator — all downstream passes check this. - wrapper_func->setAttr("accelerator", - builder.getStringAttr("neura")); + wrapper_func->setAttr("accelerator", builder.getStringAttr("neura")); // Clones the entire kernel region (all blocks) into the func body. Region &func_region = wrapper_func.getBody(); @@ -521,8 +519,8 @@ class TaskDependencyGraph { if (failed(pm.run(dst_module))) { // Pre-mapper pipeline failed — extract best-effort metrics from partial // Neura IR using ResMII/RecMII analysis with the correct multi-CGRA arch. - extractMetricsFromPartialIR(dst_module, compiled_ii, cp_depth, - x_tiles, y_tiles); + extractMetricsFromPartialIR(dst_module, compiled_ii, cp_depth, x_tiles, + y_tiles); return failure(); } @@ -532,16 +530,18 @@ class TaskDependencyGraph { std::unique_ptr custom_arch; const neura::Architecture *arch_ptr = &neura::getArchitecture(); if (x_tiles > 0 && y_tiles > 0) { - custom_arch = neura::getArchitecture().cloneWithNewDimensions( - y_tiles, x_tiles); + custom_arch = + neura::getArchitecture().cloneWithNewDimensions(y_tiles, x_tiles); arch_ptr = custom_arch.get(); } const neura::Architecture &architecture = *arch_ptr; dst_module.walk([&](func::FuncOp fn) { - if (!fn->hasAttr("accelerator")) return; + if (!fn->hasAttr("accelerator")) + return; Region ®ion = fn.getBody(); - if (region.empty()) return; + if (region.empty()) + return; int res_mii = neura::calculateResMii(region, architecture); auto cycles = neura::collectRecurrenceCycles(region); int rec_mii = 1; @@ -551,10 +551,12 @@ class TaskDependencyGraph { // Derives cp_depth from ALAP (As-Late-As-Possible) scheduling levels. std::set critical_ops; for (auto &cycle : cycles) - for (Operation *op : cycle.operations) critical_ops.insert(op); + for (Operation *op : cycle.operations) + critical_ops.insert(op); auto sorted_ops = neura::getTopologicallySortedOps(region); if (!sorted_ops.empty()) { - auto level_buckets = neura::getOpsInAlapLevels(sorted_ops, critical_ops); + auto level_buckets = + neura::getOpsInAlapLevels(sorted_ops, critical_ops); cp_depth = std::max(cp_depth, (int)level_buckets.size()); } llvm::errs() << "[profileTask] analytical fallback: res_mii=" << res_mii @@ -586,18 +588,20 @@ class TaskDependencyGraph { bool all_data_movs_ok = true; int total_mapped_ops = 0; dst_module.walk([&](func::FuncOp fn) { - if (!fn->hasAttr("accelerator")) return; + if (!fn->hasAttr("accelerator")) + return; fn.walk([&](Operation *op) { - if (isa(op)) return; + if (isa(op)) + return; total_mapped_ops++; if (isa(op)) return; for (Value operand : op->getOperands()) { Operation *producer = operand.getDefiningOp(); - if (!producer) continue; - if (!isa(producer)) all_data_movs_ok = false; } @@ -609,8 +613,8 @@ class TaskDependencyGraph { << " limit=" << kMapperOpLimit << "\n"; if (all_data_movs_ok && total_mapped_ops <= kMapperOpLimit) { - // Runs MapToAcceleratorPass in a fresh pass manager on the already-lowered - // dst_module (pre-mapper pipeline already ran above). + // Runs MapToAcceleratorPass in a fresh pass manager on the + // already-lowered dst_module (pre-mapper pipeline already ran above). // Passes the correct tile dimensions so the mapper uses the right array. PassManager pm2(ctx); pm2.enableVerifier(false); @@ -625,11 +629,13 @@ class TaskDependencyGraph { } if (succeeded(pm2.run(dst_module))) { - // Reads true compiled_ii from mapping_info; overrides analytical estimate. + // Reads true compiled_ii from mapping_info; overrides analytical + // estimate. dst_module.walk([&](func::FuncOp fn) { - if (!fn->hasAttr("accelerator")) return; - if (auto mapping_info = - fn->getAttrOfType(neura::attr::kMappingInfo)) { + if (!fn->hasAttr("accelerator")) + return; + if (auto mapping_info = fn->getAttrOfType( + neura::attr::kMappingInfo)) { if (auto ii_attr = mapping_info.getAs(neura::attr::kCompiledII)) { compiled_ii = (int)ii_attr.getInt(); // authoritative value @@ -656,15 +662,15 @@ class TaskDependencyGraph { // Extracts ResMII/RecMII from partially-lowered IR when the full pipeline // fails. Uses custom arch sized to x_tiles × y_tiles if provided. - void extractMetricsFromPartialIR(ModuleOp tmp_module, - int &out_ii, int &out_cp_depth, - int x_tiles = 0, int y_tiles = 0) { + void extractMetricsFromPartialIR(ModuleOp tmp_module, int &out_ii, + int &out_cp_depth, int x_tiles = 0, + int y_tiles = 0) { // Builds architecture: uses custom tile dimensions if provided. std::unique_ptr custom_arch; const neura::Architecture *arch_ptr = &neura::getArchitecture(); if (x_tiles > 0 && y_tiles > 0) { - custom_arch = neura::getArchitecture().cloneWithNewDimensions( - y_tiles, x_tiles); + custom_arch = + neura::getArchitecture().cloneWithNewDimensions(y_tiles, x_tiles); arch_ptr = custom_arch.get(); } const neura::Architecture &architecture = *arch_ptr; @@ -704,8 +710,7 @@ class TaskDependencyGraph { out_cp_depth = std::max(cp_depth, 1); llvm::errs() << "[profileTask] (partial) ii=" << out_ii - << " (res_mii=" << res_mii - << ", rec_mii=" << rec_mii + << " (res_mii=" << res_mii << ", rec_mii=" << rec_mii << "), steps=" << out_cp_depth << "\n"; } @@ -743,7 +748,8 @@ class TaskDependencyGraph { int64_t range = ub.getInt() - lb.getInt(); int64_t step = st.getInt(); int64_t tc = (range + step - 1) / step; - if (tc > 0) kernel_product *= tc; + if (tc > 0) + kernel_product *= tc; } } }); @@ -774,7 +780,8 @@ class TaskDependencyGraph { int64_t lb = counter.getLowerBound().getSExtValue(); int64_t ub = counter.getUpperBound().getSExtValue(); int64_t step = counter.getStep().getSExtValue(); - if (step <= 0) return 1; + if (step <= 0) + return 1; int64_t range = ub - lb; return (range > 0) ? ((range + step - 1) / step) : 1; }; @@ -802,7 +809,6 @@ class TaskDependencyGraph { return (total > 0) ? total : 1; } - }; //===----------------------------------------------------------------------===// @@ -847,15 +853,16 @@ class PipelineBalancer { int new_cgra_count = old_cgra_count + 1; // Check if incrementing cgra_count is feasible on the 4×4 grid. - // TODO: This currently only checks the capacity (total CGRA count). Ideally, - // we should invoke a global placement pass (aka MapTaskOnCgraPass) here to - // verify if the speculatively increased CGRA count and its proposed shape - // actually fit on the 4x4 grid alongside other previously allocated tasks. + // TODO: This currently only checks the capacity (total CGRA count). + // Ideally, we should invoke a global placement pass (aka + // MapTaskOnCgraPass) here to verify if the speculatively increased CGRA + // count and its proposed shape actually fit on the 4x4 grid alongside + // other previously allocated tasks. // - // Currently, MapTaskOnCgraPass does not support multi-CGRA task placement. - // Once it does, we should call it here; if global placement fails for the - // "best" shape, we should backtrack and try alternative shapes before - // saturating the node. + // Currently, MapTaskOnCgraPass does not support multi-CGRA task + // placement. Once it does, we should call it here; if global placement + // fails for the "best" shape, we should backtrack and try alternative + // shapes before saturating the node. if (!canFitOnGrid(new_cgra_count)) { saturated_nodes.insert(bottleneck); continue; @@ -863,24 +870,27 @@ class PipelineBalancer { // Saves state for potential rollback. int64_t old_latency = bottleneck->estimatedLatency(); - int64_t old_ii = bottleneck->ii; - int64_t old_steps = bottleneck->steps; + int64_t old_ii = bottleneck->ii; + int64_t old_steps = bottleneck->steps; CgraShape old_shape = bottleneck->shape; // Speculatively applies the new CGRA count and re-profiles. bottleneck->cgra_count = new_cgra_count; bottleneck->shape = pickBestShape(new_cgra_count); - llvm::errs() - << " Balance: trying Task " << bottleneck->id << " (" - << bottleneck->op.getTaskName().str() - << ") cgra_count=" << old_cgra_count << "->" << new_cgra_count - << ", shape=" << bottleneck->shape.describe(new_cgra_count) - << ", tile_array=" - << (bottleneck->shape.rows * neura::getArchitecture().getPerCgraRows()) - << "x" - << (bottleneck->shape.cols * neura::getArchitecture().getPerCgraColumns()) - << ", old_ii=" << old_ii << ", old_lat=" << old_latency << "\n"; + llvm::errs() << " Balance: trying Task " << bottleneck->id << " (" + << bottleneck->op.getTaskName().str() + << ") cgra_count=" << old_cgra_count << "->" + << new_cgra_count + << ", shape=" << bottleneck->shape.describe(new_cgra_count) + << ", tile_array=" + << (bottleneck->shape.rows * + neura::getArchitecture().getPerCgraRows()) + << "x" + << (bottleneck->shape.cols * + neura::getArchitecture().getPerCgraColumns()) + << ", old_ii=" << old_ii << ", old_lat=" << old_latency + << "\n"; profile_fn(bottleneck, bottleneck->op); @@ -889,23 +899,22 @@ class PipelineBalancer { if (new_latency < old_latency) { // Accepted: the larger array produces a measurably better latency. changed = true; - llvm::errs() - << " Balance: ACCEPTED Task " << bottleneck->id << " (" - << bottleneck->op.getTaskName().str() - << ") cgra_count=" << new_cgra_count - << ", ii=" << old_ii << "->" << bottleneck->ii - << ", lat=" << old_latency << "->" << new_latency - << ", total_cgras=" << graph.getTotalAllocatedCGRAs() << "\n"; + llvm::errs() << " Balance: ACCEPTED Task " << bottleneck->id << " (" + << bottleneck->op.getTaskName().str() + << ") cgra_count=" << new_cgra_count << ", ii=" << old_ii + << "->" << bottleneck->ii << ", lat=" << old_latency + << "->" << new_latency + << ", total_cgras=" << graph.getTotalAllocatedCGRAs() + << "\n"; } else { // Rejected: no latency improvement — roll back and mark saturated. - llvm::errs() - << " Balance: REJECTED Task " << bottleneck->id - << " (ii=" << bottleneck->ii << ", lat=" << new_latency - << " >= old_lat=" << old_latency << "). Reverting.\n"; + llvm::errs() << " Balance: REJECTED Task " << bottleneck->id + << " (ii=" << bottleneck->ii << ", lat=" << new_latency + << " >= old_lat=" << old_latency << "). Reverting.\n"; bottleneck->cgra_count = old_cgra_count; - bottleneck->shape = old_shape; - bottleneck->ii = old_ii; - bottleneck->steps = old_steps; + bottleneck->shape = old_shape; + bottleneck->ii = old_ii; + bottleneck->steps = old_steps; saturated_nodes.insert(bottleneck); } } @@ -913,109 +922,112 @@ class PipelineBalancer { return changed; } - private: - // Computes the weighted critical path length from a given node to any sink. - int64_t computeCriticalPathFrom(TaskGraphNode *node, - DenseMap &cache) { - auto it = cache.find(node); - if (it != cache.end()) { - return it->second; - } - - int64_t max_successor_path = 0; - for (auto *succ : node->successors) { - max_successor_path = - std::max(max_successor_path, computeCriticalPathFrom(succ, cache)); - } +private: + // Computes the weighted critical path length from a given node to any sink. + int64_t computeCriticalPathFrom(TaskGraphNode *node, + DenseMap &cache) { + auto it = cache.find(node); + if (it != cache.end()) { + return it->second; + } - int64_t path = node->estimatedLatency() + max_successor_path; - cache[node] = path; - return path; + int64_t max_successor_path = 0; + for (auto *succ : node->successors) { + max_successor_path = + std::max(max_successor_path, computeCriticalPathFrom(succ, cache)); } - // Computes the longest path from any source to the given node - // (depth_from_source). Uses dynamic programming with memoization. - int64_t computeDepthFromSource(TaskGraphNode *node, - DenseMap &cache) { - auto it = cache.find(node); - if (it != cache.end()) { - return it->second; - } + int64_t path = node->estimatedLatency() + max_successor_path; + cache[node] = path; + return path; + } - int64_t max_predecessor_depth = 0; - for (auto *pred : node->predecessors) { - max_predecessor_depth = - std::max(max_predecessor_depth, - computeDepthFromSource(pred, cache)); - } + // Computes the longest path from any source to the given node + // (depth_from_source). Uses dynamic programming with memoization. + int64_t computeDepthFromSource(TaskGraphNode *node, + DenseMap &cache) { + auto it = cache.find(node); + if (it != cache.end()) { + return it->second; + } - // depth_from_source(node) = max(depth_from_source(pred) for all preds) - // + node's own latency. - int64_t depth = max_predecessor_depth + node->estimatedLatency(); - cache[node] = depth; - return depth; + int64_t max_predecessor_depth = 0; + for (auto *pred : node->predecessors) { + max_predecessor_depth = + std::max(max_predecessor_depth, computeDepthFromSource(pred, cache)); } - // Finds the bottleneck node on the critical path using full slack analysis. - // - // For each node, slack is defined as: - // slack(node) = global_critical_path - // - depth_from_source(node) - // - depth_to_sink(node) - // + node->estimatedLatency() - // - // where depth_from_source includes the node's own latency, and - // depth_to_sink (computeCriticalPathFrom) also includes the node's own - // latency, so we add it back once to avoid double-counting. - // - // A node is on the critical path iff slack == 0. - // Among critical-path nodes, the one with highest individual latency - // is the bottleneck (reducing its latency most benefits the pipeline). - TaskGraphNode *findBottleneck(TaskDependencyGraph &graph, - const llvm::DenseSet &ignored) { - llvm::DenseMap to_sink_cache; - llvm::DenseMap from_source_cache; - - // Computes depth_to_sink for all nodes (via computeCriticalPathFrom). - int64_t global_critical_path = 0; - for (auto &node : graph.nodes) { - int64_t cp = computeCriticalPathFrom(node.get(), to_sink_cache); - global_critical_path = std::max(global_critical_path, cp); - } + // depth_from_source(node) = max(depth_from_source(pred) for all preds) + // + node's own latency. + int64_t depth = max_predecessor_depth + node->estimatedLatency(); + cache[node] = depth; + return depth; + } - // Computes depth_from_source for all nodes. - for (auto &node : graph.nodes) { - computeDepthFromSource(node.get(), from_source_cache); - } + // Finds the bottleneck node on the critical path using full slack analysis. + // + // For each node, slack is defined as: + // slack(node) = global_critical_path + // - depth_from_source(node) + // - depth_to_sink(node) + // + node->estimatedLatency() + // + // where depth_from_source includes the node's own latency, and + // depth_to_sink (computeCriticalPathFrom) also includes the node's own + // latency, so we add it back once to avoid double-counting. + // + // A node is on the critical path iff slack == 0. + // Among critical-path nodes, the one with highest individual latency + // is the bottleneck (reducing its latency most benefits the pipeline). + TaskGraphNode * + findBottleneck(TaskDependencyGraph &graph, + const llvm::DenseSet &ignored) { + llvm::DenseMap to_sink_cache; + llvm::DenseMap from_source_cache; + + // Computes depth_to_sink for all nodes (via computeCriticalPathFrom). + int64_t global_critical_path = 0; + for (auto &node : graph.nodes) { + int64_t cp = computeCriticalPathFrom(node.get(), to_sink_cache); + global_critical_path = std::max(global_critical_path, cp); + } - // Finds the critical-path node with highest individual latency. - TaskGraphNode *bottleneck = nullptr; - int64_t max_latency = -1; + // Computes depth_from_source for all nodes. + for (auto &node : graph.nodes) { + computeDepthFromSource(node.get(), from_source_cache); + } - for (auto &node : graph.nodes) { - if (ignored.count(node.get())) continue; - if (node->cgra_count >= node->trip_count) continue; - // Per-task CGRA limit: no point trying to add more. - if (node->cgra_count >= kMaxCgrasPerTask) continue; + // Finds the critical-path node with highest individual latency. + TaskGraphNode *bottleneck = nullptr; + int64_t max_latency = -1; - int64_t depth_from = from_source_cache[node.get()]; - int64_t depth_to = to_sink_cache[node.get()]; + for (auto &node : graph.nodes) { + if (ignored.count(node.get())) + continue; + if (node->cgra_count >= node->trip_count) + continue; + // Per-task CGRA limit: no point trying to add more. + if (node->cgra_count >= kMaxCgrasPerTask) + continue; - // slack = global_cp - depth_from - depth_to + node_latency - // (because both depth_from and depth_to include node's own latency). - int64_t slack = global_critical_path - depth_from - depth_to - + node->estimatedLatency(); + int64_t depth_from = from_source_cache[node.get()]; + int64_t depth_to = to_sink_cache[node.get()]; - if (slack != 0) continue; // Not on the critical path. + // slack = global_cp - depth_from - depth_to + node_latency + // (because both depth_from and depth_to include node's own latency). + int64_t slack = global_critical_path - depth_from - depth_to + + node->estimatedLatency(); - if (node->estimatedLatency() > max_latency) { - max_latency = node->estimatedLatency(); - bottleneck = node.get(); - } + if (slack != 0) + continue; // Not on the critical path. + + if (node->estimatedLatency() > max_latency) { + max_latency = node->estimatedLatency(); + bottleneck = node.get(); } - return bottleneck; } - + return bottleneck; + } }; //===----------------------------------------------------------------------===// @@ -1041,10 +1053,9 @@ class UtilizationFuser { auto [node_a, node_b] = *pair; - llvm::errs() - << " Fuse: Task " << node_a->id << " (" - << node_a->op.getTaskName().str() << ") + Task " << node_b->id - << " (" << node_b->op.getTaskName().str() << ")\n"; + llvm::errs() << " Fuse: Task " << node_a->id << " (" + << node_a->op.getTaskName().str() << ") + Task " << node_b->id + << " (" << node_b->op.getTaskName().str() << ")\n"; return performFusion(func, node_a, node_b, graph, profile_fn); } @@ -1070,8 +1081,7 @@ class UtilizationFuser { } // Fusion requires single-block task bodies (counter-mode tasks). - if (!a->op.getBody().hasOneBlock() || - !b->op.getBody().hasOneBlock()) { + if (!a->op.getBody().hasOneBlock() || !b->op.getBody().hasOneBlock()) { continue; } @@ -1107,7 +1117,8 @@ class UtilizationFuser { auto *task_a = a->op.getOperation(); auto *task_b = b->op.getOperation(); - if (task_a->getBlock() != task_b->getBlock()) return false; + if (task_a->getBlock() != task_b->getBlock()) + return false; // Ensures task_a is before task_b. if (!task_a->isBeforeInBlock(task_b)) { @@ -1117,10 +1128,12 @@ class UtilizationFuser { // Check: no other task between a and b should have an edge from/to a or b. for (auto &node : graph.nodes) { - if (node.get() == a || node.get() == b) continue; + if (node.get() == a || node.get() == b) + continue; auto *other_op = node->op.getOperation(); - if (other_op->getBlock() != task_a->getBlock()) continue; + if (other_op->getBlock() != task_a->getBlock()) + continue; // Is this node between task_a and task_b? if (task_a->isBeforeInBlock(other_op) && @@ -1224,6 +1237,10 @@ class UtilizationFuser { addUnique(merged_original_write_memrefs, task_b.getOriginalWriteMemrefs()); // Step 2: Builds result types. + SmallVector read_output_types; + for (Value v : merged_read_memrefs) { + read_output_types.push_back(v.getType()); + } SmallVector write_output_types; for (Value v : merged_write_memrefs) { write_output_types.push_back(v.getType()); @@ -1242,9 +1259,9 @@ class UtilizationFuser { // Step 4: Creates the fused task op. auto fused_task = builder.create( - task_a.getLoc(), write_output_types, value_output_types, - merged_read_memrefs, merged_write_memrefs, merged_value_inputs, - fused_name, merged_original_read_memrefs, + task_a.getLoc(), read_output_types, write_output_types, + value_output_types, merged_read_memrefs, merged_write_memrefs, + merged_value_inputs, fused_name, merged_original_read_memrefs, merged_original_write_memrefs); // ================================================================ @@ -1253,9 +1270,8 @@ class UtilizationFuser { // Step 5: Clones both task regions into the fused task body. // Maps source task's block args to fused task's block args. - auto buildTaskArgMapping = - [&](TaskflowTaskOp orig_task, Region &fused_region, - IRMapping &mapping) { + auto buildTaskArgMapping = [&](TaskflowTaskOp orig_task, + Region &fused_region, IRMapping &mapping) { Block &src_entry = orig_task.getBody().front(); unsigned src_idx = 0; unsigned read_count = orig_task.getReadMemrefs().size(); @@ -1315,7 +1331,8 @@ class UtilizationFuser { { OpBuilder ob = OpBuilder::atBlockEnd(entry_block); for (Operation &op : task_a.getBody().front()) { - if (isa(&op)) continue; + if (isa(&op)) + continue; ob.clone(op, mapping_a); } } @@ -1324,7 +1341,8 @@ class UtilizationFuser { { OpBuilder ob = OpBuilder::atBlockEnd(entry_block); for (Operation &op : task_b.getBody().front()) { - if (isa(&op)) continue; + if (isa(&op)) + continue; ob.clone(op, mapping_b); } } @@ -1374,8 +1392,7 @@ class UtilizationFuser { merged_iter_args, /*cgra_id=*/nullptr, /*kernel_name=*/nullptr, /*accelerator=*/builder.getStringAttr("neura")); - fused_kernel->setAttr("dataflow_mode", - builder.getStringAttr("predicate")); + fused_kernel->setAttr("dataflow_mode", builder.getStringAttr("predicate")); // Builds kernel entry block and block-arg mappings. Region &fused_kernel_region = fused_kernel.getBody(); @@ -1386,9 +1403,10 @@ class UtilizationFuser { kernel_body->addArgument(v.getType(), task_a.getLoc()); // Maps each original kernel's block args to the fused kernel's block args. - // iter_offset tracks where this kernel's iter_args start in the merged list. - auto buildKernelArgMapping = - [&](neura::KernelOp kernel, unsigned iter_offset) -> IRMapping { + // iter_offset tracks where this kernel's iter_args start in the merged + // list. + auto buildKernelArgMapping = [&](neura::KernelOp kernel, + unsigned iter_offset) -> IRMapping { IRMapping km; Block &src_entry = kernel.getBody().front(); unsigned src_idx = 0; @@ -1406,15 +1424,14 @@ class UtilizationFuser { // Maps iter_args. for (unsigned i = 0; i < kernel.getIterArgsInit().size(); ++i) { km.map(src_entry.getArgument(src_idx + i), - kernel_body->getArgument( - merged_kernel_inputs.size() + iter_offset + i)); + kernel_body->getArgument(merged_kernel_inputs.size() + + iter_offset + i)); } return km; }; - IRMapping kernel_mapping_a = buildKernelArgMapping( - cloned_kernel_a, 0); + IRMapping kernel_mapping_a = buildKernelArgMapping(cloned_kernel_a, 0); IRMapping kernel_mapping_b = buildKernelArgMapping( cloned_kernel_b, cloned_kernel_a.getIterArgsInit().size()); @@ -1422,11 +1439,13 @@ class UtilizationFuser { { OpBuilder kb = OpBuilder::atBlockEnd(kernel_body); for (auto &op : cloned_kernel_a.getBody().front().getOperations()) { - if (isa(&op)) continue; + if (isa(&op)) + continue; kb.clone(op, kernel_mapping_a); } for (auto &op : cloned_kernel_b.getBody().front().getOperations()) { - if (isa(&op)) continue; + if (isa(&op)) + continue; kb.clone(op, kernel_mapping_b); } @@ -1436,21 +1455,20 @@ class UtilizationFuser { if (auto yield_a = dyn_cast( cloned_kernel_a.getBody().front().getTerminator())) { for (Value v : yield_a.getIterArgsNext()) - merged_iter_args_next.push_back( - kernel_mapping_a.lookupOrDefault(v)); + merged_iter_args_next.push_back(kernel_mapping_a.lookupOrDefault(v)); for (Value v : yield_a.getResults()) merged_results.push_back(kernel_mapping_a.lookupOrDefault(v)); } if (auto yield_b = dyn_cast( cloned_kernel_b.getBody().front().getTerminator())) { for (Value v : yield_b.getIterArgsNext()) - merged_iter_args_next.push_back( - kernel_mapping_b.lookupOrDefault(v)); + merged_iter_args_next.push_back(kernel_mapping_b.lookupOrDefault(v)); for (Value v : yield_b.getResults()) merged_results.push_back(kernel_mapping_b.lookupOrDefault(v)); } - // Creates the combined neura.yield and preserves yield_type from kernel_a. + // Creates the combined neura.yield and preserves yield_type from + // kernel_a. auto fused_yield = kb.create( task_a.getLoc(), merged_iter_args_next, merged_results); if (auto yield_a = dyn_cast( @@ -1477,6 +1495,12 @@ class UtilizationFuser { // Builds and inserts the merged taskflow.yield. { + // Read outputs pass through the entry block's read-memref args. + SmallVector yield_reads; + for (size_t i = 0; i < merged_read_memrefs.size(); ++i) { + yield_reads.push_back(entry_block->getArgument(i)); + } + // Writes outputs pass through the entry block's write-memref args. SmallVector yield_writes; for (size_t i = 0; i < merged_write_memrefs.size(); ++i) { @@ -1494,13 +1518,13 @@ class UtilizationFuser { // Erases auto-inserted yield and creates the merged one. if (!entry_block->empty()) { - if (auto existing_yield = dyn_cast( - entry_block->back())) { + if (auto existing_yield = + dyn_cast(entry_block->back())) { existing_yield.erase(); } } OpBuilder tb = OpBuilder::atBlockEnd(entry_block); - tb.create(fused_task.getLoc(), yield_writes, + tb.create(fused_task.getLoc(), yield_reads, yield_writes, yield_values); } @@ -1514,27 +1538,29 @@ class UtilizationFuser { TaskGraphNode fused_node(/*id=*/0, fused_task); fused_node.trip_count = fused_trip; profile_fn(&fused_node, fused_task); - fused_task->setAttr("steps", - OpBuilder(fused_task).getI64IntegerAttr(fused_node.steps)); - fused_task->setAttr("compiled_ii", - OpBuilder(fused_task).getI64IntegerAttr(fused_node.ii)); + fused_task->setAttr( + "steps", OpBuilder(fused_task).getI64IntegerAttr(fused_node.steps)); + fused_task->setAttr( + "compiled_ii", + OpBuilder(fused_task).getI64IntegerAttr(fused_node.ii)); } - // Step 7: Replaces uses of original tasks' results. // Value outputs are ordered: task_a's value outputs first, then task_b's. unsigned val_offset_a = 0; unsigned val_offset_b = task_a.getValueOutputs().size(); - replaceTaskResults(task_a, fused_task, merged_write_memrefs, val_offset_a); - replaceTaskResults(task_b, fused_task, merged_write_memrefs, val_offset_b); + replaceTaskResults(task_a, fused_task, merged_read_memrefs, + merged_write_memrefs, val_offset_a); + replaceTaskResults(task_b, fused_task, merged_read_memrefs, + merged_write_memrefs, val_offset_b); // Step 8: Erases original tasks. // Verifies no remaining uses before erasing. auto verifyNoUses = [](TaskflowTaskOp task, StringRef label) { for (Value result : task->getResults()) { if (!result.use_empty()) { - llvm::errs() << "[performFusion] ERROR: " << label - << " result #" << result.cast().getResultNumber() + llvm::errs() << "[performFusion] ERROR: " << label << " result #" + << result.cast().getResultNumber() << " still has uses:\n"; for (auto &use : result.getUses()) { llvm::errs() << " used by: "; @@ -1555,7 +1581,8 @@ class UtilizationFuser { // Finds the index of a value in a list. unsigned findOperandIndex(const SmallVector &list, Value v) { for (unsigned i = 0; i < list.size(); ++i) { - if (list[i] == v) return i; + if (list[i] == v) + return i; } llvm_unreachable("Value not found in operand list"); } @@ -1564,8 +1591,17 @@ class UtilizationFuser { // fused task. Handles both write outputs (memrefs) and value outputs // (reductions, iter_args). void replaceTaskResults(TaskflowTaskOp orig_task, TaskflowTaskOp fused_task, + const SmallVector &merged_read_memrefs, const SmallVector &merged_write_memrefs, unsigned value_output_offset) { + // Read outputs: maps by matching the original read memref to its + // position in the merged read memrefs list. + for (unsigned i = 0; i < orig_task.getReadOutputs().size(); ++i) { + Value orig_result = orig_task.getReadOutputs()[i]; + Value orig_read = orig_task.getReadMemrefs()[i]; + unsigned fused_idx = findOperandIndex(merged_read_memrefs, orig_read); + orig_result.replaceAllUsesWith(fused_task.getReadOutputs()[fused_idx]); + } // Writes outputs: maps by matching the original write memref to its // position in the merged write memrefs list. for (unsigned i = 0; i < orig_task.getWriteOutputs().size(); ++i) { @@ -1595,7 +1631,8 @@ struct ResourceAwareTaskOptimizationPass ResourceAwareTaskOptimizationPass) ResourceAwareTaskOptimizationPass() = default; - ResourceAwareTaskOptimizationPass(const ResourceAwareTaskOptimizationPass &other) + ResourceAwareTaskOptimizationPass( + const ResourceAwareTaskOptimizationPass &other) : PassWrapper(other) {} StringRef getArgument() const override { @@ -1647,8 +1684,7 @@ struct ResourceAwareTaskOptimizationPass bool use_analytical = (estimationMode.getValue() == "analytical"); - llvm::errs() << "=== ResourceAwareTaskOptimization on " - << func.getName() + llvm::errs() << "=== ResourceAwareTaskOptimization on " << func.getName() << " (estimation-mode=" << estimationMode.getValue() << ") ===\n"; @@ -1670,12 +1706,12 @@ struct ResourceAwareTaskOptimizationPass "Number of tasks exceeds 4x4 CGRA grid capacity! " "Reduce task count via streaming fusion or increase grid size."); - llvm::errs() << "[ResourceAware] Iteration " << outer << ": " - << num_tasks << " tasks\n"; + llvm::errs() << "[ResourceAware] Iteration " << outer << ": " << num_tasks + << " tasks\n"; for (auto &node : graph.nodes) { - llvm::errs() << " Task " << node->id << " (" - << node->op.getTaskName() << "): trip_count=" - << node->trip_count << ", cgra_count=" << node->cgra_count + llvm::errs() << " Task " << node->id << " (" << node->op.getTaskName() + << "): trip_count=" << node->trip_count + << ", cgra_count=" << node->cgra_count << ", est_latency=" << node->estimatedLatency() << "\n"; } @@ -1686,7 +1722,7 @@ struct ResourceAwareTaskOptimizationPass // lambda so fused tasks get real profiling. In analytical mode, the // mapper is skipped entirely (only ResMII/RecMII estimates are used). auto profile_fn = [&graph, use_analytical](TaskGraphNode *node, - TaskflowTaskOp task) { + TaskflowTaskOp task) { graph.profileTaskPublic(node, task, /*skip_mapper=*/use_analytical); }; bool fuse_changed = fuser.fuse(func, graph, profile_fn); @@ -1704,7 +1740,7 @@ struct ResourceAwareTaskOptimizationPass // Balance probes use analytical-only profiling by default. bool balance_skip = use_analytical || balanceSkipMapper.getValue(); auto balance_profile_fn = [&graph, balance_skip](TaskGraphNode *node, - TaskflowTaskOp task) { + TaskflowTaskOp task) { graph.profileTaskPublic(node, task, /*skip_mapper=*/balance_skip); }; PipelineBalancer balancer; @@ -1714,8 +1750,8 @@ struct ResourceAwareTaskOptimizationPass if (balance_changed || fuse_changed) { for (auto &node : graph.nodes) { OpBuilder b(node->op); - node->op->setAttr( - "cgra_count", b.getI32IntegerAttr(node->cgra_count)); + node->op->setAttr("cgra_count", + b.getI32IntegerAttr(node->cgra_count)); if (node->ii != kUnprofiled) { node->op->setAttr("compiled_ii", b.getI32IntegerAttr(node->ii)); } @@ -1757,10 +1793,8 @@ struct ResourceAwareTaskOptimizationPass node->shape = pickBestShape(node->cgra_count); node->op->setAttr("cgra_count", b.getI32IntegerAttr(node->cgra_count)); - node->op->setAttr("compiled_ii", - b.getI32IntegerAttr(node->ii)); - node->op->setAttr("steps", - b.getI32IntegerAttr(node->steps)); + node->op->setAttr("compiled_ii", b.getI32IntegerAttr(node->ii)); + node->op->setAttr("steps", b.getI32IntegerAttr(node->steps)); node->op->setAttr("trip_count", b.getI32IntegerAttr(node->trip_count)); // Writes tile_shape attribute: simple "NxM" bounding-box string. @@ -1772,7 +1806,8 @@ struct ResourceAwareTaskOptimizationPass } } - // Performs final validation and tile occupation summary with visual 4x4 grid. + // Performs final validation and tile occupation summary with visual 4x4 + // grid. { TaskDependencyGraph final_graph; final_graph.build(func, use_analytical); @@ -1794,21 +1829,23 @@ struct ResourceAwareTaskOptimizationPass for (auto &node : final_graph.nodes) { auto shape = pickBestShape(node->cgra_count); int tile_rows = shape.rows * neura::getArchitecture().getPerCgraRows(); - int tile_cols = shape.cols * neura::getArchitecture().getPerCgraColumns(); + int tile_cols = + shape.cols * neura::getArchitecture().getPerCgraColumns(); - // Per-task grid (shape.rows x shape.cols bbox, filled up to cgra_count). + // Per-task grid (shape.rows x shape.cols bbox, filled up to + // cgra_count). llvm::errs() << "\n [" << task_idx << "] " << node->op.getTaskName() << " cgra_count=" << node->cgra_count << " shape=" << shape.describe(node->cgra_count) << " tile_array=" << tile_rows << "x" << tile_cols - << " ii=" << node->ii - << " steps=" << node->steps + << " ii=" << node->ii << " steps=" << node->steps << " trip_count=" << node->trip_count << "\n"; // Draws a per-task bounding-box grid (shape.rows x shape.cols). int remaining = node->cgra_count; - llvm::errs() << " +" ; - for (int c = 0; c < shape.cols; ++c) llvm::errs() << "---+"; + llvm::errs() << " +"; + for (int c = 0; c < shape.cols; ++c) + llvm::errs() << "---+"; llvm::errs() << "\n"; for (int r = 0; r < shape.rows; ++r) { llvm::errs() << " |"; @@ -1822,19 +1859,24 @@ struct ResourceAwareTaskOptimizationPass } llvm::errs() << "\n"; llvm::errs() << " +"; - for (int c = 0; c < shape.cols; ++c) llvm::errs() << "---+"; + for (int c = 0; c < shape.cols; ++c) + llvm::errs() << "---+"; llvm::errs() << "\n"; } // Places onto combined grid (pack sequentially). int placed = 0; - for (int r = next_row; r < kCgraGridRows && placed < node->cgra_count; ++r) { + for (int r = next_row; r < kCgraGridRows && placed < node->cgra_count; + ++r) { for (int c = (r == next_row ? next_col : 0); c < kCgraGridCols && placed < node->cgra_count; ++c) { combined_grid[r][c] = task_idx; next_row = r; next_col = c + 1; - if (next_col >= kCgraGridCols) { next_col = 0; next_row = r + 1; } + if (next_col >= kCgraGridCols) { + next_col = 0; + next_row = r + 1; + } ++placed; } } @@ -1845,7 +1887,8 @@ struct ResourceAwareTaskOptimizationPass llvm::errs() << "\n Combined 4x" << kCgraGridCols << " Grid" << " (" << final_total << "/" << kTotalCGRAs << " used):\n"; llvm::errs() << " +"; - for (int c = 0; c < kCgraGridCols; ++c) llvm::errs() << "---+"; + for (int c = 0; c < kCgraGridCols; ++c) + llvm::errs() << "---+"; llvm::errs() << "\n"; for (int r = 0; r < kCgraGridRows; ++r) { llvm::errs() << " |"; @@ -1858,7 +1901,8 @@ struct ResourceAwareTaskOptimizationPass } llvm::errs() << "\n"; llvm::errs() << " +"; - for (int c = 0; c < kCgraGridCols; ++c) llvm::errs() << "---+"; + for (int c = 0; c < kCgraGridCols; ++c) + llvm::errs() << "---+"; llvm::errs() << "\n"; } llvm::errs() << " (" << (kTotalCGRAs - final_total) << " free)\n"; diff --git a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir index b2016317..84e1cea9 100644 --- a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir +++ b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir @@ -11,7 +11,7 @@ func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16 // CHECK: func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { // CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>, memref<16xf32>, memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): // CHECK-NEXT: affine.for %arg5 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> @@ -19,7 +19,7 @@ func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16 // CHECK-NEXT: %2 = arith.addf %0, %1 : f32 // CHECK-NEXT: affine.store %2, %arg4[%arg5] : memref<16xf32> // CHECK-NEXT: } -// CHECK-NEXT: taskflow.yield writes(%arg4 : memref<16xf32>) +// CHECK-NEXT: taskflow.yield reads(%arg2, %arg3 : memref<16xf32>, memref<16xf32>) writes(%arg4 : memref<16xf32>) // CHECK-NEXT: } // CHECK-NEXT: return %write_outputs : memref<16xf32> // CHECK-NEXT: } diff --git a/test/e2e/tosa_e2e.mlir b/test/e2e/tosa_e2e.mlir index c6d5d310..5ef3babd 100644 --- a/test/e2e/tosa_e2e.mlir +++ b/test/e2e/tosa_e2e.mlir @@ -11,7 +11,7 @@ func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf // CHECK: func.func @test_e2e(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { // CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>) { +// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>, memref<16xf32>, memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): // CHECK-NEXT: affine.for %arg5 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> @@ -20,7 +20,7 @@ func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf // CHECK-NEXT: %3 = arith.mulf %2, %2 : f32 // CHECK-NEXT: affine.store %3, %arg4[%arg5] : memref<16xf32> // CHECK-NEXT: } -// CHECK-NEXT: taskflow.yield writes(%arg4 : memref<16xf32>) +// CHECK-NEXT: taskflow.yield reads(%arg2, %arg3 : memref<16xf32>, memref<16xf32>) writes(%arg4 : memref<16xf32>) // CHECK-NEXT: } // CHECK-NEXT: return %write_outputs : memref<16xf32> // CHECK-NEXT: } diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir index f70d99ca..f4122392 100644 --- a/test/multi-cgra/kernel_mapping/fir/fir.mlir +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -92,7 +92,7 @@ module attributes {} { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// TASKFLOW-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (i32) { +// TASKFLOW-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // TASKFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // TASKFLOW-NEXT: %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) { // TASKFLOW-NEXT: %1 = affine.load %arg3[%arg6] : memref @@ -101,7 +101,7 @@ module attributes {} { // TASKFLOW-NEXT: %4 = arith.addi %arg7, %3 : i32 // TASKFLOW-NEXT: affine.yield %4 : i32 // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield values(%0 : i32) +// TASKFLOW-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%0 : i32) // TASKFLOW-NEXT: } // TASKFLOW-NEXT: return %value_outputs : i32 // TASKFLOW-NEXT: } @@ -110,7 +110,7 @@ module attributes {} { // HYPERBLOCK: module { // HYPERBLOCK-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 -// HYPERBLOCK-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (i32) { +// HYPERBLOCK-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // HYPERBLOCK-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // HYPERBLOCK-NEXT: %1 = "taskflow.hyperblock"(%0, %arg5) <{operandSegmentSizes = array}> ({ @@ -121,7 +121,7 @@ module attributes {} { // HYPERBLOCK-NEXT: %5 = arith.addi %arg7, %4 : i32 // HYPERBLOCK-NEXT: taskflow.hyperblock.yield iter_args_next(%5 : i32) results(%5 : i32) // HYPERBLOCK-NEXT: }) : (index, i32) -> i32 -// HYPERBLOCK-NEXT: taskflow.yield values(%1 : i32) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%1 : i32) // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: return %value_outputs : i32 // HYPERBLOCK-NEXT: } @@ -130,7 +130,7 @@ module attributes {} { // KERNEL: module { // KERNEL-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // KERNEL-NEXT: %c0_i32 = arith.constant 0 : i32 -// KERNEL-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (i32) { +// KERNEL-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // KERNEL-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // KERNEL-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) { @@ -142,7 +142,7 @@ module attributes {} { // KERNEL-NEXT: %6 = arith.addi %arg8, %5 : i32 // KERNEL-NEXT: neura.yield iter_args_next(%6 : i32) results(%6 : i32) // KERNEL-NEXT: } : i32 -// KERNEL-NEXT: taskflow.yield values(%1 : i32) +// KERNEL-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%1 : i32) // KERNEL-NEXT: } // KERNEL-NEXT: return %value_outputs : i32 // KERNEL-NEXT: } @@ -151,7 +151,7 @@ module attributes {} { // NEURA: module { // NEURA-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 -// NEURA-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (i32) { +// NEURA-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // NEURA-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // NEURA-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // NEURA-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura"} { @@ -163,7 +163,7 @@ module attributes {} { // NEURA-NEXT: %6 = "neura.add"(%arg8, %5) : (i32, i32) -> i32 // NEURA-NEXT: neura.yield iter_args_next(%6 : i32) results(%6 : i32) // NEURA-NEXT: } : i32 -// NEURA-NEXT: taskflow.yield values(%1 : i32) +// NEURA-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%1 : i32) // NEURA-NEXT: } // NEURA-NEXT: return %value_outputs : i32 // NEURA-NEXT: } @@ -172,7 +172,7 @@ module attributes {} { // DATAFLOW: module { // DATAFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// DATAFLOW-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (i32) { +// DATAFLOW-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // DATAFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // DATAFLOW-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // DATAFLOW-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { @@ -192,7 +192,7 @@ module attributes {} { // DATAFLOW-NEXT: neura.return_value %12 : !neura.data // DATAFLOW-NEXT: neura.yield // DATAFLOW-NEXT: } : i32 -// DATAFLOW-NEXT: taskflow.yield values(%1 : i32) +// DATAFLOW-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%1 : i32) // DATAFLOW-NEXT: } // DATAFLOW-NEXT: return %value_outputs : i32 // DATAFLOW-NEXT: } @@ -201,7 +201,7 @@ module attributes {} { // MAPPED: module { // MAPPED-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 -// MAPPED-NEXT: %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (i32) { +// MAPPED-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // MAPPED-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // MAPPED-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // MAPPED-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { @@ -233,7 +233,7 @@ module attributes {} { // MAPPED-NEXT: neura.return_value %24 : !neura.data {dfg_id = 25 : i32, mapping_locs = [{id = 8 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 4 : i32, x = 0 : i32, y = 2 : i32}]} // MAPPED-NEXT: neura.yield {dfg_id = 3 : i32} // MAPPED-NEXT: } : i32 -// MAPPED-NEXT: taskflow.yield values(%1 : i32) +// MAPPED-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%1 : i32) // MAPPED-NEXT: } // MAPPED-NEXT: return %value_outputs : i32 // MAPPED-NEXT: } diff --git a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir index 1802e538..cfcae914 100644 --- a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir +++ b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir @@ -55,7 +55,7 @@ module { func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { %c0_i32 = arith.constant 0 : i32 - %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (i32) { + %read_outputs:2, %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): %1 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) { ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -68,7 +68,7 @@ module { } neura.yield results(%0 : i32) } : i32 - taskflow.yield values(%1 : i32) + taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%1 : i32) } return %value_outputs : i32 } @@ -77,7 +77,7 @@ module { // NEURA: module { // NEURA-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 -// NEURA-NEXT: %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (i32) { +// NEURA-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { // NEURA-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // NEURA-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura"} { // NEURA-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -101,7 +101,7 @@ module { // NEURA-NEXT: ^bb3: // pred: ^bb1 // NEURA-NEXT: neura.yield results(%6 : i32) // NEURA-NEXT: } : i32 -// NEURA-NEXT: taskflow.yield values(%0 : i32) +// NEURA-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%0 : i32) // NEURA-NEXT: } // NEURA-NEXT: return %value_outputs : i32 // NEURA-NEXT: } @@ -111,7 +111,7 @@ module { // DATAFLOW: module { // DATAFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// DATAFLOW-NEXT: %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (i32) { +// DATAFLOW-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { // DATAFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // DATAFLOW-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { // DATAFLOW-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -140,7 +140,7 @@ module { // DATAFLOW-NEXT: neura.ctrl_mov %18 -> %5 : !neura.data !neura.data // DATAFLOW-NEXT: neura.yield // DATAFLOW-NEXT: } : i32 -// DATAFLOW-NEXT: taskflow.yield values(%0 : i32) +// DATAFLOW-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%0 : i32) // DATAFLOW-NEXT: } // DATAFLOW-NEXT: return %value_outputs : i32 // DATAFLOW-NEXT: } @@ -150,7 +150,7 @@ module { // MAPPED: module { // MAPPED-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 -// MAPPED-NEXT: %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (i32) { +// MAPPED-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { // MAPPED-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // MAPPED-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { // MAPPED-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -193,7 +193,7 @@ module { // MAPPED-NEXT: neura.ctrl_mov %32 -> %3 {dfg_id = 37 : i32, mapping_locs = [{id = 13 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}]} : !neura.data !neura.data // MAPPED-NEXT: neura.yield {dfg_id = 4 : i32} // MAPPED-NEXT: } : i32 -// MAPPED-NEXT: taskflow.yield values(%0 : i32) +// MAPPED-NEXT: taskflow.yield reads(%arg3, %arg4 : memref, memref) values(%0 : i32) // MAPPED-NEXT: } // MAPPED-NEXT: return %value_outputs : i32 // MAPPED-NEXT: } diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 364bcadc..5c840ce5 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -203,7 +203,7 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref) { +// TASKFLOW-NEXT: %read_outputs, %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) { // TASKFLOW-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): // TASKFLOW-NEXT: affine.for %arg5 = 0 to 4 { // TASKFLOW-NEXT: %1 = arith.index_cast %arg5 : index to i32 @@ -218,7 +218,7 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield writes(%arg1 : memref) +// TASKFLOW-NEXT: taskflow.yield reads(%arg0 : memref<4x8xi32>) writes(%arg1 : memref) // TASKFLOW-NEXT: } // TASKFLOW-NEXT: %0 = affine.load %write_outputs_1[] : memref // TASKFLOW-NEXT: return %0 : i32 @@ -268,7 +268,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref) { +// KERNEL-NEXT: %read_outputs, %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) { // KERNEL-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): // KERNEL-NEXT: affine.for %arg5 = 0 to 4 { // KERNEL-NEXT: neura.kernel inputs(%arg0, %arg5, %arg3, %arg1, %arg4 : memref<4x8xi32>, index, i32, memref, i32) { @@ -295,7 +295,7 @@ module attributes {} { // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg1 : memref) +// KERNEL-NEXT: taskflow.yield reads(%arg0 : memref<4x8xi32>) writes(%arg1 : memref) // KERNEL-NEXT: } // KERNEL-NEXT: %0 = affine.load %write_outputs_1[] : memref // KERNEL-NEXT: return %0 : i32 @@ -339,7 +339,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index) -> () // HYPERBLOCK-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref) { +// HYPERBLOCK-NEXT: %read_outputs, %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1) <{operandSegmentSizes = array}> ({ @@ -368,7 +368,7 @@ module attributes {} { // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg1 : memref) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg0 : memref<4x8xi32>) writes(%arg1 : memref) // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[] : memref // HYPERBLOCK-NEXT: return %0 : i32 diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index 42f99361..8281b313 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -208,7 +208,7 @@ module attributes {} { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// TASKFLOW-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref) { +// TASKFLOW-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref): // TASKFLOW-NEXT: affine.for %arg12 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg13 = 0 to 8 { @@ -218,9 +218,9 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield writes(%arg11 : memref) +// TASKFLOW-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref) { +// TASKFLOW-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // TASKFLOW-NEXT: affine.for %arg13 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg14 = 0 to 8 { @@ -232,9 +232,9 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield writes(%arg12 : memref) +// TASKFLOW-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref) { +// TASKFLOW-NEXT: %read_outputs_2:3, %write_outputs_3 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_1, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): // TASKFLOW-NEXT: affine.for %arg14 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg15 = 0 to 8 { @@ -248,9 +248,9 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield writes(%arg13 : memref) +// TASKFLOW-NEXT: taskflow.yield reads(%arg10, %arg11, %arg13 : memref, memref, memref) writes(%arg13 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %write_outputs_2 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref) { +// TASKFLOW-NEXT: %read_outputs_4, %write_outputs_5 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref): // TASKFLOW-NEXT: affine.for %arg12 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg13 = 0 to 7 { @@ -258,9 +258,9 @@ module attributes {} { // TASKFLOW-NEXT: affine.store %1, %arg11[%arg13] : memref // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield writes(%arg11 : memref) +// TASKFLOW-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %write_outputs_3 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_2 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref) { +// TASKFLOW-NEXT: %read_outputs_6:2, %write_outputs_7 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_5 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // TASKFLOW-NEXT: affine.for %arg13 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg14 = 0 to 9 { @@ -270,16 +270,16 @@ module attributes {} { // TASKFLOW-NEXT: affine.store %3, %arg12[%arg14] : memref // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield writes(%arg12 : memref) +// TASKFLOW-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %0 = affine.load %write_outputs_1[0] : memref +// TASKFLOW-NEXT: %0 = affine.load %write_outputs_3[0] : memref // TASKFLOW-NEXT: return %0 : i32 // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } // STREAM: module { // STREAM-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// STREAM-NEXT: %write_outputs = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref) { +// STREAM-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // STREAM-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // STREAM-NEXT: affine.for %arg13 = 0 to 4 { // STREAM-NEXT: affine.for %arg14 = 0 to 8 { @@ -291,9 +291,9 @@ module attributes {} { // STREAM-NEXT: } // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg12 : memref) +// STREAM-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // STREAM-NEXT: } -// STREAM-NEXT: %write_outputs_0 = taskflow.task @Task_0_Task_2_fused read_memrefs(%arg0, %write_outputs, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg0, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref) { +// STREAM-NEXT: %read_outputs_0:3, %write_outputs_1 = taskflow.task @Task_0_Task_2_fused read_memrefs(%arg0, %write_outputs, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg0, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { // STREAM-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): // STREAM-NEXT: affine.for %arg14 = 0 to 4 { // STREAM-NEXT: affine.for %arg15 = 0 to 8 { @@ -307,9 +307,9 @@ module attributes {} { // STREAM-NEXT: } // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg12 : memref) +// STREAM-NEXT: taskflow.yield reads(%arg10, %arg11, %arg12 : memref, memref, memref) writes(%arg12 : memref) // STREAM-NEXT: } -// STREAM-NEXT: %write_outputs_1 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref) { +// STREAM-NEXT: %read_outputs_2, %write_outputs_3 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { // STREAM-NEXT: ^bb0(%arg10: memref, %arg11: memref): // STREAM-NEXT: affine.for %arg12 = 0 to 4 { // STREAM-NEXT: affine.for %arg13 = 0 to 7 { @@ -317,9 +317,9 @@ module attributes {} { // STREAM-NEXT: affine.store %1, %arg11[%arg13] : memref // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg11 : memref) +// STREAM-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // STREAM-NEXT: } -// STREAM-NEXT: %write_outputs_2 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_1 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref) { +// STREAM-NEXT: %read_outputs_4:2, %write_outputs_5 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_3 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // STREAM-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // STREAM-NEXT: affine.for %arg13 = 0 to 4 { // STREAM-NEXT: affine.for %arg14 = 0 to 9 { @@ -329,16 +329,16 @@ module attributes {} { // STREAM-NEXT: affine.store %3, %arg12[%arg14] : memref // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg12 : memref) +// STREAM-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // STREAM-NEXT: } -// STREAM-NEXT: %0 = affine.load %write_outputs_0[0] : memref +// STREAM-NEXT: %0 = affine.load %write_outputs_1[0] : memref // STREAM-NEXT: return %0 : i32 // STREAM-NEXT: } // STREAM-NEXT: } // KERNEL: module { // KERNEL-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// KERNEL-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref) { +// KERNEL-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref): // KERNEL-NEXT: affine.for %arg12 = 0 to 4 { // KERNEL-NEXT: affine.for %arg13 = 0 to 8 { @@ -355,9 +355,9 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: } // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg11 : memref) +// KERNEL-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref) { +// KERNEL-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // KERNEL-NEXT: affine.for %arg13 = 0 to 4 { // KERNEL-NEXT: affine.for %arg14 = 0 to 8 { @@ -376,7 +376,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: } // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg12 : memref) +// KERNEL-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // KERNEL-NEXT: } // KERNEL-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): @@ -444,7 +444,7 @@ module attributes {} { // HYPERBLOCK: module { // HYPERBLOCK-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -455,9 +455,9 @@ module attributes {} { // HYPERBLOCK-NEXT: memref.store %4, %arg11[%arg14] : memref // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index, index, index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg11 : memref) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -470,7 +470,7 @@ module attributes {} { // HYPERBLOCK-NEXT: memref.store %6, %arg12[%arg15] : memref // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index, index, index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg12 : memref) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): From eb5a6779fe24264aa0d2df3a58e1361c0125c4a0 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 6 Mar 2026 23:46:00 +0800 Subject: [PATCH 4/7] modify test --- .../taskflow/multi-nested/multi-nested.mlir | 32 +++--- .../parallel-nested/parallel-nested.mlir | 16 +-- .../taskflow/resnet/simple_resnet_tosa.mlir | 98 +++++++++---------- .../resource-heavy/resource-heavy.mlir | 4 +- 4 files changed, 75 insertions(+), 75 deletions(-) diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index 8281b313..71127648 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -378,7 +378,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref) { +// KERNEL-NEXT: %read_outputs_2:3, %write_outputs_3 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_1, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): // KERNEL-NEXT: affine.for %arg14 = 0 to 4 { // KERNEL-NEXT: affine.for %arg15 = 0 to 8 { @@ -399,9 +399,9 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: } // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg13 : memref) +// KERNEL-NEXT: taskflow.yield reads(%arg10, %arg11, %arg13 : memref, memref, memref) writes(%arg13 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %write_outputs_2 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref) { +// KERNEL-NEXT: %read_outputs_4, %write_outputs_5 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref): // KERNEL-NEXT: affine.for %arg12 = 0 to 4 { // KERNEL-NEXT: neura.kernel inputs(%arg10, %arg12, %arg11 : memref, index, memref) { @@ -416,9 +416,9 @@ module attributes {} { // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg11 : memref) +// KERNEL-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %write_outputs_3 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_2 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref) { +// KERNEL-NEXT: %read_outputs_6:2, %write_outputs_7 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_5 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // KERNEL-NEXT: affine.for %arg13 = 0 to 4 { // KERNEL-NEXT: neura.kernel inputs(%arg10, %arg13, %arg11, %arg12 : memref, index, memref, memref) { @@ -435,9 +435,9 @@ module attributes {} { // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg12 : memref) +// KERNEL-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %0 = affine.load %write_outputs_1[0] : memref +// KERNEL-NEXT: %0 = affine.load %write_outputs_3[0] : memref // KERNEL-NEXT: return %0 : i32 // KERNEL-NEXT: } // KERNEL-NEXT:} @@ -472,7 +472,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index, index, index) -> () // HYPERBLOCK-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_0, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: %read_outputs_2:3, %write_outputs_3 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_1, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -485,13 +485,13 @@ module attributes {} { // HYPERBLOCK-NEXT: %c0 = arith.constant 0 : index // HYPERBLOCK-NEXT: %7 = memref.load %arg13[%c0] : memref // HYPERBLOCK-NEXT: %8 = arith.addi %7, %6 : i32 -// HYPERBLOCK-NEXT: %c0_4 = arith.constant 0 : index -// HYPERBLOCK-NEXT: memref.store %8, %arg13[%c0_4] : memref +// HYPERBLOCK-NEXT: %c0_8 = arith.constant 0 : index +// HYPERBLOCK-NEXT: memref.store %8, %arg13[%c0_8] : memref // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg13 : memref) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg10, %arg11, %arg13 : memref, memref, memref) writes(%arg13 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %write_outputs_2 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: %read_outputs_4, %write_outputs_5 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index @@ -501,9 +501,9 @@ module attributes {} { // HYPERBLOCK-NEXT: memref.store %3, %arg11[%arg13] : memref // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index, index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg11 : memref) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %write_outputs_3 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_2 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref) { +// HYPERBLOCK-NEXT: %read_outputs_6:2, %write_outputs_7 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_5 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index @@ -515,9 +515,9 @@ module attributes {} { // HYPERBLOCK-NEXT: memref.store %5, %arg12[%arg14] : memref // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index, index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg12 : memref) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[0] : memref +// HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_3[0] : memref // HYPERBLOCK-NEXT: return %0 : i32 // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT:} diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index 3d63f767..6910c0ba 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -98,16 +98,16 @@ module { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { -// TASKFLOW-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>) { +// TASKFLOW-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>, memref<16xf32>) { // TASKFLOW-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: memref<16xf32>, %arg7: f32): // TASKFLOW-NEXT: affine.for %arg8 = 0 to 16 { // TASKFLOW-NEXT: %0 = affine.load %arg6[%arg8] : memref<16xf32> // TASKFLOW-NEXT: %1 = arith.mulf %0, %arg7 : f32 // TASKFLOW-NEXT: affine.store %1, %arg6[%arg8] : memref<16xf32> // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield writes(%arg6 : memref<16xf32>) +// TASKFLOW-NEXT: taskflow.yield reads(%arg6 : memref<16xf32>) writes(%arg6 : memref<16xf32>) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>) { +// TASKFLOW-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) { // TASKFLOW-NEXT: ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): // TASKFLOW-NEXT: affine.for %arg8 = 0 to 8 { // TASKFLOW-NEXT: affine.for %arg9 = 0 to 8 { @@ -117,7 +117,7 @@ module { // TASKFLOW-NEXT: affine.store %2, %arg7[%arg8, %arg9] : memref<8x8xf32> // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield writes(%arg7 : memref<8x8xf32>) +// TASKFLOW-NEXT: taskflow.yield reads(%arg5, %arg6 : memref<8x8xf32>, memref<8x8xf32>) writes(%arg7 : memref<8x8xf32>) // TASKFLOW-NEXT: } // TASKFLOW-NEXT: return // TASKFLOW-NEXT: } @@ -125,7 +125,7 @@ module { // HYPERBLOCK: module { // HYPERBLOCK-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { -// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>) { +// HYPERBLOCK-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>, memref<16xf32>) { // HYPERBLOCK-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: memref<16xf32>, %arg7: f32): // HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index // HYPERBLOCK-NEXT: "taskflow.hyperblock"(%0) <{operandSegmentSizes = array}> ({ @@ -135,9 +135,9 @@ module { // HYPERBLOCK-NEXT: memref.store %2, %arg6[%arg8] : memref<16xf32> // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg6 : memref<16xf32>) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg6 : memref<16xf32>) writes(%arg6 : memref<16xf32>) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %write_outputs_0 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>) { +// HYPERBLOCK-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) { // HYPERBLOCK-NEXT: ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): // HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index // HYPERBLOCK-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -149,7 +149,7 @@ module { // HYPERBLOCK-NEXT: memref.store %4, %arg7[%arg8, %arg9] : memref<8x8xf32> // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index, index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg7 : memref<8x8xf32>) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg5, %arg6 : memref<8x8xf32>, memref<8x8xf32>) writes(%arg7 : memref<8x8xf32>) // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: return // HYPERBLOCK-NEXT: } diff --git a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir index f1741b0a..a0d5d463 100644 --- a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir +++ b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir @@ -249,7 +249,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: %cst_1 = arith.constant 3.40282347E+38 : f32 // KERNEL-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 // KERNEL-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// KERNEL-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<1x64x8x8xf32>) write_memrefs(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<1x64x8x8xf32>) write_memrefs(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -265,7 +265,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32> // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> // KERNEL-NEXT: %write_outputs_4 = taskflow.task @Task_1 write_memrefs(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { @@ -303,7 +303,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %write_outputs_7 = taskflow.task @Task_3 read_memrefs(%write_outputs_4, %write_outputs_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %read_outputs_7:2, %write_outputs_8 = taskflow.task @Task_3 read_memrefs(%write_outputs_4, %write_outputs_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -330,10 +330,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: memref.store %19, %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32> // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg3 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %write_outputs_9 = taskflow.task @Task_4 read_memrefs(%write_outputs_7 : memref<1x8x8x64xf32>) write_memrefs(%alloc_8 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_8 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %read_outputs_10, %write_outputs_11 = taskflow.task @Task_4 read_memrefs(%write_outputs_8 : memref<1x8x8x64xf32>) write_memrefs(%alloc_9 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_9 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -349,10 +349,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x64x8x8xf32> // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x8x8x64xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %alloc_10 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %write_outputs_11 = taskflow.task @Task_5 read_memrefs(%write_outputs_9 : memref<1x64x8x8xf32>) write_memrefs(%alloc_10 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_8 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_10 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %read_outputs_13, %write_outputs_14 = taskflow.task @Task_5 read_memrefs(%write_outputs_11 : memref<1x64x8x8xf32>) write_memrefs(%alloc_12 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_9 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -370,10 +370,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: memref.store %10, %arg8[%4, %5, %6, %7] : memref<1x64x8x8xf32> // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// KERNEL-NEXT: %write_outputs_13 = taskflow.task @Task_6 read_memrefs(%write_outputs_11 : memref<1x64x8x8xf32>) write_memrefs(%alloc_12 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_10 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// KERNEL-NEXT: %read_outputs_16, %write_outputs_17 = taskflow.task @Task_6 read_memrefs(%write_outputs_14 : memref<1x64x8x8xf32>) write_memrefs(%alloc_15 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_12 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_15 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -389,10 +389,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x8x8x64xf32> // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> -// KERNEL-NEXT: %write_outputs_15 = taskflow.task @Task_7 write_memrefs(%alloc_14 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_14 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// KERNEL-NEXT: %alloc_18 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> +// KERNEL-NEXT: %write_outputs_19 = taskflow.task @Task_7 write_memrefs(%alloc_18 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_18 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index @@ -409,8 +409,8 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %alloc_16 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// KERNEL-NEXT: %write_outputs_17 = taskflow.task @Task_8 write_memrefs(%alloc_16 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_16 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %alloc_20 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// KERNEL-NEXT: %write_outputs_21 = taskflow.task @Task_8 write_memrefs(%alloc_20 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_20 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -427,7 +427,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %write_outputs_18 = taskflow.task @Task_9 read_memrefs(%write_outputs_15, %write_outputs_17 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_17 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_14, %alloc_16 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_16 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %read_outputs_22:2, %write_outputs_23 = taskflow.task @Task_9 read_memrefs(%write_outputs_19, %write_outputs_21 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_21 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_18, %alloc_20 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_20 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -454,10 +454,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: memref.store %19, %arg6[%7, %8, %9, %10] : memref<1x8x8x64xf32> // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg3 : memref<1x8x8x64xf32>) +// KERNEL-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %alloc_19 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %write_outputs_20 = taskflow.task @Task_10 read_memrefs(%write_outputs_18 : memref<1x8x8x64xf32>) write_memrefs(%alloc_19 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_16 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_19 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %alloc_24 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %read_outputs_25, %write_outputs_26 = taskflow.task @Task_10 read_memrefs(%write_outputs_23 : memref<1x8x8x64xf32>) write_memrefs(%alloc_24 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_20 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_24 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -473,10 +473,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: memref.store %8, %arg4[%4, %5, %6, %7] : memref<1x64x8x8xf32> // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x8x8x64xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %write_outputs_22 = taskflow.task @Task_11 read_memrefs(%write_outputs_20, %arg0 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) write_memrefs(%alloc_21 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_19, %arg0 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_21 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %alloc_27 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %read_outputs_28:2, %write_outputs_29 = taskflow.task @Task_11 read_memrefs(%write_outputs_26, %read_outputs : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) write_memrefs(%alloc_27 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_24, %arg0 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_27 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: memref<1x64x8x8xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -494,10 +494,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: memref.store %10, %arg6[%4, %5, %6, %7] : memref<1x64x8x8xf32> // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg3 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: taskflow.yield reads(%arg1, %arg2 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) writes(%arg3 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %alloc_23 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %write_outputs_24 = taskflow.task @Task_12 read_memrefs(%write_outputs_22 : memref<1x64x8x8xf32>) write_memrefs(%alloc_23 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_21 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_23 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %alloc_30 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// KERNEL-NEXT: %read_outputs_31, %write_outputs_32 = taskflow.task @Task_12 read_memrefs(%write_outputs_29 : memref<1x64x8x8xf32>) write_memrefs(%alloc_30 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_27 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_30 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -515,9 +515,9 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: memref.store %10, %arg8[%4, %5, %6, %7] : memref<1x64x8x8xf32> // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>) +// KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: return %write_outputs_24 : memref<1x64x8x8xf32> +// KERNEL-NEXT: return %write_outputs_32 : memref<1x64x8x8xf32> // KERNEL-NEXT: } // KERNEL-NEXT: } @@ -531,7 +531,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: %cst_1 = arith.constant 3.40282347E+38 : f32 // STREAM-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 // STREAM-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// STREAM-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<1x64x8x8xf32>) write_memrefs(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x8x8x64xf32>) { +// STREAM-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<1x64x8x8xf32>) write_memrefs(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 8 { @@ -543,7 +543,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg2 : memref<1x8x8x64xf32>) +// STREAM-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) // STREAM-NEXT: } // STREAM-NEXT: %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> // STREAM-NEXT: %write_outputs_4 = taskflow.task @Task_1 write_memrefs(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { @@ -573,7 +573,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) // STREAM-NEXT: } -// STREAM-NEXT: %write_outputs_7 = taskflow.task @Task_3 read_memrefs(%write_outputs_4, %write_outputs_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// STREAM-NEXT: %read_outputs_7:2, %write_outputs_8 = taskflow.task @Task_3 read_memrefs(%write_outputs_4, %write_outputs_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): // STREAM-NEXT: affine.for %arg5 = 0 to 1 { // STREAM-NEXT: affine.for %arg6 = 0 to 8 { @@ -594,10 +594,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg3 : memref<1x8x8x64xf32>) +// STREAM-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) // STREAM-NEXT: } -// STREAM-NEXT: %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// STREAM-NEXT: %write_outputs_9 = taskflow.task @Task_4_Task_5_fused read_memrefs(%write_outputs_7 : memref<1x8x8x64xf32>) write_memrefs(%alloc_8 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_8 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>) { +// STREAM-NEXT: %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// STREAM-NEXT: %read_outputs_10, %write_outputs_11 = taskflow.task @Task_4_Task_5_fused read_memrefs(%write_outputs_8 : memref<1x8x8x64xf32>) write_memrefs(%alloc_9 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_9 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32): // STREAM-NEXT: affine.for %arg5 = 0 to 1 { // STREAM-NEXT: affine.for %arg6 = 0 to 64 { @@ -611,10 +611,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg2 : memref<1x64x8x8xf32>) +// STREAM-NEXT: taskflow.yield reads(%arg1 : memref<1x8x8x64xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // STREAM-NEXT: } -// STREAM-NEXT: %alloc_10 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// STREAM-NEXT: %write_outputs_11 = taskflow.task @Task_6 read_memrefs(%write_outputs_9 : memref<1x64x8x8xf32>) write_memrefs(%alloc_10 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_8 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_10 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x8x8x64xf32>) { +// STREAM-NEXT: %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// STREAM-NEXT: %read_outputs_13, %write_outputs_14 = taskflow.task @Task_6 read_memrefs(%write_outputs_11 : memref<1x64x8x8xf32>) write_memrefs(%alloc_12 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_9 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 8 { @@ -626,10 +626,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg2 : memref<1x8x8x64xf32>) +// STREAM-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) // STREAM-NEXT: } -// STREAM-NEXT: %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> -// STREAM-NEXT: %write_outputs_13 = taskflow.task @Task_7 write_memrefs(%alloc_12 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_12 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// STREAM-NEXT: %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> +// STREAM-NEXT: %write_outputs_16 = taskflow.task @Task_7 write_memrefs(%alloc_15 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_15 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 10 { @@ -642,8 +642,8 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) // STREAM-NEXT: } -// STREAM-NEXT: %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// STREAM-NEXT: %write_outputs_15 = taskflow.task @Task_8 write_memrefs(%alloc_14 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_14 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// STREAM-NEXT: %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> +// STREAM-NEXT: %write_outputs_18 = taskflow.task @Task_8 write_memrefs(%alloc_17 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 8 { @@ -656,7 +656,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) // STREAM-NEXT: } -// STREAM-NEXT: %write_outputs_16 = taskflow.task @Task_9 read_memrefs(%write_outputs_13, %write_outputs_15 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_15 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_12, %alloc_14 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_14 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// STREAM-NEXT: %read_outputs_19:2, %write_outputs_20 = taskflow.task @Task_9 read_memrefs(%write_outputs_16, %write_outputs_18 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_18 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_15, %alloc_17 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): // STREAM-NEXT: affine.for %arg5 = 0 to 1 { // STREAM-NEXT: affine.for %arg6 = 0 to 8 { @@ -677,10 +677,10 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg3 : memref<1x8x8x64xf32>) +// STREAM-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) // STREAM-NEXT: } -// STREAM-NEXT: %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// STREAM-NEXT: %write_outputs_18 = taskflow.task @Task_10_Task_11_Task_12_fused_fused read_memrefs(%write_outputs_16, %arg0 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) write_memrefs(%alloc_17 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_14, %arg0 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_17 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>) { +// STREAM-NEXT: %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> +// STREAM-NEXT: %read_outputs_22:2, %write_outputs_23 = taskflow.task @Task_10_Task_11_Task_12_fused_fused read_memrefs(%write_outputs_20, %read_outputs : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) write_memrefs(%alloc_21 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_17, %arg0 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_21 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: memref<1x64x8x8xf32>, %arg4: f32, %arg5: f32): // STREAM-NEXT: affine.for %arg6 = 0 to 1 { // STREAM-NEXT: affine.for %arg7 = 0 to 64 { @@ -696,9 +696,9 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: } // STREAM-NEXT: } -// STREAM-NEXT: taskflow.yield writes(%arg3 : memref<1x64x8x8xf32>) +// STREAM-NEXT: taskflow.yield reads(%arg1, %arg2 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) writes(%arg3 : memref<1x64x8x8xf32>) // STREAM-NEXT: } -// STREAM-NEXT: return %write_outputs_18 : memref<1x64x8x8xf32> +// STREAM-NEXT: return %write_outputs_23 : memref<1x64x8x8xf32> // STREAM-NEXT: } // STREAM-NEXT: } diff --git a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir index ffc37f2d..938ba4dd 100644 --- a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir +++ b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir @@ -181,11 +181,11 @@ module { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @stereo_cost_computation -// TASKFLOW: %write_outputs:2 = taskflow.task @Task_0 +// TASKFLOW: %read_outputs:6, %write_outputs:2 = taskflow.task @Task_0 // TASKFLOW: affine.for %arg28 = 0 to 64 { // TASKFLOW: } // TASKFLOW: taskflow.yield -// TASKFLOW: %write_outputs_0 = taskflow.task @Task_1 +// TASKFLOW: %read_outputs_0, %write_outputs_1 = taskflow.task @Task_1 // TASKFLOW: affine.for %arg18 = 0 to 64 { // TASKFLOW: } // TASKFLOW: taskflow.yield From e97947ba24977426b0884587dd7be0590d0796fa Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 7 Mar 2026 00:16:34 +0800 Subject: [PATCH 5/7] rename dependency-related operand names --- include/TaskflowDialect/TaskflowOps.td | 25 +++---- .../AffineToTaskflow/AffineToTaskflowPass.cpp | 4 +- lib/TaskflowDialect/TaskflowOps.cpp | 52 ++++++++------- .../MemoryAccessStreamingFusion.cpp | 66 +++++++++---------- .../ResourceAwareTaskOptimizationPass.cpp | 48 +++++++------- .../TosaToTaskflow/affine-to-taskflow.mlir | 2 +- .../TosaToTaskflow/tosa-to-taskflow.mlir | 4 +- test/e2e/tosa_e2e.mlir | 4 +- test/multi-cgra/kernel_mapping/fir/fir.mlir | 12 ++-- .../loop-in-kernel/loop-in-kernel.mlir | 8 +-- test/multi-cgra/kernel_mapping/relu/relu.mlir | 24 +++---- .../irregular-loop/irregular-loop.mlir | 18 ++--- .../taskflow/multi-nested/multi-nested.mlir | 46 ++++++------- .../parallel-nested/parallel-nested.mlir | 8 +-- .../taskflow/resnet/simple_resnet_tosa.mlir | 50 +++++++------- .../resource-heavy/resource-heavy.mlir | 4 +- 16 files changed, 191 insertions(+), 184 deletions(-) diff --git a/include/TaskflowDialect/TaskflowOps.td b/include/TaskflowDialect/TaskflowOps.td index 3e252a5a..94eb199d 100644 --- a/include/TaskflowDialect/TaskflowOps.td +++ b/include/TaskflowDialect/TaskflowOps.td @@ -46,23 +46,24 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [ Example: // Memory inputs: %mem, Value inputs: %val $out_mem, %out_val = taskflow.task "Task_0" - read_inputs(%mem : memref<4xi32>) + dependency_read_in(%mem : memref<4xi32>) + dependency_write_in(%arg5 : memref) value_inputs(%val : i32) - original_read_memrefs(%arg0 : memref) - original_write_memrefs(%arg5 : memref) { - ^bb0(%a0: memref<4xi32>, %a1: i32): + [original_read_memrefs(%arg0 : memref), + original_write_memrefs(%arg5 : memref)] { + ^bb0(%a0: memref<4xi32>, %a1: memref, %a2: i32): affine.for %i = 0 to 4 { %v = affine.load %a0[%i] : memref<4xi32> - %sum = arith.addi %v, %a1 : i32 - affine.store %sum, %a0[%i] : memref<4xi32> + %sum = arith.addi %v, %a2 : i32 + affine.store %sum, %a1[%i] : memref } - taskflow.yield memory_outputs(%a0 : memref<4xi32>) value_outputs(%a1 : i32) - } : (memref<4xi32>, i32) -> (memref<4xi32>, i32) + taskflow.yield reads(%a0 : memref<4xi32>) writes(%a1 : memref) values(%a2 : i32) + } : (memref<4xi32>, memref, i32) -> (memref<4xi32>, memref, i32) }]; let arguments = (ins - Variadic:$read_memrefs, - Variadic:$write_memrefs, + Variadic:$dependency_read_in, + Variadic:$dependency_write_in, Variadic:$value_inputs, StrAttr:$task_name, Variadic:$original_read_memrefs, @@ -70,8 +71,8 @@ def TaskflowTaskOp : TaskflowOpBase<"task", [ ); let results = (outs - Variadic:$read_outputs, - Variadic:$write_outputs, + Variadic:$dependency_read_out, + Variadic:$dependency_write_out, Variadic:$value_outputs ); diff --git a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp index 5979edd0..98b87fad 100644 --- a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp +++ b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp @@ -338,7 +338,7 @@ static TaskflowTaskOp convertLoopToTask( // Read outputs: establishes WAR dependency chain. // Only update mapping for memrefs not already mapped by a prior write. for (auto [memref, task_read_output] : - llvm::zip(read_memrefs, task_op.getReadOutputs())) { + llvm::zip(read_memrefs, task_op.getDependencyReadOut())) { if (!value_mapping.count(memref)) { value_mapping[memref] = task_read_output; } @@ -347,7 +347,7 @@ static TaskflowTaskOp convertLoopToTask( // Memory outputs (write): establishes RAW/WAW dependency chain. // Write outputs always overwrite read outputs in the mapping. for (auto [memref, task_output] : - llvm::zip(output_memrefs, task_op.getWriteOutputs())) { + llvm::zip(output_memrefs, task_op.getDependencyWriteOut())) { value_mapping[memref] = task_output; } diff --git a/lib/TaskflowDialect/TaskflowOps.cpp b/lib/TaskflowDialect/TaskflowOps.cpp index 381ea0ff..8d134651 100644 --- a/lib/TaskflowDialect/TaskflowOps.cpp +++ b/lib/TaskflowDialect/TaskflowOps.cpp @@ -19,20 +19,20 @@ ParseResult TaskflowTaskOp::parse(OpAsmParser &parser, OperationState &result) { result.addAttribute("task_name", task_name); } - // Parses read_memrefs: read_memrefs(%arg0, %arg1 : memref, + // Parses dependency_read_in: dependency_read_in(%arg0, %arg1 : memref, // memref). SmallVector read_operands; SmallVector read_types; - if (succeeded(parser.parseOptionalKeyword("read_memrefs"))) { + if (succeeded(parser.parseOptionalKeyword("dependency_read_in"))) { if (parser.parseLParen() || parser.parseOperandList(read_operands) || parser.parseColonTypeList(read_types) || parser.parseRParen()) return failure(); } - // Parses write_memrefs: write_memrefs(%arg5 : memref). + // Parses dependency_write_in: dependency_write_in(%arg5 : memref). SmallVector write_operands; SmallVector write_types; - if (succeeded(parser.parseOptionalKeyword("write_memrefs"))) { + if (succeeded(parser.parseOptionalKeyword("dependency_write_in"))) { if (parser.parseLParen() || parser.parseOperandList(write_operands) || parser.parseColonTypeList(write_types) || parser.parseRParen()) return failure(); @@ -132,7 +132,8 @@ ParseResult TaskflowTaskOp::parse(OpAsmParser &parser, OperationState &result) { static_cast(original_write_operands.size())})); // Adds result segment sizes. - // read_outputs count matches read_memrefs count (WAR dependency tracking). + // dependency_read_out count matches dependency_read_in count (WAR dependency + // tracking). size_t num_read_outputs = read_operands.size(); size_t num_write_outputs = 0; size_t num_value_outputs = 0; @@ -142,7 +143,8 @@ ParseResult TaskflowTaskOp::parse(OpAsmParser &parser, OperationState &result) { else num_value_outputs++; } - // Total memref results include both read_outputs and write_outputs. + // Total memref results include both dependency_read_out and + // dependency_write_out. num_write_outputs = num_write_outputs - num_read_outputs; result.addAttribute("resultSegmentSizes", parser.getBuilder().getDenseI32ArrayAttr( @@ -157,21 +159,21 @@ void TaskflowTaskOp::print(OpAsmPrinter &printer) { // Prints task name. printer << " @" << getTaskName(); - // Prints read_memrefs. - if (!getReadMemrefs().empty()) { - printer << " read_memrefs("; - llvm::interleaveComma(getReadMemrefs(), printer); + // Prints dependency_read_in. + if (!getDependencyReadIn().empty()) { + printer << " dependency_read_in("; + llvm::interleaveComma(getDependencyReadIn(), printer); printer << " : "; - llvm::interleaveComma(getReadMemrefs().getTypes(), printer); + llvm::interleaveComma(getDependencyReadIn().getTypes(), printer); printer << ")"; } - // Prints write_memrefs. - if (!getWriteMemrefs().empty()) { - printer << " write_memrefs("; - llvm::interleaveComma(getWriteMemrefs(), printer); + // Prints dependency_write_in. + if (!getDependencyWriteIn().empty()) { + printer << " dependency_write_in("; + llvm::interleaveComma(getDependencyWriteIn(), printer); printer << " : "; - llvm::interleaveComma(getWriteMemrefs().getTypes(), printer); + llvm::interleaveComma(getDependencyWriteIn().getTypes(), printer); printer << ")"; } @@ -218,15 +220,17 @@ void TaskflowTaskOp::print(OpAsmPrinter &printer) { // Prints function type. printer << " : ("; - llvm::interleaveComma(llvm::concat(getReadMemrefs().getTypes(), - getWriteMemrefs().getTypes(), - getValueInputs().getTypes()), - printer); + llvm::interleaveComma( + llvm::concat(getDependencyReadIn().getTypes(), + getDependencyWriteIn().getTypes(), + getValueInputs().getTypes()), + printer); printer << ") -> ("; - llvm::interleaveComma(llvm::concat(getReadOutputs().getTypes(), - getWriteOutputs().getTypes(), - getValueOutputs().getTypes()), - printer); + llvm::interleaveComma( + llvm::concat(getDependencyReadOut().getTypes(), + getDependencyWriteOut().getTypes(), + getValueOutputs().getTypes()), + printer); printer << ")"; // Prints region. diff --git a/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp b/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp index 8c39278c..ffc95808 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/MemoryAccessStreamingFusion.cpp @@ -105,12 +105,12 @@ class MemoryDependencyAnalysis { void extractMemrefAccesses(taskflow::TaskflowTaskOp task_op, TaskInfo &task_info) { // Extracts read memrefs from the task operands. - for (Value memref : task_op.getReadMemrefs()) { + for (Value memref : task_op.getDependencyReadIn()) { task_info.read_memrefs.push_back(memref); } // Extracts write memrefs from the task operands. - for (Value memref : task_op.getWriteMemrefs()) { + for (Value memref : task_op.getDependencyWriteIn()) { task_info.write_memrefs.push_back(memref); } @@ -146,10 +146,10 @@ class MemoryDependencyAnalysis { for (auto task_op : tasks) { auto &task_info = task_map[task_op.getOperation()]; // Map read_outputs for WAR dependency tracking. - for (Value ro : task_op.getReadOutputs()) { + for (Value ro : task_op.getDependencyReadOut()) { write_output_to_producer[ro] = &task_info; } - for (Value wo : task_op.getWriteOutputs()) { + for (Value wo : task_op.getDependencyWriteOut()) { write_output_to_producer[wo] = &task_info; } } @@ -178,12 +178,12 @@ class MemoryDependencyAnalysis { }; // RAW: read_memrefs consuming a write_outputs value. - for (Value operand : task_op.getReadMemrefs()) { + for (Value operand : task_op.getDependencyReadIn()) { addDependencyIfProduced(operand); } // WAW/WAR: write_memrefs consuming a write_outputs value. - for (Value operand : task_op.getWriteMemrefs()) { + for (Value operand : task_op.getDependencyWriteIn()) { addDependencyIfProduced(operand); } } @@ -443,7 +443,7 @@ class TaskFuser { for (Value v : fused_read_memrefs) { read_output_types.push_back(v.getType()); } - for (Value v : reader_op.getWriteOutputs()) { + for (Value v : reader_op.getDependencyWriteOut()) { write_output_types.push_back(v.getType()); } for (Value v : reader_op.getValueOutputs()) { @@ -492,7 +492,7 @@ class TaskFuser { // read_memrefs = writer.reads ∪ reader.reads - intermediate DenseSet seen; - auto writer_reads = writer_op.getReadMemrefs(); + auto writer_reads = writer_op.getDependencyReadIn(); auto writer_orig_reads = writer_op.getOriginalReadMemrefs(); for (unsigned i = 0; i < writer_reads.size(); ++i) { Value orig = (i < writer_orig_reads.size()) ? writer_orig_reads[i] @@ -502,7 +502,7 @@ class TaskFuser { } } - auto reader_reads = reader_op.getReadMemrefs(); + auto reader_reads = reader_op.getDependencyReadIn(); auto reader_orig_reads = reader_op.getOriginalReadMemrefs(); for (unsigned i = 0; i < reader_reads.size(); ++i) { Value orig = (i < reader_orig_reads.size()) ? reader_orig_reads[i] @@ -514,7 +514,7 @@ class TaskFuser { // write_memrefs = reader.writes ∪ (writer.writes - intermediate) seen.clear(); - auto reader_writes = reader_op.getWriteMemrefs(); + auto reader_writes = reader_op.getDependencyWriteIn(); auto reader_orig_writes = reader_op.getOriginalWriteMemrefs(); for (unsigned i = 0; i < reader_writes.size(); ++i) { Value orig = (i < reader_orig_writes.size()) ? reader_orig_writes[i] @@ -524,7 +524,7 @@ class TaskFuser { } } - auto writer_writes = writer_op.getWriteMemrefs(); + auto writer_writes = writer_op.getDependencyWriteIn(); auto writer_orig_writes = writer_op.getOriginalWriteMemrefs(); for (unsigned i = 0; i < writer_writes.size(); ++i) { Value orig = (i < writer_orig_writes.size()) ? writer_orig_writes[i] @@ -739,8 +739,8 @@ class TaskFuser { // contain SSA results, not the raw alloc. if (auto block_arg = dyn_cast(load_memref)) { unsigned arg_num = block_arg.getArgNumber(); - unsigned total_reads = reader_op.getReadMemrefs().size(); - unsigned total_writes = reader_op.getWriteMemrefs().size(); + unsigned total_reads = reader_op.getDependencyReadIn().size(); + unsigned total_writes = reader_op.getDependencyWriteIn().size(); if (arg_num < total_reads) { // Use original_read_memrefs to check against intermediate. @@ -826,8 +826,8 @@ class TaskFuser { ArrayRef fused_values) { unsigned orig_arg_idx = 0; - unsigned num_reads = task_op.getReadMemrefs().size(); - unsigned num_writes = task_op.getWriteMemrefs().size(); + unsigned num_reads = task_op.getDependencyReadIn().size(); + unsigned num_writes = task_op.getDependencyWriteIn().size(); unsigned num_values = task_op.getValueInputs().size(); // Maps read_memrefs block args. @@ -836,14 +836,14 @@ class TaskFuser { auto orig_reads = task_op.getOriginalReadMemrefs(); for (unsigned i = 0; i < num_reads; ++i) { Value orig_memref = - (i < orig_reads.size()) ? orig_reads[i] : task_op.getReadMemrefs()[i]; + (i < orig_reads.size()) ? orig_reads[i] : task_op.getDependencyReadIn()[i]; if (orig_memref == intermediate) { // Intermediate memref — no corresponding fused arg. Skip. orig_arg_idx++; continue; } // Finds the fused block arg for this outer memref. - Value outer_memref = task_op.getReadMemrefs()[i]; + Value outer_memref = task_op.getDependencyReadIn()[i]; int fused_idx = findInFusedArgs(outer_memref, fused_reads, fused_writes, fused_values); if (fused_idx >= 0) { @@ -858,12 +858,12 @@ class TaskFuser { for (unsigned i = 0; i < num_writes; ++i) { Value orig_memref = (i < orig_writes.size()) ? orig_writes[i] - : task_op.getWriteMemrefs()[i]; + : task_op.getDependencyWriteIn()[i]; if (orig_memref == intermediate) { orig_arg_idx++; continue; } - Value outer_memref = task_op.getWriteMemrefs()[i]; + Value outer_memref = task_op.getDependencyWriteIn()[i]; int fused_idx = findInFusedArgs(outer_memref, fused_reads, fused_writes, fused_values); if (fused_idx >= 0) { @@ -940,8 +940,8 @@ class TaskFuser { // Helper: finds the index of an outer memref in fused_reads. auto findInFusedReads = [&](Value outer_memref) -> int { - for (unsigned i = 0; i < fused_task.getReadMemrefs().size(); ++i) { - if (fused_task.getReadMemrefs()[i] == outer_memref) + for (unsigned i = 0; i < fused_task.getDependencyReadIn().size(); ++i) { + if (fused_task.getDependencyReadIn()[i] == outer_memref) return i; } return -1; @@ -950,34 +950,34 @@ class TaskFuser { // Replaces writer's read_outputs: map each to the fused task's // corresponding read_output by finding the writer's read_memref // in the fused task's read_memrefs. - for (unsigned i = 0; i < writer_op.getReadOutputs().size(); ++i) { - Value writer_read_input = writer_op.getReadMemrefs()[i]; + for (unsigned i = 0; i < writer_op.getDependencyReadOut().size(); ++i) { + Value writer_read_input = writer_op.getDependencyReadIn()[i]; int fused_idx = findInFusedReads(writer_read_input); if (fused_idx >= 0) { - writer_op.getReadOutputs()[i].replaceAllUsesWith( - fused_task.getReadOutputs()[fused_idx]); + writer_op.getDependencyReadOut()[i].replaceAllUsesWith( + fused_task.getDependencyReadOut()[fused_idx]); } } // Replaces reader's read_outputs: skip intermediate, map others. - for (unsigned i = 0; i < reader_op.getReadOutputs().size(); ++i) { + for (unsigned i = 0; i < reader_op.getDependencyReadOut().size(); ++i) { Value orig = (i < reader_op.getOriginalReadMemrefs().size()) ? reader_op.getOriginalReadMemrefs()[i] - : reader_op.getReadMemrefs()[i]; + : reader_op.getDependencyReadIn()[i]; if (orig == intermediate) continue; // Intermediate read_output is dead after fusion. - Value reader_read_input = reader_op.getReadMemrefs()[i]; + Value reader_read_input = reader_op.getDependencyReadIn()[i]; int fused_idx = findInFusedReads(reader_read_input); if (fused_idx >= 0) { - reader_op.getReadOutputs()[i].replaceAllUsesWith( - fused_task.getReadOutputs()[fused_idx]); + reader_op.getDependencyReadOut()[i].replaceAllUsesWith( + fused_task.getDependencyReadOut()[fused_idx]); } } // Replaces reader's write_outputs with fused task's write_outputs. - for (unsigned i = 0; i < reader_op.getWriteOutputs().size(); ++i) { - reader_op.getWriteOutputs()[i].replaceAllUsesWith( - fused_task.getWriteOutputs()[i]); + for (unsigned i = 0; i < reader_op.getDependencyWriteOut().size(); ++i) { + reader_op.getDependencyWriteOut()[i].replaceAllUsesWith( + fused_task.getDependencyWriteOut()[i]); } // Replaces reader's value_outputs with fused task's value_outputs. diff --git a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp index 17e341fe..52f14f54 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/ResourceAwareTaskOptimizationPass.cpp @@ -283,7 +283,7 @@ class TaskDependencyGraph { // 3. Builds memory edges. for (auto &consumer : nodes) { // RAW: producer wrote a memref that this task reads. - for (Value memref : consumer->op.getReadMemrefs()) { + for (Value memref : consumer->op.getDependencyReadIn()) { if (auto producer_op = memref.getDefiningOp()) { if (auto *producer = op_to_node[producer_op.getOperation()]) { addEdge(producer, consumer.get()); @@ -291,7 +291,7 @@ class TaskDependencyGraph { } } // WAW/WAR: producer wrote or read a memref that this task writes. - for (Value memref : consumer->op.getWriteMemrefs()) { + for (Value memref : consumer->op.getDependencyWriteIn()) { if (auto producer_op = memref.getDefiningOp()) { if (auto *producer = op_to_node[producer_op.getOperation()]) { addEdge(producer, consumer.get()); @@ -1198,11 +1198,11 @@ class UtilizationFuser { } } }; - updateLatest(task_a.getReadMemrefs()); - updateLatest(task_a.getWriteMemrefs()); + updateLatest(task_a.getDependencyReadIn()); + updateLatest(task_a.getDependencyWriteIn()); updateLatest(task_a.getValueInputs()); - updateLatest(task_b.getReadMemrefs()); - updateLatest(task_b.getWriteMemrefs()); + updateLatest(task_b.getDependencyReadIn()); + updateLatest(task_b.getDependencyWriteIn()); updateLatest(task_b.getValueInputs()); // Inserts right after the latest operand definition. @@ -1225,10 +1225,10 @@ class UtilizationFuser { } }; - addUnique(merged_read_memrefs, task_a.getReadMemrefs()); - addUnique(merged_read_memrefs, task_b.getReadMemrefs()); - addUnique(merged_write_memrefs, task_a.getWriteMemrefs()); - addUnique(merged_write_memrefs, task_b.getWriteMemrefs()); + addUnique(merged_read_memrefs, task_a.getDependencyReadIn()); + addUnique(merged_read_memrefs, task_b.getDependencyReadIn()); + addUnique(merged_write_memrefs, task_a.getDependencyWriteIn()); + addUnique(merged_write_memrefs, task_b.getDependencyWriteIn()); addUnique(merged_value_inputs, task_a.getValueInputs()); addUnique(merged_value_inputs, task_b.getValueInputs()); addUnique(merged_original_read_memrefs, task_a.getOriginalReadMemrefs()); @@ -1274,11 +1274,11 @@ class UtilizationFuser { Region &fused_region, IRMapping &mapping) { Block &src_entry = orig_task.getBody().front(); unsigned src_idx = 0; - unsigned read_count = orig_task.getReadMemrefs().size(); - unsigned write_count = orig_task.getWriteMemrefs().size(); + unsigned read_count = orig_task.getDependencyReadIn().size(); + unsigned write_count = orig_task.getDependencyWriteIn().size(); for (unsigned i = 0; i < read_count; ++i) { - Value orig_memref = orig_task.getReadMemrefs()[i]; + Value orig_memref = orig_task.getDependencyReadIn()[i]; auto it = llvm::find(merged_read_memrefs, orig_memref); assert(it != merged_read_memrefs.end()); unsigned fused_idx = std::distance(merged_read_memrefs.begin(), it); @@ -1288,7 +1288,7 @@ class UtilizationFuser { src_idx += read_count; for (unsigned i = 0; i < write_count; ++i) { - Value orig_memref = orig_task.getWriteMemrefs()[i]; + Value orig_memref = orig_task.getDependencyWriteIn()[i]; auto it = llvm::find(merged_write_memrefs, orig_memref); assert(it != merged_write_memrefs.end()); unsigned fused_idx = merged_read_memrefs.size() + @@ -1560,7 +1560,7 @@ class UtilizationFuser { for (Value result : task->getResults()) { if (!result.use_empty()) { llvm::errs() << "[performFusion] ERROR: " << label << " result #" - << result.cast().getResultNumber() + << cast(result).getResultNumber() << " still has uses:\n"; for (auto &use : result.getUses()) { llvm::errs() << " used by: "; @@ -1596,19 +1596,21 @@ class UtilizationFuser { unsigned value_output_offset) { // Read outputs: maps by matching the original read memref to its // position in the merged read memrefs list. - for (unsigned i = 0; i < orig_task.getReadOutputs().size(); ++i) { - Value orig_result = orig_task.getReadOutputs()[i]; - Value orig_read = orig_task.getReadMemrefs()[i]; + for (unsigned i = 0; i < orig_task.getDependencyReadOut().size(); ++i) { + Value orig_result = orig_task.getDependencyReadOut()[i]; + Value orig_read = orig_task.getDependencyReadIn()[i]; unsigned fused_idx = findOperandIndex(merged_read_memrefs, orig_read); - orig_result.replaceAllUsesWith(fused_task.getReadOutputs()[fused_idx]); + orig_result.replaceAllUsesWith( + fused_task.getDependencyReadOut()[fused_idx]); } // Writes outputs: maps by matching the original write memref to its // position in the merged write memrefs list. - for (unsigned i = 0; i < orig_task.getWriteOutputs().size(); ++i) { - Value orig_result = orig_task.getWriteOutputs()[i]; - Value orig_write = orig_task.getWriteMemrefs()[i]; + for (unsigned i = 0; i < orig_task.getDependencyWriteOut().size(); ++i) { + Value orig_result = orig_task.getDependencyWriteOut()[i]; + Value orig_write = orig_task.getDependencyWriteIn()[i]; unsigned fused_idx = findOperandIndex(merged_write_memrefs, orig_write); - orig_result.replaceAllUsesWith(fused_task.getWriteOutputs()[fused_idx]); + orig_result.replaceAllUsesWith( + fused_task.getDependencyWriteOut()[fused_idx]); } // Value outputs: each original task's value_output[i] maps to // fused_task.getValueOutputs()[value_output_offset + i]. diff --git a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir index 15f71a8b..2baaf53f 100644 --- a/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir +++ b/test/Conversion/TosaToTaskflow/affine-to-taskflow.mlir @@ -16,7 +16,7 @@ module { } // CHECK: func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>, %arg2: memref<16xf32>) { -// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%arg2 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>, memref<16xf32>, memref<16xf32>) { +// CHECK-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) dependency_write_in(%arg2 : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%arg2 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>, memref<16xf32>, memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg3: memref<16xf32>, %arg4: memref<16xf32>, %arg5: memref<16xf32>): // CHECK-NEXT: affine.for %arg6 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg3[%arg6] : memref<16xf32> diff --git a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir index 84e1cea9..fb569bb5 100644 --- a/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir +++ b/test/Conversion/TosaToTaskflow/tosa-to-taskflow.mlir @@ -11,7 +11,7 @@ func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16 // CHECK: func.func @simple_add(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { // CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>, memref<16xf32>, memref<16xf32>) { +// CHECK-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) dependency_write_in(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>, memref<16xf32>, memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): // CHECK-NEXT: affine.for %arg5 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> @@ -21,5 +21,5 @@ func.func @simple_add(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16 // CHECK-NEXT: } // CHECK-NEXT: taskflow.yield reads(%arg2, %arg3 : memref<16xf32>, memref<16xf32>) writes(%arg4 : memref<16xf32>) // CHECK-NEXT: } -// CHECK-NEXT: return %write_outputs : memref<16xf32> +// CHECK-NEXT: return %dependency_write_out : memref<16xf32> // CHECK-NEXT: } diff --git a/test/e2e/tosa_e2e.mlir b/test/e2e/tosa_e2e.mlir index 5ef3babd..5af7a2c8 100644 --- a/test/e2e/tosa_e2e.mlir +++ b/test/e2e/tosa_e2e.mlir @@ -11,7 +11,7 @@ func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf // CHECK: func.func @test_e2e(%arg0: memref<16xf32>, %arg1: memref<16xf32>) -> memref<16xf32> { // CHECK-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<16xf32> -// CHECK-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) write_memrefs(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>, memref<16xf32>, memref<16xf32>) { +// CHECK-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0, %arg1 : memref<16xf32>, memref<16xf32>) dependency_write_in(%alloc : memref<16xf32>) [original_read_memrefs(%arg0, %arg1 : memref<16xf32>, memref<16xf32>), original_write_memrefs(%alloc : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, memref<16xf32>) -> (memref<16xf32>, memref<16xf32>, memref<16xf32>) { // CHECK-NEXT: ^bb0(%arg2: memref<16xf32>, %arg3: memref<16xf32>, %arg4: memref<16xf32>): // CHECK-NEXT: affine.for %arg5 = 0 to 16 { // CHECK-NEXT: %0 = affine.load %arg2[%arg5] : memref<16xf32> @@ -22,5 +22,5 @@ func.func @test_e2e(%arg0: tensor<16xf32>, %arg1: tensor<16xf32>) -> tensor<16xf // CHECK-NEXT: } // CHECK-NEXT: taskflow.yield reads(%arg2, %arg3 : memref<16xf32>, memref<16xf32>) writes(%arg4 : memref<16xf32>) // CHECK-NEXT: } -// CHECK-NEXT: return %write_outputs : memref<16xf32> +// CHECK-NEXT: return %dependency_write_out : memref<16xf32> // CHECK-NEXT: } diff --git a/test/multi-cgra/kernel_mapping/fir/fir.mlir b/test/multi-cgra/kernel_mapping/fir/fir.mlir index f4122392..69d228c7 100644 --- a/test/multi-cgra/kernel_mapping/fir/fir.mlir +++ b/test/multi-cgra/kernel_mapping/fir/fir.mlir @@ -92,7 +92,7 @@ module attributes {} { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// TASKFLOW-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { +// TASKFLOW-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_0 dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // TASKFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // TASKFLOW-NEXT: %0 = affine.for %arg6 = 0 to 32 iter_args(%arg7 = %arg5) -> (i32) { // TASKFLOW-NEXT: %1 = affine.load %arg3[%arg6] : memref @@ -110,7 +110,7 @@ module attributes {} { // HYPERBLOCK: module { // HYPERBLOCK-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 -// HYPERBLOCK-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { +// HYPERBLOCK-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_0 dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // HYPERBLOCK-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // HYPERBLOCK-NEXT: %1 = "taskflow.hyperblock"(%0, %arg5) <{operandSegmentSizes = array}> ({ @@ -130,7 +130,7 @@ module attributes {} { // KERNEL: module { // KERNEL-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // KERNEL-NEXT: %c0_i32 = arith.constant 0 : i32 -// KERNEL-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { +// KERNEL-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_0 dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // KERNEL-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // KERNEL-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) { @@ -151,7 +151,7 @@ module attributes {} { // NEURA: module { // NEURA-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 -// NEURA-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { +// NEURA-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_0 dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // NEURA-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // NEURA-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // NEURA-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura"} { @@ -172,7 +172,7 @@ module attributes {} { // DATAFLOW: module { // DATAFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// DATAFLOW-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { +// DATAFLOW-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_0 dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // DATAFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // DATAFLOW-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // DATAFLOW-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { @@ -201,7 +201,7 @@ module attributes {} { // MAPPED: module { // MAPPED-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 -// MAPPED-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { +// MAPPED-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_0 dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg2 : memref, memref)] : (memref, memref, i32) -> (memref, memref, i32) { // MAPPED-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // MAPPED-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // MAPPED-NEXT: %1 = neura.kernel inputs(%arg3, %arg4 : memref, memref) iter_args_init(%arg5 : i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 2 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { diff --git a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir index cfcae914..d6124058 100644 --- a/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir +++ b/test/multi-cgra/kernel_mapping/loop-in-kernel/loop-in-kernel.mlir @@ -55,7 +55,7 @@ module { func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { %c0_i32 = arith.constant 0 : i32 - %read_outputs:2, %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { + %dependency_read_out:2, %value_outputs = taskflow.task @Task_o dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): %1 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) { ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -77,7 +77,7 @@ module { // NEURA: module { // NEURA-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 -// NEURA-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { +// NEURA-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_o dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { // NEURA-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // NEURA-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura"} { // NEURA-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -111,7 +111,7 @@ module { // DATAFLOW: module { // DATAFLOW-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// DATAFLOW-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { +// DATAFLOW-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_o dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { // DATAFLOW-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // DATAFLOW-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura", dataflow_mode = "predicate"} { // DATAFLOW-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): @@ -150,7 +150,7 @@ module { // MAPPED: module { // MAPPED-NEXT: func.func @_Z6kernelPiS_S_(%arg0: memref, %arg1: memref, %arg2: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 -// MAPPED-NEXT: %read_outputs:2, %value_outputs = taskflow.task @Task_o read_memrefs(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { +// MAPPED-NEXT: %dependency_read_out:2, %value_outputs = taskflow.task @Task_o dependency_read_in(%arg0, %arg2 : memref, memref) value_inputs(%c0_i32 : i32) : (memref, memref, i32) -> (memref, memref, i32) { // MAPPED-NEXT: ^bb0(%arg3: memref, %arg4: memref, %arg5: i32): // MAPPED-NEXT: %0 = neura.kernel inputs(%arg3, %arg4, %arg5 : memref, memref, i32) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { // MAPPED-NEXT: ^bb0(%arg6: memref, %arg7: memref, %arg8: i32): diff --git a/test/multi-cgra/kernel_mapping/relu/relu.mlir b/test/multi-cgra/kernel_mapping/relu/relu.mlir index 309c8512..3fcc12be 100644 --- a/test/multi-cgra/kernel_mapping/relu/relu.mlir +++ b/test/multi-cgra/kernel_mapping/relu/relu.mlir @@ -96,7 +96,7 @@ module attributes {} { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // TASKFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// TASKFLOW-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref) { +// TASKFLOW-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0, %arg1 : memref, memref) dependency_write_in(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref, memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // TASKFLOW-NEXT: affine.for %arg6 = 0 to 32 { // TASKFLOW-NEXT: %0 = affine.load %arg2[%arg6] : memref @@ -111,7 +111,7 @@ module attributes {} { // TASKFLOW-NEXT: affine.store %2, %arg4[%arg6] : memref // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: taskflow.yield writes(%arg4 : memref) +// TASKFLOW-NEXT: taskflow.yield reads(%arg2, %arg4 : memref, memref) writes(%arg4 : memref) // TASKFLOW-NEXT: } // TASKFLOW-NEXT: return // TASKFLOW-NEXT: } @@ -120,7 +120,7 @@ module attributes {} { // HYPERBLOCK: module { // HYPERBLOCK-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // HYPERBLOCK-NEXT: %c0_i32 = arith.constant 0 : i32 -// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref) { +// HYPERBLOCK-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0, %arg1 : memref, memref) dependency_write_in(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref, memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // HYPERBLOCK-NEXT: "taskflow.hyperblock"(%0) <{operandSegmentSizes = array}> ({ @@ -138,7 +138,7 @@ module attributes {} { // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: taskflow.hyperblock.yield // HYPERBLOCK-NEXT: }) : (index) -> () -// HYPERBLOCK-NEXT: taskflow.yield writes(%arg4 : memref) +// HYPERBLOCK-NEXT: taskflow.yield reads(%arg2, %arg4 : memref, memref) writes(%arg4 : memref) // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: return // HYPERBLOCK-NEXT: } @@ -147,7 +147,7 @@ module attributes {} { // KERNEL: module { // KERNEL-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // KERNEL-NEXT: %c0_i32 = arith.constant 0 : i32 -// KERNEL-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref) { +// KERNEL-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0, %arg1 : memref, memref) dependency_write_in(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref, memref, memref) { // KERNEL-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // KERNEL-NEXT: neura.kernel inputs(%arg2, %arg5, %arg4 : memref, i32, memref) { @@ -166,7 +166,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: neura.yield // KERNEL-NEXT: } -// KERNEL-NEXT: taskflow.yield writes(%arg4 : memref) +// KERNEL-NEXT: taskflow.yield reads(%arg2, %arg4 : memref, memref) writes(%arg4 : memref) // KERNEL-NEXT: } // KERNEL-NEXT: return // KERNEL-NEXT: } @@ -175,7 +175,7 @@ module attributes {} { // NEURA: module { // NEURA-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // NEURA-NEXT: %c0_i32 = arith.constant 0 : i32 -// NEURA-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref) { +// NEURA-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0, %arg1 : memref, memref) dependency_write_in(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref, memref, memref) { // NEURA-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // NEURA-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // NEURA-NEXT: neura.kernel inputs(%arg2, %arg5, %arg4 : memref, i32, memref) attributes {accelerator = "neura"} { @@ -197,7 +197,7 @@ module attributes {} { // NEURA-NEXT: ^bb3: // 2 preds: ^bb1, ^bb2 // NEURA-NEXT: neura.yield // NEURA-NEXT: } -// NEURA-NEXT: taskflow.yield writes(%arg4 : memref) +// NEURA-NEXT: taskflow.yield reads(%arg2, %arg4 : memref, memref) writes(%arg4 : memref) // NEURA-NEXT: } // NEURA-NEXT: return // NEURA-NEXT: } @@ -206,7 +206,7 @@ module attributes {} { // DATAFLOW: module { // DATAFLOW-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // DATAFLOW-NEXT: %c0_i32 = arith.constant 0 : i32 -// DATAFLOW-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref) { +// DATAFLOW-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0, %arg1 : memref, memref) dependency_write_in(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref, memref, memref) { // DATAFLOW-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // DATAFLOW-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // DATAFLOW-NEXT: neura.kernel inputs(%arg2, %arg5, %arg4 : memref, i32, memref) attributes {accelerator = "neura", dataflow_mode = "predicate"} { @@ -225,7 +225,7 @@ module attributes {} { // DATAFLOW-NEXT: neura.store_indexed %10 to [%4 : !neura.data] {rhs_value = "%input2"} : !neura.data // DATAFLOW-NEXT: neura.yield {yield_type = "void"} // DATAFLOW-NEXT: } -// DATAFLOW-NEXT: taskflow.yield writes(%arg4 : memref) +// DATAFLOW-NEXT: taskflow.yield reads(%arg2, %arg4 : memref, memref) writes(%arg4 : memref) // DATAFLOW-NEXT: } // DATAFLOW-NEXT: return // DATAFLOW-NEXT: } @@ -234,7 +234,7 @@ module attributes {} { // MAPPED: module { // MAPPED-NEXT: func.func @_Z6kernelPiS_(%arg0: memref, %arg1: memref) attributes {llvm.linkage = #llvm.linkage} { // MAPPED-NEXT: %c0_i32 = arith.constant 0 : i32 -// MAPPED-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0, %arg1 : memref, memref) write_memrefs(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref) { +// MAPPED-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0, %arg1 : memref, memref) dependency_write_in(%arg1 : memref) value_inputs(%c0_i32 : i32) [original_read_memrefs(%arg0, %arg1 : memref, memref), original_write_memrefs(%arg1 : memref)] : (memref, memref, memref, i32) -> (memref, memref, memref) { // MAPPED-NEXT: ^bb0(%arg2: memref, %arg3: memref, %arg4: memref, %arg5: i32): // MAPPED-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "leaf", lower_bound = 0 : index, step = 1 : index, upper_bound = 32 : index} : index // MAPPED-NEXT: neura.kernel inputs(%arg2, %arg5, %arg4 : memref, i32, memref) attributes {accelerator = "neura", dataflow_mode = "predicate", mapping_info = {compiled_ii = 2 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 1 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} { @@ -269,7 +269,7 @@ module attributes {} { // MAPPED-NEXT: neura.store_indexed %25 to [%26 : !neura.data] {dfg_id = 28 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 1 : i32, invalid_iterations = 3 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 1 : i32}], rhs_value = "%input2"} : !neura.data // MAPPED-NEXT: neura.yield {dfg_id = 1 : i32, yield_type = "void"} // MAPPED-NEXT: } -// MAPPED-NEXT: taskflow.yield writes(%arg4 : memref) +// MAPPED-NEXT: taskflow.yield reads(%arg2, %arg4 : memref, memref) writes(%arg4 : memref) // MAPPED-NEXT: } // MAPPED-NEXT: return // MAPPED-NEXT: } diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 5c840ce5..ae8b8765 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -190,7 +190,7 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield values(%1 : i32) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { +// TASKFLOW-NEXT: %dependency_write_out = taskflow.task @Task_1 dependency_write_in(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { // TASKFLOW-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: i32): // TASKFLOW-NEXT: affine.for %arg2 = 0 to 4 { // TASKFLOW-NEXT: %1 = arith.index_cast %arg2 : index to i32 @@ -203,7 +203,7 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %read_outputs, %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) { +// TASKFLOW-NEXT: %dependency_read_out, %dependency_write_out_1 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out : memref<4x8xi32>) dependency_write_in(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) { // TASKFLOW-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): // TASKFLOW-NEXT: affine.for %arg5 = 0 to 4 { // TASKFLOW-NEXT: %1 = arith.index_cast %arg5 : index to i32 @@ -220,7 +220,7 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield reads(%arg0 : memref<4x8xi32>) writes(%arg1 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %0 = affine.load %write_outputs_1[] : memref +// TASKFLOW-NEXT: %0 = affine.load %dependency_write_out_1[] : memref // TASKFLOW-NEXT: return %0 : i32 // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } @@ -248,7 +248,7 @@ module attributes {} { // KERNEL-NEXT: } : i32 // KERNEL-NEXT: taskflow.yield values(%1 : i32) // KERNEL-NEXT: } -// KERNEL-NEXT: %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { +// KERNEL-NEXT: %dependency_write_out = taskflow.task @Task_1 dependency_write_in(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { // KERNEL-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: i32): // KERNEL-NEXT: affine.for %arg2 = 0 to 4 { // KERNEL-NEXT: %1 = arith.index_cast %arg2 : index to i32 @@ -268,7 +268,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %read_outputs, %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) { +// KERNEL-NEXT: %dependency_read_out, %dependency_write_out_1 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out : memref<4x8xi32>) dependency_write_in(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) { // KERNEL-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): // KERNEL-NEXT: affine.for %arg5 = 0 to 4 { // KERNEL-NEXT: neura.kernel inputs(%arg0, %arg5, %arg3, %arg1, %arg4 : memref<4x8xi32>, index, i32, memref, i32) { @@ -297,7 +297,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield reads(%arg0 : memref<4x8xi32>) writes(%arg1 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %0 = affine.load %write_outputs_1[] : memref +// KERNEL-NEXT: %0 = affine.load %dependency_write_out_1[] : memref // KERNEL-NEXT: return %0 : i32 // KERNEL-NEXT: } // KERNEL-NEXT: } @@ -320,7 +320,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index, i32) -> i32 // HYPERBLOCK-NEXT: taskflow.yield values(%2 : i32) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %write_outputs = taskflow.task @Task_1 write_memrefs(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { +// HYPERBLOCK-NEXT: %dependency_write_out = taskflow.task @Task_1 dependency_write_in(%alloca_0 : memref<4x8xi32>) value_inputs(%c8_i32 : i32) [original_write_memrefs(%alloca_0 : memref<4x8xi32>)] : (memref<4x8xi32>, i32) -> (memref<4x8xi32>) { // HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: i32): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1) <{operandSegmentSizes = array}> ({ @@ -339,7 +339,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index) -> () // HYPERBLOCK-NEXT: taskflow.yield writes(%arg0 : memref<4x8xi32>) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %read_outputs, %write_outputs_1 = taskflow.task @Task_2 read_memrefs(%write_outputs : memref<4x8xi32>) write_memrefs(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) { +// HYPERBLOCK-NEXT: %dependency_read_out, %dependency_write_out_1 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out : memref<4x8xi32>) dependency_write_in(%alloca : memref) value_inputs(%c8_i32, %value_outputs, %c2_i32 : i32, i32, i32) [original_read_memrefs(%alloca_0 : memref<4x8xi32>), original_write_memrefs(%alloca : memref)] : (memref<4x8xi32>, memref, i32, i32, i32) -> (memref<4x8xi32>, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg0: memref<4x8xi32>, %arg1: memref, %arg2: i32, %arg3: i32, %arg4: i32): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: "taskflow.hyperblock"(%1) <{operandSegmentSizes = array}> ({ @@ -370,7 +370,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index) -> () // HYPERBLOCK-NEXT: taskflow.yield reads(%arg0 : memref<4x8xi32>) writes(%arg1 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_1[] : memref +// HYPERBLOCK-NEXT: %0 = affine.load %dependency_write_out_1[] : memref // HYPERBLOCK-NEXT: return %0 : i32 // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT: } diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index 71127648..c40bf8cb 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -208,7 +208,7 @@ module attributes {} { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// TASKFLOW-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref, memref) { +// TASKFLOW-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref) dependency_write_in(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref): // TASKFLOW-NEXT: affine.for %arg12 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg13 = 0 to 8 { @@ -220,7 +220,7 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { +// TASKFLOW-NEXT: %dependency_read_out_0:2, %dependency_write_out_1 = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref, memref) dependency_write_in(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // TASKFLOW-NEXT: affine.for %arg13 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg14 = 0 to 8 { @@ -234,7 +234,7 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %read_outputs_2:3, %write_outputs_3 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_1, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { +// TASKFLOW-NEXT: %dependency_read_out_2:3, %dependency_write_out_3 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out, %dependency_write_out_1, %arg9 : memref, memref, memref) dependency_write_in(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): // TASKFLOW-NEXT: affine.for %arg14 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg15 = 0 to 8 { @@ -250,7 +250,7 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield reads(%arg10, %arg11, %arg13 : memref, memref, memref) writes(%arg13 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %read_outputs_4, %write_outputs_5 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { +// TASKFLOW-NEXT: %dependency_read_out_4, %dependency_write_out_5 = taskflow.task @Task_3 dependency_read_in(%arg3 : memref) dependency_write_in(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref): // TASKFLOW-NEXT: affine.for %arg12 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg13 = 0 to 7 { @@ -260,7 +260,7 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %read_outputs_6:2, %write_outputs_7 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_5 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { +// TASKFLOW-NEXT: %dependency_read_out_6:2, %dependency_write_out_7 = taskflow.task @Task_4 dependency_read_in(%arg4, %dependency_write_out_5 : memref, memref) dependency_write_in(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // TASKFLOW-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // TASKFLOW-NEXT: affine.for %arg13 = 0 to 4 { // TASKFLOW-NEXT: affine.for %arg14 = 0 to 9 { @@ -272,14 +272,14 @@ module attributes {} { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %0 = affine.load %write_outputs_3[0] : memref +// TASKFLOW-NEXT: %0 = affine.load %dependency_write_out_3[0] : memref // TASKFLOW-NEXT: return %0 : i32 // TASKFLOW-NEXT: } // TASKFLOW-NEXT: } // STREAM: module { // STREAM-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// STREAM-NEXT: %read_outputs:2, %write_outputs = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { +// STREAM-NEXT: %dependency_read_out:2, %dependency_write_out = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref, memref) dependency_write_in(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // STREAM-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // STREAM-NEXT: affine.for %arg13 = 0 to 4 { // STREAM-NEXT: affine.for %arg14 = 0 to 8 { @@ -293,7 +293,7 @@ module attributes {} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // STREAM-NEXT: } -// STREAM-NEXT: %read_outputs_0:3, %write_outputs_1 = taskflow.task @Task_0_Task_2_fused read_memrefs(%arg0, %write_outputs, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg0, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { +// STREAM-NEXT: %dependency_read_out_0:3, %dependency_write_out_1 = taskflow.task @Task_0_Task_2_fused dependency_read_in(%arg0, %dependency_write_out, %arg9 : memref, memref, memref) dependency_write_in(%arg9 : memref) [original_read_memrefs(%arg0, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { // STREAM-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): // STREAM-NEXT: affine.for %arg14 = 0 to 4 { // STREAM-NEXT: affine.for %arg15 = 0 to 8 { @@ -309,7 +309,7 @@ module attributes {} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield reads(%arg10, %arg11, %arg12 : memref, memref, memref) writes(%arg12 : memref) // STREAM-NEXT: } -// STREAM-NEXT: %read_outputs_2, %write_outputs_3 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { +// STREAM-NEXT: %dependency_read_out_2, %dependency_write_out_3 = taskflow.task @Task_3 dependency_read_in(%arg3 : memref) dependency_write_in(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { // STREAM-NEXT: ^bb0(%arg10: memref, %arg11: memref): // STREAM-NEXT: affine.for %arg12 = 0 to 4 { // STREAM-NEXT: affine.for %arg13 = 0 to 7 { @@ -319,7 +319,7 @@ module attributes {} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // STREAM-NEXT: } -// STREAM-NEXT: %read_outputs_4:2, %write_outputs_5 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_3 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { +// STREAM-NEXT: %dependency_read_out_4:2, %dependency_write_out_5 = taskflow.task @Task_4 dependency_read_in(%arg4, %dependency_write_out_3 : memref, memref) dependency_write_in(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // STREAM-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // STREAM-NEXT: affine.for %arg13 = 0 to 4 { // STREAM-NEXT: affine.for %arg14 = 0 to 9 { @@ -331,14 +331,14 @@ module attributes {} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // STREAM-NEXT: } -// STREAM-NEXT: %0 = affine.load %write_outputs_1[0] : memref +// STREAM-NEXT: %0 = affine.load %dependency_write_out_1[0] : memref // STREAM-NEXT: return %0 : i32 // STREAM-NEXT: } // STREAM-NEXT: } // KERNEL: module { // KERNEL-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// KERNEL-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref, memref) { +// KERNEL-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref) dependency_write_in(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref): // KERNEL-NEXT: affine.for %arg12 = 0 to 4 { // KERNEL-NEXT: affine.for %arg13 = 0 to 8 { @@ -357,7 +357,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { +// KERNEL-NEXT: %dependency_read_out_0:2, %dependency_write_out_1 = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref, memref) dependency_write_in(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // KERNEL-NEXT: affine.for %arg13 = 0 to 4 { // KERNEL-NEXT: affine.for %arg14 = 0 to 8 { @@ -378,7 +378,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %read_outputs_2:3, %write_outputs_3 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_1, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { +// KERNEL-NEXT: %dependency_read_out_2:3, %dependency_write_out_3 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out, %dependency_write_out_1, %arg9 : memref, memref, memref) dependency_write_in(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): // KERNEL-NEXT: affine.for %arg14 = 0 to 4 { // KERNEL-NEXT: affine.for %arg15 = 0 to 8 { @@ -401,7 +401,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield reads(%arg10, %arg11, %arg13 : memref, memref, memref) writes(%arg13 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %read_outputs_4, %write_outputs_5 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { +// KERNEL-NEXT: %dependency_read_out_4, %dependency_write_out_5 = taskflow.task @Task_3 dependency_read_in(%arg3 : memref) dependency_write_in(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref): // KERNEL-NEXT: affine.for %arg12 = 0 to 4 { // KERNEL-NEXT: neura.kernel inputs(%arg10, %arg12, %arg11 : memref, index, memref) { @@ -418,7 +418,7 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %read_outputs_6:2, %write_outputs_7 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_5 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { +// KERNEL-NEXT: %dependency_read_out_6:2, %dependency_write_out_7 = taskflow.task @Task_4 dependency_read_in(%arg4, %dependency_write_out_5 : memref, memref) dependency_write_in(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // KERNEL-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // KERNEL-NEXT: affine.for %arg13 = 0 to 4 { // KERNEL-NEXT: neura.kernel inputs(%arg10, %arg13, %arg11, %arg12 : memref, index, memref, memref) { @@ -437,14 +437,14 @@ module attributes {} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // KERNEL-NEXT: } -// KERNEL-NEXT: %0 = affine.load %write_outputs_3[0] : memref +// KERNEL-NEXT: %0 = affine.load %dependency_write_out_3[0] : memref // KERNEL-NEXT: return %0 : i32 // KERNEL-NEXT: } // KERNEL-NEXT:} // HYPERBLOCK: module { // HYPERBLOCK-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { -// HYPERBLOCK-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref, memref) { +// HYPERBLOCK-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref) dependency_write_in(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -457,7 +457,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index, index, index) -> () // HYPERBLOCK-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref, memref) write_memrefs(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { +// HYPERBLOCK-NEXT: %dependency_read_out_0:2, %dependency_write_out_1 = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref, memref) dependency_write_in(%arg6 : memref) [original_read_memrefs(%arg1, %arg2 : memref, memref), original_write_memrefs(%arg6 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -472,7 +472,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index, index, index) -> () // HYPERBLOCK-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %read_outputs_2:3, %write_outputs_3 = taskflow.task @Task_2 read_memrefs(%write_outputs, %write_outputs_1, %arg9 : memref, memref, memref) write_memrefs(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { +// HYPERBLOCK-NEXT: %dependency_read_out_2:3, %dependency_write_out_3 = taskflow.task @Task_2 dependency_read_in(%dependency_write_out, %dependency_write_out_1, %arg9 : memref, memref, memref) dependency_write_in(%arg9 : memref) [original_read_memrefs(%arg5, %arg6, %arg9 : memref, memref, memref), original_write_memrefs(%arg9 : memref)] : (memref, memref, memref, memref) -> (memref, memref, memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref, %arg13: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -491,7 +491,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index) -> () // HYPERBLOCK-NEXT: taskflow.yield reads(%arg10, %arg11, %arg13 : memref, memref, memref) writes(%arg13 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %read_outputs_4, %write_outputs_5 = taskflow.task @Task_3 read_memrefs(%arg3 : memref) write_memrefs(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { +// HYPERBLOCK-NEXT: %dependency_read_out_4, %dependency_write_out_5 = taskflow.task @Task_3 dependency_read_in(%arg3 : memref) dependency_write_in(%arg7 : memref) [original_read_memrefs(%arg3 : memref), original_write_memrefs(%arg7 : memref)] : (memref, memref) -> (memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 7 : index} : index @@ -503,7 +503,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index, index) -> () // HYPERBLOCK-NEXT: taskflow.yield reads(%arg10 : memref) writes(%arg11 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %read_outputs_6:2, %write_outputs_7 = taskflow.task @Task_4 read_memrefs(%arg4, %write_outputs_5 : memref, memref) write_memrefs(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { +// HYPERBLOCK-NEXT: %dependency_read_out_6:2, %dependency_write_out_7 = taskflow.task @Task_4 dependency_read_in(%arg4, %dependency_write_out_5 : memref, memref) dependency_write_in(%arg8 : memref) [original_read_memrefs(%arg4, %arg7 : memref, memref), original_write_memrefs(%arg8 : memref)] : (memref, memref, memref) -> (memref, memref, memref) { // HYPERBLOCK-NEXT: ^bb0(%arg10: memref, %arg11: memref, %arg12: memref): // HYPERBLOCK-NEXT: %1 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 4 : index} : index // HYPERBLOCK-NEXT: %2 = taskflow.counter parent(%1 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 9 : index} : index @@ -517,7 +517,7 @@ module attributes {} { // HYPERBLOCK-NEXT: }) : (index, index) -> () // HYPERBLOCK-NEXT: taskflow.yield reads(%arg10, %arg11 : memref, memref) writes(%arg12 : memref) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %0 = affine.load %write_outputs_3[0] : memref +// HYPERBLOCK-NEXT: %0 = affine.load %dependency_write_out_3[0] : memref // HYPERBLOCK-NEXT: return %0 : i32 // HYPERBLOCK-NEXT: } // HYPERBLOCK-NEXT:} diff --git a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir index 6910c0ba..57deba29 100644 --- a/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir +++ b/test/multi-cgra/taskflow/parallel-nested/parallel-nested.mlir @@ -98,7 +98,7 @@ module { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { -// TASKFLOW-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>, memref<16xf32>) { +// TASKFLOW-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref<16xf32>) dependency_write_in(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>, memref<16xf32>) { // TASKFLOW-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: memref<16xf32>, %arg7: f32): // TASKFLOW-NEXT: affine.for %arg8 = 0 to 16 { // TASKFLOW-NEXT: %0 = affine.load %arg6[%arg8] : memref<16xf32> @@ -107,7 +107,7 @@ module { // TASKFLOW-NEXT: } // TASKFLOW-NEXT: taskflow.yield reads(%arg6 : memref<16xf32>) writes(%arg6 : memref<16xf32>) // TASKFLOW-NEXT: } -// TASKFLOW-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) { +// TASKFLOW-NEXT: %dependency_read_out_0:2, %dependency_write_out_1 = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) dependency_write_in(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) { // TASKFLOW-NEXT: ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): // TASKFLOW-NEXT: affine.for %arg8 = 0 to 8 { // TASKFLOW-NEXT: affine.for %arg9 = 0 to 8 { @@ -125,7 +125,7 @@ module { // HYPERBLOCK: module { // HYPERBLOCK-NEXT: func.func @parallel_nested_example(%arg0: memref<16xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>, %arg4: f32) { -// HYPERBLOCK-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<16xf32>) write_memrefs(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>, memref<16xf32>) { +// HYPERBLOCK-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref<16xf32>) dependency_write_in(%arg0 : memref<16xf32>) value_inputs(%arg4 : f32) [original_read_memrefs(%arg0 : memref<16xf32>), original_write_memrefs(%arg0 : memref<16xf32>)] : (memref<16xf32>, memref<16xf32>, f32) -> (memref<16xf32>, memref<16xf32>) { // HYPERBLOCK-NEXT: ^bb0(%arg5: memref<16xf32>, %arg6: memref<16xf32>, %arg7: f32): // HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index // HYPERBLOCK-NEXT: "taskflow.hyperblock"(%0) <{operandSegmentSizes = array}> ({ @@ -137,7 +137,7 @@ module { // HYPERBLOCK-NEXT: }) : (index) -> () // HYPERBLOCK-NEXT: taskflow.yield reads(%arg6 : memref<16xf32>) writes(%arg6 : memref<16xf32>) // HYPERBLOCK-NEXT: } -// HYPERBLOCK-NEXT: %read_outputs_0:2, %write_outputs_1 = taskflow.task @Task_1 read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) write_memrefs(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) { +// HYPERBLOCK-NEXT: %dependency_read_out_0:2, %dependency_write_out_1 = taskflow.task @Task_1 dependency_read_in(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>) dependency_write_in(%arg3 : memref<8x8xf32>) [original_read_memrefs(%arg1, %arg2 : memref<8x8xf32>, memref<8x8xf32>), original_write_memrefs(%arg3 : memref<8x8xf32>)] : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) { // HYPERBLOCK-NEXT: ^bb0(%arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): // HYPERBLOCK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index // HYPERBLOCK-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index diff --git a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir index a0d5d463..80c571ee 100644 --- a/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir +++ b/test/multi-cgra/taskflow/resnet/simple_resnet_tosa.mlir @@ -249,7 +249,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: %cst_1 = arith.constant 3.40282347E+38 : f32 // KERNEL-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 // KERNEL-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// KERNEL-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<1x64x8x8xf32>) write_memrefs(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref<1x64x8x8xf32>) dependency_write_in(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -268,7 +268,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> -// KERNEL-NEXT: %write_outputs_4 = taskflow.task @Task_1 write_memrefs(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// KERNEL-NEXT: %dependency_write_out_4 = taskflow.task @Task_1 dependency_write_in(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index @@ -286,7 +286,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// KERNEL-NEXT: %write_outputs_6 = taskflow.task @Task_2 write_memrefs(%alloc_5 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %dependency_write_out_6 = taskflow.task @Task_2 dependency_write_in(%alloc_5 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -303,7 +303,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %read_outputs_7:2, %write_outputs_8 = taskflow.task @Task_3 read_memrefs(%write_outputs_4, %write_outputs_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %dependency_read_out_7:2, %dependency_write_out_8 = taskflow.task @Task_3 dependency_read_in(%dependency_write_out_4, %dependency_write_out_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) dependency_write_in(%dependency_write_out_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -333,7 +333,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %read_outputs_10, %write_outputs_11 = taskflow.task @Task_4 read_memrefs(%write_outputs_8 : memref<1x8x8x64xf32>) write_memrefs(%alloc_9 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_9 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %dependency_read_out_10, %dependency_write_out_11 = taskflow.task @Task_4 dependency_read_in(%dependency_write_out_8 : memref<1x8x8x64xf32>) dependency_write_in(%alloc_9 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_9 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -352,7 +352,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x8x8x64xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %read_outputs_13, %write_outputs_14 = taskflow.task @Task_5 read_memrefs(%write_outputs_11 : memref<1x64x8x8xf32>) write_memrefs(%alloc_12 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_9 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %dependency_read_out_13, %dependency_write_out_14 = taskflow.task @Task_5 dependency_read_in(%dependency_write_out_11 : memref<1x64x8x8xf32>) dependency_write_in(%alloc_12 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_9 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -373,7 +373,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// KERNEL-NEXT: %read_outputs_16, %write_outputs_17 = taskflow.task @Task_6 read_memrefs(%write_outputs_14 : memref<1x64x8x8xf32>) write_memrefs(%alloc_15 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_12 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_15 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %dependency_read_out_16, %dependency_write_out_17 = taskflow.task @Task_6 dependency_read_in(%dependency_write_out_14 : memref<1x64x8x8xf32>) dependency_write_in(%alloc_15 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_12 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_15 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -392,7 +392,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_18 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> -// KERNEL-NEXT: %write_outputs_19 = taskflow.task @Task_7 write_memrefs(%alloc_18 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_18 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// KERNEL-NEXT: %dependency_write_out_19 = taskflow.task @Task_7 dependency_write_in(%alloc_18 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_18 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 10 : index} : index @@ -410,7 +410,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_20 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// KERNEL-NEXT: %write_outputs_21 = taskflow.task @Task_8 write_memrefs(%alloc_20 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_20 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %dependency_write_out_21 = taskflow.task @Task_8 dependency_write_in(%alloc_20 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_20 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -427,7 +427,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: %read_outputs_22:2, %write_outputs_23 = taskflow.task @Task_9 read_memrefs(%write_outputs_19, %write_outputs_21 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_21 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_18, %alloc_20 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_20 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { +// KERNEL-NEXT: %dependency_read_out_22:2, %dependency_write_out_23 = taskflow.task @Task_9 dependency_read_in(%dependency_write_out_19, %dependency_write_out_21 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) dependency_write_in(%dependency_write_out_21 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_18, %alloc_20 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_20 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index @@ -457,7 +457,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_24 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %read_outputs_25, %write_outputs_26 = taskflow.task @Task_10 read_memrefs(%write_outputs_23 : memref<1x8x8x64xf32>) write_memrefs(%alloc_24 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_20 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_24 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %dependency_read_out_25, %dependency_write_out_26 = taskflow.task @Task_10 dependency_read_in(%dependency_write_out_23 : memref<1x8x8x64xf32>) dependency_write_in(%alloc_24 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_20 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_24 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -476,7 +476,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x8x8x64xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_27 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %read_outputs_28:2, %write_outputs_29 = taskflow.task @Task_11 read_memrefs(%write_outputs_26, %read_outputs : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) write_memrefs(%alloc_27 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_24, %arg0 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_27 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %dependency_read_out_28:2, %dependency_write_out_29 = taskflow.task @Task_11 dependency_read_in(%dependency_write_out_26, %dependency_read_out : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) dependency_write_in(%alloc_27 : memref<1x64x8x8xf32>) [original_read_memrefs(%alloc_24, %arg0 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_27 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) -> (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: memref<1x64x8x8xf32>): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -497,7 +497,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: taskflow.yield reads(%arg1, %arg2 : memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) writes(%arg3 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } // KERNEL-NEXT: %alloc_30 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// KERNEL-NEXT: %read_outputs_31, %write_outputs_32 = taskflow.task @Task_12 read_memrefs(%write_outputs_29 : memref<1x64x8x8xf32>) write_memrefs(%alloc_30 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_27 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_30 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { +// KERNEL-NEXT: %dependency_read_out_31, %dependency_write_out_32 = taskflow.task @Task_12 dependency_read_in(%dependency_write_out_29 : memref<1x64x8x8xf32>) dependency_write_in(%alloc_30 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_27 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_30 : memref<1x64x8x8xf32>)] : (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { // KERNEL-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32): // KERNEL-NEXT: %0 = taskflow.counter attributes {counter_id = 0 : i32, counter_type = "root", lower_bound = 0 : index, step = 1 : index, upper_bound = 1 : index} : index // KERNEL-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {counter_id = 1 : i32, counter_type = "relay", lower_bound = 0 : index, step = 1 : index, upper_bound = 64 : index} : index @@ -517,7 +517,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // KERNEL-NEXT: } // KERNEL-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // KERNEL-NEXT: } -// KERNEL-NEXT: return %write_outputs_32 : memref<1x64x8x8xf32> +// KERNEL-NEXT: return %dependency_write_out_32 : memref<1x64x8x8xf32> // KERNEL-NEXT: } // KERNEL-NEXT: } @@ -531,7 +531,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: %cst_1 = arith.constant 3.40282347E+38 : f32 // STREAM-NEXT: %cst_2 = arith.constant 0.000000e+00 : f32 // STREAM-NEXT: %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// STREAM-NEXT: %read_outputs, %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<1x64x8x8xf32>) write_memrefs(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { +// STREAM-NEXT: %dependency_read_out, %dependency_write_out = taskflow.task @Task_0 dependency_read_in(%arg0 : memref<1x64x8x8xf32>) dependency_write_in(%alloc : memref<1x8x8x64xf32>) [original_read_memrefs(%arg0 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 8 { @@ -546,7 +546,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) // STREAM-NEXT: } // STREAM-NEXT: %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> -// STREAM-NEXT: %write_outputs_4 = taskflow.task @Task_1 write_memrefs(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// STREAM-NEXT: %dependency_write_out_4 = taskflow.task @Task_1 dependency_write_in(%alloc_3 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_3 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 10 { @@ -560,7 +560,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) // STREAM-NEXT: } // STREAM-NEXT: %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// STREAM-NEXT: %write_outputs_6 = taskflow.task @Task_2 write_memrefs(%alloc_5 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// STREAM-NEXT: %dependency_write_out_6 = taskflow.task @Task_2 dependency_write_in(%alloc_5 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 8 { @@ -573,7 +573,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) // STREAM-NEXT: } -// STREAM-NEXT: %read_outputs_7:2, %write_outputs_8 = taskflow.task @Task_3 read_memrefs(%write_outputs_4, %write_outputs_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { +// STREAM-NEXT: %dependency_read_out_7:2, %dependency_write_out_8 = taskflow.task @Task_3 dependency_read_in(%dependency_write_out_4, %dependency_write_out_6 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) dependency_write_in(%dependency_write_out_6 : memref<1x8x8x64xf32>) value_inputs(%cst_0 : f32) [original_read_memrefs(%alloc_3, %alloc_5 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_5 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): // STREAM-NEXT: affine.for %arg5 = 0 to 1 { // STREAM-NEXT: affine.for %arg6 = 0 to 8 { @@ -597,7 +597,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) // STREAM-NEXT: } // STREAM-NEXT: %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// STREAM-NEXT: %read_outputs_10, %write_outputs_11 = taskflow.task @Task_4_Task_5_fused read_memrefs(%write_outputs_8 : memref<1x8x8x64xf32>) write_memrefs(%alloc_9 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_9 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { +// STREAM-NEXT: %dependency_read_out_10, %dependency_write_out_11 = taskflow.task @Task_4_Task_5_fused dependency_read_in(%dependency_write_out_8 : memref<1x8x8x64xf32>) dependency_write_in(%alloc_9 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_5 : memref<1x8x8x64xf32>), original_write_memrefs(%alloc_9 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: f32, %arg4: f32): // STREAM-NEXT: affine.for %arg5 = 0 to 1 { // STREAM-NEXT: affine.for %arg6 = 0 to 64 { @@ -614,7 +614,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: taskflow.yield reads(%arg1 : memref<1x8x8x64xf32>) writes(%arg2 : memref<1x64x8x8xf32>) // STREAM-NEXT: } // STREAM-NEXT: %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// STREAM-NEXT: %read_outputs_13, %write_outputs_14 = taskflow.task @Task_6 read_memrefs(%write_outputs_11 : memref<1x64x8x8xf32>) write_memrefs(%alloc_12 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_9 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { +// STREAM-NEXT: %dependency_read_out_13, %dependency_write_out_14 = taskflow.task @Task_6 dependency_read_in(%dependency_write_out_11 : memref<1x64x8x8xf32>) dependency_write_in(%alloc_12 : memref<1x8x8x64xf32>) [original_read_memrefs(%alloc_9 : memref<1x64x8x8xf32>), original_write_memrefs(%alloc_12 : memref<1x8x8x64xf32>)] : (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) -> (memref<1x64x8x8xf32>, memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x64x8x8xf32>, %arg2: memref<1x8x8x64xf32>): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 8 { @@ -629,7 +629,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: taskflow.yield reads(%arg1 : memref<1x64x8x8xf32>) writes(%arg2 : memref<1x8x8x64xf32>) // STREAM-NEXT: } // STREAM-NEXT: %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<1x10x10x64xf32> -// STREAM-NEXT: %write_outputs_16 = taskflow.task @Task_7 write_memrefs(%alloc_15 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_15 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { +// STREAM-NEXT: %dependency_write_out_16 = taskflow.task @Task_7 dependency_write_in(%alloc_15 : memref<1x10x10x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_15 : memref<1x10x10x64xf32>)] : (memref<1x10x10x64xf32>, f32) -> (memref<1x10x10x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: f32): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 10 { @@ -643,7 +643,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: taskflow.yield writes(%arg1 : memref<1x10x10x64xf32>) // STREAM-NEXT: } // STREAM-NEXT: %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<1x8x8x64xf32> -// STREAM-NEXT: %write_outputs_18 = taskflow.task @Task_8 write_memrefs(%alloc_17 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { +// STREAM-NEXT: %dependency_write_out_18 = taskflow.task @Task_8 dependency_write_in(%alloc_17 : memref<1x8x8x64xf32>) value_inputs(%cst_2 : f32) [original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] : (memref<1x8x8x64xf32>, f32) -> (memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: f32): // STREAM-NEXT: affine.for %arg3 = 0 to 1 { // STREAM-NEXT: affine.for %arg4 = 0 to 8 { @@ -656,7 +656,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield writes(%arg1 : memref<1x8x8x64xf32>) // STREAM-NEXT: } -// STREAM-NEXT: %read_outputs_19:2, %write_outputs_20 = taskflow.task @Task_9 read_memrefs(%write_outputs_16, %write_outputs_18 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) write_memrefs(%write_outputs_18 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_15, %alloc_17 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { +// STREAM-NEXT: %dependency_read_out_19:2, %dependency_write_out_20 = taskflow.task @Task_9 dependency_read_in(%dependency_write_out_16, %dependency_write_out_18 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) dependency_write_in(%dependency_write_out_18 : memref<1x8x8x64xf32>) value_inputs(%cst : f32) [original_read_memrefs(%alloc_15, %alloc_17 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>), original_write_memrefs(%alloc_17 : memref<1x8x8x64xf32>)] : (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>, f32) -> (memref<1x10x10x64xf32>, memref<1x8x8x64xf32>, memref<1x8x8x64xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x10x10x64xf32>, %arg2: memref<1x8x8x64xf32>, %arg3: memref<1x8x8x64xf32>, %arg4: f32): // STREAM-NEXT: affine.for %arg5 = 0 to 1 { // STREAM-NEXT: affine.for %arg6 = 0 to 8 { @@ -680,7 +680,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: taskflow.yield reads(%arg1, %arg3 : memref<1x10x10x64xf32>, memref<1x8x8x64xf32>) writes(%arg3 : memref<1x8x8x64xf32>) // STREAM-NEXT: } // STREAM-NEXT: %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<1x64x8x8xf32> -// STREAM-NEXT: %read_outputs_22:2, %write_outputs_23 = taskflow.task @Task_10_Task_11_Task_12_fused_fused read_memrefs(%write_outputs_20, %read_outputs : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) write_memrefs(%alloc_21 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_17, %arg0 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_21 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { +// STREAM-NEXT: %dependency_read_out_22:2, %dependency_write_out_23 = taskflow.task @Task_10_Task_11_Task_12_fused_fused dependency_read_in(%dependency_write_out_20, %dependency_read_out : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) dependency_write_in(%alloc_21 : memref<1x64x8x8xf32>) value_inputs(%cst_1, %cst_2 : f32, f32) [original_read_memrefs(%alloc_17, %arg0 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>), original_write_memrefs(%alloc_21 : memref<1x64x8x8xf32>)] : (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>, f32, f32) -> (memref<1x8x8x64xf32>, memref<1x64x8x8xf32>, memref<1x64x8x8xf32>) { // STREAM-NEXT: ^bb0(%arg1: memref<1x8x8x64xf32>, %arg2: memref<1x64x8x8xf32>, %arg3: memref<1x64x8x8xf32>, %arg4: f32, %arg5: f32): // STREAM-NEXT: affine.for %arg6 = 0 to 1 { // STREAM-NEXT: affine.for %arg7 = 0 to 64 { @@ -698,7 +698,7 @@ module attributes {torch.debug_module_name = "SimpleResNetBlock"} { // STREAM-NEXT: } // STREAM-NEXT: taskflow.yield reads(%arg1, %arg2 : memref<1x8x8x64xf32>, memref<1x64x8x8xf32>) writes(%arg3 : memref<1x64x8x8xf32>) // STREAM-NEXT: } -// STREAM-NEXT: return %write_outputs_23 : memref<1x64x8x8xf32> +// STREAM-NEXT: return %dependency_write_out_23 : memref<1x64x8x8xf32> // STREAM-NEXT: } // STREAM-NEXT: } diff --git a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir index 938ba4dd..408bfb80 100644 --- a/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir +++ b/test/multi-cgra/taskflow/resource-heavy/resource-heavy.mlir @@ -181,11 +181,11 @@ module { // TASKFLOW: module { // TASKFLOW-NEXT: func.func @stereo_cost_computation -// TASKFLOW: %read_outputs:6, %write_outputs:2 = taskflow.task @Task_0 +// TASKFLOW: %dependency_read_out:6, %dependency_write_out:2 = taskflow.task @Task_0 // TASKFLOW: affine.for %arg28 = 0 to 64 { // TASKFLOW: } // TASKFLOW: taskflow.yield -// TASKFLOW: %read_outputs_0, %write_outputs_1 = taskflow.task @Task_1 +// TASKFLOW: %dependency_read_out_0, %dependency_write_out_1 = taskflow.task @Task_1 // TASKFLOW: affine.for %arg18 = 0 to 64 { // TASKFLOW: } // TASKFLOW: taskflow.yield From 0a29870d7d74246329d2191be0c8ca6db257cd18 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 7 Mar 2026 01:20:12 +0800 Subject: [PATCH 6/7] fix bugs in FuseTaskPass.cpp --- .../Transforms/FuseTaskPass.cpp | 564 ++++++++++++------ 1 file changed, 388 insertions(+), 176 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp b/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp index d55a7540..00b20c27 100644 --- a/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp +++ b/lib/TaskflowDialect/Transforms/FuseTaskPass.cpp @@ -10,15 +10,15 @@ #include "TaskflowDialect/TaskflowOps.h" #include "TaskflowDialect/TaskflowPasses.h" -#include "NeuraDialect/NeuraOps.h" -#include "NeuraDialect/NeuraPasses.h" +#include "Conversion/ConversionPasses.h" #include "NeuraDialect/Architecture/Architecture.h" #include "NeuraDialect/Mapping/mapping_util.h" -#include "Conversion/ConversionPasses.h" +#include "NeuraDialect/NeuraOps.h" +#include "NeuraDialect/NeuraPasses.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" -#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Conversion/Passes.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" @@ -31,9 +31,9 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/PatternMatch.h" +#include "mlir/Parser/Parser.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" -#include "mlir/Parser/Parser.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -48,13 +48,32 @@ namespace { // Operand helpers //===----------------------------------------------------------------------===// +// Resolves a value through a task's WAR/RAW dependency chain. +// If v is a dependency_read_out or dependency_write_out of the task, +// returns the corresponding input; otherwise returns v unchanged. +static Value resolveThrough(Value v, TaskflowTaskOp task) { + for (unsigned i = 0; i < task.getDependencyReadOut().size(); ++i) + if (v == task.getDependencyReadOut()[i]) + return task.getDependencyReadIn()[i]; + for (unsigned i = 0; i < task.getDependencyWriteOut().size(); ++i) + if (v == task.getDependencyWriteOut()[i]) + return task.getDependencyWriteIn()[i]; + return v; +} + // Collects unique values from two ranges into a deduped vector with index map. static void collectUnique(ValueRange a, ValueRange b, SmallVectorImpl &result, llvm::SmallDenseMap &idx_map) { - for (Value v : a) { idx_map[v] = result.size(); result.push_back(v); } + for (Value v : a) { + idx_map[v] = result.size(); + result.push_back(v); + } for (Value v : b) - if (!idx_map.count(v)) { idx_map[v] = result.size(); result.push_back(v); } + if (!idx_map.count(v)) { + idx_map[v] = result.size(); + result.push_back(v); + } } //===----------------------------------------------------------------------===// @@ -62,8 +81,7 @@ static void collectUnique(ValueRange a, ValueRange b, //===----------------------------------------------------------------------===// // Extracts all TaskflowCounterOps from a task body in definition order. -static SmallVector -extractCounterChain(TaskflowTaskOp task) { +static SmallVector extractCounterChain(TaskflowTaskOp task) { SmallVector chain; for (Operation &op : task.getBody().front()) if (auto c = dyn_cast(&op)) @@ -74,14 +92,15 @@ extractCounterChain(TaskflowTaskOp task) { // Returns true if two counter chains have identical bounds and steps. static bool counterChainsMatch(SmallVectorImpl &a, SmallVectorImpl &b) { - if (a.size() != b.size()) return false; + if (a.size() != b.size()) + return false; for (unsigned i = 0; i < a.size(); ++i) { auto get_int = [](Operation *op, StringRef name) -> int64_t { return op->getAttrOfType(name).getInt(); }; if (get_int(a[i], "lower_bound") != get_int(b[i], "lower_bound") || get_int(a[i], "upper_bound") != get_int(b[i], "upper_bound") || - get_int(a[i], "step") != get_int(b[i], "step")) + get_int(a[i], "step") != get_int(b[i], "step")) return false; } return true; @@ -93,7 +112,8 @@ static TaskflowHyperblockOp findHyperblock(TaskflowTaskOp task) { TaskflowHyperblockOp result = nullptr; for (Operation &op : task.getBody().front()) { if (auto hb = dyn_cast(&op)) { - if (result) return nullptr; // multiple hyperblocks + if (result) + return nullptr; // multiple hyperblocks result = hb; } } @@ -106,7 +126,8 @@ static scf::ForOp findInnerForOp(TaskflowHyperblockOp hb) { scf::ForOp result = nullptr; for (auto &op : hb.getBody().front()) { if (auto f = dyn_cast(&op)) { - if (result) return nullptr; + if (result) + return nullptr; result = f; } } @@ -123,9 +144,9 @@ static bool scfForBoundsMatch(scf::ForOp a, scf::ForOp b) { }; auto al = get_const(a.getLowerBound()), bl = get_const(b.getLowerBound()); auto au = get_const(a.getUpperBound()), bu = get_const(b.getUpperBound()); - auto as = get_const(a.getStep()), bs = get_const(b.getStep()); - return al && bl && au && bu && as && bs && - *al == *bl && *au == *bu && *as == *bs; + auto as = get_const(a.getStep()), bs = get_const(b.getStep()); + return al && bl && au && bu && as && bs && *al == *bl && *au == *bu && + *as == *bs; } //===----------------------------------------------------------------------===// @@ -134,36 +155,61 @@ static bool scfForBoundsMatch(scf::ForOp a, scf::ForOp b) { // Returns true if task1 dominates task2 in the same block. static bool canFuseTasks(TaskflowTaskOp t1, TaskflowTaskOp t2) { - return t1 && t2 && t1 != t2 && - t1->getBlock() == t2->getBlock() && + return t1 && t2 && t1 != t2 && t1->getBlock() == t2->getBlock() && t1->isBeforeInBlock(t2); } // Returns true if any write output of the producer feeds the consumer. static bool hasProducerConsumerRelation(TaskflowTaskOp producer, - TaskflowTaskOp consumer) { - for (Value r : producer.getWriteOutputs()) { - for (Value in : consumer.getReadMemrefs()) if (r == in) return true; - for (Value in : consumer.getWriteMemrefs()) if (r == in) return true; + TaskflowTaskOp consumer) { + for (Value r : producer.getDependencyWriteOut()) { + for (Value in : consumer.getDependencyReadIn()) + if (r == in) + return true; + for (Value in : consumer.getDependencyWriteIn()) + if (r == in) + return true; } for (Value r : producer.getValueOutputs()) - for (Value in : consumer.getValueInputs()) if (r == in) return true; + for (Value in : consumer.getValueInputs()) + if (r == in) + return true; return false; } // Returns true if tasks share at least one input with no data dependency. +// Resolves WAR chains: if t2's input is t1's dependency_read_out, traces +// back to the original memref for comparison. static bool areSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2) { llvm::SmallPtrSet t1_mem; - for (Value v : t1.getReadMemrefs()) t1_mem.insert(v); - for (Value v : t1.getWriteMemrefs()) t1_mem.insert(v); + for (Value v : t1.getDependencyReadIn()) + t1_mem.insert(v); + for (Value v : t1.getDependencyWriteIn()) + t1_mem.insert(v); llvm::SmallPtrSet t1_val(t1.getValueInputs().begin(), t1.getValueInputs().end()); bool share = false; - for (Value v : t2.getReadMemrefs()) if (t1_mem.count(v)) { share = true; break; } + for (Value v : t2.getDependencyReadIn()) { + Value resolved = resolveThrough(v, t1); + if (t1_mem.count(resolved)) { + share = true; + break; + } + } if (!share) - for (Value v : t2.getWriteMemrefs()) if (t1_mem.count(v)) { share = true; break; } + for (Value v : t2.getDependencyWriteIn()) { + Value resolved = resolveThrough(v, t1); + if (t1_mem.count(resolved)) { + share = true; + break; + } + } if (!share) - for (Value v : t2.getValueInputs()) if (t1_val.count(v)) { share = true; break; } + for (Value v : t2.getValueInputs()) + if (t1_val.count(v)) { + share = true; + break; + } return share && !hasProducerConsumerRelation(t1, t2) && !hasProducerConsumerRelation(t2, t1); } @@ -172,25 +218,39 @@ static bool areSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2) { static bool hasInterveningUses(TaskflowTaskOp producer, TaskflowTaskOp consumer) { llvm::SmallPtrSet results; - for (Value v : producer.getWriteOutputs()) results.insert(v); - for (Value v : producer.getValueOutputs()) results.insert(v); + for (Value v : producer.getDependencyReadOut()) + results.insert(v); + for (Value v : producer.getDependencyWriteOut()) + results.insert(v); + for (Value v : producer.getValueOutputs()) + results.insert(v); bool in_range = false; for (Operation &op : *producer->getBlock()) { - if (&op == producer.getOperation()) { in_range = true; continue; } - if (&op == consumer.getOperation()) break; + if (&op == producer.getOperation()) { + in_range = true; + continue; + } + if (&op == consumer.getOperation()) + break; if (in_range) for (Value v : op.getOperands()) - if (results.count(v)) return true; + if (results.count(v)) + return true; } return false; } // Returns true if all outputs of the task have at most one use. static bool hasOnlySingleUseOutputs(TaskflowTaskOp task) { - for (Value r : task.getWriteOutputs()) - if (!r.hasOneUse() && !r.use_empty()) return false; + for (Value r : task.getDependencyReadOut()) + if (!r.hasOneUse() && !r.use_empty()) + return false; + for (Value r : task.getDependencyWriteOut()) + if (!r.hasOneUse() && !r.use_empty()) + return false; for (Value r : task.getValueOutputs()) - if (!r.hasOneUse() && !r.use_empty()) return false; + if (!r.hasOneUse() && !r.use_empty()) + return false; return true; } @@ -209,7 +269,8 @@ static void cloneTaskBodyMiscOps(Block &body, OpBuilder &builder, // Handles both counter-chain and scf.for-inside-hyperblock cases. static bool haveMatchingLoopStructure(TaskflowTaskOp t1, TaskflowTaskOp t2) { auto hb1 = findHyperblock(t1), hb2 = findHyperblock(t2); - if (!hb1 || !hb2) return false; + if (!hb1 || !hb2) + return false; if (hb1.getIterArgs().size() > 0 || hb2.getIterArgs().size() > 0) return false; @@ -220,9 +281,11 @@ static bool haveMatchingLoopStructure(TaskflowTaskOp t1, TaskflowTaskOp t2) { // No counter chains: compares scf.for loops in hyperblock bodies. if (c1.empty() && c2.empty()) { auto f1 = findInnerForOp(hb1), f2 = findInnerForOp(hb2); - if (f1 && f2) return scfForBoundsMatch(f1, f2); + if (f1 && f2) + return scfForBoundsMatch(f1, f2); // Both have no loops: trivially compatible. - if (!f1 && !f2) return true; + if (!f1 && !f2) + return true; } return false; } @@ -234,52 +297,71 @@ static bool haveMatchingLoopStructure(TaskflowTaskOp t1, TaskflowTaskOp t2) { // Fuses a producer into its consumer by merging their counter chains and // hyperblock bodies into a single task with direct SSA value forwarding // for the intermediate memref. -static TaskflowTaskOp -fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, - Value intermediate_memref, OpBuilder &builder) { +static TaskflowTaskOp fuseProducerConsumerTasks(TaskflowTaskOp producer, + TaskflowTaskOp consumer, + Value intermediate_memref, + OpBuilder &builder) { Location loc = consumer.getLoc(); - auto prod_read = producer.getReadMemrefs(); - auto prod_write = producer.getWriteMemrefs(); - auto prod_val = producer.getValueInputs(); - auto cons_read = consumer.getReadMemrefs(); - auto cons_write = consumer.getWriteMemrefs(); - auto cons_val = consumer.getValueInputs(); + auto prod_read = producer.getDependencyReadIn(); + auto prod_write = producer.getDependencyWriteIn(); + auto prod_val = producer.getValueInputs(); + auto cons_read = consumer.getDependencyReadIn(); + auto cons_write = consumer.getDependencyWriteIn(); + auto cons_val = consumer.getValueInputs(); // Collects fused inputs, keeping intermediate in write_memrefs for // block arg availability but excluding it from consumer's read side. SmallVector fused_read, fused_write, fused_val; - for (Value v : prod_read) fused_read.push_back(v); + for (Value v : prod_read) + fused_read.push_back(v); for (Value v : cons_read) - if (v != intermediate_memref) fused_read.push_back(v); - for (Value v : prod_write) fused_write.push_back(v); + if (v != intermediate_memref) + fused_read.push_back(resolveThrough(v, producer)); + for (Value v : prod_write) + fused_write.push_back(v); for (Value v : cons_write) - if (v != intermediate_memref) fused_write.push_back(v); - for (Value v : prod_val) fused_val.push_back(v); - for (Value v : cons_val) fused_val.push_back(v); - - // Output types come from the consumer only. - SmallVector write_out_types(consumer.getWriteOutputs().getTypes()); + if (v != intermediate_memref) + fused_write.push_back(resolveThrough(v, producer)); + for (Value v : prod_val) + fused_val.push_back(v); + for (Value v : cons_val) + fused_val.push_back(resolveThrough(v, producer)); + + // Read output types: passthrough of fused read inputs for WAR tracking. + SmallVector read_out_types; + for (Value v : fused_read) + read_out_types.push_back(v.getType()); + + // Write/value output types come from the consumer only. + SmallVector write_out_types( + consumer.getDependencyWriteOut().getTypes()); SmallVector val_out_types(consumer.getValueOutputs().getTypes()); SmallVector orig_reads, orig_writes; - for (Value v : producer.getOriginalReadMemrefs()) orig_reads.push_back(v); - for (Value v : consumer.getOriginalReadMemrefs()) orig_reads.push_back(v); - for (Value v : producer.getOriginalWriteMemrefs()) orig_writes.push_back(v); - for (Value v : consumer.getOriginalWriteMemrefs()) orig_writes.push_back(v); + for (Value v : producer.getOriginalReadMemrefs()) + orig_reads.push_back(v); + for (Value v : consumer.getOriginalReadMemrefs()) + orig_reads.push_back(v); + for (Value v : producer.getOriginalWriteMemrefs()) + orig_writes.push_back(v); + for (Value v : consumer.getOriginalWriteMemrefs()) + orig_writes.push_back(v); auto fused = builder.create( - loc, write_out_types, val_out_types, - fused_read, fused_write, fused_val, - builder.getStringAttr("fused_pc"), - orig_reads, orig_writes); + loc, read_out_types, write_out_types, val_out_types, fused_read, + fused_write, fused_val, builder.getStringAttr("fused_pc"), orig_reads, + orig_writes); Block *body = new Block(); fused.getBody().push_back(body); - for (Value v : fused_read) body->addArgument(v.getType(), loc); - for (Value v : fused_write) body->addArgument(v.getType(), loc); - for (Value v : fused_val) body->addArgument(v.getType(), loc); - - unsigned fused_read_n = fused_read.size(); + for (Value v : fused_read) + body->addArgument(v.getType(), loc); + for (Value v : fused_write) + body->addArgument(v.getType(), loc); + for (Value v : fused_val) + body->addArgument(v.getType(), loc); + + unsigned fused_read_n = fused_read.size(); unsigned fused_write_n = fused_write.size(); // --- Maps producer block args to fused block args --- @@ -299,8 +381,8 @@ fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, // finds which write output index it corresponds to, then gets the matching // write memref block arg. BlockArgument prod_intermediate_arg = nullptr; - for (unsigned i = 0; i < producer.getWriteOutputs().size(); ++i) - if (producer.getWriteOutputs()[i] == intermediate_memref) + for (unsigned i = 0; i < producer.getDependencyWriteOut().size(); ++i) + if (producer.getDependencyWriteOut()[i] == intermediate_memref) prod_intermediate_arg = prod_body.getArgument(prod_read.size() + i); OpBuilder::InsertionGuard guard(builder); @@ -337,7 +419,8 @@ fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, // Clones counter chain from producer. auto prod_counters = extractCounterChain(producer); - for (auto ctr : prod_counters) builder.clone(*ctr, mapping); + for (auto ctr : prod_counters) + builder.clone(*ctr, mapping); // Clones non-counter, non-hyperblock, non-yield ops from both task bodies // (e.g. arith.constant for loop bounds). @@ -354,8 +437,8 @@ fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, SmallVector triggers; for (Value v : prod_hb.getIndices()) triggers.push_back(mapping.lookupOrDefault(v)); - auto fused_hb = builder.create( - loc, TypeRange{}, triggers, ValueRange{}); + auto fused_hb = builder.create(loc, TypeRange{}, + triggers, ValueRange{}); Block *hb_body = &fused_hb.getBody().emplaceBlock(); for (auto arg : prod_hb_body.getArguments()) hb_body->addArgument(arg.getType(), loc); @@ -384,17 +467,19 @@ fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, // Clones non-for, non-yield ops from both hyperblock bodies. for (Operation &op : prod_hb_body) { - if (isa(&op)) continue; + if (isa(&op)) + continue; builder.clone(op, mapping); } for (Operation &op : cons_hb_body) { - if (isa(&op)) continue; + if (isa(&op)) + continue; builder.clone(op, mapping); } // Creates merged scf.for with producer's bounds. - Value lb = mapping.lookupOrDefault(prod_for.getLowerBound()); - Value ub = mapping.lookupOrDefault(prod_for.getUpperBound()); + Value lb = mapping.lookupOrDefault(prod_for.getLowerBound()); + Value ub = mapping.lookupOrDefault(prod_for.getUpperBound()); Value step = mapping.lookupOrDefault(prod_for.getStep()); auto merged_for = builder.create(loc, lb, ub, step); mapping.map(prod_for.getInductionVar(), merged_for.getInductionVar()); @@ -407,7 +492,8 @@ fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, // the forwarded value. Value forwarded = nullptr; for (Operation &op : *prod_for.getBody()) { - if (isa(&op)) continue; + if (isa(&op)) + continue; if (auto store = dyn_cast(&op)) { if (prod_intermediate_arg && store.getMemRef() == prod_intermediate_arg) { @@ -421,7 +507,8 @@ fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, // Clones consumer for-body; replaces loads from intermediate // with the forwarded value. for (Operation &op : *cons_for.getBody()) { - if (isa(&op)) continue; + if (isa(&op)) + continue; if (auto load = dyn_cast(&op)) { if (cons_intermediate_arg && forwarded && load.getMemRef() == cons_intermediate_arg) { @@ -436,7 +523,8 @@ fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, // --- Counter-chain path: clones hyperblock bodies sequentially --- Value forwarded = nullptr; for (Operation &op : prod_hb_body) { - if (isa(&op)) continue; + if (isa(&op)) + continue; if (auto store = dyn_cast(&op)) { if (prod_intermediate_arg && store.getMemRef() == prod_intermediate_arg) { @@ -447,7 +535,8 @@ fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, builder.clone(op, mapping); } for (Operation &op : cons_hb_body) { - if (isa(&op)) continue; + if (isa(&op)) + continue; if (auto load = dyn_cast(&op)) { if (cons_intermediate_arg && forwarded && load.getMemRef() == cons_intermediate_arg) { @@ -463,13 +552,20 @@ fuseProducerConsumerTasks(TaskflowTaskOp producer, TaskflowTaskOp consumer, } // Creates the task yield from consumer's yield operands. - auto cons_yield = cast(consumer.getBody().front().getTerminator()); + auto cons_yield = + cast(consumer.getBody().front().getTerminator()); + + // Read yield outputs: passthrough fused read block args. + SmallVector yield_reads; + for (unsigned i = 0; i < fused_read.size(); ++i) + yield_reads.push_back(body->getArgument(i)); + SmallVector yield_mem, yield_val; for (Value v : cons_yield.getMemoryResults()) yield_mem.push_back(mapping.lookupOrDefault(v)); for (Value v : cons_yield.getValueResults()) yield_val.push_back(mapping.lookupOrDefault(v)); - builder.create(loc, yield_mem, yield_val); + builder.create(loc, yield_reads, yield_mem, yield_val); return fused; } @@ -483,58 +579,86 @@ static TaskflowTaskOp fuseSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2, OpBuilder &builder) { Location loc = t1.getLoc(); + // Resolves t2's inputs through t1's WAR chains so that the fused task + // references original memrefs rather than t1's passthrough results. + SmallVector t2_read, t2_write, t2_val; + for (Value v : t2.getDependencyReadIn()) + t2_read.push_back(resolveThrough(v, t1)); + for (Value v : t2.getDependencyWriteIn()) + t2_write.push_back(resolveThrough(v, t1)); + for (Value v : t2.getValueInputs()) + t2_val.push_back(resolveThrough(v, t1)); + // Deduplicates inputs across both tasks. SmallVector fused_read, fused_write, fused_val; llvm::SmallDenseMap read_idx, write_idx, val_idx; - collectUnique(t1.getReadMemrefs(), t2.getReadMemrefs(), fused_read, read_idx); - collectUnique(t1.getWriteMemrefs(), t2.getWriteMemrefs(), fused_write, write_idx); - collectUnique(t1.getValueInputs(), t2.getValueInputs(), fused_val, val_idx); + collectUnique(t1.getDependencyReadIn(), t2_read, fused_read, read_idx); + collectUnique(t1.getDependencyWriteIn(), t2_write, fused_write, write_idx); + collectUnique(t1.getValueInputs(), t2_val, fused_val, val_idx); - // Combined output types. + // Read output types: passthrough of fused read inputs for WAR tracking. + SmallVector read_out_types; + for (Value v : fused_read) + read_out_types.push_back(v.getType()); + + // Combined write/value output types. SmallVector write_out_types, val_out_types; - write_out_types.append(t1.getWriteOutputs().getTypes().begin(), - t1.getWriteOutputs().getTypes().end()); - write_out_types.append(t2.getWriteOutputs().getTypes().begin(), - t2.getWriteOutputs().getTypes().end()); + write_out_types.append(t1.getDependencyWriteOut().getTypes().begin(), + t1.getDependencyWriteOut().getTypes().end()); + write_out_types.append(t2.getDependencyWriteOut().getTypes().begin(), + t2.getDependencyWriteOut().getTypes().end()); val_out_types.append(t1.getValueOutputs().getTypes().begin(), t1.getValueOutputs().getTypes().end()); val_out_types.append(t2.getValueOutputs().getTypes().begin(), t2.getValueOutputs().getTypes().end()); SmallVector orig_reads, orig_writes; - for (Value v : t1.getOriginalReadMemrefs()) orig_reads.push_back(v); - for (Value v : t2.getOriginalReadMemrefs()) orig_reads.push_back(v); - for (Value v : t1.getOriginalWriteMemrefs()) orig_writes.push_back(v); - for (Value v : t2.getOriginalWriteMemrefs()) orig_writes.push_back(v); + for (Value v : t1.getOriginalReadMemrefs()) + orig_reads.push_back(v); + for (Value v : t2.getOriginalReadMemrefs()) + orig_reads.push_back(v); + for (Value v : t1.getOriginalWriteMemrefs()) + orig_writes.push_back(v); + for (Value v : t2.getOriginalWriteMemrefs()) + orig_writes.push_back(v); auto fused = builder.create( - loc, write_out_types, val_out_types, - fused_read, fused_write, fused_val, - builder.getStringAttr("fused_sibling"), + loc, read_out_types, write_out_types, val_out_types, fused_read, + fused_write, fused_val, builder.getStringAttr("fused_sibling"), orig_reads, orig_writes); Block *body = new Block(); fused.getBody().push_back(body); - for (Value v : fused_read) body->addArgument(v.getType(), loc); - for (Value v : fused_write) body->addArgument(v.getType(), loc); - for (Value v : fused_val) body->addArgument(v.getType(), loc); + for (Value v : fused_read) + body->addArgument(v.getType(), loc); + for (Value v : fused_write) + body->addArgument(v.getType(), loc); + for (Value v : fused_val) + body->addArgument(v.getType(), loc); unsigned rn = fused_read.size(), wn = fused_write.size(); // Lambda to map a task's block args to fused block args via index maps. - auto mapBlockArgs = [&](TaskflowTaskOp task, IRMapping &m) { + // When is_t2 is true, resolves inputs through t1's WAR chains for lookup. + auto mapBlockArgs = [&](TaskflowTaskOp task, IRMapping &m, bool is_t2) { Block &tb = task.getBody().front(); - auto r = task.getReadMemrefs(); - auto w = task.getWriteMemrefs(); + auto r = task.getDependencyReadIn(); + auto w = task.getDependencyWriteIn(); auto v = task.getValueInputs(); - for (unsigned i = 0; i < r.size(); ++i) - m.map(tb.getArgument(i), body->getArgument(read_idx[r[i]])); - for (unsigned i = 0; i < w.size(); ++i) + for (unsigned i = 0; i < r.size(); ++i) { + Value key = is_t2 ? resolveThrough(r[i], t1) : r[i]; + m.map(tb.getArgument(i), body->getArgument(read_idx[key])); + } + for (unsigned i = 0; i < w.size(); ++i) { + Value key = is_t2 ? resolveThrough(w[i], t1) : w[i]; m.map(tb.getArgument(r.size() + i), - body->getArgument(rn + write_idx[w[i]])); - for (unsigned i = 0; i < v.size(); ++i) + body->getArgument(rn + write_idx[key])); + } + for (unsigned i = 0; i < v.size(); ++i) { + Value key = is_t2 ? resolveThrough(v[i], t1) : v[i]; m.map(tb.getArgument(r.size() + w.size() + i), - body->getArgument(rn + wn + val_idx[v[i]])); + body->getArgument(rn + wn + val_idx[key])); + } }; OpBuilder::InsertionGuard guard(builder); @@ -542,15 +666,16 @@ static TaskflowTaskOp fuseSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2, // Clones counter chain from t1. IRMapping mapping; - mapBlockArgs(t1, mapping); + mapBlockArgs(t1, mapping, /*is_t2=*/false); auto counters = extractCounterChain(t1); - for (auto ctr : counters) builder.clone(*ctr, mapping); + for (auto ctr : counters) + builder.clone(*ctr, mapping); // Clones non-counter, non-hyperblock, non-yield ops from both task bodies. Block &t1_body = t1.getBody().front(); Block &t2_body = t2.getBody().front(); IRMapping mapping2; - mapBlockArgs(t2, mapping2); + mapBlockArgs(t2, mapping2, /*is_t2=*/true); cloneTaskBodyMiscOps(t1_body, builder, mapping); cloneTaskBodyMiscOps(t2_body, builder, mapping2); @@ -563,8 +688,8 @@ static TaskflowTaskOp fuseSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2, SmallVector triggers; for (Value v : hb1.getIndices()) triggers.push_back(mapping.lookupOrDefault(v)); - auto fused_hb = builder.create( - loc, TypeRange{}, triggers, ValueRange{}); + auto fused_hb = builder.create(loc, TypeRange{}, + triggers, ValueRange{}); Block *hb_body = &fused_hb.getBody().emplaceBlock(); for (auto arg : hb1_body.getArguments()) hb_body->addArgument(arg.getType(), loc); @@ -586,17 +711,19 @@ static TaskflowTaskOp fuseSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2, // Clones non-for, non-yield ops from both hyperblock bodies. for (Operation &op : hb1_body) { - if (isa(&op)) continue; + if (isa(&op)) + continue; builder.clone(op, mapping); } for (Operation &op : hb2_body) { - if (isa(&op)) continue; + if (isa(&op)) + continue; builder.clone(op, mapping2); } // Creates merged scf.for with t1's bounds. - Value lb = mapping.lookupOrDefault(for1.getLowerBound()); - Value ub = mapping.lookupOrDefault(for1.getUpperBound()); + Value lb = mapping.lookupOrDefault(for1.getLowerBound()); + Value ub = mapping.lookupOrDefault(for1.getUpperBound()); Value step = mapping.lookupOrDefault(for1.getStep()); auto merged_for = builder.create(loc, lb, ub, step); mapping.map(for1.getInductionVar(), merged_for.getInductionVar()); @@ -607,23 +734,27 @@ static TaskflowTaskOp fuseSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2, // Clones t1 for-body. for (Operation &op : *for1.getBody()) { - if (isa(&op)) continue; + if (isa(&op)) + continue; builder.clone(op, mapping); } // Clones t2 for-body. for (Operation &op : *for2.getBody()) { - if (isa(&op)) continue; + if (isa(&op)) + continue; builder.clone(op, mapping2); } } else { // --- Counter-chain path: clones hyperblock bodies sequentially --- for (Operation &op : hb1_body) { - if (isa(&op)) continue; + if (isa(&op)) + continue; builder.clone(op, mapping); } for (Operation &op : hb2_body) { - if (isa(&op)) continue; + if (isa(&op)) + continue; builder.clone(op, mapping2); } } @@ -634,6 +765,12 @@ static TaskflowTaskOp fuseSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2, // Creates combined task yield. auto t1_yield = cast(t1.getBody().front().getTerminator()); auto t2_yield = cast(t2.getBody().front().getTerminator()); + + // Read yield outputs: passthrough fused read block args. + SmallVector all_reads; + for (unsigned i = 0; i < fused_read.size(); ++i) + all_reads.push_back(body->getArgument(i)); + SmallVector all_mem, all_val; for (Value v : t1_yield.getMemoryResults()) all_mem.push_back(mapping.lookupOrDefault(v)); @@ -643,7 +780,7 @@ static TaskflowTaskOp fuseSiblingTasks(TaskflowTaskOp t1, TaskflowTaskOp t2, all_mem.push_back(mapping2.lookupOrDefault(v)); for (Value v : t2_yield.getValueResults()) all_val.push_back(mapping2.lookupOrDefault(v)); - builder.create(loc, all_mem, all_val); + builder.create(loc, all_reads, all_mem, all_val); return fused; } @@ -673,8 +810,9 @@ static int calculateMaxFanoutInRegion(Region ®ion) { // Runs the taskflow-to-neura pipeline on a cloned module and computes // MII metrics on the resulting "test_fused_kernel" function. -static FusionMetrics computeRealMetrics( - ModuleOp test_module, const neura::Architecture &architecture) { +static FusionMetrics +computeRealMetrics(ModuleOp test_module, + const neura::Architecture &architecture) { FusionMetrics metrics; auto cloned = test_module.clone(); @@ -700,7 +838,8 @@ static FusionMetrics computeRealMetrics( return; } OpBuilder builder(ret); - auto neura_ret = builder.create(ret.getLoc(), ret.getOperands()); + auto neura_ret = + builder.create(ret.getLoc(), ret.getOperands()); if (ret.getNumOperands() > 0) { neura_ret->setAttr("return_type", builder.getStringAttr("value")); } else { @@ -786,7 +925,8 @@ static ModuleOp createTestModuleForTask(TaskflowTaskOp task) { } auto func_type = builder.getFunctionType(input_types, output_types); - auto func_op = builder.create(loc, "test_fused_kernel", func_type); + auto func_op = + builder.create(loc, "test_fused_kernel", func_type); func_op->setAttr("accelerator", builder.getStringAttr("neura")); Block *entry = func_op.addEntryBlock(); @@ -812,8 +952,9 @@ static ModuleOp createTestModuleForTask(TaskflowTaskOp task) { } // Computes metrics for a single task by running the neura pipeline. -static FusionMetrics computeSingleTaskMetrics( - TaskflowTaskOp task, const neura::Architecture &architecture) { +static FusionMetrics +computeSingleTaskMetrics(TaskflowTaskOp task, + const neura::Architecture &architecture) { auto module = createTestModuleForTask(task); FusionMetrics metrics = computeRealMetrics(module, architecture); module.erase(); @@ -821,9 +962,10 @@ static FusionMetrics computeSingleTaskMetrics( } // Creates a test module with two tasks fused together. -static ModuleOp createFusedTestModule( - TaskflowTaskOp task1, TaskflowTaskOp task2, - bool is_producer_consumer, Value intermediate_memref) { +static ModuleOp createFusedTestModule(TaskflowTaskOp task1, + TaskflowTaskOp task2, + bool is_producer_consumer, + Value intermediate_memref) { MLIRContext *ctx = task1->getContext(); OpBuilder builder(ctx); Location loc = builder.getUnknownLoc(); @@ -852,10 +994,11 @@ static ModuleOp createFusedTestModule( // Placeholder output type; the fused task's results drive the actual return. SmallVector output_types; - output_types.push_back(builder.getI64Type()); + output_types.push_back(builder.getI64Type()); auto func_type = builder.getFunctionType(input_types, output_types); - auto func_op = builder.create(loc, "test_fused_kernel", func_type); + auto func_op = + builder.create(loc, "test_fused_kernel", func_type); func_op->setAttr("accelerator", builder.getStringAttr("neura")); Block *entry = func_op.addEntryBlock(); @@ -871,7 +1014,8 @@ static ModuleOp createFusedTestModule( auto ct1 = cast(ct1_op); // Maps task1's results so task2's operands resolve correctly. - for (auto [orig, cloned] : llvm::zip(task1->getResults(), ct1->getResults())) { + for (auto [orig, cloned] : + llvm::zip(task1->getResults(), ct1->getResults())) { mapping.map(orig, cloned); } @@ -907,7 +1051,8 @@ static ModuleOp createFusedTestModule( for (Value v : ret) { real_output_types.push_back(v.getType()); } - func_op.setFunctionType(builder.getFunctionType(input_types, real_output_types)); + func_op.setFunctionType( + builder.getFunctionType(input_types, real_output_types)); builder.create(loc, ret); // Erases un-fused clones (consumer first to drop uses of producer results). @@ -917,11 +1062,12 @@ static ModuleOp createFusedTestModule( } // Computes metrics for the fused version of two tasks. -static FusionMetrics computeFusedTaskMetrics( - TaskflowTaskOp task1, TaskflowTaskOp task2, - bool is_producer_consumer, Value intermediate_memref, - const neura::Architecture &architecture) { - auto module = createFusedTestModule(task1, task2, is_producer_consumer, intermediate_memref); +static FusionMetrics +computeFusedTaskMetrics(TaskflowTaskOp task1, TaskflowTaskOp task2, + bool is_producer_consumer, Value intermediate_memref, + const neura::Architecture &architecture) { + auto module = createFusedTestModule(task1, task2, is_producer_consumer, + intermediate_memref); FusionMetrics metrics = computeRealMetrics(module, architecture); module.erase(); return metrics; @@ -932,7 +1078,8 @@ static int estimateMII(const FusionMetrics &metrics, int total_tiles) { const float alpha = 0.5f; const float beta = 0.5f; int mii = std::max(metrics.rec_mii, metrics.res_mii); - float utilization_factor = 1.0f + alpha * (metrics.num_ops / static_cast(total_tiles)); + float utilization_factor = + 1.0f + alpha * (metrics.num_ops / static_cast(total_tiles)); float fanout_factor = 1.0f + beta * std::max(metrics.max_fanout - 4, 0); return static_cast(std::ceil(utilization_factor * fanout_factor * mii)); } @@ -949,7 +1096,8 @@ static bool isFusionProfitable(TaskflowTaskOp task1, TaskflowTaskOp task2, FusionMetrics m1 = computeSingleTaskMetrics(task1, architecture); FusionMetrics m2 = computeSingleTaskMetrics(task2, architecture); - FusionMetrics fused = computeFusedTaskMetrics(task1, task2, is_producer_consumer, intermediate, architecture); + FusionMetrics fused = computeFusedTaskMetrics( + task1, task2, is_producer_consumer, intermediate, architecture); int mii_1 = estimateMII(m1, total_tiles); int mii_2 = estimateMII(m2, total_tiles); @@ -961,13 +1109,20 @@ static bool isFusionProfitable(TaskflowTaskOp task1, TaskflowTaskOp task2, int raw_1 = std::max(m1.rec_mii, m1.res_mii); int raw_2 = std::max(m2.rec_mii, m2.res_mii); int raw_fused = std::max(fused.rec_mii, fused.res_mii); - int unfused_mii = is_producer_consumer ? (raw_1 + raw_2) : std::max(raw_1, raw_2); + int unfused_mii = + is_producer_consumer ? (raw_1 + raw_2) : std::max(raw_1, raw_2); bool profitable = raw_fused <= unfused_mii; llvm::errs() << "[fuse-task] Profitability:" - << " m1(rec=" << m1.rec_mii << " res=" << m1.res_mii << " ops=" << m1.num_ops << " fan=" << m1.max_fanout << " mii=" << mii_1 << ")" - << " m2(rec=" << m2.rec_mii << " res=" << m2.res_mii << " ops=" << m2.num_ops << " fan=" << m2.max_fanout << " mii=" << mii_2 << ")" - << " fused(rec=" << fused.rec_mii << " res=" << fused.res_mii << " ops=" << fused.num_ops << " fan=" << fused.max_fanout << " mii=" << mii_fused << ")" + << " m1(rec=" << m1.rec_mii << " res=" << m1.res_mii + << " ops=" << m1.num_ops << " fan=" << m1.max_fanout + << " mii=" << mii_1 << ")" + << " m2(rec=" << m2.rec_mii << " res=" << m2.res_mii + << " ops=" << m2.num_ops << " fan=" << m2.max_fanout + << " mii=" << mii_2 << ")" + << " fused(rec=" << fused.rec_mii << " res=" << fused.res_mii + << " ops=" << fused.num_ops << " fan=" << fused.max_fanout + << " mii=" << mii_fused << ")" << " -> " << (profitable ? "PROFITABLE" : "REJECTED") << "\n"; return profitable; } @@ -978,8 +1133,7 @@ static bool isFusionProfitable(TaskflowTaskOp task1, TaskflowTaskOp task2, // Fuses a producer task into its consumer when the producer's output feeds // directly into the consumer and the loop structures match. -struct ProducerConsumerTaskFusion - : public OpRewritePattern { +struct ProducerConsumerTaskFusion : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(TaskflowTaskOp consumer, @@ -991,8 +1145,11 @@ struct ProducerConsumerTaskFusion auto tryFindProducer = [&](ValueRange inputs) -> bool { for (Value in : inputs) { auto def = in.getDefiningOp(); - if (!canFuseTasks(def, consumer) || - !hasOnlySingleUseOutputs(def) || + // Only write outputs represent true producer-consumer (RAW) links. + // Read outputs are WAR dependency chains and must be skipped. + if (!def || !llvm::is_contained(def.getDependencyWriteOut(), in)) + continue; + if (!canFuseTasks(def, consumer) || !hasOnlySingleUseOutputs(def) || hasInterveningUses(def, consumer) || !haveMatchingLoopStructure(def, consumer) || !isFusionProfitable(def, consumer, true, in)) @@ -1003,17 +1160,44 @@ struct ProducerConsumerTaskFusion } return false; }; - if (!tryFindProducer(consumer.getReadMemrefs()) && - !tryFindProducer(consumer.getWriteMemrefs())) + if (!tryFindProducer(consumer.getDependencyReadIn()) && + !tryFindProducer(consumer.getDependencyWriteIn())) return failure(); - auto fused = fuseProducerConsumerTasks(producer, consumer, - intermediate, rewriter); - for (auto [o, n] : llvm::zip(consumer.getWriteOutputs(), - fused.getWriteOutputs())) + auto fused = + fuseProducerConsumerTasks(producer, consumer, intermediate, rewriter); + + // Replaces producer's dependency_read_out results. + for (unsigned i = 0; i < producer.getDependencyReadOut().size(); ++i) { + Value orig_read = producer.getDependencyReadIn()[i]; + for (unsigned j = 0; j < fused.getDependencyReadIn().size(); ++j) { + if (fused.getDependencyReadIn()[j] == orig_read) { + producer.getDependencyReadOut()[i].replaceAllUsesWith( + fused.getDependencyReadOut()[j]); + break; + } + } + } + // Replaces consumer's dependency_read_out results. + for (unsigned i = 0; i < consumer.getDependencyReadOut().size(); ++i) { + Value orig_read = consumer.getDependencyReadIn()[i]; + if (orig_read == intermediate) + continue; + Value resolved = resolveThrough(orig_read, producer); + for (unsigned j = 0; j < fused.getDependencyReadIn().size(); ++j) { + if (fused.getDependencyReadIn()[j] == resolved) { + consumer.getDependencyReadOut()[i].replaceAllUsesWith( + fused.getDependencyReadOut()[j]); + break; + } + } + } + // Replaces consumer's write and value outputs. + for (auto [o, n] : llvm::zip(consumer.getDependencyWriteOut(), + fused.getDependencyWriteOut())) o.replaceAllUsesWith(n); - for (auto [o, n] : llvm::zip(consumer.getValueOutputs(), - fused.getValueOutputs())) + for (auto [o, n] : + llvm::zip(consumer.getValueOutputs(), fused.getValueOutputs())) o.replaceAllUsesWith(n); rewriter.eraseOp(consumer); rewriter.eraseOp(producer); @@ -1030,7 +1214,8 @@ struct SiblingTaskFusion : public OpRewritePattern { TaskflowTaskOp t2 = nullptr; for (Operation *op = t1->getNextNode(); op; op = op->getNextNode()) { auto next = dyn_cast(op); - if (!next) continue; + if (!next) + continue; if (areSiblingTasks(t1, next) && canFuseTasks(t1, next) && haveMatchingLoopStructure(t1, next) && isFusionProfitable(t1, next, false)) { @@ -1038,18 +1223,45 @@ struct SiblingTaskFusion : public OpRewritePattern { break; } } - if (!t2) return failure(); + if (!t2) + return failure(); auto fused = fuseSiblingTasks(t1, t2, rewriter); - unsigned t1_wo = t1.getWriteOutputs().size(); + + // Replaces dependency_read_out for both tasks. + for (unsigned i = 0; i < t1.getDependencyReadOut().size(); ++i) { + Value orig_read = t1.getDependencyReadIn()[i]; + for (unsigned j = 0; j < fused.getDependencyReadIn().size(); ++j) { + if (fused.getDependencyReadIn()[j] == orig_read) { + t1.getDependencyReadOut()[i].replaceAllUsesWith( + fused.getDependencyReadOut()[j]); + break; + } + } + } + for (unsigned i = 0; i < t2.getDependencyReadOut().size(); ++i) { + Value orig_read = t2.getDependencyReadIn()[i]; + Value resolved = resolveThrough(orig_read, t1); + for (unsigned j = 0; j < fused.getDependencyReadIn().size(); ++j) { + if (fused.getDependencyReadIn()[j] == resolved) { + t2.getDependencyReadOut()[i].replaceAllUsesWith( + fused.getDependencyReadOut()[j]); + break; + } + } + } + + // Replaces dependency_write_out and value_outputs. + unsigned t1_wo = t1.getDependencyWriteOut().size(); unsigned t1_vo = t1.getValueOutputs().size(); for (unsigned i = 0; i < t1_wo; ++i) - t1.getWriteOutputs()[i].replaceAllUsesWith(fused.getWriteOutputs()[i]); + t1.getDependencyWriteOut()[i].replaceAllUsesWith( + fused.getDependencyWriteOut()[i]); for (unsigned i = 0; i < t1_vo; ++i) t1.getValueOutputs()[i].replaceAllUsesWith(fused.getValueOutputs()[i]); - for (unsigned i = 0; i < t2.getWriteOutputs().size(); ++i) - t2.getWriteOutputs()[i].replaceAllUsesWith( - fused.getWriteOutputs()[t1_wo + i]); + for (unsigned i = 0; i < t2.getDependencyWriteOut().size(); ++i) + t2.getDependencyWriteOut()[i].replaceAllUsesWith( + fused.getDependencyWriteOut()[t1_wo + i]); for (unsigned i = 0; i < t2.getValueOutputs().size(); ++i) t2.getValueOutputs()[i].replaceAllUsesWith( fused.getValueOutputs()[t1_vo + i]); @@ -1075,19 +1287,19 @@ struct FuseTaskPass } void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry + .insert(); } void runOnOperation() override { RewritePatternSet patterns(&getContext()); patterns.add(&getContext(), /*benefit=*/10); patterns.add(&getContext(), /*benefit=*/5); - if (failed(applyPatternsGreedily(getOperation(), - FrozenRewritePatternSet(std::move(patterns))))) + if (failed(applyPatternsGreedily( + getOperation(), FrozenRewritePatternSet(std::move(patterns))))) signalPassFailure(); } }; From 8fd11b27366020132a9d36e1e82bc3606b143a80 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Sat, 7 Mar 2026 13:03:00 +0800 Subject: [PATCH 7/7] rename the variable name --- .../AffineToTaskflow/AffineToTaskflowPass.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp index 98b87fad..c1650fae 100644 --- a/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp +++ b/lib/Conversion/AffineToTaskflow/AffineToTaskflowPass.cpp @@ -301,14 +301,14 @@ static TaskflowTaskOp convertLoopToTask( // Step 8: Creates the yield operation. //--------------------------------------------------------------- task_builder.setInsertionPointToEnd(task_body); - SmallVector read_yield_operands; - SmallVector memory_yield_operands; + SmallVector yield_for_dependency_read_out; + SmallVector yield_for_dependency_write_out; SmallVector value_yield_operands; // Read yield outputs: passthrough read memref block args for WAR tracking. for (Value memref : read_memrefs) { if (input_to_block_arg.count(memref)) { - read_yield_operands.push_back(input_to_block_arg[memref]); + yield_for_dependency_read_out.push_back(input_to_block_arg[memref]); } else { assert(false && "Read memref not in inputs!"); } @@ -317,7 +317,7 @@ static TaskflowTaskOp convertLoopToTask( // Memory yield outputs: yield the written memrefs. for (Value memref : output_memrefs) { if (input_to_block_arg.count(memref)) { - memory_yield_operands.push_back(input_to_block_arg[memref]); + yield_for_dependency_write_out.push_back(input_to_block_arg[memref]); } else { assert(false && "Written memref not in inputs!"); } @@ -327,8 +327,8 @@ static TaskflowTaskOp convertLoopToTask( for (Value result : cloned_loop->getResults()) { value_yield_operands.push_back(result); } - task_builder.create(loc, read_yield_operands, - memory_yield_operands, + task_builder.create(loc, yield_for_dependency_read_out, + yield_for_dependency_write_out, value_yield_operands); //-------------------------------------------------------------------