From e325d999db69e34d717c9f0467039a67fc8b2b06 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 18 Jul 2023 13:30:16 -0400 Subject: [PATCH 01/33] Add tests for padding to broadcast in various ways --- test/test_resize.cpp | 221 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index ca35f9d874e..8af96f149c5 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2265,4 +2265,225 @@ TEST_F(NVFuserTest, SliceVectorization) { testValidate(&fusion, cg_outputs, inputs, {ref}, __LINE__, __FILE__); } +// Concretize a symbolic pad that results in a broadcast (static pads) +// In this test, the sizes and pad widths are static, so there should be nothing +// to concretize. +TEST_F(NVFuserTest, ResizePadToBroadcastStatic_CUDA) { + /* + std::vector t0_size = {2, 3, 2, 5, 6}; + std::vector t1_size = {2, 4, 4, 3, 5}; + // Note there are only 8 input scalars for 5D input. Implicit no-pad of dim 0 + std::vector pad_widths = { + 0, + -1, // dim=4 trim last element + 0, + -4, // dim=3 pad to broadcast of first element + 1, + 1, // dim=2 pad with zeros on either side + -1, + -1, // dim=1 pad to broadcast of second element + // dim=0 is implicit 0, 0 + }; + std::vector expected_itertypes = { + IterType::Iteration, + IterType::Broadcast, + IterType::Iteration, + IterType::Broadcast, + IterType::Iteration, + }; + */ + std::vector t0_size = {2, 3}; + std::vector t1_size = {2, 4}; + // Note there are only 8 input scalars for 5D input. Implicit no-pad of dim 0 + std::vector pad_widths = { + -1, + -1, // dim=1 pad to broadcast of second element + // dim=0 is implicit 0, 0 + }; + std::vector expected_itertypes = { + IterType::Iteration, + IterType::Broadcast, + }; + + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeConcreteTensor(t0_size); + fusion->addInput(tv0); + auto tv1 = makeConcreteTensor(t1_size); + fusion->addInput(tv1); + + std::vector pad_width_vals; + pad_width_vals.reserve(pad_widths.size()); + for (auto w : pad_widths) { + pad_width_vals.push_back(IrBuilder::create(w)); + } + + auto tv2 = pad(tv0, pad_width_vals); + auto tv3 = mul(tv1, tv2); + fusion->addOutput(tv3); + + fusion->printMath(); + fusion->printTransforms(); + + EXPECT_FALSE(fusion->hasDynamicTransform()); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t0 = at::randn(t0_size, options); + auto t1 = at::randn(t1_size, options); + std::vector aten_inputs({t0, t1}); + + FusionExecutorCache executor_cache(std::move(fusion)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); + + auto runtime = executor_cache.getMostRecentKernelRuntime(); + auto concretized_fusion = runtime->fusionSegments()->completeFusion(); + + auto conc_t2 = concretized_fusion->outputs()[0] + ->definition() + ->inputs()[1] + ->as(); + for (auto i : c10::irange(expected_itertypes.size())) { + EXPECT_EQ(conc_t2->axis(i)->getIterType(), expected_itertypes.at(i)); + } + + auto t2_padded = at::pad(t0, pad_widths); + auto ref_t2 = t1 * t2_padded; + + std::cout << "ATen: " << ref_t2 << std::endl; + std::cout << "NVFuser: " << cg_outputs[0] << std::endl; + + testValidate( + concretized_fusion, + cg_outputs, + aten_inputs, + {ref_t2}, + __LINE__, + __FILE__); +} + +// Concretize a symbolic pad that results in a broadcast (dynamic pads) +TEST_F(NVFuserTest, ResizePadToBroadcastDynamic_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeSymbolicTensor(5); + fusion->addInput(tv0); + auto tv1 = makeSymbolicTensor(5); + fusion->addInput(tv1); + + // Note there are only 8 input scalars for 5D input. Implicit no-pad of dim 0 + std::vector pad_widths = { + 0, + -1, // dim=4 trim last element + 0, + -4, // dim=3 pad to broadcast of first element + 1, + 1, // dim=2 pad with zeros on either side + -1, + -1, // dim=1 pad to broadcast of second element + // dim=0 is implicit 0, 0 + }; + std::vector pad_width_vals; + pad_width_vals.reserve(pad_widths.size()); + for ([[maybe_unused]] auto _ : pad_widths) { + auto w_val = IrBuilder::create(DataType::Int); + fusion->addInput(w_val); + pad_width_vals.push_back(w_val); + } + + auto tv2 = pad(tv0, pad_width_vals); + auto tv3 = mul(tv1, tv2); + fusion->addOutput(tv3); + + fusion->printMath(); + fusion->printTransforms(); + + EXPECT_TRUE(fusion->hasDynamicTransform()); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t0 = at::randn({2, 3, 2, 5, 6}, options); + auto t1 = at::randn({2, 4, 4, 3, 5}, options); + // Keep dimension 0, pad to broadcast in dimension 1 and 3. Pad with zero in + // dimension 2. Trim by one element in dimension 4. + std::vector aten_inputs({ + t0, + t1, + }); + aten_inputs.insert(aten_inputs.end(), pad_widths.begin(), pad_widths.end()); + + FusionExecutorCache executor_cache(std::move(fusion)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); + + auto runtime = executor_cache.getMostRecentKernelRuntime(); + auto concretized_fusion = runtime->fusionSegments()->completeFusion(); + + auto conc_t2 = concretized_fusion->outputs()[0] + ->definition() + ->inputs()[1] + ->as(); + EXPECT_EQ(conc_t2->axis(0)->getIterType(), IterType::Iteration); + EXPECT_EQ(conc_t2->axis(1)->getIterType(), IterType::Broadcast); + EXPECT_EQ(conc_t2->axis(2)->getIterType(), IterType::Iteration); + EXPECT_EQ(conc_t2->axis(3)->getIterType(), IterType::Broadcast); + EXPECT_EQ(conc_t2->axis(4)->getIterType(), IterType::Iteration); + + auto t2_padded = at::pad(t0, pad_widths); + auto ref_t2 = t1 * t2_padded; + + std::cout << "ATen: " << ref_t2 << std::endl; + std::cout << "NVFuser: " << cg_outputs[0] << std::endl; + + testValidate( + concretized_fusion, + cg_outputs, + aten_inputs, + {ref_t2}, + __LINE__, + __FILE__); +} + +TEST_F(NVFuserTest, ResizeIssue596_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeConcreteTensor({2}); + auto tv1 = makeConcreteTensor({3}); + fusion->addInput(tv0); + fusion->addInput(tv1); + + auto tv2 = pad(tv0, {fusion->zeroVal(), IrBuilder::create(-1)}); + auto tv3 = mul(tv1, tv2); + fusion->addOutput(tv3); + + // Fusion is not dynamic + EXPECT_FALSE(fusion->hasDynamicTransform()); + + fusion->printMath(); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + auto t0 = at::randn({2}, options); + auto t1 = at::randn({3}, options); + std::vector aten_inputs({t0, t1}); + + auto args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs); + FusionKernelRuntime runtime(std::move(fusion), args); + runtime.compileFusionParallel(args); + auto cg_outputs = runtime.runWithInputs(args); + + auto t2_padded = at::pad(t0, {0, -1}); + auto ref_t2 = t1 * t2_padded; + + testValidate( + runtime.fusionSegments()->completeFusion(), + cg_outputs, + aten_inputs, + {ref_t2}, + __LINE__, + __FILE__); +} + } // namespace nvfuser From 8d30c205d5ea3751bdebdaeeca4a31a26268609e Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 18 Jul 2023 13:34:31 -0400 Subject: [PATCH 02/33] Set bcast consumer IDs with no bcast producers as origins --- csrc/device_lower/analysis/trivial_broadcast.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/csrc/device_lower/analysis/trivial_broadcast.cpp b/csrc/device_lower/analysis/trivial_broadcast.cpp index 6993699632b..cab1e8e15e4 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.cpp +++ b/csrc/device_lower/analysis/trivial_broadcast.cpp @@ -78,9 +78,6 @@ void ConcretizedBroadcastDomains::handle(Expr* expr) { producer_broadcasts.insert(producer_id); } } - if (producer_broadcasts.empty()) { - continue; - } for (auto consumer : ir_utils::filterByType(expr->outputs())) { auto p2c_map = @@ -95,6 +92,14 @@ void ConcretizedBroadcastDomains::handle(Expr* expr) { const bool is_concretized = !c_id->isBroadcast() && !c_id->isReduction(); auto it = broadcast_origin_map_.find(p_id); + if (it == broadcast_origin_map_.end()) { + // The consumer might be broadcast even though the producer is not, if + // this is a pad to size 1. In such case, we set the consumer as the + // origin of the broadcast. + broadcast_origin_map_.emplace( + c_id, std::unordered_set({c_id})); + continue; + } TORCH_INTERNAL_ASSERT( it != broadcast_origin_map_.end(), "Broadcast origin info not found for producer broadcast domain: ", From e62d69242261014a06b2bac933e7462ee1d94de3 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 18 Jul 2023 13:35:16 -0400 Subject: [PATCH 03/33] Only map root->rfactor IDs with same IterType in ca root dom map builder --- csrc/root_domain_map.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index 3173cf848f7..8e76be67662 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -1185,7 +1185,9 @@ void ComputeAtRootDomainMapBuilder::handle(TensorView* tv) { if (root_set.find(id) == root_set.end() || rf_id == id) { continue; } - setMaybeMapped(td, id, td, rf_id); + if (id->getIterType() == rf_id->getIterType()) { + setMaybeMapped(td, id, td, rf_id); + } } } // Once mappings for rfactor axes are propagated to root axes, From 18cc3d121c743e237a31b655dc17f31892da01b6 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 18 Jul 2023 13:45:12 -0400 Subject: [PATCH 04/33] Switch to more challenging static pad to broadcast test --- test/test_resize.cpp | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index 8af96f149c5..5a715a2c8e0 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2269,7 +2269,6 @@ TEST_F(NVFuserTest, SliceVectorization) { // In this test, the sizes and pad widths are static, so there should be nothing // to concretize. TEST_F(NVFuserTest, ResizePadToBroadcastStatic_CUDA) { - /* std::vector t0_size = {2, 3, 2, 5, 6}; std::vector t1_size = {2, 4, 4, 3, 5}; // Note there are only 8 input scalars for 5D input. Implicit no-pad of dim 0 @@ -2291,7 +2290,7 @@ TEST_F(NVFuserTest, ResizePadToBroadcastStatic_CUDA) { IterType::Broadcast, IterType::Iteration, }; - */ + /* std::vector t0_size = {2, 3}; std::vector t1_size = {2, 4}; // Note there are only 8 input scalars for 5D input. Implicit no-pad of dim 0 @@ -2304,6 +2303,7 @@ TEST_F(NVFuserTest, ResizePadToBroadcastStatic_CUDA) { IterType::Iteration, IterType::Broadcast, }; + */ auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -2323,9 +2323,6 @@ TEST_F(NVFuserTest, ResizePadToBroadcastStatic_CUDA) { auto tv3 = mul(tv1, tv2); fusion->addOutput(tv3); - fusion->printMath(); - fusion->printTransforms(); - EXPECT_FALSE(fusion->hasDynamicTransform()); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -2351,9 +2348,6 @@ TEST_F(NVFuserTest, ResizePadToBroadcastStatic_CUDA) { auto t2_padded = at::pad(t0, pad_widths); auto ref_t2 = t1 * t2_padded; - std::cout << "ATen: " << ref_t2 << std::endl; - std::cout << "NVFuser: " << cg_outputs[0] << std::endl; - testValidate( concretized_fusion, cg_outputs, @@ -2397,9 +2391,6 @@ TEST_F(NVFuserTest, ResizePadToBroadcastDynamic_CUDA) { auto tv3 = mul(tv1, tv2); fusion->addOutput(tv3); - fusion->printMath(); - fusion->printTransforms(); - EXPECT_TRUE(fusion->hasDynamicTransform()); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -2433,9 +2424,6 @@ TEST_F(NVFuserTest, ResizePadToBroadcastDynamic_CUDA) { auto t2_padded = at::pad(t0, pad_widths); auto ref_t2 = t1 * t2_padded; - std::cout << "ATen: " << ref_t2 << std::endl; - std::cout << "NVFuser: " << cg_outputs[0] << std::endl; - testValidate( concretized_fusion, cg_outputs, @@ -2461,8 +2449,6 @@ TEST_F(NVFuserTest, ResizeIssue596_CUDA) { // Fusion is not dynamic EXPECT_FALSE(fusion->hasDynamicTransform()); - fusion->printMath(); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto t0 = at::randn({2}, options); From bed1358dd67082e8e7cee4653a743b298bcc3e2c Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 18 Jul 2023 13:51:20 -0400 Subject: [PATCH 05/33] Remove commented code in static test --- test/test_resize.cpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index a3c0349a452..b94e3b75fe8 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2291,20 +2291,6 @@ TEST_F(NVFuserTest, ResizePadToBroadcastStatic_CUDA) { IterType::Broadcast, IterType::Iteration, }; - /* - std::vector t0_size = {2, 3}; - std::vector t1_size = {2, 4}; - // Note there are only 8 input scalars for 5D input. Implicit no-pad of dim 0 - std::vector pad_widths = { - -1, - -1, // dim=1 pad to broadcast of second element - // dim=0 is implicit 0, 0 - }; - std::vector expected_itertypes = { - IterType::Iteration, - IterType::Broadcast, - }; - */ auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); From ea05f072b9dff9a35659b39e49491ab00c5cbb10 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 19 Jul 2023 15:35:05 -0400 Subject: [PATCH 06/33] Always skip mapping symbolic IterDomains that have different extents --- csrc/root_domain_map.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index 8e76be67662..e74104c35a4 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -132,6 +132,7 @@ std::unordered_map PairwiseRootDomainMap::map( // domains of torch_gather) // 3. Squeeze and unsqueeze // 4. Broadcast and non broadcast + // 5. Symbolic ID with different extent from other ID // Condition 1: when the producer ID is the dim of a select-like op if (producer_id == indexed_producer_id) { @@ -182,6 +183,16 @@ std::unordered_map PairwiseRootDomainMap::map( continue; } + // Condition 5 + // At least one ID is symbolic. + // Map producer to consumer if and only if their extents are identical + if ((producer_id->isSymbolic() || consumer_id->isSymbolic()) && + (!producer_id->extent()->sameAs(consumer_id->extent()))) { + itc++; + itp++; + continue; + } + IterDomain* map_key_id = producer_id; IterDomain* map_value_id = consumer_id; if (!producer_to_consumer) { From 3fcafcbcf18799f818a40c1b20edd89138640029 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 19 Jul 2023 15:35:31 -0400 Subject: [PATCH 07/33] Use maybeMutated id for isSymbolic check. This prevents us overwriting a concretization to Broadcast with a concretization to Iteration. --- csrc/dynamic_transform.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 6889e35ebf5..0d2144d72a1 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -614,6 +614,7 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { // Update the IterType of each output for (auto out_id : ir_utils::filterByType(expr->outputs())) { + out_id = maybeMutated(out_id)->as(); if (!out_id->isSymbolic()) { continue; } @@ -721,12 +722,13 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( auto c2p = root_map.mapConsumerToProducer( consumer->domain(), producer->domain()); + auto p_it = c2p.find(root_id); TORCH_INTERNAL_ASSERT( - c2p.find(root_id) != c2p.end(), + p_it != c2p.end(), "No input ID found to map with output ID: ", root_id->toString()); - auto input_id = c2p.at(root_id); + auto input_id = p_it->second; TORCH_INTERNAL_ASSERT( input_id->getIterType() != IterType::Symbolic, "Producer ID not concretized: ", From e84ffe6d605130914b8ae9934a1d7811d4cac9d0 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 24 Jul 2023 09:42:19 -0400 Subject: [PATCH 08/33] Update comment for isSymbolic check in mutate(TV) --- csrc/dynamic_transform.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 28cf7e75225..9bfa92495cf 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -640,12 +640,12 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { for (auto out_id : ir_utils::filterByType(expr->outputs())) { out_id = maybeMutated(out_id)->as(); if (!out_id->isSymbolic()) { + // We are only concretizing IterType here, so if we have already + // concretized the iter_type for this ID, we can skip this. continue; } auto concretized_out_id = - IterDomainBuilder(maybeMutated(out_id)->as()) - .iter_type(iter_type) - .build(); + IterDomainBuilder(out_id).iter_type(iter_type).build(); registerConcretization(out_id, concretized_out_id); } From c6410b40e747bbc0a4b5a93a0263e1d81e25469d Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 24 Jul 2023 09:48:53 -0400 Subject: [PATCH 09/33] Add comment to #596 test --- test/test_resize.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index 2cdb01af130..3799a019148 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2416,6 +2416,7 @@ TEST_F(NVFuserTest, ResizePadToBroadcastDynamic_CUDA) { __FILE__); } +// See https://github.com/NVIDIA/Fuser/issues/596 TEST_F(NVFuserTest, ResizeIssue596_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); From b877331dc69f589ed86fae71fecf93e182e97676 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 24 Jul 2023 09:51:21 -0400 Subject: [PATCH 10/33] Expand comment about condition 5 --- csrc/root_domain_map.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index e1e5522b95d..ca1253138f9 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -185,7 +185,11 @@ std::unordered_map PairwiseRootDomainMap::map( // Condition 5 // At least one ID is symbolic. - // Map producer to consumer if and only if their extents are identical + // Map producer to consumer if and only if their extents are identical. + // IterType::Symbolic reflects that the extent might evaluate to 1 for some + // inputs, in which case it may be valid to use those domains in a broadcast + // op. If the extents are exactly the same between two aligned IterDomains, + // even if one is symbolic they are mapped. if ((producer_id->isSymbolic() || consumer_id->isSymbolic()) && (!producer_id->extent()->sameAs(consumer_id->extent()))) { itc++; From dc3a1562b76a6ec594537dc46ea5d1f01b12229a Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 24 Jul 2023 09:53:22 -0400 Subject: [PATCH 11/33] Add comment to itertype check in ComputeAtRootDomainMapBuilder --- csrc/root_domain_map.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index ca1253138f9..29f3f26ae3e 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -1200,6 +1200,11 @@ void ComputeAtRootDomainMapBuilder::handle(TensorView* tv) { if (root_set.find(id) == root_set.end() || rf_id == id) { continue; } + // Usually, the itertypes between IterDomain expression inputs and + // outputs will match. However, it is possible for a Resize operation to + // take an Iteration input and reduce it to size 1, after which it + // becomes Broadcast. This check avoids mapping an Iteration and + // Broadcast domain in such a case. if (id->getIterType() == rf_id->getIterType()) { setMaybeMapped(td, id, td, rf_id); } From 6020cd6417be78a8dcd54f424412e99eada497f7 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 24 Jul 2023 15:44:22 -0400 Subject: [PATCH 12/33] Map Symbolic with non-Broadcast in propagateFromP2C --- csrc/dynamic_transform.cpp | 24 +++++++++++++++--------- csrc/root_domain_map.cpp | 37 +++++++++++++++++++++++++++---------- csrc/root_domain_map.h | 13 +++++++++++++ 3 files changed, 55 insertions(+), 19 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 9bfa92495cf..ab0e203a899 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -728,6 +728,14 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( auto def = consumer->definition(); + std::vector> c2p_maps; + for (auto producer : ir_utils::filterByType(def->inputs())) { + PairwiseRootDomainMap root_map(producer, consumer); + root_map.mapSymbolicNonBroadcast(true); + c2p_maps.push_back( + root_map.mapConsumerToProducer(consumer->domain(), producer->domain())); + } + bool is_concretized = false; for (const auto i : c10::irange(root_domain.size())) { @@ -741,16 +749,14 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( std::optional id_type; - for (auto producer : ir_utils::filterByType(def->inputs())) { - PairwiseRootDomainMap root_map(producer, consumer); - auto c2p = root_map.mapConsumerToProducer( - consumer->domain(), producer->domain()); - + for (const auto& c2p : c2p_maps) { auto p_it = c2p.find(root_id); - TORCH_INTERNAL_ASSERT( - p_it != c2p.end(), - "No input ID found to map with output ID: ", - root_id->toString()); + if (p_it == c2p.end()) { + // We might not have mapped this ID to this producer, for example if the + // producer is a broadcast domain. In that case just skip this producer + // when determining the output id_type. + continue; + } auto input_id = p_it->second; TORCH_INTERNAL_ASSERT( diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index 29f3f26ae3e..01abf4ed604 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -185,16 +185,33 @@ std::unordered_map PairwiseRootDomainMap::map( // Condition 5 // At least one ID is symbolic. - // Map producer to consumer if and only if their extents are identical. - // IterType::Symbolic reflects that the extent might evaluate to 1 for some - // inputs, in which case it may be valid to use those domains in a broadcast - // op. If the extents are exactly the same between two aligned IterDomains, - // even if one is symbolic they are mapped. - if ((producer_id->isSymbolic() || consumer_id->isSymbolic()) && - (!producer_id->extent()->sameAs(consumer_id->extent()))) { - itc++; - itp++; - continue; + // If map_symbolic_ is true: + // Map these as long as one is not Broadcast. This is most + // useful for propagating from consumer to producer during concretization. + // In that case, a producer rfactor domain might be concretized to + // Iteration while the corresponding consumer ID is still Symbolic. We + // need this mode in order to find which consumer IDs to propagate the + // producer IterType to. + // If map_symbolic_ is false (default): + // Map these only if their extents are identical. IterType::Symbolic + // reflects that the extent might evaluate to 1 for some inputs, in which + // case it may be valid to use those domains in a broadcast op. If the + // extents are exactly the same between two aligned IterDomains, even if + // one is symbolic they are mapped. + auto ps = producer_id->isSymbolic(); + auto cs = consumer_id->isSymbolic(); + if (ps || cs) { + if (map_symbolic_non_bcast_) { + if (producer_id->isBroadcast() || consumer_id->isBroadcast()) { + itc++; + itp++; + continue; + } + } else if (!producer_id->extent()->sameAs(consumer_id->extent())) { + itc++; + itp++; + continue; + } } IterDomain* map_key_id = producer_id; diff --git a/csrc/root_domain_map.h b/csrc/root_domain_map.h index 75aca2a5b99..5b1c7efd18b 100644 --- a/csrc/root_domain_map.h +++ b/csrc/root_domain_map.h @@ -100,6 +100,15 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { return *this; } + //! If b is true: map symbolic domains with other IterDomains as long as the + //! other is not Broadcast, even if their extents don't match. If b is false + //! (default): map symbolic domains with other IterDomains only if their + //! extents match. + PairwiseRootDomainMap& mapSymbolicNonBroadcast(bool b) { + map_symbolic_non_bcast_ = b; + return *this; + } + PairwiseRootDomainMap& mapDifferentExtents(bool b) { map_different_extents_ = b; return *this; @@ -136,6 +145,10 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { //! Map broadcast and non-broadcast domains. Note that this is on by //! default bool map_broadcast_ = true; + //! Map symbolic domains with other IterDomains as long as the other is not + //! Broadcast, even if their extents don't match. Note that this is off by + //! default, in which case they are mapped only if their extents match. + bool map_symbolic_non_bcast_ = false; //! Map domains that may have different extents, e.g., torch_gather bool map_different_extents_ = false; //! Map domains that are indirectly accessed, e.g., index_select From 9b48a6f650daec61d01489cc10cd208cb2add69f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 24 Jul 2023 15:44:48 -0400 Subject: [PATCH 13/33] Rename test --- test/test_resize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index 3799a019148..84b09dab31a 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2417,7 +2417,7 @@ TEST_F(NVFuserTest, ResizePadToBroadcastDynamic_CUDA) { } // See https://github.com/NVIDIA/Fuser/issues/596 -TEST_F(NVFuserTest, ResizeIssue596_CUDA) { +TEST_F(NVFuserTest, ResizePadToBroadcastIssue596_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); From 012e8782a38a4f5462a7cfc9d93b56ca376c0399 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 25 Jul 2023 08:18:53 -0400 Subject: [PATCH 14/33] Remove mapSymbolicNonBroadcast option. On always now --- csrc/dynamic_transform.cpp | 1 - csrc/root_domain_map.cpp | 19 +++++-------------- csrc/root_domain_map.h | 13 ------------- 3 files changed, 5 insertions(+), 28 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index ab0e203a899..1882870b8fc 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -731,7 +731,6 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( std::vector> c2p_maps; for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); - root_map.mapSymbolicNonBroadcast(true); c2p_maps.push_back( root_map.mapConsumerToProducer(consumer->domain(), producer->domain())); } diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index 01abf4ed604..bb33033d636 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -198,20 +198,11 @@ std::unordered_map PairwiseRootDomainMap::map( // case it may be valid to use those domains in a broadcast op. If the // extents are exactly the same between two aligned IterDomains, even if // one is symbolic they are mapped. - auto ps = producer_id->isSymbolic(); - auto cs = consumer_id->isSymbolic(); - if (ps || cs) { - if (map_symbolic_non_bcast_) { - if (producer_id->isBroadcast() || consumer_id->isBroadcast()) { - itc++; - itp++; - continue; - } - } else if (!producer_id->extent()->sameAs(consumer_id->extent())) { - itc++; - itp++; - continue; - } + if ((producer_id->isSymbolic() || consumer_id->isBroadcast()) && + (producer_id->isBroadcast() || consumer_id->isSymbolic())) { + itc++; + itp++; + continue; } IterDomain* map_key_id = producer_id; diff --git a/csrc/root_domain_map.h b/csrc/root_domain_map.h index 5b1c7efd18b..75aca2a5b99 100644 --- a/csrc/root_domain_map.h +++ b/csrc/root_domain_map.h @@ -100,15 +100,6 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { return *this; } - //! If b is true: map symbolic domains with other IterDomains as long as the - //! other is not Broadcast, even if their extents don't match. If b is false - //! (default): map symbolic domains with other IterDomains only if their - //! extents match. - PairwiseRootDomainMap& mapSymbolicNonBroadcast(bool b) { - map_symbolic_non_bcast_ = b; - return *this; - } - PairwiseRootDomainMap& mapDifferentExtents(bool b) { map_different_extents_ = b; return *this; @@ -145,10 +136,6 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { //! Map broadcast and non-broadcast domains. Note that this is on by //! default bool map_broadcast_ = true; - //! Map symbolic domains with other IterDomains as long as the other is not - //! Broadcast, even if their extents don't match. Note that this is off by - //! default, in which case they are mapped only if their extents match. - bool map_symbolic_non_bcast_ = false; //! Map domains that may have different extents, e.g., torch_gather bool map_different_extents_ = false; //! Map domains that are indirectly accessed, e.g., index_select From 1aca3afb5d86a37e626aa458d41ec01f0f89327f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 25 Jul 2023 08:29:52 -0400 Subject: [PATCH 15/33] Revert "Remove mapSymbolicNonBroadcast option. On always now" This reverts commit 012e8782a38a4f5462a7cfc9d93b56ca376c0399. --- csrc/dynamic_transform.cpp | 1 + csrc/root_domain_map.cpp | 19 ++++++++++++++----- csrc/root_domain_map.h | 13 +++++++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 1882870b8fc..ab0e203a899 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -731,6 +731,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( std::vector> c2p_maps; for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); + root_map.mapSymbolicNonBroadcast(true); c2p_maps.push_back( root_map.mapConsumerToProducer(consumer->domain(), producer->domain())); } diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index bb33033d636..01abf4ed604 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -198,11 +198,20 @@ std::unordered_map PairwiseRootDomainMap::map( // case it may be valid to use those domains in a broadcast op. If the // extents are exactly the same between two aligned IterDomains, even if // one is symbolic they are mapped. - if ((producer_id->isSymbolic() || consumer_id->isBroadcast()) && - (producer_id->isBroadcast() || consumer_id->isSymbolic())) { - itc++; - itp++; - continue; + auto ps = producer_id->isSymbolic(); + auto cs = consumer_id->isSymbolic(); + if (ps || cs) { + if (map_symbolic_non_bcast_) { + if (producer_id->isBroadcast() || consumer_id->isBroadcast()) { + itc++; + itp++; + continue; + } + } else if (!producer_id->extent()->sameAs(consumer_id->extent())) { + itc++; + itp++; + continue; + } } IterDomain* map_key_id = producer_id; diff --git a/csrc/root_domain_map.h b/csrc/root_domain_map.h index 75aca2a5b99..5b1c7efd18b 100644 --- a/csrc/root_domain_map.h +++ b/csrc/root_domain_map.h @@ -100,6 +100,15 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { return *this; } + //! If b is true: map symbolic domains with other IterDomains as long as the + //! other is not Broadcast, even if their extents don't match. If b is false + //! (default): map symbolic domains with other IterDomains only if their + //! extents match. + PairwiseRootDomainMap& mapSymbolicNonBroadcast(bool b) { + map_symbolic_non_bcast_ = b; + return *this; + } + PairwiseRootDomainMap& mapDifferentExtents(bool b) { map_different_extents_ = b; return *this; @@ -136,6 +145,10 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { //! Map broadcast and non-broadcast domains. Note that this is on by //! default bool map_broadcast_ = true; + //! Map symbolic domains with other IterDomains as long as the other is not + //! Broadcast, even if their extents don't match. Note that this is off by + //! default, in which case they are mapped only if their extents match. + bool map_symbolic_non_bcast_ = false; //! Map domains that may have different extents, e.g., torch_gather bool map_different_extents_ = false; //! Map domains that are indirectly accessed, e.g., index_select From eeff747ae9dff5712167214710ec1a8c7c0a9d9b Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 25 Jul 2023 12:42:11 -0400 Subject: [PATCH 16/33] Update mapSymbolic option. Expose it to ExactRootDomainMap Use it in ExpressionEvaluator differently than in concretization propagateP2C. --- csrc/dynamic_transform.cpp | 7 +++++- csrc/expr_evaluator.cpp | 6 ++++- csrc/root_domain_map.cpp | 46 +++++++++++++++----------------------- csrc/root_domain_map.h | 14 ++++++------ 4 files changed, 36 insertions(+), 37 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index ab0e203a899..9979ef264fd 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -731,7 +731,12 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( std::vector> c2p_maps; for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); - root_map.mapSymbolicNonBroadcast(true); + // We map symbolic domains here regardless of whether their extents match. + // This is safe because we are propagating from a producer which should have + // already been concretized. The consumer might have a different extent + // which will be equivalent to (but not necessarily sameAs) the producer's, + // and we just want to use its IterType to concretize the consumer ID. + root_map.mapSymbolic(true); c2p_maps.push_back( root_map.mapConsumerToProducer(consumer->domain(), producer->domain())); } diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index 1a4526ca774..56b7b055a3e 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -160,7 +160,11 @@ void ExpressionEvaluator::print() const { } void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion) { - const auto mapped_sets = ExactRootDomainMap(fusion).getMappedSets(); + // We map Symbolic IterDomains here only if their extents match. This avoids + // mapping between symbolic domains that might concretize to an (Iteration, + // Broadcast) pair from a resolved broadcast. + const auto mapped_sets = + ExactRootDomainMap(fusion, /*map_symbolic*/ false).getMappedSets(); for (const auto& set : mapped_sets.disjointSets()) { int64_t known_size = -1; diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index 01abf4ed604..9cf313fcd6b 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -185,33 +185,20 @@ std::unordered_map PairwiseRootDomainMap::map( // Condition 5 // At least one ID is symbolic. - // If map_symbolic_ is true: - // Map these as long as one is not Broadcast. This is most - // useful for propagating from consumer to producer during concretization. - // In that case, a producer rfactor domain might be concretized to - // Iteration while the corresponding consumer ID is still Symbolic. We - // need this mode in order to find which consumer IDs to propagate the - // producer IterType to. - // If map_symbolic_ is false (default): + // If map_symbolic_ is true: map these IDs regardless of other + // considerations. If map_symbolic_ is false (default): // Map these only if their extents are identical. IterType::Symbolic // reflects that the extent might evaluate to 1 for some inputs, in which // case it may be valid to use those domains in a broadcast op. If the - // extents are exactly the same between two aligned IterDomains, even if - // one is symbolic they are mapped. - auto ps = producer_id->isSymbolic(); - auto cs = consumer_id->isSymbolic(); - if (ps || cs) { - if (map_symbolic_non_bcast_) { - if (producer_id->isBroadcast() || consumer_id->isBroadcast()) { - itc++; - itp++; - continue; - } - } else if (!producer_id->extent()->sameAs(consumer_id->extent())) { - itc++; - itp++; - continue; - } + // extents are exactly the same between two aligned IterDomains, the + // Symbolic one will be concretized to the same IterType as the other, so + // they should be mapped with one another. + if (!map_symbolic_ && + (producer_id->isSymbolic() || consumer_id->isSymbolic()) && + (!producer_id->extent()->sameAs(consumer_id->extent()))) { + itc++; + itp++; + continue; } IterDomain* map_key_id = producer_id; @@ -1266,8 +1253,9 @@ class ExactRootDomainMapBuilder : private IterVisitor { public: ExactRootDomainMapBuilder( Fusion* fusion, - DisjointSets& eq_sets) - : eq_sets_(eq_sets) { + DisjointSets& eq_sets, + bool map_symbolic) + : eq_sets_(eq_sets), map_symbolic_(map_symbolic) { traverseTo(fusion, fusion->outputs()); } @@ -1280,6 +1268,7 @@ class ExactRootDomainMapBuilder : private IterVisitor { ir_utils::filterByType(expr->outputs())) { PairwiseRootDomainMap pwise_map(producer, consumer); pwise_map.mapBroadcast(false); + pwise_map.mapSymbolic(map_symbolic_); const auto mappings = pwise_map.mapProducerToConsumer( producer->domain(), consumer->domain()); for (const auto& mapping : mappings) { @@ -1291,12 +1280,13 @@ class ExactRootDomainMapBuilder : private IterVisitor { private: DisjointSets& eq_sets_; + bool map_symbolic_ = false; }; } // namespace -ExactRootDomainMap::ExactRootDomainMap(Fusion* fusion) { - ExactRootDomainMapBuilder builder(fusion, eq_sets_); +ExactRootDomainMap::ExactRootDomainMap(Fusion* fusion, bool map_symbolic) { + ExactRootDomainMapBuilder builder(fusion, eq_sets_, map_symbolic); } bool ExactRootDomainMap::areMapped( diff --git a/csrc/root_domain_map.h b/csrc/root_domain_map.h index 5b1c7efd18b..24156743cbd 100644 --- a/csrc/root_domain_map.h +++ b/csrc/root_domain_map.h @@ -104,8 +104,8 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { //! other is not Broadcast, even if their extents don't match. If b is false //! (default): map symbolic domains with other IterDomains only if their //! extents match. - PairwiseRootDomainMap& mapSymbolicNonBroadcast(bool b) { - map_symbolic_non_bcast_ = b; + PairwiseRootDomainMap& mapSymbolic(bool b) { + map_symbolic_ = b; return *this; } @@ -145,10 +145,10 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { //! Map broadcast and non-broadcast domains. Note that this is on by //! default bool map_broadcast_ = true; - //! Map symbolic domains with other IterDomains as long as the other is not - //! Broadcast, even if their extents don't match. Note that this is off by - //! default, in which case they are mapped only if their extents match. - bool map_symbolic_non_bcast_ = false; + //! Map symbolic domains with other IterDomains, even if their extents don't + //! match. Note that this is off by default, in which case they are mapped + //! only if their extents match. + bool map_symbolic_ = false; //! Map domains that may have different extents, e.g., torch_gather bool map_different_extents_ = false; //! Map domains that are indirectly accessed, e.g., index_select @@ -543,7 +543,7 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder //! domains with non-broadcast domains. class TORCH_CUDA_CU_API ExactRootDomainMap : public RootDomainMap { public: - ExactRootDomainMap(Fusion* fusion); + ExactRootDomainMap(Fusion* fusion, bool map_symbolic = false); bool areMapped(const IterDomain* id_a, const IterDomain* id_b) const; From 3e974ba9f0ddc7bd508633c1a358b650d607e641 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 25 Jul 2023 12:47:42 -0400 Subject: [PATCH 17/33] Fix DynamicTransform4_CUDA, add long comment --- test/test_dynamic_transform.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index fcd3b59f9b9..24c1f668b1a 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -250,6 +250,13 @@ TEST_F(NVFuserTest, DynamicTransform4_CUDA) { for (const auto i : c10::irange(after_shape.size())) { expr_eval.bind(tv2->axis((int)i)->extent(), after_shape.at(i)); + // We must bind tv1's extents, since they cannot be inferred until after + // concretization. Because tv2 is a dynamic reshape both its IterDomains + // are Symbolic, which means both of tv3's IterDomains are also Symbolic. + // tv1 has both IterDomains of type Iteration, but it since we add tv3 to + // it to get tv4, we do not know whether this will resolve broadcasts from + // tv3 or not until concretization. + expr_eval.bind(tv1->axis((int)i)->extent(), after_shape.at(i)); } auto initial_info = DynamicTransform::getInitialInfo(&fusion); From dbb7157f189a9d3d01cbab0c2cfa9775b07f9ec2 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 25 Jul 2023 12:53:30 -0400 Subject: [PATCH 18/33] Fix DynamicTransform3_CUDA --- test/test_dynamic_transform.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 24c1f668b1a..229f2b9098e 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -186,6 +186,11 @@ TEST_F(NVFuserTest, DynamicTransform3_CUDA) { expr_eval.bind(tv0->axis(1)->extent(), shape_before.at(1)); expr_eval.bind(tv1->axis(0)->extent(), shape_after.at(0)); expr_eval.bind(tv1->axis(1)->extent(), shape_after.at(1)); + // We cannot infer reshape_shape0 and reshape_shape1 from tv0's and tv1's + // extents alone, since either of these reshaped extents could either match + // that of tv1 or be 1, resulting in a broadcast. + expr_eval.bind(reshape_shape0, shape_after.at(0)); + expr_eval.bind(reshape_shape1, shape_after.at(1)); auto initial_info = DynamicTransform::getInitialInfo(&fusion); auto info = DynamicTransformConcretizationInfo(&initial_info, &expr_eval); From a5f254386b409e5ce9670ab945229aa382145579 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 25 Jul 2023 12:58:30 -0400 Subject: [PATCH 19/33] Fix DynamicTransform1_CUDA --- test/test_dynamic_transform.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 229f2b9098e..2aabecb9aad 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -62,6 +62,10 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { expr_eval.bind(tv0->axis(1)->extent(), 3L); expr_eval.bind(reshape_shape0, 3L); expr_eval.bind(reshape_shape1, 4L); + // We cannot infer the shape of tv1 from the above bound values, since + // either axis of tv2 might be broadcast against one from tv1. + expr_eval.bind(tv1->axis(0)->extent(), 3L); + expr_eval.bind(tv1->axis(1)->extent(), 4L); auto initial_info = DynamicTransform::getInitialInfo(&fusion); auto info = DynamicTransformConcretizationInfo(&initial_info, &expr_eval); From 84f9b69816db528df8259fb59466671d40102a78 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 25 Jul 2023 13:44:53 -0400 Subject: [PATCH 20/33] Update doxygen comment for mapSymbolic --- csrc/root_domain_map.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/csrc/root_domain_map.h b/csrc/root_domain_map.h index 24156743cbd..338d8c9c32b 100644 --- a/csrc/root_domain_map.h +++ b/csrc/root_domain_map.h @@ -100,10 +100,9 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { return *this; } - //! If b is true: map symbolic domains with other IterDomains as long as the - //! other is not Broadcast, even if their extents don't match. If b is false - //! (default): map symbolic domains with other IterDomains only if their - //! extents match. + //! If b is true: map symbolic domains with other IterDomains even if their + //! extents don't match. If b is false (default): map symbolic domains with + //! other IterDomains only if their extents match. PairwiseRootDomainMap& mapSymbolic(bool b) { map_symbolic_ = b; return *this; From 7ff24e6ed9a515df6fe93766fd90a5342bec253f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 13 Sep 2023 12:21:34 -0400 Subject: [PATCH 21/33] Register concretization from unmutated ID in root->rfactor --- csrc/dynamic_transform.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index b304dcdef63..e60d73e81a8 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -678,8 +678,8 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { } // Update the IterType of each output for (auto out_id : ir_utils::filterByType(expr->outputs())) { - out_id = maybeMutated(out_id)->as(); - if (!out_id->isSymbolic()) { + auto mut_id = maybeMutated(out_id)->as(); + if (!mut_id->isSymbolic()) { // We are only concretizing IterType here, so if we have already // concretized the iter_type for this ID, we can skip this. continue; @@ -693,7 +693,7 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { expr->toString()); auto concretized_out_id = - IterDomainBuilder(out_id).iter_type(iter_type).build(); + IterDomainBuilder(mut_id).iter_type(iter_type).build(); registerConcretization(out_id, concretized_out_id); } From 1fe841a271a6c8a73f2e3b68aa704c5e0f220ed1 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 13 Sep 2023 12:26:51 -0400 Subject: [PATCH 22/33] Clean up comment in condition 5 --- csrc/root_domain_map.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index 9855f481358..cc28dd90d08 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -185,8 +185,11 @@ std::unordered_map PairwiseRootDomainMap::map( // Condition 5 // At least one ID is symbolic. - // If map_symbolic_ is true: map these IDs regardless of other - // considerations. If map_symbolic_ is false (default): + // + // If map_symbolic_ is true: + // Map these IDs regardless of other considerations. + // + // If map_symbolic_ is false (default): // Map these only if their extents are identical. IterType::Symbolic // reflects that the extent might evaluate to 1 for some inputs, in which // case it may be valid to use those domains in a broadcast op. If the From 4cd57bfb9a12d15a045327e8aeac8ed54c302973 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 08:42:44 -0400 Subject: [PATCH 23/33] Remove map_symbolic from ExactRootDomainMap --- csrc/root_domain_map.cpp | 11 ++++------- csrc/root_domain_map.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/csrc/root_domain_map.cpp b/csrc/root_domain_map.cpp index cc28dd90d08..a0bab3cf35e 100644 --- a/csrc/root_domain_map.cpp +++ b/csrc/root_domain_map.cpp @@ -1256,9 +1256,8 @@ class ExactRootDomainMapBuilder : private IterVisitor { public: ExactRootDomainMapBuilder( Fusion* fusion, - DisjointSets& eq_sets, - bool map_symbolic) - : eq_sets_(eq_sets), map_symbolic_(map_symbolic) { + DisjointSets& eq_sets) + : eq_sets_(eq_sets) { traverseTo(fusion, fusion->outputs()); } @@ -1271,7 +1270,6 @@ class ExactRootDomainMapBuilder : private IterVisitor { ir_utils::filterByType(expr->outputs())) { PairwiseRootDomainMap pwise_map(producer, consumer); pwise_map.mapBroadcast(false); - pwise_map.mapSymbolic(map_symbolic_); const auto mappings = pwise_map.mapProducerToConsumer( producer->domain(), consumer->domain()); for (const auto& mapping : mappings) { @@ -1283,13 +1281,12 @@ class ExactRootDomainMapBuilder : private IterVisitor { private: DisjointSets& eq_sets_; - bool map_symbolic_ = false; }; } // namespace -ExactRootDomainMap::ExactRootDomainMap(Fusion* fusion, bool map_symbolic) { - ExactRootDomainMapBuilder builder(fusion, eq_sets_, map_symbolic); +ExactRootDomainMap::ExactRootDomainMap(Fusion* fusion) { + ExactRootDomainMapBuilder builder(fusion, eq_sets_); } bool ExactRootDomainMap::areMapped( diff --git a/csrc/root_domain_map.h b/csrc/root_domain_map.h index 2a042c9e2fe..a2a9a37d8d7 100644 --- a/csrc/root_domain_map.h +++ b/csrc/root_domain_map.h @@ -543,7 +543,7 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder //! domains with non-broadcast domains. class TORCH_CUDA_CU_API ExactRootDomainMap : public RootDomainMap { public: - ExactRootDomainMap(Fusion* fusion, bool map_symbolic = false); + ExactRootDomainMap(Fusion* fusion); bool areMapped(const IterDomain* id_a, const IterDomain* id_b) const; From 25e27c147057a039139d7687ce327126efaa8735 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 08:47:16 -0400 Subject: [PATCH 24/33] Asset instead of ignoring missing c2p mapping --- csrc/dynamic_transform.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index e60d73e81a8..3016ebcc682 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -795,6 +795,10 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( auto def = consumer->definition(); + // We will loop over IterDomains in the consumer root. For each, we need to + // inspect the consumer to producer map to all producers. Instead of + // recomputing these for each root IterDomain, we precompute them for each + // producer here then re-use them in the following loop. std::vector> c2p_maps; for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); @@ -823,13 +827,10 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( for (const auto& c2p : c2p_maps) { auto p_it = c2p.find(root_id); - if (p_it == c2p.end()) { - // We might not have mapped this ID to this producer, for example if the - // producer is a broadcast domain. In that case just skip this producer - // when determining the output id_type. - continue; - } - + NVF_ERROR( + p_it != c2p.end(), + "No input ID found to map with output ID: ", + root_id->toString()); auto input_id = p_it->second; NVF_ERROR( input_id == maybeMutated(input_id), From 15be1854625c79103e174b4c936862b489fe6cf2 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 08:52:50 -0400 Subject: [PATCH 25/33] Clean up --- csrc/expr_evaluator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index b9a1c6c4522..e17f8da8956 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -241,7 +241,7 @@ void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion) { // mapping between symbolic domains that might concretize to an (Iteration, // Broadcast) pair from a resolved broadcast. const auto mapped_sets = - ExactRootDomainMap(fusion, /*map_symbolic*/ false).getMappedSets(); + ExactRootDomainMap(fusion).getMappedSets(); for (const auto& set : mapped_sets.disjointSets()) { int64_t known_size = -1; From 0f7952fe1e1ed845ce56481d38e2e7cfe231e156 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 10:28:03 -0400 Subject: [PATCH 26/33] clang-format --- csrc/expr_evaluator.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index e17f8da8956..676c869fd28 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -240,8 +240,7 @@ void ExpressionEvaluator::propagateBoundValuesThroughExactMaps(Fusion* fusion) { // We map Symbolic IterDomains here only if their extents match. This avoids // mapping between symbolic domains that might concretize to an (Iteration, // Broadcast) pair from a resolved broadcast. - const auto mapped_sets = - ExactRootDomainMap(fusion).getMappedSets(); + const auto mapped_sets = ExactRootDomainMap(fusion).getMappedSets(); for (const auto& set : mapped_sets.disjointSets()) { int64_t known_size = -1; From 166ff69a401e4372e87e605e072a660f38fb2551 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 15:47:34 -0400 Subject: [PATCH 27/33] Handle TensorView instead of BroadcastOp --- .../analysis/trivial_broadcast.cpp | 31 ++++++++++--------- .../device_lower/analysis/trivial_broadcast.h | 2 +- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/csrc/device_lower/analysis/trivial_broadcast.cpp b/csrc/device_lower/analysis/trivial_broadcast.cpp index daa0a7d338c..eddfe6316ba 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.cpp +++ b/csrc/device_lower/analysis/trivial_broadcast.cpp @@ -52,14 +52,16 @@ std::unordered_set ConcretizedBroadcastDomains:: return {}; } -void ConcretizedBroadcastDomains::handle(BroadcastOp* bop) { - // Create a new entry for each of new broadcast domains - auto out = bop->out()->as(); - for (const auto i : c10::irange(out->getRootDomain().size())) { - if (bop->getBroadcastDimFlags().at(i)) { - auto new_bcast_id = out->getRootDomain().at(i); +void ConcretizedBroadcastDomains::handle(TensorView* tv) { + if (!tv->hasRFactor()) { + return; + } + for (auto id : tv->getMaybeRFactorDomain()) { + // broadcast rfactor domains with definitions are not root domains. We + // register these as new broadcast origins. + if (id->isBroadcast() && id->definition() != nullptr) { broadcast_origin_map_.emplace( - new_bcast_id, std::unordered_set({new_bcast_id})); + id, std::unordered_set({id})); } } } @@ -78,6 +80,9 @@ void ConcretizedBroadcastDomains::dispatch(Expr* expr) { producer_broadcasts.insert(producer_id); } } + if (producer_broadcasts.empty()) { + continue; + } for (auto consumer : ir_utils::filterByType(expr->outputs())) { auto p2c_map = @@ -87,15 +92,11 @@ void ConcretizedBroadcastDomains::dispatch(Expr* expr) { for (const auto& kv : p2c_map) { auto p_id = kv.first; auto c_id = kv.second; - // If the consumer ID is a reduction (i.e., a trivial - // reduction), do not consider it's concretized. - const bool is_concretized = - !c_id->isBroadcast() && !c_id->isReduction(); auto it = broadcast_origin_map_.find(p_id); if (it == broadcast_origin_map_.end()) { // The consumer might be broadcast even though the producer is not, if - // this is a pad to size 1. In such case, we set the consumer as the - // origin of the broadcast. + // this is a pad or slice to size 1. In such case, we set the consumer + // as the origin of the broadcast. broadcast_origin_map_.emplace( c_id, std::unordered_set({c_id})); continue; @@ -107,7 +108,9 @@ void ConcretizedBroadcastDomains::dispatch(Expr* expr) { " of ", producer->toString()); const auto& producer_origins = it->second; - if (is_concretized) { + // If the consumer ID is a reduction (i.e., a trivial + // reduction), do not consider it's concretized. + if (!c_id->isBroadcast() && !c_id->isReduction()) { // Keep track of all the origin domains as concretized for (auto origin : producer_origins) { markAsConcretized(origin, c_id); diff --git a/csrc/device_lower/analysis/trivial_broadcast.h b/csrc/device_lower/analysis/trivial_broadcast.h index a9a3598ed72..c389c8ee028 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.h +++ b/csrc/device_lower/analysis/trivial_broadcast.h @@ -46,7 +46,7 @@ class ConcretizedBroadcastDomains : private IterVisitor { private: using IterVisitor::handle; - void handle(BroadcastOp* bop) final; + void handle(TensorView* tv) final; void dispatch(Expr* expr) final; From 480dee3a0ea86bce17e747355fe52b79662a8096 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 16:06:25 -0400 Subject: [PATCH 28/33] Check directly that rfactor bcast is not root --- csrc/device_lower/analysis/trivial_broadcast.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/csrc/device_lower/analysis/trivial_broadcast.cpp b/csrc/device_lower/analysis/trivial_broadcast.cpp index eddfe6316ba..627a8ef76b2 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.cpp +++ b/csrc/device_lower/analysis/trivial_broadcast.cpp @@ -57,11 +57,12 @@ void ConcretizedBroadcastDomains::handle(TensorView* tv) { return; } for (auto id : tv->getMaybeRFactorDomain()) { - // broadcast rfactor domains with definitions are not root domains. We - // register these as new broadcast origins. - if (id->isBroadcast() && id->definition() != nullptr) { - broadcast_origin_map_.emplace( - id, std::unordered_set({id})); + // Register broadcast rfactor domains that are not root domains as new + // broadcast origins. + if (id->isBroadcast() && + std::find(tv->getRootDomain().begin(), tv->getRootDomain().end(), id) == + tv->getRootDomain().end()) { + broadcast_origin_map_.emplace(id, std::unordered_set({id})); } } } From 8cc4076c5352609e156010d22ec45525c6d278db Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 16:09:47 -0400 Subject: [PATCH 29/33] Remove check for missing entry in origin map --- csrc/device_lower/analysis/trivial_broadcast.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/csrc/device_lower/analysis/trivial_broadcast.cpp b/csrc/device_lower/analysis/trivial_broadcast.cpp index 627a8ef76b2..6d390361439 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.cpp +++ b/csrc/device_lower/analysis/trivial_broadcast.cpp @@ -94,14 +94,6 @@ void ConcretizedBroadcastDomains::dispatch(Expr* expr) { auto p_id = kv.first; auto c_id = kv.second; auto it = broadcast_origin_map_.find(p_id); - if (it == broadcast_origin_map_.end()) { - // The consumer might be broadcast even though the producer is not, if - // this is a pad or slice to size 1. In such case, we set the consumer - // as the origin of the broadcast. - broadcast_origin_map_.emplace( - c_id, std::unordered_set({c_id})); - continue; - } NVF_ERROR( it != broadcast_origin_map_.end(), "Broadcast origin info not found for producer broadcast domain: ", From d8fce7b74219ff1d5aadfb0354060c95402384ea Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 16:21:16 -0400 Subject: [PATCH 30/33] Restore handle(BroadcastOp*) --- csrc/device_lower/analysis/trivial_broadcast.cpp | 12 ++++++++++++ csrc/device_lower/analysis/trivial_broadcast.h | 2 ++ 2 files changed, 14 insertions(+) diff --git a/csrc/device_lower/analysis/trivial_broadcast.cpp b/csrc/device_lower/analysis/trivial_broadcast.cpp index 6d390361439..76ceea984d4 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.cpp +++ b/csrc/device_lower/analysis/trivial_broadcast.cpp @@ -67,6 +67,18 @@ void ConcretizedBroadcastDomains::handle(TensorView* tv) { } } +void ConcretizedBroadcastDomains::handle(BroadcastOp* bop) { + // Create a new entry for each of new broadcast domains + auto out = bop->out()->as(); + for (const auto i : c10::irange(out->getRootDomain().size())) { + if (bop->getBroadcastDimFlags().at(i)) { + auto new_bcast_id = out->getRootDomain().at(i); + broadcast_origin_map_.emplace( + new_bcast_id, std::unordered_set({new_bcast_id})); + } + } +} + void ConcretizedBroadcastDomains::dispatch(Expr* expr) { IterVisitor::dispatch(expr); diff --git a/csrc/device_lower/analysis/trivial_broadcast.h b/csrc/device_lower/analysis/trivial_broadcast.h index c389c8ee028..a952df7aaff 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.h +++ b/csrc/device_lower/analysis/trivial_broadcast.h @@ -48,6 +48,8 @@ class ConcretizedBroadcastDomains : private IterVisitor { void handle(TensorView* tv) final; + void handle(BroadcastOp* bop) final; + void dispatch(Expr* expr) final; void markAsConcretized( From f43a4b5b6dab469a4904eba42b3a1f6f0ee5a63f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 16:22:59 -0400 Subject: [PATCH 31/33] Undo trivial prior change to minimize diff --- csrc/device_lower/analysis/trivial_broadcast.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/csrc/device_lower/analysis/trivial_broadcast.cpp b/csrc/device_lower/analysis/trivial_broadcast.cpp index 76ceea984d4..8e9c133a175 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.cpp +++ b/csrc/device_lower/analysis/trivial_broadcast.cpp @@ -105,6 +105,10 @@ void ConcretizedBroadcastDomains::dispatch(Expr* expr) { for (const auto& kv : p2c_map) { auto p_id = kv.first; auto c_id = kv.second; + // If the consumer ID is a reduction (i.e., a trivial + // reduction), do not consider it's concretized. + const bool is_concretized = + !c_id->isBroadcast() && !c_id->isReduction(); auto it = broadcast_origin_map_.find(p_id); NVF_ERROR( it != broadcast_origin_map_.end(), @@ -113,9 +117,7 @@ void ConcretizedBroadcastDomains::dispatch(Expr* expr) { " of ", producer->toString()); const auto& producer_origins = it->second; - // If the consumer ID is a reduction (i.e., a trivial - // reduction), do not consider it's concretized. - if (!c_id->isBroadcast() && !c_id->isReduction()) { + if (is_concretized) { // Keep track of all the origin domains as concretized for (auto origin : producer_origins) { markAsConcretized(origin, c_id); From 3da217b3632f03c0afdf3898a5331e6d635ea415 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 25 Sep 2023 11:01:49 -0400 Subject: [PATCH 32/33] Add comments in trivial_broadcast.cpp --- csrc/device_lower/analysis/trivial_broadcast.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/csrc/device_lower/analysis/trivial_broadcast.cpp b/csrc/device_lower/analysis/trivial_broadcast.cpp index 8e9c133a175..34ae04df99e 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.cpp +++ b/csrc/device_lower/analysis/trivial_broadcast.cpp @@ -52,6 +52,12 @@ std::unordered_set ConcretizedBroadcastDomains:: return {}; } +// In some cases an op like pad or slice will introduce a broadcast domain by +// truncating a longer dimension or expanding an empty dimension to size 1. In +// these cases tv will have an RFactor Broadcast IterDomains that are not +// present in the root domain. Contrast this case with BroadcastOp, whose output +// does not have RFactor domains and instead places new broadcast domains in the +// output root domain. void ConcretizedBroadcastDomains::handle(TensorView* tv) { if (!tv->hasRFactor()) { return; @@ -67,6 +73,9 @@ void ConcretizedBroadcastDomains::handle(TensorView* tv) { } } +// Most broadcasts are handled with this method, since Broadcast domains are +// usually introduced through a BroadcastOp. Others are handled by the +// handle(TensorView*) method. void ConcretizedBroadcastDomains::handle(BroadcastOp* bop) { // Create a new entry for each of new broadcast domains auto out = bop->out()->as(); From b30622dfbd6cabfe5d9d26dbc2a2a83225b3b72a Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 25 Sep 2023 11:03:39 -0400 Subject: [PATCH 33/33] Typos --- csrc/device_lower/analysis/trivial_broadcast.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/csrc/device_lower/analysis/trivial_broadcast.cpp b/csrc/device_lower/analysis/trivial_broadcast.cpp index 34ae04df99e..37729c63739 100644 --- a/csrc/device_lower/analysis/trivial_broadcast.cpp +++ b/csrc/device_lower/analysis/trivial_broadcast.cpp @@ -54,10 +54,10 @@ std::unordered_set ConcretizedBroadcastDomains:: // In some cases an op like pad or slice will introduce a broadcast domain by // truncating a longer dimension or expanding an empty dimension to size 1. In -// these cases tv will have an RFactor Broadcast IterDomains that are not -// present in the root domain. Contrast this case with BroadcastOp, whose output -// does not have RFactor domains and instead places new broadcast domains in the -// output root domain. +// these cases tv will have RFactor Broadcast IterDomains that are not present +// in the root domain. Contrast this with BroadcastOp, whose output does not +// have RFactor domains and instead places new broadcast domains in the output +// root domain. void ConcretizedBroadcastDomains::handle(TensorView* tv) { if (!tv->hasRFactor()) { return;