From 757f685494dcbf0d468e812fad0621f6b55d3481 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 30 May 2023 10:56:52 -0400 Subject: [PATCH 01/22] Add C++ repro for #418 --- test/test_dynamic_transform.cpp | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index dc0201d0833..1b9003f5c3b 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1001,4 +1001,51 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) { reductionDynamicPadAddFusion(invocations); } +// Repro of https://github.com/NVIDIA/Fuser/issues/418 +TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto s0 = IrBuilder::create(); + fusion->addInput(s0); + + auto v00 = tv0->axis(0)->extent(); + auto v01 = tv0->axis(1)->extent(); + auto v02 = tv0->axis(2)->extent(); + auto v03 = tv0->axis(3)->extent(); + + auto tv1 = reshape(tv0, {v00, div(v01, s0), s0, v02, v03}); + auto vm = variance_mean(tv1, {2, 3, 4}, 0, true); + fusion->addOutput(vm.mean); + fusion->addOutput(vm.var); + + // tv1 has symbolic axes as reshape is dynamic + TORCH_CHECK( + tv1->domain()->hasSymbolicAxis(), + "Expected to have symbolic axes: ", + tv1->toString()); + + FusionExecutorCache executor_cache(std::move(fusion)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + { // trivial reshape + auto t0 = at::randn({256, 128, 28, 28}, options); + std::vector inputs = {t0, 32}; + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); + auto t0_resh = t0.reshape({256, 4, 32, 28, 28}); + auto mu = t0_resh.mean({2, 3, 4}); + auto v = t0_resh.var({2, 3, 4}); + testValidate( + executor_cache.fusion(), + cg_outputs, + inputs, + {mu, v}, + __LINE__, + __FILE__); + } +} + } // namespace nvfuser From 03ac754e40d711c781d6daa249165a87b9f8fafb Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 30 May 2023 10:58:22 -0400 Subject: [PATCH 02/22] Process all outputs of each Expr in concretization --- csrc/dynamic_transform.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index f838e07a1aa..a31ca3d9edd 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -404,8 +404,15 @@ void DynamicTransformConcretizer::concretize() { // Finally, propagate concretized domains auto all_stmts = StmtSort::getStmts(info_.fusion(), true); for (auto stmt : all_stmts) { - if (stmt->isA()) { + if (stmt->isA() && !stmt->asVal()->definition()) { + // vals with definitions will be processed as exprs along with _all_ + // outputs mutate(stmt); + } else if (stmt->isA()) { + // concretize all outputs of each expression that will be evaluated + for (auto o : stmt->as()->outputs()) { + mutate(o); + } } } } From 36c70c5bd62f3388b51c5b8c4a13440a207a8a3f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 31 May 2023 19:31:02 -0400 Subject: [PATCH 03/22] Add fusion_ir_dynamic debug dump option --- csrc/kernel_cache.cpp | 5 +++++ csrc/utils.cpp | 1 + csrc/utils.h | 1 + 3 files changed, 7 insertions(+) diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp index 6cc6dc37ed3..9abbd35ed7b 100644 --- a/csrc/kernel_cache.cpp +++ b/csrc/kernel_cache.cpp @@ -554,6 +554,11 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( } } + if (isDebugDumpEnabled(DebugDumpOption::FusionIrDynamic)) { + std::cout << "Dynamic Fusion (before concretization):" << std::endl; + fusion()->printMath(); + } + // Compute or get cached initial concretization info auto& initial_info = initialInfo(); diff --git a/csrc/utils.cpp b/csrc/utils.cpp index 644f9337196..4c35f247cd7 100644 --- a/csrc/utils.cpp +++ b/csrc/utils.cpp @@ -109,6 +109,7 @@ auto parseDebugDumpOptions() { {"fusion_ir", DebugDumpOption::FusionIr}, {"fusion_ir_math", DebugDumpOption::FusionIrMath}, {"fusion_ir_presched", DebugDumpOption::FusionIrPresched}, + {"fusion_ir_dynamic", DebugDumpOption::FusionIrDynamic}, {"fusion_ir_concretized", DebugDumpOption::FusionIrConcretized}, {"kernel_ir", DebugDumpOption::KernelIr}, {"ca_map", DebugDumpOption::ComputeAtMap}, diff --git a/csrc/utils.h b/csrc/utils.h index e5a0171f9f0..83362f51acc 100644 --- a/csrc/utils.h +++ b/csrc/utils.h @@ -55,6 +55,7 @@ enum class DebugDumpOption { FusionIr, //!< Dump the Fusion IR before lowering FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR FusionIrPresched, //!< Dump the Fusion IR before it is scheduled. + FusionIrDynamic, //!< Dump the Fusion IR before concretization FusionIrConcretized, //!< Dump the Fusion IR after concretization KernelIr, //!< Dump the compiler Kernel IR ComputeAtMap, //!< Dump the computeAt map From 9727388594eea9371422faa0899a113156e4e89e Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 31 May 2023 20:09:52 -0400 Subject: [PATCH 04/22] Propagate extent from P2C during concretization --- csrc/dynamic_transform.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index a31ca3d9edd..b6d3dc14aad 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -645,6 +645,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( // corresponding producer IDs std::optional id_type; + Val* extent = nullptr; for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); @@ -667,6 +668,11 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( } else { id_type = input_id->getIterType(); } + + // Set extent expression based on producer, overwriting that of consumer + if (!extent) { + extent = input_id->extent(); + } } TORCH_INTERNAL_ASSERT( @@ -684,7 +690,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( consumer->toString()); auto concretized_id = - IterDomainBuilder(root_id).iter_type(*id_type).build(); + IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build(); registerConcretization(root_id, concretized_id); is_concretized = true; From 031f327f90136160a8faf226d33092fb53685939 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 1 Jun 2023 07:51:34 -0400 Subject: [PATCH 05/22] Revert "Propagate extent from P2C during concretization" This reverts commit 9727388594eea9371422faa0899a113156e4e89e. --- csrc/dynamic_transform.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index b6d3dc14aad..a31ca3d9edd 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -645,7 +645,6 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( // corresponding producer IDs std::optional id_type; - Val* extent = nullptr; for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); @@ -668,11 +667,6 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( } else { id_type = input_id->getIterType(); } - - // Set extent expression based on producer, overwriting that of consumer - if (!extent) { - extent = input_id->extent(); - } } TORCH_INTERNAL_ASSERT( @@ -690,7 +684,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( consumer->toString()); auto concretized_id = - IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build(); + IterDomainBuilder(root_id).iter_type(*id_type).build(); registerConcretization(root_id, concretized_id); is_concretized = true; From 3724a004657183a657228614ea02156ba6fc4900 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 1 Jun 2023 11:28:57 -0400 Subject: [PATCH 06/22] Match concretized reshape extents to desired extents. This is still failing due to segmentation not being able to infer output shapes. --- csrc/dynamic_transform.cpp | 198 +++++++++++++++++++++++++++++++--- csrc/ir/internal_base_nodes.h | 3 +- csrc/ir/nodes.cpp | 18 ++-- csrc/kernel_cache.cpp | 2 + 4 files changed, 200 insertions(+), 21 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index a31ca3d9edd..1db9ab0ea82 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -417,6 +418,166 @@ void DynamicTransformConcretizer::concretize() { } } +//! Given a concrete rfactor domain and a symbolic rfactor domain, create a new +//! rfactor domain with the same history as the concrete one, but with the +//! extent expressions of the symbolic IterDomains. +std::vector matchReshapeRFactorExtents( + const std::vector& old_rfactor, + const std::vector& desired_extents_rfactor) { + TORCH_INTERNAL_ASSERT( + old_rfactor.size() == desired_extents_rfactor.size(), + "New RFactor length does not match symbolic RFactor length: found ", + old_rfactor.size(), + " but expected ", + desired_extents_rfactor.size()); + + std::cout << "desired_extents_rfactor:" << std::endl; + for (auto id : desired_extents_rfactor) { + std::cout << id->toString() << " has extent " << id->extent()->toString() + << std::endl; + } + std::cout << std::endl; + + std::vector new_rfactor(old_rfactor.size(), nullptr); + + // Note: we do not use c10::irange here so that we can advance i if we see a + // Split + for (size_t i = 0; i < old_rfactor.size(); i++) { + auto old_id = old_rfactor.at(i); + // create new Split or Merge expression + auto def = old_id->definition(); + if (def) { + if (def->isA()) { + auto spl = def->as(); + // For Split, we will replace both this rfactor ID and the next at once + // Note: I think this is safe since we don't have any broadcasts at this + // point. + auto desired_outer_extent = desired_extents_rfactor.at(i)->extent(); + auto new_outer = + IterDomainBuilder(old_id).extent(desired_outer_extent).build(); + std::cout << "desired_extent = " << desired_outer_extent->toString() + << std::endl; + new_rfactor.at(i) = new_outer; + i++; + TORCH_INTERNAL_ASSERT( + i < old_rfactor.size(), "Found outer split output without inner"); + auto desired_inner_extent = desired_extents_rfactor.at(i)->extent(); + std::cout << "desired_inner_extent = " + << desired_inner_extent->toString() << std::endl; + auto old_inner = old_rfactor.at(i); + auto new_inner = + IterDomainBuilder(old_inner).extent(desired_inner_extent).build(); + new_rfactor.at(i) = new_inner; + std::cout << "new_outer = " << new_outer->toString() + << " new_inner = " << new_inner->toString() << std::endl; + IrBuilder::create( + new_outer, + new_inner, + spl->in(), + spl->factor(), + spl->innerSplit(), + spl->startOffset(), + spl->stopOffset()); + // If we do not remove the original expression, there will be multiple + // uses of the same root ID, which will cause an error in + // BestEffortReplay. So we remove the old expression here. + spl->fusion()->removeExpr(spl); + } else if (def->isA()) { + auto m = def->as(); + auto desired_extent = desired_extents_rfactor.at(i)->extent(); + auto new_id = IterDomainBuilder(old_id).extent(desired_extent).build(); + new_rfactor.at(i) = new_id; + IrBuilder::create(new_id, m->outer(), m->inner()); + m->fusion()->removeExpr(m); // see above + } else { + TORCH_INTERNAL_ASSERT( + false, + "Unhandled definition for reshape rfactor ID ", + old_id->toString(), + ". Definition is ", + def->toString()); + } + } else { + new_rfactor.at(i) = old_id; + } + } + return new_rfactor; +} + +//! Given a concretized reshape (concrete_tv) and its symbolic stand-in +//! (symbolic_tv), generate a new TensorView having the same concretization as +//! concrete_tv with the same final rfactor extents as symbolic_tv. +//! +//! Currently, reshape works by applying squeeze, split/merge, then broadcasts +//! (in that order). Each of these three phases produces a new TensorView. We +//! re-use the squeezed TV, then replace the applyViewTransforms TV and replay +//! the broadcast. +TensorView* matchSymbolicReshapeExtents( + TensorView* concrete_tv, + TensorView* symbolic_tv) { + TORCH_INTERNAL_ASSERT( + concrete_tv->getMaybeRFactorDomain().size() == + symbolic_tv->getMaybeRFactorDomain().size(), + "Mismatch between concrete and symbolic rfactor domain sizes"); + // We will skip axes in the symbolic rfactor domain matching broadcast axes + // in the concrete rfactor for now. + std::vector symbolic_rfactor_nobcast; + for (auto i : c10::irange(concrete_tv->getMaybeRFactorDomain().size())) { + if (concrete_tv->getMaybeRFactorDomain().at(i)->isBroadcast()) { + continue; + } + symbolic_rfactor_nobcast.push_back( + symbolic_tv->getMaybeRFactorDomain().at(i)); + } + + // First determine if there is a broadcast + auto def = concrete_tv->definition(); + TORCH_INTERNAL_ASSERT( + def, "matchSymbolicReshapeExtents input has no definition."); + BroadcastOp* bcast = nullptr; + if (def->isA()) { + // There were broadcasts. Move upstream but hold onto the op so we can + // replay it later + bcast = def->as(); + concrete_tv = bcast->in()->as(); + def = concrete_tv->definition(); + TORCH_INTERNAL_ASSERT( + def, "Reshape's internal broadcast has no definition"); + } + TORCH_INTERNAL_ASSERT( + def->isA(), + "matchSymbolicReshapeExtents expects a ViewOp output"); + + // We can now ignore output broadcast axes and just replace the rfactor + // extents in the ViewOp output + auto new_rfactor = matchReshapeRFactorExtents( + concrete_tv->getMaybeRFactorDomain(), symbolic_rfactor_nobcast); + // NOTE: we avoid the validate_domain_equivalence check when creating this + // TensorDomain since we cannot easily prove that the new rfactor will match + // the root. However, this should be guaranteed by analyzeView in this case. + auto new_td = IrBuilder::create( + concrete_tv->getRootDomain(), + new_rfactor, + new_rfactor, + concrete_tv->domain()->contiguity(), + /*validate_domain_equivalence*/ false); + auto new_tv = IrBuilder::create( + concrete_tv->container(), new_td, concrete_tv->getDataType().value()); + + // Create a new ViewOp linking the input to this new output + IrBuilder::create( + concrete_tv->container(), + new_tv, + concrete_tv->definition()->input(0)->as()); + + // Replay broadcast + if (bcast) { + new_tv = broadcast(new_tv, bcast->getBroadcastDimFlags()); + } + + return new_tv; +} + void DynamicTransformConcretizer::concretizeReshape() { // Concretize each reshape op. for (const auto& kv : info_.getReshapeTransforms()) { @@ -427,19 +588,29 @@ void DynamicTransformConcretizer::concretizeReshape() { auto concrete_reshape_out_tv = reshape(inp_tv, view_analysis); + // The above call to reshape uses static sizes and recomputes the output + // extents based on the decomposed Split and Merge operations. This means + // that it will produce a TensorView with different extent expressions + // (though equal for the given inputs) than what the user requested in the + // original symbolic reshape invocation. This can cause subtle issues as + // it introduces a mismatch with downstream extents; e.g. + // https://github.com/NVIDIA/Fuser/issues/418 + // Here, we take the output above and replace the rfactor extents with the + // original ones in order to prevent such mismatches. + auto new_tv = + matchSymbolicReshapeExtents(concrete_reshape_out_tv, incomplete_out_tv); + // We do the replacement directly here, but we must still check that the // replacement is valid - checkConcretizedUses(incomplete_out_tv, concrete_reshape_out_tv); + checkConcretizedUses(incomplete_out_tv, new_tv); // Replace the old tensor with the new concretized tensor for (auto use_of_old_tv : incomplete_out_tv->uses()) { - ir_utils::replaceValInExpr( - use_of_old_tv, incomplete_out_tv, concrete_reshape_out_tv); + ir_utils::replaceValInExpr(use_of_old_tv, incomplete_out_tv, new_tv); } if (incomplete_out_tv->isFusionOutput()) { - incomplete_out_tv->fusion()->replaceOutput( - incomplete_out_tv, concrete_reshape_out_tv); + incomplete_out_tv->fusion()->replaceOutput(incomplete_out_tv, new_tv); } incomplete_out_tv->fusion()->removeVal(incomplete_out_tv); @@ -486,9 +657,9 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { propagateFromProducerToConsumer(tv); // If no root domain is altered by producer, we don't need to propagate back - // up to rfactor. We could return early, but instead we go ahead and check the - // root to rfactor transforms to be sure we have concretized any intermediate - // IterDomains. + // up to rfactor. We could return early, but instead we go ahead and check + // the root to rfactor transforms to be sure we have concretized any + // intermediate IterDomains. // At this point, there should be no expr beyond rfactor root TORCH_INTERNAL_ASSERT( @@ -525,16 +696,17 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { // padded with constant pad widths (1, 1), in which case although we do // not know the exact extent of the output, we know it is at least as // large as the sum of the pad widths, 2. In such cases, the output - // IterDomain is concrete at definition, since if the extent is >1 we know - // the IterType is Iteration. In these cases, we must continue to + // IterDomain is concrete at definition, since if the extent is >1 we + // know the IterType is Iteration. In these cases, we must continue to // concretize intermediate expressions between the root and R-factor // domain. See test DynamicTransform5_CUDA which demonstrates this // behavior. // NOTE: We also do not assume that if one output ID is symbolic, that // they all must be. See test FusionSliceForNanoGPT3_CUDA for an example - // that does a static split by a factor of 16 of a symbolic input domain. - // The static split in that case results in a concrete IterDomain with - // extent 16 along with a symbolic one (extent ceilDiv(n / 16)). + // that does a static split by a factor of 16 of a symbolic input + // domain. The static split in that case results in a concrete + // IterDomain with extent 16 along with a symbolic one (extent ceilDiv(n + // / 16)). // Determine the output IterType IterType iter_type = IterType::Symbolic; diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h index b13caaa9dba..4f4b1e7da10 100644 --- a/csrc/ir/internal_base_nodes.h +++ b/csrc/ir/internal_base_nodes.h @@ -444,7 +444,8 @@ class TORCH_CUDA_CU_API TensorDomain : public Val { std::vector root_domain, std::vector rfactor_domain, std::vector leaf_domain, - std::vector> contiguity = {}); + std::vector> contiguity = {}, + bool validate_domain_equivalence = true); TensorDomain( IrBuilderPasskey, diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index 3fb3ebaca3e..802757b374e 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -2726,7 +2726,8 @@ TensorDomain::TensorDomain( std::vector root_domain, std::vector rfactor_domain, std::vector leaf_domain, - std::vector> contiguity) + std::vector> contiguity, + bool validate_domain_equivalence) : Val(passkey, ValType::TensorDomain, DataType::Null), root_domain_(std::move(root_domain)), rfactor_domain_(std::move(rfactor_domain)), @@ -2736,12 +2737,15 @@ TensorDomain::TensorDomain( : std::move(contiguity)) { validateContiguity(maybeAllocation(), contiguity_); - if (!root_domain_.empty()) { - TORCH_CHECK(!leaf_domain_.empty(), "Root domain is not empty but leaf is"); - ir_utils::validateDomainEquivalence(root_domain_, leaf_domain_); - if (!rfactor_domain_.empty()) { - ir_utils::validateDomainEquivalence(root_domain_, rfactor_domain_); - ir_utils::validateDomainEquivalence(rfactor_domain_, leaf_domain_); + if (validate_domain_equivalence) { + if (!root_domain_.empty()) { + TORCH_CHECK( + !leaf_domain_.empty(), "Root domain is not empty but leaf is"); + ir_utils::validateDomainEquivalence(root_domain_, leaf_domain_); + if (!rfactor_domain_.empty()) { + ir_utils::validateDomainEquivalence(root_domain_, rfactor_domain_); + ir_utils::validateDomainEquivalence(rfactor_domain_, leaf_domain_); + } } } diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp index 9abbd35ed7b..a9c7670e2a5 100644 --- a/csrc/kernel_cache.cpp +++ b/csrc/kernel_cache.cpp @@ -557,6 +557,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( if (isDebugDumpEnabled(DebugDumpOption::FusionIrDynamic)) { std::cout << "Dynamic Fusion (before concretization):" << std::endl; fusion()->printMath(); + fusion()->printTransforms(); } // Compute or get cached initial concretization info @@ -640,6 +641,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) { std::cout << "Concretized Fusion:" << std::endl; fusion->printMath(); + fusion->printTransforms(); } kernel_runtimes.emplace_back(std::make_unique( std::move(fusion), args, forced_index_type)); From 72b29eefd737eac4805091dfaed048a088282aa9 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 1 Jun 2023 11:40:11 -0400 Subject: [PATCH 07/22] Remove debugging print statements --- csrc/dynamic_transform.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 1db9ab0ea82..258f82c46ce 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -431,13 +431,6 @@ std::vector matchReshapeRFactorExtents( " but expected ", desired_extents_rfactor.size()); - std::cout << "desired_extents_rfactor:" << std::endl; - for (auto id : desired_extents_rfactor) { - std::cout << id->toString() << " has extent " << id->extent()->toString() - << std::endl; - } - std::cout << std::endl; - std::vector new_rfactor(old_rfactor.size(), nullptr); // Note: we do not use c10::irange here so that we can advance i if we see a @@ -455,21 +448,15 @@ std::vector matchReshapeRFactorExtents( auto desired_outer_extent = desired_extents_rfactor.at(i)->extent(); auto new_outer = IterDomainBuilder(old_id).extent(desired_outer_extent).build(); - std::cout << "desired_extent = " << desired_outer_extent->toString() - << std::endl; new_rfactor.at(i) = new_outer; i++; TORCH_INTERNAL_ASSERT( i < old_rfactor.size(), "Found outer split output without inner"); auto desired_inner_extent = desired_extents_rfactor.at(i)->extent(); - std::cout << "desired_inner_extent = " - << desired_inner_extent->toString() << std::endl; auto old_inner = old_rfactor.at(i); auto new_inner = IterDomainBuilder(old_inner).extent(desired_inner_extent).build(); new_rfactor.at(i) = new_inner; - std::cout << "new_outer = " << new_outer->toString() - << " new_inner = " << new_inner->toString() << std::endl; IrBuilder::create( new_outer, new_inner, From 78ad7e262f6855cbd4e5ffddf0f8236c8d77d47c Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 1 Jun 2023 11:42:01 -0400 Subject: [PATCH 08/22] Add output extents as ViewOp inputs --- csrc/ir/nodes.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index 802757b374e..6ac62e46220 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -1963,6 +1963,14 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(ViewAsScalar) ViewOp::ViewOp(IrBuilderPasskey passkey, Val* out, Val* in) : Expr(passkey) { addOutput(out); addInput(in); + // Note: we also add the output extents as INPUTS here. This helps to ensure + // that we include necessary scalars during segmentation to determine outputs. + // Without this, the user may pass expressions for the desired shape that are + // not available in a given segment. See + // https://github.com/NVIDIA/Fuser/issues/418 + for (auto id : out->as()->getMaybeRFactorDomain()) { + addInput(id->extent()); + } } std::string ViewOp::toString(int indent_size) const { From 788f500ab894848d54fecaf9c67fac1aa7fb25e0 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 1 Jun 2023 12:57:39 -0400 Subject: [PATCH 09/22] Remove concrete_reshape_out_tv after replacing --- csrc/dynamic_transform.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 258f82c46ce..53a27a7b3f2 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -584,8 +584,14 @@ void DynamicTransformConcretizer::concretizeReshape() { // https://github.com/NVIDIA/Fuser/issues/418 // Here, we take the output above and replace the rfactor extents with the // original ones in order to prevent such mismatches. - auto new_tv = - matchSymbolicReshapeExtents(concrete_reshape_out_tv, incomplete_out_tv); + TensorView* new_tv = concrete_reshape_out_tv; + if (!new_tv->sameAs(inp_tv)) { + new_tv = matchSymbolicReshapeExtents(new_tv, incomplete_out_tv); + // Remove the concrete TV after we've replaced its extents and generated a + // new TV + new_tv->fusion()->removeVal(concrete_reshape_out_tv); + concrete_reshape_out_tv = nullptr; + } // We do the replacement directly here, but we must still check that the // replacement is valid From bf9aa1e5f6ccfa825d784085beb7715b1b7983d8 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 1 Jun 2023 12:58:00 -0400 Subject: [PATCH 10/22] Find expr outputs before calling StmtSort again --- csrc/dynamic_transform.cpp | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 53a27a7b3f2..2f44759cd51 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -403,17 +403,36 @@ void DynamicTransformConcretizer::concretize() { concretizeResize(); // Finally, propagate concretized domains - auto all_stmts = StmtSort::getStmts(info_.fusion(), true); + + // We need to concretize all immediate outputs of all intermediate + // expressions; even those leading to dead code branches. To do this, we + // insert all outputs from all intermediate expressions to "leaves". If we + // don't insert anything new, we are done. Otherwise, we traverse again using + // these as outputs as well, which ensures the output is sorted. + auto leaves = info_.fusion()->getTerminatingOutputs(); + auto leaves_set = + std::unordered_set(leaves.begin(), leaves.end()); + std::vector all_stmts; + bool inserted = true; + while (inserted) { + all_stmts = StmtSort::getStmts(info_.fusion(), leaves, true); + inserted = false; + for (auto stmt : all_stmts) { + if (stmt->isExpr()) { + for (auto o : stmt->as()->outputs()) { + if (leaves_set.find(o) == leaves_set.end()) { + leaves.push_back(o); + leaves_set.insert(o); + inserted = true; + } + } + } + } + } + // Concretize all vals in the final vector for (auto stmt : all_stmts) { - if (stmt->isA() && !stmt->asVal()->definition()) { - // vals with definitions will be processed as exprs along with _all_ - // outputs + if (stmt->isVal()) { mutate(stmt); - } else if (stmt->isA()) { - // concretize all outputs of each expression that will be evaluated - for (auto o : stmt->as()->outputs()) { - mutate(o); - } } } } From 4f05ad50aa3f1e3cfc8c03680d167481dfc1c44f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 1 Jun 2023 15:48:05 -0400 Subject: [PATCH 11/22] Revert to d675161 but keep bf9aa1e5 --- csrc/dynamic_transform.cpp | 199 ++++------------------------------ csrc/ir/internal_base_nodes.h | 3 +- csrc/ir/nodes.cpp | 26 ++--- csrc/kernel_cache.cpp | 2 - 4 files changed, 28 insertions(+), 202 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 2f44759cd51..a34fac679e3 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -437,153 +436,6 @@ void DynamicTransformConcretizer::concretize() { } } -//! Given a concrete rfactor domain and a symbolic rfactor domain, create a new -//! rfactor domain with the same history as the concrete one, but with the -//! extent expressions of the symbolic IterDomains. -std::vector matchReshapeRFactorExtents( - const std::vector& old_rfactor, - const std::vector& desired_extents_rfactor) { - TORCH_INTERNAL_ASSERT( - old_rfactor.size() == desired_extents_rfactor.size(), - "New RFactor length does not match symbolic RFactor length: found ", - old_rfactor.size(), - " but expected ", - desired_extents_rfactor.size()); - - std::vector new_rfactor(old_rfactor.size(), nullptr); - - // Note: we do not use c10::irange here so that we can advance i if we see a - // Split - for (size_t i = 0; i < old_rfactor.size(); i++) { - auto old_id = old_rfactor.at(i); - // create new Split or Merge expression - auto def = old_id->definition(); - if (def) { - if (def->isA()) { - auto spl = def->as(); - // For Split, we will replace both this rfactor ID and the next at once - // Note: I think this is safe since we don't have any broadcasts at this - // point. - auto desired_outer_extent = desired_extents_rfactor.at(i)->extent(); - auto new_outer = - IterDomainBuilder(old_id).extent(desired_outer_extent).build(); - new_rfactor.at(i) = new_outer; - i++; - TORCH_INTERNAL_ASSERT( - i < old_rfactor.size(), "Found outer split output without inner"); - auto desired_inner_extent = desired_extents_rfactor.at(i)->extent(); - auto old_inner = old_rfactor.at(i); - auto new_inner = - IterDomainBuilder(old_inner).extent(desired_inner_extent).build(); - new_rfactor.at(i) = new_inner; - IrBuilder::create( - new_outer, - new_inner, - spl->in(), - spl->factor(), - spl->innerSplit(), - spl->startOffset(), - spl->stopOffset()); - // If we do not remove the original expression, there will be multiple - // uses of the same root ID, which will cause an error in - // BestEffortReplay. So we remove the old expression here. - spl->fusion()->removeExpr(spl); - } else if (def->isA()) { - auto m = def->as(); - auto desired_extent = desired_extents_rfactor.at(i)->extent(); - auto new_id = IterDomainBuilder(old_id).extent(desired_extent).build(); - new_rfactor.at(i) = new_id; - IrBuilder::create(new_id, m->outer(), m->inner()); - m->fusion()->removeExpr(m); // see above - } else { - TORCH_INTERNAL_ASSERT( - false, - "Unhandled definition for reshape rfactor ID ", - old_id->toString(), - ". Definition is ", - def->toString()); - } - } else { - new_rfactor.at(i) = old_id; - } - } - return new_rfactor; -} - -//! Given a concretized reshape (concrete_tv) and its symbolic stand-in -//! (symbolic_tv), generate a new TensorView having the same concretization as -//! concrete_tv with the same final rfactor extents as symbolic_tv. -//! -//! Currently, reshape works by applying squeeze, split/merge, then broadcasts -//! (in that order). Each of these three phases produces a new TensorView. We -//! re-use the squeezed TV, then replace the applyViewTransforms TV and replay -//! the broadcast. -TensorView* matchSymbolicReshapeExtents( - TensorView* concrete_tv, - TensorView* symbolic_tv) { - TORCH_INTERNAL_ASSERT( - concrete_tv->getMaybeRFactorDomain().size() == - symbolic_tv->getMaybeRFactorDomain().size(), - "Mismatch between concrete and symbolic rfactor domain sizes"); - // We will skip axes in the symbolic rfactor domain matching broadcast axes - // in the concrete rfactor for now. - std::vector symbolic_rfactor_nobcast; - for (auto i : c10::irange(concrete_tv->getMaybeRFactorDomain().size())) { - if (concrete_tv->getMaybeRFactorDomain().at(i)->isBroadcast()) { - continue; - } - symbolic_rfactor_nobcast.push_back( - symbolic_tv->getMaybeRFactorDomain().at(i)); - } - - // First determine if there is a broadcast - auto def = concrete_tv->definition(); - TORCH_INTERNAL_ASSERT( - def, "matchSymbolicReshapeExtents input has no definition."); - BroadcastOp* bcast = nullptr; - if (def->isA()) { - // There were broadcasts. Move upstream but hold onto the op so we can - // replay it later - bcast = def->as(); - concrete_tv = bcast->in()->as(); - def = concrete_tv->definition(); - TORCH_INTERNAL_ASSERT( - def, "Reshape's internal broadcast has no definition"); - } - TORCH_INTERNAL_ASSERT( - def->isA(), - "matchSymbolicReshapeExtents expects a ViewOp output"); - - // We can now ignore output broadcast axes and just replace the rfactor - // extents in the ViewOp output - auto new_rfactor = matchReshapeRFactorExtents( - concrete_tv->getMaybeRFactorDomain(), symbolic_rfactor_nobcast); - // NOTE: we avoid the validate_domain_equivalence check when creating this - // TensorDomain since we cannot easily prove that the new rfactor will match - // the root. However, this should be guaranteed by analyzeView in this case. - auto new_td = IrBuilder::create( - concrete_tv->getRootDomain(), - new_rfactor, - new_rfactor, - concrete_tv->domain()->contiguity(), - /*validate_domain_equivalence*/ false); - auto new_tv = IrBuilder::create( - concrete_tv->container(), new_td, concrete_tv->getDataType().value()); - - // Create a new ViewOp linking the input to this new output - IrBuilder::create( - concrete_tv->container(), - new_tv, - concrete_tv->definition()->input(0)->as()); - - // Replay broadcast - if (bcast) { - new_tv = broadcast(new_tv, bcast->getBroadcastDimFlags()); - } - - return new_tv; -} - void DynamicTransformConcretizer::concretizeReshape() { // Concretize each reshape op. for (const auto& kv : info_.getReshapeTransforms()) { @@ -594,35 +446,19 @@ void DynamicTransformConcretizer::concretizeReshape() { auto concrete_reshape_out_tv = reshape(inp_tv, view_analysis); - // The above call to reshape uses static sizes and recomputes the output - // extents based on the decomposed Split and Merge operations. This means - // that it will produce a TensorView with different extent expressions - // (though equal for the given inputs) than what the user requested in the - // original symbolic reshape invocation. This can cause subtle issues as - // it introduces a mismatch with downstream extents; e.g. - // https://github.com/NVIDIA/Fuser/issues/418 - // Here, we take the output above and replace the rfactor extents with the - // original ones in order to prevent such mismatches. - TensorView* new_tv = concrete_reshape_out_tv; - if (!new_tv->sameAs(inp_tv)) { - new_tv = matchSymbolicReshapeExtents(new_tv, incomplete_out_tv); - // Remove the concrete TV after we've replaced its extents and generated a - // new TV - new_tv->fusion()->removeVal(concrete_reshape_out_tv); - concrete_reshape_out_tv = nullptr; - } - // We do the replacement directly here, but we must still check that the // replacement is valid - checkConcretizedUses(incomplete_out_tv, new_tv); + checkConcretizedUses(incomplete_out_tv, concrete_reshape_out_tv); // Replace the old tensor with the new concretized tensor for (auto use_of_old_tv : incomplete_out_tv->uses()) { - ir_utils::replaceValInExpr(use_of_old_tv, incomplete_out_tv, new_tv); + ir_utils::replaceValInExpr( + use_of_old_tv, incomplete_out_tv, concrete_reshape_out_tv); } if (incomplete_out_tv->isFusionOutput()) { - incomplete_out_tv->fusion()->replaceOutput(incomplete_out_tv, new_tv); + incomplete_out_tv->fusion()->replaceOutput( + incomplete_out_tv, concrete_reshape_out_tv); } incomplete_out_tv->fusion()->removeVal(incomplete_out_tv); @@ -669,9 +505,9 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { propagateFromProducerToConsumer(tv); // If no root domain is altered by producer, we don't need to propagate back - // up to rfactor. We could return early, but instead we go ahead and check - // the root to rfactor transforms to be sure we have concretized any - // intermediate IterDomains. + // up to rfactor. We could return early, but instead we go ahead and check the + // root to rfactor transforms to be sure we have concretized any intermediate + // IterDomains. // At this point, there should be no expr beyond rfactor root TORCH_INTERNAL_ASSERT( @@ -708,17 +544,16 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { // padded with constant pad widths (1, 1), in which case although we do // not know the exact extent of the output, we know it is at least as // large as the sum of the pad widths, 2. In such cases, the output - // IterDomain is concrete at definition, since if the extent is >1 we - // know the IterType is Iteration. In these cases, we must continue to + // IterDomain is concrete at definition, since if the extent is >1 we know + // the IterType is Iteration. In these cases, we must continue to // concretize intermediate expressions between the root and R-factor // domain. See test DynamicTransform5_CUDA which demonstrates this // behavior. // NOTE: We also do not assume that if one output ID is symbolic, that // they all must be. See test FusionSliceForNanoGPT3_CUDA for an example - // that does a static split by a factor of 16 of a symbolic input - // domain. The static split in that case results in a concrete - // IterDomain with extent 16 along with a symbolic one (extent ceilDiv(n - // / 16)). + // that does a static split by a factor of 16 of a symbolic input domain. + // The static split in that case results in a concrete IterDomain with + // extent 16 along with a symbolic one (extent ceilDiv(n / 16)). // Determine the output IterType IterType iter_type = IterType::Symbolic; @@ -829,6 +664,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( // corresponding producer IDs std::optional id_type; + Val* extent = nullptr; for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); @@ -851,6 +687,11 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( } else { id_type = input_id->getIterType(); } + + // Set extent expression based on producer, overwriting that of consumer + if (!extent) { + extent = input_id->extent(); + } } TORCH_INTERNAL_ASSERT( @@ -868,7 +709,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( consumer->toString()); auto concretized_id = - IterDomainBuilder(root_id).iter_type(*id_type).build(); + IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build(); registerConcretization(root_id, concretized_id); is_concretized = true; diff --git a/csrc/ir/internal_base_nodes.h b/csrc/ir/internal_base_nodes.h index 4f4b1e7da10..b13caaa9dba 100644 --- a/csrc/ir/internal_base_nodes.h +++ b/csrc/ir/internal_base_nodes.h @@ -444,8 +444,7 @@ class TORCH_CUDA_CU_API TensorDomain : public Val { std::vector root_domain, std::vector rfactor_domain, std::vector leaf_domain, - std::vector> contiguity = {}, - bool validate_domain_equivalence = true); + std::vector> contiguity = {}); TensorDomain( IrBuilderPasskey, diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index 6ac62e46220..3fb3ebaca3e 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -1963,14 +1963,6 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(ViewAsScalar) ViewOp::ViewOp(IrBuilderPasskey passkey, Val* out, Val* in) : Expr(passkey) { addOutput(out); addInput(in); - // Note: we also add the output extents as INPUTS here. This helps to ensure - // that we include necessary scalars during segmentation to determine outputs. - // Without this, the user may pass expressions for the desired shape that are - // not available in a given segment. See - // https://github.com/NVIDIA/Fuser/issues/418 - for (auto id : out->as()->getMaybeRFactorDomain()) { - addInput(id->extent()); - } } std::string ViewOp::toString(int indent_size) const { @@ -2734,8 +2726,7 @@ TensorDomain::TensorDomain( std::vector root_domain, std::vector rfactor_domain, std::vector leaf_domain, - std::vector> contiguity, - bool validate_domain_equivalence) + std::vector> contiguity) : Val(passkey, ValType::TensorDomain, DataType::Null), root_domain_(std::move(root_domain)), rfactor_domain_(std::move(rfactor_domain)), @@ -2745,15 +2736,12 @@ TensorDomain::TensorDomain( : std::move(contiguity)) { validateContiguity(maybeAllocation(), contiguity_); - if (validate_domain_equivalence) { - if (!root_domain_.empty()) { - TORCH_CHECK( - !leaf_domain_.empty(), "Root domain is not empty but leaf is"); - ir_utils::validateDomainEquivalence(root_domain_, leaf_domain_); - if (!rfactor_domain_.empty()) { - ir_utils::validateDomainEquivalence(root_domain_, rfactor_domain_); - ir_utils::validateDomainEquivalence(rfactor_domain_, leaf_domain_); - } + if (!root_domain_.empty()) { + TORCH_CHECK(!leaf_domain_.empty(), "Root domain is not empty but leaf is"); + ir_utils::validateDomainEquivalence(root_domain_, leaf_domain_); + if (!rfactor_domain_.empty()) { + ir_utils::validateDomainEquivalence(root_domain_, rfactor_domain_); + ir_utils::validateDomainEquivalence(rfactor_domain_, leaf_domain_); } } diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp index a9c7670e2a5..9abbd35ed7b 100644 --- a/csrc/kernel_cache.cpp +++ b/csrc/kernel_cache.cpp @@ -557,7 +557,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( if (isDebugDumpEnabled(DebugDumpOption::FusionIrDynamic)) { std::cout << "Dynamic Fusion (before concretization):" << std::endl; fusion()->printMath(); - fusion()->printTransforms(); } // Compute or get cached initial concretization info @@ -641,7 +640,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) { std::cout << "Concretized Fusion:" << std::endl; fusion->printMath(); - fusion->printTransforms(); } kernel_runtimes.emplace_back(std::make_unique( std::move(fusion), args, forced_index_type)); From 8160c6fa8b2e802c749701921c103e188263166b Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 1 Jun 2023 15:49:23 -0400 Subject: [PATCH 12/22] Fix DynamicTransformIssue418_CUDA ATen keepdim args --- test/test_dynamic_transform.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 1b9003f5c3b..ea257f81558 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1036,8 +1036,8 @@ TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) { std::vector inputs = {t0, 32}; auto cg_outputs = executor_cache.runFusionWithInputs(inputs); auto t0_resh = t0.reshape({256, 4, 32, 28, 28}); - auto mu = t0_resh.mean({2, 3, 4}); - auto v = t0_resh.var({2, 3, 4}); + auto mu = t0_resh.mean({2, 3, 4}, true); + auto v = t0_resh.var({2, 3, 4}, true, true); testValidate( executor_cache.fusion(), cg_outputs, From 83062b878571d3d696c5b79b0c9a46127442f755 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 2 Jun 2023 10:10:38 -0400 Subject: [PATCH 13/22] Add failing full groupnorm test --- test/test_dynamic_transform.cpp | 51 +++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index ea257f81558..17d70028771 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1048,4 +1048,55 @@ TEST_F(NVFuserTest, DynamicTransformIssue418_CUDA) { } } +// Repro of https://github.com/NVIDIA/Fuser/issues/418 (full GroupNorm example) +TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto weight = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto bias = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto s0 = IrBuilder::create(); + fusion->addInput(s0); + + auto v00 = tv0->axis(0)->extent(); + auto v01 = tv0->axis(1)->extent(); + auto v02 = tv0->axis(2)->extent(); + auto v03 = tv0->axis(3)->extent(); + + auto tv1 = reshape(tv0, {v00, div(v01, s0), s0, v02, v03}); + auto vm = variance_mean(tv1, {2, 3, 4}, 0, true); + auto eps = IrBuilder::create(1e-5); + auto tv2 = mul(sub(tv1, vm.mean), rsqrt(add(vm.var, eps))); + auto tv3 = reshape(tv2, {v00, v01, v02, v03}); + auto tv4 = add(mul(tv3, weight), bias); + fusion->addOutput(tv4); + + // tv1 has symbolic axes as reshape is dynamic + TORCH_CHECK( + tv1->domain()->hasSymbolicAxis(), + "Expected to have symbolic axes: ", + tv1->toString()); + + FusionExecutorCache executor_cache(std::move(fusion)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({256, 128, 28, 28}, options); + auto w = at::randn({1, 128, 1, 1}, options); + auto b = at::randn({1, 128, 1, 1}, options); + std::vector inputs = {t0, w, b, 32}; + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); + auto t0_resh = t0.reshape({256, 4, 32, 28, 28}); + auto mu = t0_resh.mean({2, 3, 4}, true); + auto v = t0_resh.var({2, 3, 4}, true, true); + auto gn = + ((t0_resh - mu) * (v + 1e-5).rsqrt()).reshape({256, 128, 28, 28}) * w + b; + testValidate( + executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__); +} + } // namespace nvfuser From 82e7ee8f7f78a9eefef3e328e2c4ee05eea8424c Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 2 Jun 2023 10:12:47 -0400 Subject: [PATCH 14/22] Fix bug in failing test --- test/test_dynamic_transform.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 17d70028771..c54cd1b5543 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1056,9 +1056,9 @@ TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) { auto tv0 = makeSymbolicTensor(4); fusion->addInput(tv0); auto weight = makeSymbolicTensor(4); - fusion->addInput(tv0); + fusion->addInput(weight); auto bias = makeSymbolicTensor(4); - fusion->addInput(tv0); + fusion->addInput(bias); auto s0 = IrBuilder::create(); fusion->addInput(s0); From 5c8a4c2ea60141112f0a885f4a246d12a042b6f3 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 2 Jun 2023 12:07:56 -0400 Subject: [PATCH 15/22] Properly set broadcast axes in inputs to 418Full test --- test/test_dynamic_transform.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index c54cd1b5543..e367e9c49b3 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1055,9 +1055,9 @@ TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) { auto tv0 = makeSymbolicTensor(4); fusion->addInput(tv0); - auto weight = makeSymbolicTensor(4); + auto weight = makeSymbolicTensor({1, -1, 1, 1}); fusion->addInput(weight); - auto bias = makeSymbolicTensor(4); + auto bias = makeSymbolicTensor({1, -1, 1, 1}); fusion->addInput(bias); auto s0 = IrBuilder::create(); fusion->addInput(s0); From 9ffadb9b990f53254f0058ed907c4c71ce142c0a Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 2 Jun 2023 12:27:53 -0400 Subject: [PATCH 16/22] Add channels-last test --- test/test_dynamic_transform.cpp | 51 +++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index e367e9c49b3..4ba73961ddc 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1099,4 +1099,55 @@ TEST_F(NVFuserTest, DynamicTransformIssue418Full_CUDA) { executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__); } +// Repro of https://github.com/NVIDIA/Fuser/issues/418 (channels-last) +TEST_F(NVFuserTest, DynamicTransformIssue418FullNHWC_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + auto tv0 = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto weight = makeSymbolicTensor({1, 1, 1, -1}); + fusion->addInput(weight); + auto bias = makeSymbolicTensor({1, 1, 1, -1}); + fusion->addInput(bias); + auto s0 = IrBuilder::create(); + fusion->addInput(s0); + + auto v00 = tv0->axis(0)->extent(); + auto v01 = tv0->axis(1)->extent(); + auto v02 = tv0->axis(2)->extent(); + auto v03 = tv0->axis(3)->extent(); + + auto tv1 = reshape(tv0, {v00, v01, v02, div(v03, s0), s0}); + auto vm = variance_mean(tv1, {1, 2, 4}, 0, true); + auto eps = IrBuilder::create(1e-5); + auto tv2 = mul(sub(tv1, vm.mean), rsqrt(add(vm.var, eps))); + auto tv3 = reshape(tv2, {v00, v01, v02, v03}); + auto tv4 = add(mul(tv3, weight), bias); + fusion->addOutput(tv4); + + // tv1 has symbolic axes as reshape is dynamic + TORCH_CHECK( + tv1->domain()->hasSymbolicAxis(), + "Expected to have symbolic axes: ", + tv1->toString()); + + FusionExecutorCache executor_cache(std::move(fusion)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({256, 28, 28, 128}, options); + auto w = at::randn({1, 1, 1, 128}, options); + auto b = at::randn({1, 1, 1, 128}, options); + std::vector inputs = {t0, w, b, 32}; + auto cg_outputs = executor_cache.runFusionWithInputs(inputs); + auto t0_resh = t0.reshape({256, 28, 28, 4, 32}); + auto mu = t0_resh.mean({1, 2, 4}, true); + auto v = t0_resh.var({1, 2, 4}, true, true); + auto gn = + ((t0_resh - mu) * (v + 1e-5).rsqrt()).reshape({256, 28, 28, 128}) * w + b; + testValidate( + executor_cache.fusion(), cg_outputs, inputs, {gn}, __LINE__, __FILE__); +} + } // namespace nvfuser From 596ceab387c8733188dbcf7770dc0a0d7dc1ee84 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 2 Jun 2023 13:12:31 -0400 Subject: [PATCH 17/22] Grab iter_type fix in newOutputDomain from #358 --- csrc/ops/utils.cpp | 66 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp index 43b7ba18206..31b00f72af4 100644 --- a/csrc/ops/utils.cpp +++ b/csrc/ops/utils.cpp @@ -223,9 +223,10 @@ std::vector newOutputDomain( std::vector start_offsets(out_domain.size(), 0); std::vector stop_offsets(out_domain.size(), 0); std::vector extent_vals(out_domain.size(), nullptr); + std::vector mismatched_symbolic_extents(out_domain.size(), false); std::vector expanded_extent_vals(out_domain.size(), nullptr); - std::vector> iter_types( - out_domain.size(), c10::nullopt); + std::vector> iter_types( + out_domain.size(), std::nullopt); for (auto tv : tvs) { auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); @@ -236,6 +237,53 @@ std::vector newOutputDomain( " dimensions but expected ", out_domain.size()); for (const auto i : c10::irange(dom.size())) { + auto iter_type = dom[i]->getIterType(); + auto prev_iter_type = iter_types[i]; + if (prev_iter_type.has_value()) { + // Clang-tidy complains about unchecked access to optional value here + if (iter_type == IterType::Iteration && + prev_iter_type.value() == IterType::Symbolic) { + // Prefer the Iteration extent, since Symbolic could be broadcast + extent_vals[i] = nullptr; + } else if (iter_type == IterType::Symbolic) { + switch (prev_iter_type.value()) { + case IterType::Iteration: + // Previously found Iteration domain, so ignore all Symbolic + // domains + continue; + case IterType::Symbolic: + if (extent_vals[i]->sameAs(dom[i]->extent())) { + // matching symbolic extent + continue; + } else { + // Mismatched symbolic input extents. Any one of the symbolic + // inputs could be a Broadcast or Iteration domain. Until + // concretization, we will not know which one holds the true + // extent (or whether they all are Broadcast, so that the output + // is also Broadcast). We record that these symbolic extents + // mismatched so that we can introduce a new symbolic extent + // later. + mismatched_symbolic_extents[i] = true; + } + break; + case IterType::Broadcast: + // Previously found only broadcast, so this will either also + // broadcast or resolve those broadcasts. If the expanded + // extent of any of the broadcasts is not 1, then it will need to + // match that of the dom[i]. In either case, prefer dom[i]'s + // extent, so clear iter_types[i] and extent_vals[i] so that the + // rest of this iteration will mark output as Symbolic. + iter_types[i] = std::nullopt; + extent_vals[i] = nullptr; + break; + default: + TORCH_CHECK( + false, + "Encountered unexpected IterType when creating new output domain: ", + prev_iter_type.value()); + } + } + } if (dom[i]->isBroadcast()) { if (dom[i]->hasExpandedExtent()) { expanded_extent_vals[i] = @@ -244,9 +292,9 @@ std::vector newOutputDomain( continue; } extent_vals[i] = promoteSize(extent_vals[i], dom[i]->extent()); - if (iter_types[i].has_value()) { + if (prev_iter_type.has_value()) { iter_types[i] = - promoteIterType(iter_types[i].value(), dom[i]->getIterType()); + promoteIterType(prev_iter_type.value(), dom[i]->getIterType()); } else { iter_types[i] = dom[i]->getIterType(); } @@ -268,15 +316,21 @@ std::vector newOutputDomain( } } for (const auto dim_i : c10::irange(out_domain.size())) { + auto iter_type = iter_types[dim_i]; + if (iter_type == IterType::Symbolic && mismatched_symbolic_extents[dim_i]) { + // if we have a symbolic output but the input symbolic extents did not + // match, create a new extent + extent_vals[dim_i] = IrBuilder::create(); + } if (extent_vals[dim_i] != nullptr) { TORCH_INTERNAL_ASSERT( - iter_types[dim_i].has_value(), + iter_type.has_value(), "Could not deduce iter type for new tensor view."); out_domain[dim_i] = IterDomainBuilder( IrBuilder::create(start_offsets[dim_i]), extent_vals[dim_i]) .stop_offset(IrBuilder::create(stop_offsets[dim_i])) - .iter_type(iter_types[dim_i].value()) + .iter_type(iter_type.value()) .build(); } else { out_domain[dim_i] = IterDomainBuilder( From 6b85cf48145d9d36c7db750c1d472f9ee9eb54fc Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 2 Jun 2023 15:48:11 -0400 Subject: [PATCH 18/22] Propagate replacements for placeholder extents --- csrc/dynamic_transform.cpp | 39 ++++++++++++++++++++++++-------------- test/test_resize.cpp | 10 ++++++++-- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index dd379e6c293..779736f271f 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -392,6 +392,8 @@ class DynamicTransformConcretizer : public OptOutMutator { private: const DynamicTransformConcretizationInfo& info_; + + std::unordered_map replaced_extents_; }; void DynamicTransformConcretizer::concretize() { @@ -496,10 +498,6 @@ void DynamicTransformConcretizer::checkConcretizedUses( // concretized. Since symbolic IDs may be propagated down to // consumers, those domains need to be concretized accordingly. void DynamicTransformConcretizer::mutate(TensorView* tv) { - if (!tv->domain()->hasSymbolicAxis()) { - return; - } - // First, try to concretize the root domain as there may be symbolic // axes inherited from the producers propagateFromProducerToConsumer(tv); @@ -646,11 +644,6 @@ void DynamicTransformConcretizer::mutate(TensorDomain* td) { bool DynamicTransformConcretizer::propagateFromProducerToConsumer( TensorView* consumer) { - if (consumer->definition() == nullptr || - !consumer->domain()->hasSymbolicAxis()) { - return false; - } - const auto& root_domain = consumer->getRootDomain(); auto def = consumer->definition(); @@ -659,15 +652,25 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( for (const auto i : c10::irange(root_domain.size())) { auto root_id = root_domain.at(i); - if (root_id->getIterType() != IterType::Symbolic) { - continue; + Val* extent = nullptr; + + auto replaced_extent_it = replaced_extents_.find(root_id->extent()); + if (replaced_extent_it == replaced_extents_.end()) { + if (root_id->getIterType() != IterType::Symbolic) { + continue; + } + } else { + extent = replaced_extent_it->second; } // Figure out the right IterType of this consumer root ID from its // corresponding producer IDs - std::optional id_type; - Val* extent = nullptr; + std::optional id_type = std::nullopt; + + if (root_id->isReduction()) { + id_type = IterType::Reduction; + } for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); @@ -686,7 +689,9 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( input_id->toString()); if (id_type.has_value()) { - id_type = ops::promoteIterType(*id_type, input_id->getIterType()); + if (*id_type != IterType::Reduction) { + id_type = ops::promoteIterType(*id_type, input_id->getIterType()); + } } else { id_type = input_id->getIterType(); } @@ -714,6 +719,12 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( auto concretized_id = IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build(); + if (!extent->sameAs(root_id->extent())) { + replaced_extents_[root_id->extent()] = extent; + // In case this value is used elsewhere, register for mutation + registerMutation(root_id->extent(), extent); + } + registerConcretization(root_id, concretized_id); is_concretized = true; } diff --git a/test/test_resize.cpp b/test/test_resize.cpp index 29b16d6ed74..252fc10018f 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -1391,7 +1391,10 @@ TEST_F(NVFuserTest, FusionResizeCatReduceScheduler1_CUDA) { auto ref = at::cat({t0, t1}, 1).sum({1}); testValidate( - executor_cache.fusion(), + // Reference the most recent concretized Fusion, not the dynamic Fusion + executor_cache.getMostRecentKernelRuntime() + ->fusionSegments() + ->completeFusion(), cg_outputs, aten_inputs, {ref}, @@ -1429,7 +1432,10 @@ TEST_F(NVFuserTest, FusionResizeCatSoftmaxScheduler1_CUDA) { auto ref = at::_softmax(t2.to(at::kDouble), -1, false); testValidate( - executor_cache.fusion(), + // Reference the most recent concretized Fusion, not the dynamic Fusion + executor_cache.getMostRecentKernelRuntime() + ->fusionSegments() + ->completeFusion(), cg_outputs, aten_inputs, {ref}, From f8a0610634ff8df83503084cf0baa47e6897d1a7 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 2 Jun 2023 15:59:06 -0400 Subject: [PATCH 19/22] Temporarily revert "Propagate replacements for placeholder extents" This reverts commit 6b85cf48145d9d36c7db750c1d472f9ee9eb54fc. --- csrc/dynamic_transform.cpp | 39 ++++++++++++++------------------------ test/test_resize.cpp | 10 ++-------- 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 779736f271f..dd379e6c293 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -392,8 +392,6 @@ class DynamicTransformConcretizer : public OptOutMutator { private: const DynamicTransformConcretizationInfo& info_; - - std::unordered_map replaced_extents_; }; void DynamicTransformConcretizer::concretize() { @@ -498,6 +496,10 @@ void DynamicTransformConcretizer::checkConcretizedUses( // concretized. Since symbolic IDs may be propagated down to // consumers, those domains need to be concretized accordingly. void DynamicTransformConcretizer::mutate(TensorView* tv) { + if (!tv->domain()->hasSymbolicAxis()) { + return; + } + // First, try to concretize the root domain as there may be symbolic // axes inherited from the producers propagateFromProducerToConsumer(tv); @@ -644,6 +646,11 @@ void DynamicTransformConcretizer::mutate(TensorDomain* td) { bool DynamicTransformConcretizer::propagateFromProducerToConsumer( TensorView* consumer) { + if (consumer->definition() == nullptr || + !consumer->domain()->hasSymbolicAxis()) { + return false; + } + const auto& root_domain = consumer->getRootDomain(); auto def = consumer->definition(); @@ -652,25 +659,15 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( for (const auto i : c10::irange(root_domain.size())) { auto root_id = root_domain.at(i); - Val* extent = nullptr; - - auto replaced_extent_it = replaced_extents_.find(root_id->extent()); - if (replaced_extent_it == replaced_extents_.end()) { - if (root_id->getIterType() != IterType::Symbolic) { - continue; - } - } else { - extent = replaced_extent_it->second; + if (root_id->getIterType() != IterType::Symbolic) { + continue; } // Figure out the right IterType of this consumer root ID from its // corresponding producer IDs - std::optional id_type = std::nullopt; - - if (root_id->isReduction()) { - id_type = IterType::Reduction; - } + std::optional id_type; + Val* extent = nullptr; for (auto producer : ir_utils::filterByType(def->inputs())) { PairwiseRootDomainMap root_map(producer, consumer); @@ -689,9 +686,7 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( input_id->toString()); if (id_type.has_value()) { - if (*id_type != IterType::Reduction) { - id_type = ops::promoteIterType(*id_type, input_id->getIterType()); - } + id_type = ops::promoteIterType(*id_type, input_id->getIterType()); } else { id_type = input_id->getIterType(); } @@ -719,12 +714,6 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( auto concretized_id = IterDomainBuilder(root_id).extent(extent).iter_type(*id_type).build(); - if (!extent->sameAs(root_id->extent())) { - replaced_extents_[root_id->extent()] = extent; - // In case this value is used elsewhere, register for mutation - registerMutation(root_id->extent(), extent); - } - registerConcretization(root_id, concretized_id); is_concretized = true; } diff --git a/test/test_resize.cpp b/test/test_resize.cpp index 252fc10018f..29b16d6ed74 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -1391,10 +1391,7 @@ TEST_F(NVFuserTest, FusionResizeCatReduceScheduler1_CUDA) { auto ref = at::cat({t0, t1}, 1).sum({1}); testValidate( - // Reference the most recent concretized Fusion, not the dynamic Fusion - executor_cache.getMostRecentKernelRuntime() - ->fusionSegments() - ->completeFusion(), + executor_cache.fusion(), cg_outputs, aten_inputs, {ref}, @@ -1432,10 +1429,7 @@ TEST_F(NVFuserTest, FusionResizeCatSoftmaxScheduler1_CUDA) { auto ref = at::_softmax(t2.to(at::kDouble), -1, false); testValidate( - // Reference the most recent concretized Fusion, not the dynamic Fusion - executor_cache.getMostRecentKernelRuntime() - ->fusionSegments() - ->completeFusion(), + executor_cache.fusion(), cg_outputs, aten_inputs, {ref}, From bd17931ac81001cf9487d169a04309b6cfd039ae Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 6 Jun 2023 14:32:39 -0400 Subject: [PATCH 20/22] Print dynamic fusion if fusion_ir_concretized is given --- csrc/kernel_cache.cpp | 10 ++++------ csrc/utils.cpp | 1 - csrc/utils.h | 1 - 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp index 3dc989eafe7..dc4288a97c0 100644 --- a/csrc/kernel_cache.cpp +++ b/csrc/kernel_cache.cpp @@ -555,11 +555,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( } } - if (isDebugDumpEnabled(DebugDumpOption::FusionIrDynamic)) { - std::cout << "Dynamic Fusion (before concretization):" << std::endl; - fusion()->printMath(); - } - // Compute or get cached initial concretization info auto& initial_info = initialInfo(); @@ -616,7 +611,10 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( kernel_runtime->updateHeuristicsLaunchParams(new_heuristics.get()); } else { // cache miss, need to re-build an optimized graph for this case - + if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) { + std::cout << "Fusion Before Concretization:" << std::endl; + fusion()->printMath(); + } // concretize fusion_ for use in this runtime auto fusion = std::make_unique(*fusion_); FusionGuard fg(fusion.get()); diff --git a/csrc/utils.cpp b/csrc/utils.cpp index 7fa703b1eea..8e2d030476f 100644 --- a/csrc/utils.cpp +++ b/csrc/utils.cpp @@ -109,7 +109,6 @@ auto parseDebugDumpOptions() { {"fusion_ir", DebugDumpOption::FusionIr}, {"fusion_ir_math", DebugDumpOption::FusionIrMath}, {"fusion_ir_presched", DebugDumpOption::FusionIrPresched}, - {"fusion_ir_dynamic", DebugDumpOption::FusionIrDynamic}, {"fusion_ir_concretized", DebugDumpOption::FusionIrConcretized}, {"kernel_ir", DebugDumpOption::KernelIr}, {"ca_map", DebugDumpOption::ComputeAtMap}, diff --git a/csrc/utils.h b/csrc/utils.h index e5885ecd965..a21e94a986c 100644 --- a/csrc/utils.h +++ b/csrc/utils.h @@ -55,7 +55,6 @@ enum class DebugDumpOption { FusionIr, //!< Dump the Fusion IR before lowering FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR FusionIrPresched, //!< Dump the Fusion IR before it is scheduled. - FusionIrDynamic, //!< Dump the Fusion IR before concretization FusionIrConcretized, //!< Dump the Fusion IR after concretization KernelIr, //!< Dump the compiler Kernel IR ComputeAtMap, //!< Dump the computeAt map From 9aaedec806b4378b2f49833f60b7809a7f6b2894 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 6 Jun 2023 14:33:00 -0400 Subject: [PATCH 21/22] Set proper output extent in dynamic cat --- csrc/ops/alias.cpp | 17 +++++++++++++++-- csrc/ops/utils.cpp | 6 ++---- csrc/ops/utils.h | 4 +--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp index 49ed3b9912d..99f3071a4da 100644 --- a/csrc/ops/alias.cpp +++ b/csrc/ops/alias.cpp @@ -116,7 +116,7 @@ TensorView* reshape(TensorView* inp_tv, const std::vector& new_sizes) { return static_reshape_output; } - auto root_domain = ops::newOutputDomain({inp_tv}, inp_tv->dtype()); + auto root_domain = ops::newOutputDomain({inp_tv}); // Create placeholder rfactor domain. Note it's not connected with the root // domain. @@ -632,7 +632,20 @@ TensorView* cat(const std::vector& inputs, int64_t cat_dim) { } // Now all of resized_inputs have the same shape as the out tensor - auto out = ops::newOutputTV(resized_inputs, dtype); + // NOTE: ops::newOutputTV would not necessarily be able to infer that the + // padded dimensions are all of the same size. However, we know that they are + // constructed such that that is the case, so we can use + auto out_domain = ops::newOutputDomain(resized_inputs); + // Override the concatenated dimension and insert an IterDomain with the true + // extent, if needed + if (!out_domain.at(cat_dim)->extent()->sameAs(concat_ext)) { + out_domain[cat_dim] = + IterDomainBuilder(out_domain.at(cat_dim)).extent(concat_ext).build(); + } + auto out = IrBuilder::create( + IrBuilder::create( + out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)), + dtype); IrBuilder::create(out, resized_inputs, cat_dim); diff --git a/csrc/ops/utils.cpp b/csrc/ops/utils.cpp index 31b00f72af4..effa5d617a7 100644 --- a/csrc/ops/utils.cpp +++ b/csrc/ops/utils.cpp @@ -198,9 +198,7 @@ IterType promoteIterType(IterType type1, IterType type2) { } } -std::vector newOutputDomain( - const std::vector& vals, - DataType dtype) { +std::vector newOutputDomain(const std::vector& vals) { std::vector tvs; for (auto val : vals) { if (val->getValType() == ValType::TensorView) { @@ -346,7 +344,7 @@ std::vector newOutputDomain( } TensorView* newOutputTV(const std::vector& vals, DataType dtype) { - auto out_domain = newOutputDomain(vals, dtype); + auto out_domain = newOutputDomain(vals); return IrBuilder::create( IrBuilder::create( out_domain, TensorDomain::getContiguityFilledWith(out_domain, true)), diff --git a/csrc/ops/utils.h b/csrc/ops/utils.h index e9dc563d4fe..7af3a606986 100644 --- a/csrc/ops/utils.h +++ b/csrc/ops/utils.h @@ -31,9 +31,7 @@ Val* newScalar(ValType vtype, DataType dtype); IterType promoteIterType(IterType type1, IterType type2); -std::vector newOutputDomain( - const std::vector& vals, - DataType dtype); +std::vector newOutputDomain(const std::vector& vals); TensorView* newOutputTV(const std::vector& vals, DataType dtype); From 140298fe00f5aaacf53c3b3472f7fbfa27a8a261 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Sat, 10 Jun 2023 11:19:23 -0400 Subject: [PATCH 22/22] Add failing PadBroadcast test --- test/test_dynamic_transform.cpp | 61 +++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 55bd7c5c1fa..0d5096bf156 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1181,4 +1181,65 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { testValidate(&fusion, outputs, aten_inputs, {at2}, __LINE__, __FILE__); } +// Test dynamic pad followed by broadcast resolution +TEST_F(NVFuserTest, DynamicPadBroadcast_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + TensorView* tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + TensorView* tv1 = makeSymbolicTensor(2); + fusion.addInput(tv1); + + // 2d axis order here is YX + auto ypad = IrBuilder::create(); + fusion.addInput(ypad); + auto xpad = IrBuilder::create(); + fusion.addInput(xpad); + + // two-way resizes to cut square tv down to broadcastable size in each axis + auto tv0_pad = pad(tv0, {fusion.zeroVal(), xpad, fusion.zeroVal(), ypad}); + + // This will potentially resolve the y or x broadcast + auto p = mul(tv0_pad, tv1); + fusion.addOutput(p); + fusion.addOutput(tv0_pad); + + fusion.printMath(); + + FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_x = at::randn({5, 5}, options); + at::Tensor at_y = at::randn({5, 5}, options); + + // trivial resize + std::vector aten_inputs({at_x, at_y, 0, 0}); + std::vector outputs; + + /* + aten_inputs[2] = 0; + aten_inputs[3] = 0; + outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + testValidate(fusion_executor_cache.fusion(), outputs, aten_inputs, {at_x * + at_y}, __LINE__, __FILE__); + */ + + // shrink first axis + aten_inputs[2] = -4; + aten_inputs[3] = 0; + outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + std::cout << outputs << std::endl; + std::cout << at_x.slice(0, 0, 1) * at_y << std::endl; + std::cout << at_x.slice(0, 0, 1) << std::endl; + testValidate( + fusion_executor_cache.fusion(), + outputs, + aten_inputs, + {at_x.slice(0, 0, 1) * at_y, at_x.slice(0, 0, 1)}, + __LINE__, + __FILE__); +} + } // namespace nvfuser