From 55b0dfdf619e00a34fdcd156e74cae9d584af9f5 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Sat, 3 Jun 2023 21:01:28 -0400 Subject: [PATCH 01/63] Introduce info_.has_possible_empty_tensor_ --- csrc/dynamic_transform.cpp | 6 ++++-- csrc/dynamic_transform.h | 12 +++++++++--- csrc/kernel_cache.cpp | 6 +++--- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index f838e07a1aa..9cc9c9643c5 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -128,12 +128,14 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { if (!id->definition() || id->getIterType() != IterType::Symbolic) { continue; } + auto extent_opt = info_.expr_eval_.evaluate(id->extent()); + if (!extent_opt.has_value() || extent_opt.value().as() == 0) { + info_.has_possible_empty_tensor_ = true; + } if (auto op = dynamic_cast(id->definition())) { info_.dynamic_resizes_.push_back(op); // extent of output determines its IterType leaf_dynamic_vals_.push_back(id->extent()); - // warm up extent evaluation - info_.expr_eval_.evaluate(id->extent()); } } } diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index 842ab83a06a..29003af02a7 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -38,9 +38,13 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { return fusion_; } - //! Return whether any dynamic transforms exist in the Fusion - bool hasDynamicTransforms() const { - return !dynamic_reshapes_.empty() || !dynamic_resizes_.empty(); + //! Return whether any dynamic transforms exist in the Fusion, or whether + //! there are any tensors which could potentially be empty (size-0 extent) + //! given some user input. In either of these cases, concretization may change + //! the structure of the Fusion. + bool isDynamic() const { + return has_possible_empty_tensor_ || !dynamic_reshapes_.empty() || + !dynamic_resizes_.empty(); } //! Return a set of scalars that are inputs or extents of input TensorViews @@ -89,6 +93,8 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { std::vector dynamic_resizes_; + bool has_possible_empty_tensor_ = false; + // Root Vals that determine concretization std::unordered_set root_dynamic_vals_; diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp index 6cc6dc37ed3..e5dfc7a1bfb 100644 --- a/csrc/kernel_cache.cpp +++ b/csrc/kernel_cache.cpp @@ -562,7 +562,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( // will be used only as a cache key. std::optional conc_info = std::nullopt; size_t conc_info_index = 0; - if (initial_info.hasDynamicTransforms()) { + if (initial_info.isDynamic()) { conc_info = DynamicTransform::getConcretizationInfo( fusion_.get(), &initial_info, &args); TORCH_CHECK( @@ -614,7 +614,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( // concretize fusion_ for use in this runtime auto fusion = std::make_unique(*fusion_); FusionGuard fg(fusion.get()); - if (initial_info.hasDynamicTransforms()) { + if (initial_info.isDynamic()) { const auto& cloned_conc_info = fusion->getManagedSafe( conc_info_index); @@ -644,7 +644,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( } } - if (initial_info.hasDynamicTransforms()) { + if (initial_info.isDynamic()) { // In the case of cache hits, we tend to accumulate managed data in // fusion_. Here we release the concretization info we created to avoid // cloning more and more entries. From 759d803e1eba1099c777467055cb3eaecddfb5dd Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Sat, 3 Jun 2023 21:13:57 -0400 Subject: [PATCH 02/63] Remove fusion arg from concretizeFusion --- csrc/dynamic_transform.cpp | 10 ++-------- csrc/dynamic_transform.h | 4 +--- csrc/kernel_cache.cpp | 3 +-- test/test_dynamic_transform.cpp | 10 +++++----- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 9cc9c9643c5..0b3226ed185 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -353,13 +353,8 @@ std::string DynamicTransformConcretizationInfo::toString() const { //! Concretize a symbolic fusion with concrete transformation info class DynamicTransformConcretizer : public OptOutMutator { public: - DynamicTransformConcretizer( - Fusion* fusion, - const DynamicTransformConcretizationInfo& info) + DynamicTransformConcretizer(const DynamicTransformConcretizationInfo& info) : info_(info) { - TORCH_INTERNAL_ASSERT( - fusion == info.fusion(), - "Invalid DynamicTransformInitialInfo. The associated Fusion is different from the given Fusion"); concretize(); } @@ -747,9 +742,8 @@ DynamicTransformConcretizationInfo DynamicTransform::getConcretizationInfo( } void DynamicTransform::concretizeFusion( - Fusion* fusion, const DynamicTransformConcretizationInfo& info) { - DynamicTransformConcretizer concretizer(fusion, info); + DynamicTransformConcretizer concretizer(info); } size_t DynamicTransformConcretizationInfo::hash() const { diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index 29003af02a7..dcfd96df967 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -200,9 +200,7 @@ class TORCH_CUDA_CU_API DynamicTransform { //! Concretizes a given fusion. Note that the concretization is //! in-place and the given fusion is modified. - static void concretizeFusion( - Fusion*, - const DynamicTransformConcretizationInfo& info); + static void concretizeFusion(const DynamicTransformConcretizationInfo& info); }; } // namespace nvfuser diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp index e5dfc7a1bfb..42ca1680d83 100644 --- a/csrc/kernel_cache.cpp +++ b/csrc/kernel_cache.cpp @@ -621,8 +621,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( TORCH_INTERNAL_ASSERT( cloned_conc_info.has_value(), "Copied Fusion is missing managed concretization info"); - DynamicTransform::concretizeFusion( - fusion.get(), cloned_conc_info.value()); + DynamicTransform::concretizeFusion(cloned_conc_info.value()); // The information in initial_info and cloned_conc_info refers to // variables in the copied symbolic fusion which get replaced during // concretization. Keeping these around during a subsequent fusion copy diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index dc0201d0833..1d7a6ad82bc 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -192,7 +192,7 @@ TEST_F(NVFuserTest, DynamicTransform3_CUDA) { auto info = DynamicTransform::getConcretizationInfo( &fusion, &initial_info, &expr_eval); - DynamicTransform::concretizeFusion(&fusion, info); + DynamicTransform::concretizeFusion(info); TORCH_CHECK( !fusion.hasDynamicTransform(), "Expected to have no dynamic transform"); @@ -258,7 +258,7 @@ TEST_F(NVFuserTest, DynamicTransform4_CUDA) { auto info = DynamicTransform::getConcretizationInfo( &fusion, &initial_info, &expr_eval); - DynamicTransform::concretizeFusion(&fusion, info); + DynamicTransform::concretizeFusion(info); TORCH_CHECK( !fusion.hasDynamicTransform(), "Expected to have no dynamic transform"); @@ -307,7 +307,7 @@ TEST_F(NVFuserTest, DynamicTransform5_CUDA) { auto info = DynamicTransform::getConcretizationInfo( &fusion, &initial_info, &expr_eval); - DynamicTransform::concretizeFusion(&fusion, info); + DynamicTransform::concretizeFusion(info); TORCH_CHECK( !fusion.hasDynamicTransform(), "Expected to have no dynamic transform"); @@ -361,7 +361,7 @@ TEST_F(NVFuserTest, DynamicTransform6_CUDA) { auto info = DynamicTransform::getConcretizationInfo( &fusion, &initial_info, &expr_eval); - DynamicTransform::concretizeFusion(&fusion, info); + DynamicTransform::concretizeFusion(info); TORCH_CHECK( !fusion.hasDynamicTransform(), "Expected to have no dynamic transform"); @@ -586,7 +586,7 @@ TEST_F(NVFuserTest, DynamicTransform10_CUDA) { auto info = DynamicTransform::getConcretizationInfo( &fusion, &initial_info, &expr_eval); - DynamicTransform::concretizeFusion(&fusion, info); + DynamicTransform::concretizeFusion(info); TORCH_CHECK( !fusion.hasDynamicTransform(), "Expected to have no dynamic transform"); From e0f4eb137f936b1feecb25808b6ad5211d7da145 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Sun, 4 Jun 2023 11:33:17 -0400 Subject: [PATCH 03/63] Sketch of empty branch finding in conc info --- csrc/dynamic_transform.cpp | 78 ++++++++++++++++++++++++++++++++++++++ csrc/dynamic_transform.h | 11 ++++++ 2 files changed, 89 insertions(+) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 0b3226ed185..62cd839b652 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -160,6 +161,74 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { std::vector leaf_dynamic_vals_; }; +class EmptyBranchFinder : public OptOutDispatch { + public: + EmptyBranchFinder(Fusion* fusion, ExpressionEvaluator& expr_eval) + : fusion_(fusion), expr_eval_(expr_eval) { + mutate(fusion_->outputs()); + } + + bool isTVEmpty(TensorView* tv) { + for (auto id : tv->getRootDomain()) { + auto extent_opt = expr_eval_.evaluate(id->extent()); + TORCH_INTERNAL_ASSERT( + extent_opt.has_value(), + "Cannot evaluate extent ", + id->extent(), + " of ", + tv->toString()); + if (extent_opt.value().as() == 0) { + return true; + } + } + return false; + } + + using OptOutMutator::mutate; + + void mutate(std::vector vals) { + for (auto v : vals) { + mutate(v); + } + } + + void mutate(TensorView* tv) final { + if (isTVEmpty(tv)) { + if (tv->definition() && !tv->definition()->isA()) { + // Replace with full + std::vector shape; + shape.reserve(tv->getRootDomain().size()); + for (auto id : tv->getRootDomain()) { + shape.push_back(id->extent()); + } + auto full_output = full( + shape, fusion_->zeroVal(), tv->Statement::getDataType().value()); + registerMutation(tv, full_output); + } + return; + } + if (tv->definition()) { + mutate(tv->definition()->inputs()); + } + } + + private: + Fusion* fusion_; + ExpressionEvaluator expr_eval_; +}; + +void DynamicTransformConcretizer::findEmptyBranches( + const DynamicTransformInitialInfo* info, + ExpressionEvaluator* expr_eval) { + for (auto tv : info_.empty_tensors_) { + // TODO: record if empty, with which dimensions are zero + // TODO: re-traverse to find dynamic reshapes and resizes in case we find + // any empty branches since they may be on removed branches. We may trigger + // unnecessary recompilations when the cache misses so we try to include the + // minimal amount of information possible. + } +} + void DynamicTransformConcretizationInfo::analyzeReshapes( const DynamicTransformInitialInfo* info, ExpressionEvaluator* expr_eval) { @@ -338,6 +407,10 @@ std::string DynamicTransformConcretizationInfo::toString() const { std::stringstream ss; ss << "DynamicTransformConcretizationInfo\n"; std::string indent = " "; + ss << indent << "Empty tensors:\n"; + for (const auto& tv : empty_tensors_) { + ss << indent << indent << tv->toString() << "\n"; + } ss << indent << "Reshape:\n"; for (const auto& kv : reshape_transforms_) { ss << indent << indent << kv.first->toString() << ", " @@ -361,6 +434,9 @@ class DynamicTransformConcretizer : public OptOutMutator { private: void concretize(); + //! Set definitions of empty tensors to full() calls. + void removeEmptyBranches(); + void concretizeReshape(); void concretizeResize(); @@ -392,6 +468,8 @@ class DynamicTransformConcretizer : public OptOutMutator { }; void DynamicTransformConcretizer::concretize() { + removeEmptyBranches(); + // First, concretize all dynamic reshape ops concretizeReshape(); diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index dcfd96df967..6d1a7fe46e0 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -121,6 +121,9 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { // evaluator when any one of the IDs has a known value expr_eval->propagateBoundValuesThroughExactMaps(fusion); + // Find a minimal set of empty tensors to replace with full() calls + findEmptyTensors(info, expr_eval); + analyzeReshapes(info, expr_eval); analyzeResizes(info, expr_eval); @@ -142,6 +145,10 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { return !(*this == other); } + void findEmptyTensors( + const DynamicTransformInitialInfo* info, + ExpressionEvaluator* expr_eval); + void analyzeReshapes( const DynamicTransformInitialInfo* info, ExpressionEvaluator* expr_eval); @@ -166,6 +173,10 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { private: Fusion* fusion_ = nullptr; + // Holds, for each empty tensor, a pointer to the tensor along with a vector + // of positions in its rfactor domain which are size 0 + std::vector < std::pair> empty_tensors_; + // Holds, for each dynamic reshape, the output TensorView, and the result of // analyzeView std::vector> reshape_transforms_; From 5b56be74283ea96caa6850d3387c316e1df21ec1 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 5 Jun 2023 11:15:39 -0400 Subject: [PATCH 04/63] Cleanup and fix assumptions in a couple tests --- csrc/dynamic_transform.cpp | 115 +++++++++++++++++++++++--------- csrc/dynamic_transform.h | 36 +++++----- test/test_dynamic_transform.cpp | 6 ++ 3 files changed, 105 insertions(+), 52 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 62cd839b652..d539db091f0 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -126,13 +126,13 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { void handle(TensorView* tv) override { const auto& rfd = tv->getMaybeRFactorDomain(); for (auto id : rfd) { - if (!id->definition() || id->getIterType() != IterType::Symbolic) { - continue; - } auto extent_opt = info_.expr_eval_.evaluate(id->extent()); if (!extent_opt.has_value() || extent_opt.value().as() == 0) { info_.has_possible_empty_tensor_ = true; } + if (!id->definition() || id->getIterType() != IterType::Symbolic) { + continue; + } if (auto op = dynamic_cast(id->definition())) { info_.dynamic_resizes_.push_back(op); // extent of output determines its IterType @@ -161,16 +161,20 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { std::vector leaf_dynamic_vals_; }; -class EmptyBranchFinder : public OptOutDispatch { +class EmptyBranchFinder : public BackwardVisitor { public: - EmptyBranchFinder(Fusion* fusion, ExpressionEvaluator& expr_eval) + EmptyBranchFinder(Fusion* fusion, ExpressionEvaluator* expr_eval) : fusion_(fusion), expr_eval_(expr_eval) { - mutate(fusion_->outputs()); + // We do not require the traversal to cover all outputs, because if we + // replace some outputs with calls to full() then any unused outputs will be + // ignored entirely. + must_cover_all_expr_outputs_ = false; + traverseTo(fusion, fusion->outputs(), false); } bool isTVEmpty(TensorView* tv) { - for (auto id : tv->getRootDomain()) { - auto extent_opt = expr_eval_.evaluate(id->extent()); + for (auto id : tv->getMaybeRFactorDomain()) { + auto extent_opt = expr_eval_->evaluate(id->extent()); TORCH_INTERNAL_ASSERT( extent_opt.has_value(), "Cannot evaluate extent ", @@ -184,49 +188,72 @@ class EmptyBranchFinder : public OptOutDispatch { return false; } - using OptOutMutator::mutate; + std::vector getEmptyTensors() const { + return empty_tensors_; + } + + private: + using BackwardVisitor::handle; - void mutate(std::vector vals) { + void handle(std::vector vals) { for (auto v : vals) { - mutate(v); + handle(v); } } - void mutate(TensorView* tv) final { + void handle(TensorView* tv) final { if (isTVEmpty(tv)) { if (tv->definition() && !tv->definition()->isA()) { // Replace with full - std::vector shape; - shape.reserve(tv->getRootDomain().size()); - for (auto id : tv->getRootDomain()) { - shape.push_back(id->extent()); + std::vector empty_axes; + auto rfactor = tv->getMaybeRFactorDomain(); + for (size_t i : c10::irange(rfactor.size())) { + auto id = rfactor.at(i); + auto extent_eval = expr_eval_->evaluate(id->extent()); + TORCH_INTERNAL_ASSERT( + extent_eval.has_value(), + "When finding empty tensors: could not evaluate extent of ", + id->toString()); + if (extent_eval.value().as() == 0) { + empty_axes.push_back(i); + } } - auto full_output = full( - shape, fusion_->zeroVal(), tv->Statement::getDataType().value()); - registerMutation(tv, full_output); + empty_tensors_.push_back(EmptyTensorDescriptor{tv, empty_axes}); } return; } if (tv->definition()) { - mutate(tv->definition()->inputs()); + handle(tv->definition()->inputs()); } } private: Fusion* fusion_; - ExpressionEvaluator expr_eval_; + ExpressionEvaluator* expr_eval_; + std::vector empty_tensors_; }; -void DynamicTransformConcretizer::findEmptyBranches( +DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( + Fusion* fusion, const DynamicTransformInitialInfo* info, - ExpressionEvaluator* expr_eval) { - for (auto tv : info_.empty_tensors_) { - // TODO: record if empty, with which dimensions are zero - // TODO: re-traverse to find dynamic reshapes and resizes in case we find - // any empty branches since they may be on removed branches. We may trigger - // unnecessary recompilations when the cache misses so we try to include the - // minimal amount of information possible. - } + ExpressionEvaluator* expr_eval) + : fusion_(fusion) { + TORCH_INTERNAL_ASSERT( + !fusion->isA(), + "Invalid container. Kernel container not allowed.\n"); + + // Make sure all exactly mapped IDs have the same value in the + // evaluator when any one of the IDs has a known value + expr_eval->propagateBoundValuesThroughExactMaps(fusion); + + analyzeReshapes(info, expr_eval); + + analyzeResizes(info, expr_eval); + + // Find a minimal set of empty tensors to replace with full() calls + // NOTE: this does a backward traversal from outputs. + empty_tensors_ = + EmptyBranchFinder(info->fusion(), expr_eval).getEmptyTensors(); } void DynamicTransformConcretizationInfo::analyzeReshapes( @@ -408,8 +435,13 @@ std::string DynamicTransformConcretizationInfo::toString() const { ss << "DynamicTransformConcretizationInfo\n"; std::string indent = " "; ss << indent << "Empty tensors:\n"; - for (const auto& tv : empty_tensors_) { - ss << indent << indent << tv->toString() << "\n"; + for (const auto& kv : empty_tensors_) { + ss << indent << indent << kv.tv->toString() + << " has zero extent in these axes:"; + for (auto i : kv.empty_axes) { + ss << " " << i; + } + ss << "\n"; } ss << indent << "Reshape:\n"; for (const auto& kv : reshape_transforms_) { @@ -485,6 +517,25 @@ void DynamicTransformConcretizer::concretize() { } } +void DynamicTransformConcretizer::removeEmptyBranches() { + for (auto empty_tv_descr : info_.getEmptyTensors()) { + auto tv = empty_tv_descr.tv; + auto rfactor = tv->getMaybeRFactorDomain(); + std::vector new_shape; + new_shape.reserve(rfactor.size()); + for (auto id : rfactor) { + new_shape.push_back(id->extent()); + } + for (auto ax : empty_tv_descr.empty_axes) { + new_shape[ax] = tv->fusion()->zeroVal(); + } + auto mut_tv = + full(new_shape, tv->fusion()->zeroVal(), tv->getDataType().value()); + registerConcretization(tv, mut_tv); + mutate(tv); + } +} + void DynamicTransformConcretizer::concretizeReshape() { // Concretize each reshape op. for (const auto& kv : info_.getReshapeTransforms()) { diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index 6d1a7fe46e0..50e5ab16620 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -47,6 +47,12 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { !dynamic_resizes_.empty(); } + //! Return whether there are any tensors with unknown extent in some + //! dimension, so that they might be empty + bool hasPossibleEmptyTensor() const { + return has_possible_empty_tensor_; + } + //! Return a set of scalars that are inputs or extents of input TensorViews //! and that appear in inputs to dynamic expressions. Any Vals not in this //! list do not affect concretization. @@ -104,6 +110,12 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { friend class DynamicTransformInitialInfoBuilder; }; +//! Describes known empty dimensions in a TensorView's maybe RFactor domain +struct TORCH_CUDA_CU_API EmptyTensorDescriptor { + TensorView* tv; + std::vector empty_axes; +}; + //! A set of transformations for a symbolic fusion with concrete sizes //! of the fusion inputs class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { @@ -111,22 +123,10 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { DynamicTransformConcretizationInfo( Fusion* fusion, const DynamicTransformInitialInfo* info, - ExpressionEvaluator* expr_eval) - : fusion_(fusion) { - TORCH_INTERNAL_ASSERT( - !fusion->isA(), - "Invalid container. Kernel container not allowed.\n"); - - // Make sure all exactly mapped IDs have the same value in the - // evaluator when any one of the IDs has a known value - expr_eval->propagateBoundValuesThroughExactMaps(fusion); - - // Find a minimal set of empty tensors to replace with full() calls - findEmptyTensors(info, expr_eval); - - analyzeReshapes(info, expr_eval); + ExpressionEvaluator* expr_eval); - analyzeResizes(info, expr_eval); + const std::vector& getEmptyTensors() const { + return empty_tensors_; } const std::vector>& @@ -145,10 +145,6 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { return !(*this == other); } - void findEmptyTensors( - const DynamicTransformInitialInfo* info, - ExpressionEvaluator* expr_eval); - void analyzeReshapes( const DynamicTransformInitialInfo* info, ExpressionEvaluator* expr_eval); @@ -175,7 +171,7 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { // Holds, for each empty tensor, a pointer to the tensor along with a vector // of positions in its rfactor domain which are size 0 - std::vector < std::pair> empty_tensors_; + std::vector empty_tensors_; // Holds, for each dynamic reshape, the output TensorView, and the result of // analyzeView diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 1d7a6ad82bc..67746520aaf 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -60,6 +60,8 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { // output: 3, 4 expr_eval.bind(tv0->axis(0)->extent(), 4); expr_eval.bind(tv0->axis(1)->extent(), 3); + expr_eval.bind(tv1->axis(0)->extent(), 3); + expr_eval.bind(tv1->axis(1)->extent(), 4); expr_eval.bind(reshape_shape0, 3); expr_eval.bind(reshape_shape1, 4); @@ -79,6 +81,8 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { // output: 3, -1 expr_eval.bind(tv0->axis(0)->extent(), 4); expr_eval.bind(tv0->axis(1)->extent(), 3); + expr_eval.bind(tv1->axis(0)->extent(), 3); + expr_eval.bind(tv1->axis(1)->extent(), 4); expr_eval.bind(reshape_shape0, 3); expr_eval.bind(reshape_shape1, -1); @@ -98,6 +102,8 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { // output: 5, -1 expr_eval.bind(tv0->axis(0)->extent(), 4); expr_eval.bind(tv0->axis(1)->extent(), 3); + expr_eval.bind(tv1->axis(0)->extent(), 5); + expr_eval.bind(tv1->axis(1)->extent(), -1); expr_eval.bind(reshape_shape0, 5); expr_eval.bind(reshape_shape1, -1); From 7d01339c391bddb49c580713937ec0a0d02592ff Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 5 Jun 2023 12:07:48 -0400 Subject: [PATCH 05/63] noReductions, replaceOutput, add tests --- csrc/dynamic_transform.cpp | 25 ++++++++-- csrc/dynamic_transform.h | 8 ++++ test/test_dynamic_transform.cpp | 82 +++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 4 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index d539db091f0..03b21500d9f 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -173,7 +173,7 @@ class EmptyBranchFinder : public BackwardVisitor { } bool isTVEmpty(TensorView* tv) { - for (auto id : tv->getMaybeRFactorDomain()) { + for (auto id : TensorDomain::noReductions(tv->getMaybeRFactorDomain())) { auto extent_opt = expr_eval_->evaluate(id->extent()); TORCH_INTERNAL_ASSERT( extent_opt.has_value(), @@ -206,7 +206,7 @@ class EmptyBranchFinder : public BackwardVisitor { if (tv->definition() && !tv->definition()->isA()) { // Replace with full std::vector empty_axes; - auto rfactor = tv->getMaybeRFactorDomain(); + auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); for (size_t i : c10::irange(rfactor.size())) { auto id = rfactor.at(i); auto extent_eval = expr_eval_->evaluate(id->extent()); @@ -405,6 +405,14 @@ bool DynamicTransformConcretizationInfo::operator==( } } + for (const auto i : c10::irange(empty_tensors_.size())) { + const auto& et = empty_tensors_.at(i); + const auto& other_et = other.empty_tensors_.at(i); + if (et != other_et) { + return false; + } + } + return true; } @@ -427,6 +435,10 @@ DynamicTransformConcretizationInfo DynamicTransformConcretizationInfo::clone( // resize_transforms_ iter_type); } + for (const auto& [tv, empty_axes] : empty_tensors_) { + cloned_info.empty_tensors_.emplace_back( + EmptyTensorDescriptor{ir_cloner.clone(tv), empty_axes}); + } return cloned_info; } @@ -520,7 +532,7 @@ void DynamicTransformConcretizer::concretize() { void DynamicTransformConcretizer::removeEmptyBranches() { for (auto empty_tv_descr : info_.getEmptyTensors()) { auto tv = empty_tv_descr.tv; - auto rfactor = tv->getMaybeRFactorDomain(); + auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); std::vector new_shape; new_shape.reserve(rfactor.size()); for (auto id : rfactor) { @@ -532,7 +544,12 @@ void DynamicTransformConcretizer::removeEmptyBranches() { auto mut_tv = full(new_shape, tv->fusion()->zeroVal(), tv->getDataType().value()); registerConcretization(tv, mut_tv); - mutate(tv); + OptOutMutator::mutate(tv); + // Replace tv in Fusion outputs() if present + auto outputs = tv->fusion()->outputs(); + if (std::find(outputs.begin(), outputs.end(), tv) != outputs.end()) { + tv->fusion()->replaceOutput(tv, mut_tv); + } } } diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index 50e5ab16620..43131689a7c 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -114,6 +114,14 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { struct TORCH_CUDA_CU_API EmptyTensorDescriptor { TensorView* tv; std::vector empty_axes; + + bool operator==(const EmptyTensorDescriptor& other) const { + return tv == other.tv && empty_axes == other.empty_axes; + } + + bool operator!=(const EmptyTensorDescriptor& other) const { + return !operator==(other); + } }; //! A set of transformations for a symbolic fusion with concrete sizes diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 67746520aaf..483b5bc7820 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1007,4 +1007,86 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) { reductionDynamicPadAddFusion(invocations); } +// Test that zero-element tensors are removed from Fusion (replaced by calls to +// full()) +TEST_F(NVFuserTest, DynamicRewriteEmptyTensors1_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + auto x = makeSymbolicTensor({-1, 1}); + auto y = makeSymbolicTensor({1, -1}); + fusion.addInput(x); + fusion.addInput(y); + + auto z = mul(x, y); + fusion.addOutput(z); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor at_x = at::randn({3, 1}, options); + at::Tensor at_y = at::randn({1, 0}, options); + std::vector aten_inputs = {at_x, at_y}; + + FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + + testValidate( + fusion_executor_cache.fusion(), + outputs, + aten_inputs, + {at_x * at_y}, + __LINE__, + __FILE__); + + auto runtime = fusion_executor_cache.getMostRecentKernelRuntime(); + auto runtime_outputs = runtime->fusionSegments()->outputs(); + EXPECT_EQ(runtime_outputs.size(), 1); + + auto tv_output = runtime_outputs.at(0); + auto def = tv_output->definition(); + EXPECT_NE(def, nullptr); + std::cout << def->toString() << std::endl; + EXPECT_EQ(def->isA(), true); +} + +// Similar to above, but no broadcasts. Instead perform a reduction over +// non-empty axes first. +TEST_F(NVFuserTest, DynamicRewriteEmptyTensors2_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + auto x = makeSymbolicTensor({-1, -1}); + fusion.addInput(x); + + auto y = sum(x, {1}); + fusion.addOutput(y); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor at_x = at::randn({0, 3}, options); + std::vector aten_inputs = {at_x}; + + FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + + testValidate( + fusion_executor_cache.fusion(), + outputs, + aten_inputs, + {at_x.sum(1)}, + __LINE__, + __FILE__); + + auto runtime = fusion_executor_cache.getMostRecentKernelRuntime(); + auto runtime_outputs = runtime->fusionSegments()->outputs(); + EXPECT_EQ(runtime_outputs.size(), 1); + + auto tv_output = runtime_outputs.at(0); + auto def = tv_output->definition(); + EXPECT_NE(def, nullptr); + EXPECT_EQ(def->isA(), true); +} + } // namespace nvfuser From c16c362d973caa4c6da2f2928ffe968727393eaa Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 5 Jun 2023 12:51:08 -0400 Subject: [PATCH 06/63] Clean up clang-tidy --- csrc/dynamic_transform.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 03b21500d9f..be41a221139 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -164,7 +164,7 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { class EmptyBranchFinder : public BackwardVisitor { public: EmptyBranchFinder(Fusion* fusion, ExpressionEvaluator* expr_eval) - : fusion_(fusion), expr_eval_(expr_eval) { + : expr_eval_(expr_eval) { // We do not require the traversal to cover all outputs, because if we // replace some outputs with calls to full() then any unused outputs will be // ignored entirely. @@ -228,7 +228,6 @@ class EmptyBranchFinder : public BackwardVisitor { } private: - Fusion* fusion_; ExpressionEvaluator* expr_eval_; std::vector empty_tensors_; }; @@ -530,7 +529,7 @@ void DynamicTransformConcretizer::concretize() { } void DynamicTransformConcretizer::removeEmptyBranches() { - for (auto empty_tv_descr : info_.getEmptyTensors()) { + for (const auto& empty_tv_descr : info_.getEmptyTensors()) { auto tv = empty_tv_descr.tv; auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); std::vector new_shape; From b15929fb35ea6fac5f0f5f8d4feee8034c973874 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 6 Jun 2023 08:45:08 -0400 Subject: [PATCH 07/63] Fix FusionMagicSchedulerInstanceNormalizationBackward_CUDA --- test/test_gpu1.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_gpu1.cpp b/test/test_gpu1.cpp index f06f51f2402..5d6d6354ed8 100644 --- a/test/test_gpu1.cpp +++ b/test/test_gpu1.cpp @@ -8106,8 +8106,8 @@ TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalizationBackward_CUDA) { at_input_nvfuser, at_grad_nvfuser, at_weight_nvfuser, - at::empty({}), - at::empty({}), + at::empty({}, options), + at::empty({}, options), outputs_forward[1], outputs_forward[2]}; auto outputs_backward = From 2a2eef7ea3791c13810e37c6a277bcee9cc03dee Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 6 Jun 2023 08:45:39 -0400 Subject: [PATCH 08/63] Bind to expanded extent if needed, and only if non-const --- csrc/dynamic_transform.cpp | 25 ++++++++++++++++++++++--- test/test_dynamic_transform.cpp | 1 - 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index f5d9b15ba74..ba938da46c5 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -879,10 +879,29 @@ DynamicTransformConcretizationInfo DynamicTransform::getConcretizationInfo( i, " but found ", argTypeToString(argi->type())); - const TensorArgAbstract* targ = - reinterpret_cast(argi); + const auto* targ = reinterpret_cast(argi); for (auto j : c10::irange(dom.size())) { - expr_eval.bind(dom[j]->extent(), targ->getSize((int64_t)j)); + auto size_j = targ->getSize((int64_t)j); + // Input can be expanded. See test FusionExpandRepro1860_CUDA + auto ext = dom[j]->hasExpandedExtent() ? dom[j]->expandedExtent() : dom[j]->extent(); + // Extents can be concrete, in which case we should just check that the + // input size matches, but not try to bind them. + if (ext->isConstInt()) { + TORCH_INTERNAL_ASSERT( + ext->getInt().value() == size_j, + "Provided argument ", + targ->toString(), + " for input ", + i, + " (", + inpi->toString(), + ") does not match constant extent of IterDomain ", + dom[j]->toString(), + " at position ", + j); + } else { + expr_eval.bind(ext, size_j); + } } } } diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index c6f6e0a4466..16636edfc8e 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1046,7 +1046,6 @@ TEST_F(NVFuserTest, DynamicRewriteEmptyTensors1_CUDA) { auto tv_output = runtime_outputs.at(0); auto def = tv_output->definition(); EXPECT_NE(def, nullptr); - std::cout << def->toString() << std::endl; EXPECT_EQ(def->isA(), true); } From 3270f1598ab95d5193b26eaf62704038cdc82692 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 6 Jun 2023 09:12:21 -0400 Subject: [PATCH 09/63] Add all symbolic tensor extents to leaf_dynamic_vals_ This might make the lookups there a bit slower, as previously these would be very small sets. However, this is necessary in order to find differences in empty-tensor concretization. Note that the actual keys won't commonly change but this will find input vals that affect output sizes, such as in FusionStandaloneIota_CUDA, which was failing before this fix. --- csrc/dynamic_transform.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index ba938da46c5..1a3f3071179 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -129,6 +129,7 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { auto extent_opt = info_.expr_eval_.evaluate(id->extent()); if (!extent_opt.has_value() || extent_opt.value().as() == 0) { info_.has_possible_empty_tensor_ = true; + leaf_dynamic_vals_.push_back(id->extent()); } if (!id->definition() || id->getIterType() != IterType::Symbolic) { continue; @@ -883,7 +884,8 @@ DynamicTransformConcretizationInfo DynamicTransform::getConcretizationInfo( for (auto j : c10::irange(dom.size())) { auto size_j = targ->getSize((int64_t)j); // Input can be expanded. See test FusionExpandRepro1860_CUDA - auto ext = dom[j]->hasExpandedExtent() ? dom[j]->expandedExtent() : dom[j]->extent(); + auto ext = dom[j]->hasExpandedExtent() ? dom[j]->expandedExtent() + : dom[j]->extent(); // Extents can be concrete, in which case we should just check that the // input size matches, but not try to bind them. if (ext->isConstInt()) { From aee5a2f9610b8924c8b77bede8c73f99f0d80ee1 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 6 Jun 2023 09:13:55 -0400 Subject: [PATCH 10/63] Print pre-concretization fusion for fusion_ir_concretized --- csrc/kernel_cache.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp index bc46b61713b..98d5947624b 100644 --- a/csrc/kernel_cache.cpp +++ b/csrc/kernel_cache.cpp @@ -621,6 +621,10 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( // concretize fusion_ for use in this runtime auto fusion = std::make_unique(*fusion_); + if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) { + std::cout << "Fusion Before Concretization:" << std::endl; + fusion->printMath(); + } FusionGuard fg(fusion.get()); if (initial_info.isDynamic()) { const auto& cloned_conc_info = @@ -640,7 +644,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( fusion->stopManaging("initial_info"); } if (isDebugDumpEnabled(DebugDumpOption::FusionIrConcretized)) { - std::cout << "Concretized Fusion:" << std::endl; + std::cout << "\nConcretized Fusion:" << std::endl; fusion->printMath(); } kernel_runtimes.emplace_back(std::make_unique( From d41decc87da113bf4d61068656540bdbc64e7489 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 11:09:13 -0400 Subject: [PATCH 11/63] Remove empty reductions --- csrc/dynamic_transform.cpp | 193 +++++++++++++++++++------------------ csrc/dynamic_transform.h | 38 +++++--- 2 files changed, 123 insertions(+), 108 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 1a3f3071179..03c1e278351 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,6 @@ DynamicTransformInitialInfo DynamicTransformInitialInfo::clone( cloned_info.dynamic_resizes_.push_back(ir_cloner.clone(op)); } } - cloned_info.expr_eval_ = expr_eval_.clone(ir_cloner); return cloned_info; } @@ -102,22 +102,12 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { info_.dynamic_reshapes_.push_back(op); // Input and output extent expressions both affect concretization - const auto& inp_dom = - TensorDomain::noReductions(inp_tv->getMaybeRFactorDomain()); - for (const auto id : inp_dom) { - // Try and evaluate the extent so that intermediate expressions are - // cached in expr_eval_ - auto ext = info_.expr_eval_.evaluate(id->extent()); - if (!ext.has_value()) { - leaf_dynamic_vals_.push_back(id->extent()); - } + for (const auto id : + TensorDomain::noReductions(inp_tv->getMaybeRFactorDomain())) { + leaf_dynamic_vals_.push_back(id->extent()); } - const auto& out_dom = out_tv->getMaybeRFactorDomain(); - for (const auto id : out_dom) { - auto ext = info_.expr_eval_.evaluate(id->extent()); - if (!ext.has_value()) { - leaf_dynamic_vals_.push_back(id->extent()); - } + for (const auto id : out_tv->getMaybeRFactorDomain()) { + leaf_dynamic_vals_.push_back(id->extent()); } } } @@ -126,9 +116,9 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { void handle(TensorView* tv) override { const auto& rfd = tv->getMaybeRFactorDomain(); for (auto id : rfd) { - auto extent_opt = info_.expr_eval_.evaluate(id->extent()); - if (!extent_opt.has_value() || extent_opt.value().as() == 0) { - info_.has_possible_empty_tensor_ = true; + if (!id->extent()->isConstScalar() || + id->extent()->getInt().value() == 0) { + info_.dynamic_extent_vals_.insert(id->extent()); leaf_dynamic_vals_.push_back(id->extent()); } if (!id->definition() || id->getIterType() != IterType::Symbolic) { @@ -242,18 +232,23 @@ DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( !fusion->isA(), "Invalid container. Kernel container not allowed.\n"); - // Make sure all exactly mapped IDs have the same value in the - // evaluator when any one of the IDs has a known value - expr_eval->propagateBoundValuesThroughExactMaps(fusion); - analyzeReshapes(info, expr_eval); analyzeResizes(info, expr_eval); + bool has_empty_tensor = false; + for (auto ext : info->getDynamicExtentVals()) { + if (expr_eval->evaluate(ext).value().as() == 0) { + has_empty_tensor = true; + break; + } + } // Find a minimal set of empty tensors to replace with full() calls // NOTE: this does a backward traversal from outputs. - empty_tensors_ = - EmptyBranchFinder(info->fusion(), expr_eval).getEmptyTensors(); + if (has_empty_tensor) { + empty_tensors_ = + EmptyBranchFinder(info->fusion(), expr_eval).getEmptyTensors(); + } } void DynamicTransformConcretizationInfo::analyzeReshapes( @@ -478,9 +473,12 @@ class DynamicTransformConcretizer : public OptOutMutator { private: void concretize(); - //! Set definitions of empty tensors to full() calls. + //! Set definitions of empty tensors to full() calls, replace reductions over + //! empty axes with full calls. void removeEmptyBranches(); + void replaceByFull(TensorView* tv, std::vector& new_shape); + void concretizeReshape(); void concretizeResize(); @@ -512,9 +510,7 @@ class DynamicTransformConcretizer : public OptOutMutator { }; void DynamicTransformConcretizer::concretize() { - removeEmptyBranches(); - - // First, concretize all dynamic reshape ops + // Concretize all dynamic reshape ops concretizeReshape(); // Set output IterTypes for dynamic resize ops @@ -527,6 +523,9 @@ void DynamicTransformConcretizer::concretize() { mutate(stmt); } } + + // Concretize empty tensors last. + removeEmptyBranches(); } void DynamicTransformConcretizer::removeEmptyBranches() { @@ -539,17 +538,74 @@ void DynamicTransformConcretizer::removeEmptyBranches() { new_shape.push_back(id->extent()); } for (auto ax : empty_tv_descr.empty_axes) { + // Hard-code zero extent for empty axes. This lets us detect empty input + // and output tensors during scheduling/execution. new_shape[ax] = tv->fusion()->zeroVal(); } - auto mut_tv = - full(new_shape, tv->fusion()->zeroVal(), tv->getDataType().value()); - registerConcretization(tv, mut_tv); - OptOutMutator::mutate(tv); - // Replace tv in Fusion outputs() if present - auto outputs = tv->fusion()->outputs(); - if (std::find(outputs.begin(), outputs.end(), tv) != outputs.end()) { - tv->fusion()->replaceOutput(tv, mut_tv); + + // If expr is a ReductionOp or WelfordOp over some empty axes, replace it + // with a call to full(). + for (auto use : tv->uses()) { + if (auto rop = dynamic_cast(use)) { + auto out = rop->out()->as(); + if (std::any_of( + empty_tv_descr.empty_axes.begin(), + empty_tv_descr.empty_axes.end(), + [&out](size_t ax) { + return out->getRootDomain().at(ax)->isReduction(); + })) { + auto nored_axes = + TensorDomain::noReductions(out->getMaybeRFactorDomain()); + // Output shape is simply the same as the original reduction. If there + // were zeros in the non-Reduction axes, it would be replaced by + // full() directly. + std::vector out_shape(nored_axes.size()); + std::transform( + nored_axes.begin(), + nored_axes.end(), + out_shape.begin(), + [](IterDomain* id) -> Val* { return id->extent(); }); + replaceByFull(out, out_shape); + } + } else if (auto wop = dynamic_cast(use)) { + auto avg = wop->outAvg()->as(); + auto var = wop->outVar()->as(); + auto N = wop->outN()->as(); + if (std::any_of( + empty_tv_descr.empty_axes.begin(), + empty_tv_descr.empty_axes.end(), + [&avg](size_t ax) { + return avg->getRootDomain().at(ax)->isReduction(); + })) { + auto nored_axes = + TensorDomain::noReductions(avg->getMaybeRFactorDomain()); + std::vector out_shape(nored_axes.size()); + std::transform( + nored_axes.begin(), + nored_axes.end(), + out_shape.begin(), + [](IterDomain* id) -> Val* { return id->extent(); }); + replaceByFull(avg, out_shape); + replaceByFull(var, out_shape); + replaceByFull(N, out_shape); + } + } } + replaceByFull(tv, new_shape); + } +} + +void DynamicTransformConcretizer::replaceByFull( + TensorView* tv, + std::vector& new_shape) { + auto mut_tv = + full(new_shape, tv->fusion()->zeroVal(), tv->getDataType().value()); + registerConcretization(tv, mut_tv); + OptOutMutator::mutate(tv); + // Replace tv in Fusion outputs() if present + auto outputs = tv->fusion()->outputs(); + if (std::find(outputs.begin(), outputs.end(), tv) != outputs.end()) { + tv->fusion()->replaceOutput(tv, mut_tv); } } @@ -848,65 +904,10 @@ DynamicTransformConcretizationInfo DynamicTransform::getConcretizationInfo( Fusion* fusion, const DynamicTransformInitialInfo* info, const KernelArgumentHolder* args) { - // Copy the expression evaluator that has some values precomputed - auto expr_eval = info->getExpressionEvaluator(); - - // Bind input scalars and tensor metadata to symbolic scalars - TORCH_CHECK( - args->size() == fusion->inputs().size(), - "Received ", - args->size(), - " inputs but expected ", - fusion->inputs().size()); - for (auto i : c10::irange(args->size())) { - const auto& inpi = fusion->inputs()[i]; - const auto argi = (*args)[i]; - if (inpi->isIntegralScalar()) { - TORCH_CHECK( - argi->isType(ArgType::Long), - "Expected integer input at position ", - i, - " but found ", - argTypeToString(argi->type())); - - const int64_t arg_val = *reinterpret_cast(argi->arg()); - expr_eval.bind(inpi, arg_val); - } else if (inpi->isA()) { - const auto& tv = inpi->as(); - const auto& dom = tv->domain()->maybeRFactor(); - TORCH_CHECK( - argi->isType(ArgType::Tensor), - "Expected CUDA tensor at position ", - i, - " but found ", - argTypeToString(argi->type())); - const auto* targ = reinterpret_cast(argi); - for (auto j : c10::irange(dom.size())) { - auto size_j = targ->getSize((int64_t)j); - // Input can be expanded. See test FusionExpandRepro1860_CUDA - auto ext = dom[j]->hasExpandedExtent() ? dom[j]->expandedExtent() - : dom[j]->extent(); - // Extents can be concrete, in which case we should just check that the - // input size matches, but not try to bind them. - if (ext->isConstInt()) { - TORCH_INTERNAL_ASSERT( - ext->getInt().value() == size_j, - "Provided argument ", - targ->toString(), - " for input ", - i, - " (", - inpi->toString(), - ") does not match constant extent of IterDomain ", - dom[j]->toString(), - " at position ", - j); - } else { - expr_eval.bind(ext, size_j); - } - } - } - } + ExpressionEvaluator expr_eval = executor_utils::bindInputs(*args, fusion); + // Make sure all exactly mapped IDs have the same value in the + // evaluator when any one of the IDs has a known value + expr_eval.propagateBoundValuesThroughExactMaps(fusion); return DynamicTransformConcretizationInfo(fusion, info, &expr_eval); } diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index 43131689a7c..d6d1187aeb4 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -43,23 +44,30 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { //! given some user input. In either of these cases, concretization may change //! the structure of the Fusion. bool isDynamic() const { - return has_possible_empty_tensor_ || !dynamic_reshapes_.empty() || + return hasPossibleEmptyTensor() || !dynamic_reshapes_.empty() || !dynamic_resizes_.empty(); } //! Return whether there are any tensors with unknown extent in some //! dimension, so that they might be empty bool hasPossibleEmptyTensor() const { - return has_possible_empty_tensor_; + return !dynamic_extent_vals_.empty(); } //! Return a set of scalars that are inputs or extents of input TensorViews //! and that appear in inputs to dynamic expressions. Any Vals not in this //! list do not affect concretization. - const std::unordered_set getRootDynamicVals() const { + const std::unordered_set& getRootDynamicVals() const { return root_dynamic_vals_; } + //! Return a set of scalars that appear as extents in TensorViews in the + //! Fusion. If any of these evaluate to zero, there is at least one empty + //! TensorView present. + const std::unordered_set& getDynamicExtentVals() const { + return dynamic_extent_vals_; + } + //! Return a vector of ViewOp expressions that have dynamic output shapes const std::vector& getDynamicReshapes() const { return dynamic_reshapes_; @@ -70,10 +78,6 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { return dynamic_resizes_; } - const ExpressionEvaluator& getExpressionEvaluator() const { - return expr_eval_; - } - std::string toString() const; DynamicTransformInitialInfo clone(IrCloner& ir_cloner) const; @@ -99,14 +103,13 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { std::vector dynamic_resizes_; - bool has_possible_empty_tensor_ = false; + // This is a minimal set of scalars to check for empty tensors. If any are + // zero, we should traverse to find empty tensors. + std::unordered_set dynamic_extent_vals_; // Root Vals that determine concretization std::unordered_set root_dynamic_vals_; - // ExpressionEvaluator that we use to pre-compute as much as possible - ExpressionEvaluator expr_eval_; - friend class DynamicTransformInitialInfoBuilder; }; @@ -122,6 +125,18 @@ struct TORCH_CUDA_CU_API EmptyTensorDescriptor { bool operator!=(const EmptyTensorDescriptor& other) const { return !operator==(other); } + + size_t hash() const { + size_t hash = 0; + for (auto ax : empty_axes) { + hash <<= 3; + hash ^= ax; + } + // We need to hash the tv address here, since we could conceivably find two + // different tensors that are empty in the same axes. + hash ^= std::hash()(tv); + return hash; + } }; //! A set of transformations for a symbolic fusion with concrete sizes @@ -160,7 +175,6 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { void analyzeResizes( const DynamicTransformInitialInfo* info, ExpressionEvaluator* expr_eval); - Fusion* fusion() const { return fusion_; } From 96e105a410bc0c0fd1614cbf4d059c0f7ab8c726 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 11:57:43 -0400 Subject: [PATCH 12/63] Clean up EmptyBranchFinder --- csrc/dynamic_transform.cpp | 74 +++++++++++++++----------------------- 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 03c1e278351..9cf6c0a9474 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -163,22 +163,6 @@ class EmptyBranchFinder : public BackwardVisitor { traverseTo(fusion, fusion->outputs(), false); } - bool isTVEmpty(TensorView* tv) { - for (auto id : TensorDomain::noReductions(tv->getMaybeRFactorDomain())) { - auto extent_opt = expr_eval_->evaluate(id->extent()); - TORCH_INTERNAL_ASSERT( - extent_opt.has_value(), - "Cannot evaluate extent ", - id->extent(), - " of ", - tv->toString()); - if (extent_opt.value().as() == 0) { - return true; - } - } - return false; - } - std::vector getEmptyTensors() const { return empty_tensors_; } @@ -186,35 +170,34 @@ class EmptyBranchFinder : public BackwardVisitor { private: using BackwardVisitor::handle; - void handle(std::vector vals) { - for (auto v : vals) { - handle(v); - } - } - void handle(TensorView* tv) final { - if (isTVEmpty(tv)) { - if (tv->definition() && !tv->definition()->isA()) { - // Replace with full - std::vector empty_axes; - auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); - for (size_t i : c10::irange(rfactor.size())) { - auto id = rfactor.at(i); - auto extent_eval = expr_eval_->evaluate(id->extent()); - TORCH_INTERNAL_ASSERT( - extent_eval.has_value(), - "When finding empty tensors: could not evaluate extent of ", - id->toString()); - if (extent_eval.value().as() == 0) { - empty_axes.push_back(i); - } - } + std::vector empty_axes; + auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); + bool empty = false; + for (size_t i : c10::irange(rfactor.size())) { + auto id = rfactor.at(i); + auto extent_eval = expr_eval_->evaluate(id->extent()); + TORCH_INTERNAL_ASSERT( + extent_eval.has_value(), + "When finding empty tensors: could not evaluate extent of ", + id->toString()); + if (extent_eval.value().as() == 0) { + empty_axes.push_back(i); + empty = true; + } + } + if (empty) { + if (tv->definition()) { + // Replace with full. Note that even if the definition was a FullOp, we + // still mark this tensor for replacement, so that we can ensure the + // empty axes are marked with constant zeroes empty_tensors_.push_back(EmptyTensorDescriptor{tv, empty_axes}); } return; - } - if (tv->definition()) { - handle(tv->definition()->inputs()); + } else if (tv->definition()) { + for (auto v : tv->definition()->inputs()) { + handle(v); + } } } @@ -516,6 +499,9 @@ void DynamicTransformConcretizer::concretize() { // Set output IterTypes for dynamic resize ops concretizeResize(); + // Concretize empty tensors last. + removeEmptyBranches(); + // Finally, propagate concretized domains auto all_stmts = StmtSort::getStmts(info_.fusion(), true); for (auto stmt : all_stmts) { @@ -523,9 +509,6 @@ void DynamicTransformConcretizer::concretize() { mutate(stmt); } } - - // Concretize empty tensors last. - removeEmptyBranches(); } void DynamicTransformConcretizer::removeEmptyBranches() { @@ -603,8 +586,7 @@ void DynamicTransformConcretizer::replaceByFull( registerConcretization(tv, mut_tv); OptOutMutator::mutate(tv); // Replace tv in Fusion outputs() if present - auto outputs = tv->fusion()->outputs(); - if (std::find(outputs.begin(), outputs.end(), tv) != outputs.end()) { + if (tv->isFusionOutput()) { tv->fusion()->replaceOutput(tv, mut_tv); } } From 38925784f0eaf10dcff0376326dbcddf953724ee Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 13:18:04 -0400 Subject: [PATCH 13/63] Simplify removeEmptyBranches --- csrc/dynamic_transform.cpp | 87 ++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 9cf6c0a9474..0d4bf0860bb 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -215,6 +215,9 @@ DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( !fusion->isA(), "Invalid container. Kernel container not allowed.\n"); + // Ensure we have propagated known values before evaluating extents + expr_eval->propagateBoundValuesThroughExactMaps(fusion); + analyzeReshapes(info, expr_eval); analyzeResizes(info, expr_eval); @@ -460,7 +463,12 @@ class DynamicTransformConcretizer : public OptOutMutator { //! empty axes with full calls. void removeEmptyBranches(); - void replaceByFull(TensorView* tv, std::vector& new_shape); + //! Modify the Fusion by replacing tv with output of full() expression in + //! outputs and all uses. + void replaceByFull( + TensorView* tv, + std::vector& new_shape, + Val* fill_value = nullptr); void concretizeReshape(); @@ -526,50 +534,49 @@ void DynamicTransformConcretizer::removeEmptyBranches() { new_shape[ax] = tv->fusion()->zeroVal(); } + auto hasEmptyRootReductionAxis = [&empty_tv_descr](TensorView* out_tv) { + return std::any_of( + empty_tv_descr.empty_axes.begin(), + empty_tv_descr.empty_axes.end(), + [&out_tv](size_t ax) { + return out_tv->getRootDomain().at(ax)->isReduction(); + }); + }; + + // Given a TensorView, get a shape with hard-coded zeroes + auto reduction_shape = [](TensorView* out_tv) -> std::vector { + auto nored_axes = + TensorDomain::noReductions(out_tv->getMaybeRFactorDomain()); + // Output shape is simply the same as the original reduction. If there + // were zeros in the non-Reduction axes, it would be replaced by + // full() directly. + std::vector out_shape(nored_axes.size()); + std::transform( + nored_axes.begin(), + nored_axes.end(), + out_shape.begin(), + [](IterDomain* id) -> Val* { return id->extent(); }); + return out_shape; + }; + // If expr is a ReductionOp or WelfordOp over some empty axes, replace it // with a call to full(). for (auto use : tv->uses()) { if (auto rop = dynamic_cast(use)) { auto out = rop->out()->as(); - if (std::any_of( - empty_tv_descr.empty_axes.begin(), - empty_tv_descr.empty_axes.end(), - [&out](size_t ax) { - return out->getRootDomain().at(ax)->isReduction(); - })) { - auto nored_axes = - TensorDomain::noReductions(out->getMaybeRFactorDomain()); - // Output shape is simply the same as the original reduction. If there - // were zeros in the non-Reduction axes, it would be replaced by - // full() directly. - std::vector out_shape(nored_axes.size()); - std::transform( - nored_axes.begin(), - nored_axes.end(), - out_shape.begin(), - [](IterDomain* id) -> Val* { return id->extent(); }); + if (hasEmptyRootReductionAxis(out)) { + auto out_shape = reduction_shape(out); replaceByFull(out, out_shape); } } else if (auto wop = dynamic_cast(use)) { auto avg = wop->outAvg()->as(); auto var = wop->outVar()->as(); auto N = wop->outN()->as(); - if (std::any_of( - empty_tv_descr.empty_axes.begin(), - empty_tv_descr.empty_axes.end(), - [&avg](size_t ax) { - return avg->getRootDomain().at(ax)->isReduction(); - })) { - auto nored_axes = - TensorDomain::noReductions(avg->getMaybeRFactorDomain()); - std::vector out_shape(nored_axes.size()); - std::transform( - nored_axes.begin(), - nored_axes.end(), - out_shape.begin(), - [](IterDomain* id) -> Val* { return id->extent(); }); - replaceByFull(avg, out_shape); - replaceByFull(var, out_shape); + if (hasEmptyRootReductionAxis(avg)) { + auto out_shape = reduction_shape(avg); + auto nan = IrBuilder::create(0.0 / 0.0); + replaceByFull(avg, out_shape, nan); + replaceByFull(var, out_shape, nan); replaceByFull(N, out_shape); } } @@ -580,9 +587,15 @@ void DynamicTransformConcretizer::removeEmptyBranches() { void DynamicTransformConcretizer::replaceByFull( TensorView* tv, - std::vector& new_shape) { - auto mut_tv = - full(new_shape, tv->fusion()->zeroVal(), tv->getDataType().value()); + std::vector& new_shape, + Val* fill_value) { + if (!fill_value) { + fill_value = tv->fusion()->zeroVal(); + } + if (fill_value->getDataType().value() != tv->getDataType().value()) { + fill_value = castOp(tv->getDataType().value(), fill_value); + } + auto mut_tv = full(new_shape, fill_value, tv->getDataType().value()); registerConcretization(tv, mut_tv); OptOutMutator::mutate(tv); // Replace tv in Fusion outputs() if present From e38d02772da02e7aad7859412848cd12ad75f0fa Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 14:02:46 -0400 Subject: [PATCH 14/63] Evaluate extents instead of shallow getInt --- csrc/dynamic_transform.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 0d4bf0860bb..661c579cf0d 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -117,7 +117,7 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { const auto& rfd = tv->getMaybeRFactorDomain(); for (auto id : rfd) { if (!id->extent()->isConstScalar() || - id->extent()->getInt().value() == 0) { + id->extent()->evaluateInt() == 0) { info_.dynamic_extent_vals_.insert(id->extent()); leaf_dynamic_vals_.push_back(id->extent()); } From 6099940b89ebefab50dde242bddae5ad5b780ca5 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 14:18:28 -0400 Subject: [PATCH 15/63] Add FusionResizeMultiSliceEmpty_CUDA test Verifies that this branch fixes #365 --- csrc/dynamic_transform.cpp | 3 +-- test/test_resize.cpp | 45 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 661c579cf0d..a5e0fa2eddf 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -116,8 +116,7 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { void handle(TensorView* tv) override { const auto& rfd = tv->getMaybeRFactorDomain(); for (auto id : rfd) { - if (!id->extent()->isConstScalar() || - id->extent()->evaluateInt() == 0) { + if (!id->extent()->isConstScalar() || id->extent()->evaluateInt() == 0) { info_.dynamic_extent_vals_.insert(id->extent()); leaf_dynamic_vals_.push_back(id->extent()); } diff --git a/test/test_resize.cpp b/test/test_resize.cpp index 29b16d6ed74..b7545e42f12 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2168,4 +2168,49 @@ TEST_F(NVFuserTest, FusionSqueezeSymbolic_CUDA) { "must concretize to IterType::Broadcast but found"))); } +TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + std::vector shape({9}); + // concrete shapes to avoid dynamic Fusion + auto tv0 = makeConcreteTensor(shape); + fusion->addInput(tv0); + + // Perform a size-1 slice and a size-0 slice on tv0. The size-1 slice + // could be size >1 with no change in the error. The order does not + // matter. Performing only one of these slices does not trigger the + // error and the output is correct in that case. If there are + // multiple size-0 slices the error is not triggered. It only seems + // to appear when there are both size-0 and size non-zero slices of + // the same tensor. + auto tv1 = slice( + tv0, + {{IrBuilder::create(0), + IrBuilder::create(1), + IrBuilder::create(1)}}); + fusion->addOutput(tv1); + auto tv2 = slice( + tv0, + {{IrBuilder::create(0), + IrBuilder::create(0), + IrBuilder::create(1)}}); + fusion->addOutput(tv2); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + + FusionExecutorCache executor_cache(std::move(fusion)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); + + auto ref0 = t0.index({at::indexing::Slice(0, 1)}); + auto ref1 = t0.index({at::indexing::Slice(0, 0)}); + + TORCH_CHECK(ref0.equal(cg_outputs[0])); + TORCH_CHECK(ref1.equal(cg_outputs[1])); +} + } // namespace nvfuser From 478ed4ac40947142694c4efeeaff87adb05fdfba Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 14:46:52 -0400 Subject: [PATCH 16/63] Add FusionReduceZeroElementTensor_CUDA --- csrc/dynamic_transform.cpp | 38 ++++++++++++++++++--------------- test/test_dynamic_transform.cpp | 21 ++++++++++++++++++ 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index a5e0fa2eddf..bc60ed5d5c5 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -186,17 +186,11 @@ class EmptyBranchFinder : public BackwardVisitor { } } if (empty) { - if (tv->definition()) { - // Replace with full. Note that even if the definition was a FullOp, we - // still mark this tensor for replacement, so that we can ensure the - // empty axes are marked with constant zeroes - empty_tensors_.push_back(EmptyTensorDescriptor{tv, empty_axes}); - } + // Replace with full. Note that even if the definition was a FullOp, we + // still mark this tensor for replacement, so that we can ensure the + // empty axes are marked with constant zeroes + empty_tensors_.push_back(EmptyTensorDescriptor{tv, empty_axes}); return; - } else if (tv->definition()) { - for (auto v : tv->definition()->inputs()) { - handle(v); - } } } @@ -464,7 +458,7 @@ class DynamicTransformConcretizer : public OptOutMutator { //! Modify the Fusion by replacing tv with output of full() expression in //! outputs and all uses. - void replaceByFull( + TensorView* replaceEmpty( TensorView* tv, std::vector& new_shape, Val* fill_value = nullptr); @@ -500,6 +494,7 @@ class DynamicTransformConcretizer : public OptOutMutator { }; void DynamicTransformConcretizer::concretize() { + std::cout << "Concretizing with " << info_.toString() << std::endl; // Concretize all dynamic reshape ops concretizeReshape(); @@ -565,7 +560,7 @@ void DynamicTransformConcretizer::removeEmptyBranches() { auto out = rop->out()->as(); if (hasEmptyRootReductionAxis(out)) { auto out_shape = reduction_shape(out); - replaceByFull(out, out_shape); + replaceEmpty(out, out_shape); } } else if (auto wop = dynamic_cast(use)) { auto avg = wop->outAvg()->as(); @@ -574,20 +569,28 @@ void DynamicTransformConcretizer::removeEmptyBranches() { if (hasEmptyRootReductionAxis(avg)) { auto out_shape = reduction_shape(avg); auto nan = IrBuilder::create(0.0 / 0.0); - replaceByFull(avg, out_shape, nan); - replaceByFull(var, out_shape, nan); - replaceByFull(N, out_shape); + replaceEmpty(avg, out_shape, nan); + replaceEmpty(var, out_shape, nan); + replaceEmpty(N, out_shape); } } } - replaceByFull(tv, new_shape); + replaceEmpty(tv, new_shape); } } -void DynamicTransformConcretizer::replaceByFull( +TensorView* DynamicTransformConcretizer::replaceEmpty( TensorView* tv, std::vector& new_shape, Val* fill_value) { + if (!tv->definition()) { + TORCH_INTERNAL_ASSERT( + tv->isFusionInput(), + "Found TensorView ", + tv->toString(), + " which does not have a definition and is not a Fusion input."); + return tv; + } if (!fill_value) { fill_value = tv->fusion()->zeroVal(); } @@ -601,6 +604,7 @@ void DynamicTransformConcretizer::replaceByFull( if (tv->isFusionOutput()) { tv->fusion()->replaceOutput(tv, mut_tv); } + return mut_tv; } void DynamicTransformConcretizer::concretizeReshape() { diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 16636edfc8e..8fbd761f9d9 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1119,4 +1119,25 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { testValidate(&fusion, outputs, aten_inputs, {at2}, __LINE__, __FILE__); } +// Test that 0-dimensional tensors do not break reduction scheduler +TEST_F(NVFuserTest, FusionReduceZeroElementTensor_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + std::vector input_shape{3, 4, 0, 5}; + auto tv0 = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto tv1 = sum(tv0, {2}); + fusion->addOutput(tv1); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_x = at::randn(input_shape, options); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs({at_x}); + auto t2 = at_x.sum({2}); + + testValidate( + executor_cache.fusion(), outputs, {at_x}, {t2}, __LINE__, __FILE__); +} + } // namespace nvfuser From 5ce87fed9ef3684f26bb4d4297657be3706d8123 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 14:51:54 -0400 Subject: [PATCH 17/63] Sweep reduction dims in reduce zero elt test --- test/test_dynamic_transform.cpp | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 8fbd761f9d9..55761ea7677 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1119,25 +1119,27 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { testValidate(&fusion, outputs, aten_inputs, {at2}, __LINE__, __FILE__); } -// Test that 0-dimensional tensors do not break reduction scheduler +// Test that 0-dimensional tensors can be used in reductions TEST_F(NVFuserTest, FusionReduceZeroElementTensor_CUDA) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); + for (int reduction_dim : {1, 2}) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); - std::vector input_shape{3, 4, 0, 5}; - auto tv0 = makeSymbolicTensor(4); - fusion->addInput(tv0); - auto tv1 = sum(tv0, {2}); - fusion->addOutput(tv1); + std::vector input_shape{3, 4, 0, 5}; + auto tv0 = makeSymbolicTensor(4); + fusion->addInput(tv0); + auto tv1 = sum(tv0, {reduction_dim}); + fusion->addOutput(tv1); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor at_x = at::randn(input_shape, options); - FusionExecutorCache executor_cache(std::move(fusion)); - auto outputs = executor_cache.runFusionWithInputs({at_x}); - auto t2 = at_x.sum({2}); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_x = at::randn(input_shape, options); + FusionExecutorCache executor_cache(std::move(fusion)); + auto outputs = executor_cache.runFusionWithInputs({at_x}); + auto t2 = at_x.sum({reduction_dim}); - testValidate( - executor_cache.fusion(), outputs, {at_x}, {t2}, __LINE__, __FILE__); + testValidate( + executor_cache.fusion(), outputs, {at_x}, {t2}, __LINE__, __FILE__); + } } } // namespace nvfuser From 13a5b5722d1256ca481a653ce0be937cdf3cf8b7 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 14:57:15 -0400 Subject: [PATCH 18/63] Remove debug print --- csrc/dynamic_transform.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index bc60ed5d5c5..e34432d1d1c 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -494,7 +494,6 @@ class DynamicTransformConcretizer : public OptOutMutator { }; void DynamicTransformConcretizer::concretize() { - std::cout << "Concretizing with " << info_.toString() << std::endl; // Concretize all dynamic reshape ops concretizeReshape(); From c56bd8639c5d51c29e364c15e00cb001ebfc1282 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 15:22:29 -0400 Subject: [PATCH 19/63] Add failing tests --- test/test_dynamic_transform.cpp | 53 +++++++++++++++++++++++++++++++-- test/test_resize.cpp | 1 + 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 55761ea7677..0681c242878 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1079,13 +1079,61 @@ TEST_F(NVFuserTest, DynamicRewriteEmptyTensors2_CUDA) { __FILE__); auto runtime = fusion_executor_cache.getMostRecentKernelRuntime(); - auto runtime_outputs = runtime->fusionSegments()->outputs(); + auto runtime_fusion = runtime->fusionSegments(); + auto runtime_outputs = runtime_fusion->outputs(); EXPECT_EQ(runtime_outputs.size(), 1); - auto tv_output = runtime_outputs.at(0); + auto tv_output = runtime_outputs.at(0)->as(); auto def = tv_output->definition(); EXPECT_NE(def, nullptr); EXPECT_EQ(def->isA(), true); + + // Fusion output should have hardcoded zero extent after concretization + auto output_extent = tv_output->axis(0)->extent(); + EXPECT_TRUE(output_extent->isConstInt()); + EXPECT_EQ(output_extent->getInt(), 0); + + // Fusion input should have hardcoded zero extent after concretization + auto input_extent = + runtime_fusion->inputs().at(0)->as()->axis(0)->extent(); + EXPECT_TRUE(input_extent->isConstInt()); + EXPECT_EQ(input_extent->getInt(), 0); +} + +// Test that the vector of empty tensors is minimal +TEST_F(NVFuserTest, DynamicRewriteEmptyTensors3_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + auto x = makeSymbolicTensor({-1, -1}); + fusion.addInput(x); + + // Each of x, y, and z should be empty, but only z (and possibly x) needs to + // be marked empty. + auto y = add(x, x); + auto z = add(y, y); + fusion.addOutput(z); + + { + ExpressionEvaluator expr_eval; + + // input: 0, 2 + expr_eval.bind(x->axis(0)->extent(), 0); + expr_eval.bind(x->axis(1)->extent(), 2); + + auto initial_info = DynamicTransform::getInitialInfo(&fusion); + auto info = DynamicTransform::getConcretizationInfo( + &fusion, &initial_info, &expr_eval); + for (const auto& empty_desc : info.getEmptyTensors()) { + TORCH_CHECK( + y != empty_desc.tv, + "Expected ", + y->toString(), + " to not be marked empty as it is on a dead branch: ", + info.toString()); + } + } } // Test that a Symbolic root/Broadcast rfactor is not concretized to @@ -1120,6 +1168,7 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { } // Test that 0-dimensional tensors can be used in reductions +// See https://github.com/NVIDIA/Fuser/issues/264 TEST_F(NVFuserTest, FusionReduceZeroElementTensor_CUDA) { for (int reduction_dim : {1, 2}) { auto fusion = std::make_unique(); diff --git a/test/test_resize.cpp b/test/test_resize.cpp index b7545e42f12..89a3415bc66 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2168,6 +2168,7 @@ TEST_F(NVFuserTest, FusionSqueezeSymbolic_CUDA) { "must concretize to IterType::Broadcast but found"))); } +// See https://github.com/NVIDIA/Fuser/issues/365 TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); From 145f4deca71e34bdd0e54fc8255726b23cf2048f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 16:24:37 -0400 Subject: [PATCH 20/63] Add Fusion::replaceInput With this the only failing test is due to not stopping traversal at empty tensor definitions. --- csrc/dynamic_transform.cpp | 49 +++++++++++++++++++++++---------- csrc/fusion.cpp | 29 +++++++++++++++++++ csrc/fusion.h | 3 ++ test/test_dynamic_transform.cpp | 30 ++++++++++++++++++++ 4 files changed, 96 insertions(+), 15 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index e34432d1d1c..b067fa58b15 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -519,7 +519,7 @@ void DynamicTransformConcretizer::removeEmptyBranches() { std::vector new_shape; new_shape.reserve(rfactor.size()); for (auto id : rfactor) { - new_shape.push_back(id->extent()); + new_shape.push_back(id->getMaybeExpandedExtent()); } for (auto ax : empty_tv_descr.empty_axes) { // Hard-code zero extent for empty axes. This lets us detect empty input @@ -548,7 +548,7 @@ void DynamicTransformConcretizer::removeEmptyBranches() { nored_axes.begin(), nored_axes.end(), out_shape.begin(), - [](IterDomain* id) -> Val* { return id->extent(); }); + [](IterDomain* id) -> Val* { return id->getMaybeExpandedExtent(); }); return out_shape; }; @@ -582,27 +582,46 @@ TensorView* DynamicTransformConcretizer::replaceEmpty( TensorView* tv, std::vector& new_shape, Val* fill_value) { + TensorView* mut_tv = nullptr; if (!tv->definition()) { + // No definition. Probably an input. TORCH_INTERNAL_ASSERT( - tv->isFusionInput(), - "Found TensorView ", - tv->toString(), - " which does not have a definition and is not a Fusion input."); - return tv; - } - if (!fill_value) { - fill_value = tv->fusion()->zeroVal(); - } - if (fill_value->getDataType().value() != tv->getDataType().value()) { - fill_value = castOp(tv->getDataType().value(), fill_value); + !tv->hasRFactor(), + "Found RFactor in input TensorView ", + tv->toString()); + std::vector expanded(tv->nDims()); + for (auto i : c10::irange(tv->nDims())) { + expanded[i] = tv->axis(i)->hasExpandedExtent(); + } + mut_tv = TensorViewBuilder() + .ndims(tv->nDims()) + .dtype(tv->getDataType().value()) + .contiguity(tv->getContiguity()) + .shape(new_shape) + .expanded(expanded) + .build(); + mut_tv->setMemoryType(MemoryType::Global); + } else { + if (!fill_value) { + fill_value = tv->fusion()->zeroVal(); + } + if (fill_value->getDataType().value() != tv->getDataType().value()) { + fill_value = castOp(tv->getDataType().value(), fill_value); + } + mut_tv = full(new_shape, fill_value, tv->getDataType().value()); } - auto mut_tv = full(new_shape, fill_value, tv->getDataType().value()); + registerConcretization(tv, mut_tv); OptOutMutator::mutate(tv); - // Replace tv in Fusion outputs() if present + + if (tv->isFusionInput()) { + tv->fusion()->replaceInput(tv, mut_tv); + } + if (tv->isFusionOutput()) { tv->fusion()->replaceOutput(tv, mut_tv); } + return mut_tv; } diff --git a/csrc/fusion.cpp b/csrc/fusion.cpp index 1adccf41021..50b97c76302 100644 --- a/csrc/fusion.cpp +++ b/csrc/fusion.cpp @@ -285,6 +285,35 @@ void Fusion::removeOutput(Val* output) { all_tv_uses_valid_ = false; } +void Fusion::replaceInput(Val* input, Val* replacement) { + auto find_input = std::find(inputs_.begin(), inputs_.end(), input); + TORCH_CHECK(find_input != inputs_.end(), "Unable to find input in Fusion"); + + std::replace_if( + inputs_.begin(), + inputs_.end(), + [&input](Val* v) { return v == input; }, + replacement); + + if (replacement->getValType().value() == ValType::TensorView) { + replacement->setIsFusionInput(true); + replacement->as()->setMemoryType(MemoryType::Global); + } + if (input->getValType().value() == ValType::TensorView) { + input->setIsFusionInput(false); + input->as()->setMemoryType(MemoryType::Local); + } + // Mark uses invalid so that they will be reset next time uses() is called + invalidateTvUses(); + + // Maintain aliased inputs + for (auto [aliased_output, aliased_input] : io_alias_) { + if (aliased_input == input) { + io_alias_[aliased_output] = replacement; + } + } +} + void Fusion::replaceOutput(Val* output, Val* replacement) { auto find_output = std::find(outputs_.begin(), outputs_.end(), output); TORCH_CHECK(find_output != outputs_.end(), "Unable to find output in Fusion"); diff --git a/csrc/fusion.h b/csrc/fusion.h index e06c1c12778..334833a5974 100644 --- a/csrc/fusion.h +++ b/csrc/fusion.h @@ -126,6 +126,9 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer { //! Deregister output as an output of the fusion void removeOutput(Val* output); + //! Replace input with another value + void replaceInput(Val* input, Val* replacement); + //! Replace output with another value void replaceOutput(Val* output, Val* replacement); diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 0681c242878..5252bb66d1e 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -1133,6 +1133,36 @@ TEST_F(NVFuserTest, DynamicRewriteEmptyTensors3_CUDA) { " to not be marked empty as it is on a dead branch: ", info.toString()); } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor at_x = at::randn({0, 3}, options); + std::vector aten_inputs = {at_x}; + + FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + + testValidate( + fusion_executor_cache.fusion(), + outputs, + aten_inputs, + {at_x + at_x + at_x}, + __LINE__, + __FILE__); + + // Expect input and output to have hardcoded zero extent. + auto runtime = fusion_executor_cache.getMostRecentKernelRuntime(); + auto runtime_fusion = runtime->fusionSegments(); + + auto input_extent = + runtime_fusion->inputs().at(0)->as()->axis(0)->extent(); + EXPECT_TRUE(input_extent->isConstInt()); + EXPECT_EQ(input_extent->getInt(), 0); + + auto output_extent = + runtime_fusion->outputs().at(0)->as()->axis(0)->extent(); + EXPECT_TRUE(output_extent->isConstInt()); + EXPECT_EQ(output_extent->getInt(), 0); } } From 91101b90776529a6695763a91d2fbb09521bef20 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 16:30:00 -0400 Subject: [PATCH 21/63] Silence clang-tidy --- csrc/dynamic_transform.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index b067fa58b15..60e95965ade 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -590,7 +590,7 @@ TensorView* DynamicTransformConcretizer::replaceEmpty( "Found RFactor in input TensorView ", tv->toString()); std::vector expanded(tv->nDims()); - for (auto i : c10::irange(tv->nDims())) { + for (int i : c10::irange((int)tv->nDims())) { expanded[i] = tv->axis(i)->hasExpandedExtent(); } mut_tv = TensorViewBuilder() From 9a4f857e6d732f54e63f889aff6d850dd3c8eaf0 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 16:55:27 -0400 Subject: [PATCH 22/63] Fix test by switching from BackwardVisitor to standalone function --- csrc/dynamic_transform.cpp | 62 +++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 60e95965ade..3fa73932a60 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -151,31 +151,36 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { std::vector leaf_dynamic_vals_; }; -class EmptyBranchFinder : public BackwardVisitor { - public: - EmptyBranchFinder(Fusion* fusion, ExpressionEvaluator* expr_eval) - : expr_eval_(expr_eval) { - // We do not require the traversal to cover all outputs, because if we - // replace some outputs with calls to full() then any unused outputs will be - // ignored entirely. - must_cover_all_expr_outputs_ = false; - traverseTo(fusion, fusion->outputs(), false); - } - - std::vector getEmptyTensors() const { - return empty_tensors_; - } - - private: - using BackwardVisitor::handle; +//! This performs a depth-first search from outputs toward inputs for empty +//! tensors. It does not traverse past any zero tensors it finds; this is why +//! this is implemented as a single function instead of with BackwardVisitor. +//! Additionally, we check inputs since they might actually be disconnected from +//! outputs. +std::vector findEmptyTensors( + Fusion* fusion, + ExpressionEvaluator* expr_eval) { + std::vector empty_tensors; + std::vector vals(fusion->inputs()); + vals.insert(vals.end(), fusion->outputs().begin(), fusion->outputs().end()); + std::unordered_set visited; + + while (!vals.empty()) { + auto val = vals.back(); + vals.pop_back(); + if (!val->isA()) { + continue; + } + auto tv = val->as(); + if (visited.find(tv) != visited.end()) { + continue; + } - void handle(TensorView* tv) final { std::vector empty_axes; auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); bool empty = false; for (size_t i : c10::irange(rfactor.size())) { auto id = rfactor.at(i); - auto extent_eval = expr_eval_->evaluate(id->extent()); + auto extent_eval = expr_eval->evaluate(id->extent()); TORCH_INTERNAL_ASSERT( extent_eval.has_value(), "When finding empty tensors: could not evaluate extent of ", @@ -189,15 +194,17 @@ class EmptyBranchFinder : public BackwardVisitor { // Replace with full. Note that even if the definition was a FullOp, we // still mark this tensor for replacement, so that we can ensure the // empty axes are marked with constant zeroes - empty_tensors_.push_back(EmptyTensorDescriptor{tv, empty_axes}); - return; + empty_tensors.push_back(EmptyTensorDescriptor{tv, empty_axes}); + continue; + } + if (tv->definition()) { + for (auto inp : tv->definition()->inputs()) { + vals.push_back(inp); + } } } - - private: - ExpressionEvaluator* expr_eval_; - std::vector empty_tensors_; -}; + return empty_tensors; +} DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( Fusion* fusion, @@ -225,8 +232,7 @@ DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( // Find a minimal set of empty tensors to replace with full() calls // NOTE: this does a backward traversal from outputs. if (has_empty_tensor) { - empty_tensors_ = - EmptyBranchFinder(info->fusion(), expr_eval).getEmptyTensors(); + empty_tensors_ = findEmptyTensors(info->fusion(), expr_eval); } } From c56f16391670d258bd938e1ff4748cee7fe486b5 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 8 Jun 2023 17:00:24 -0400 Subject: [PATCH 23/63] Fix length check in conc info operator== --- csrc/dynamic_transform.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 3fa73932a60..441c28d4345 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -365,7 +365,8 @@ bool DynamicTransformConcretizationInfo::operator==( } if (reshape_transforms_.size() != other.reshape_transforms_.size() || - resize_transforms_.size() != other.resize_transforms_.size()) { + resize_transforms_.size() != other.resize_transforms_.size() || + empty_tensors_.size() != other.empty_tensors_.size()) { return false; } From 9f28007ef44325d4440af740836a55cb177f5b55 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 14 Jun 2023 12:47:28 -0400 Subject: [PATCH 24/63] Look up TVs by name() instead of holding ptrs in conc_info --- csrc/dynamic_transform.cpp | 20 +++++++++++++++++--- csrc/dynamic_transform.h | 23 ++++++++++++++++++++--- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 00c943c5d38..7d3566ee633 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -39,6 +39,18 @@ DynamicTransformInitialInfo DynamicTransformInitialInfo::clone( cloned_info.dynamic_resized_ids_.push_back(ir_cloner.clone(op)); } } + cloned_info.dynamic_extent_vals_.reserve(dynamic_extent_vals_.size()); + for (const auto v : dynamic_extent_vals_) { + if (v) { + cloned_info.dynamic_extent_vals_.insert(ir_cloner.clone(v)); + } + } + cloned_info.name_to_tensorview_.reserve(name_to_tensorview_.size()); + for (const auto kv : name_to_tensorview_) { + if (kv.second) { + cloned_info.name_to_tensorview_[kv.first] = ir_cloner.clone(kv.second); + } + } cloned_info.root_dynamic_vals_.reserve(root_dynamic_vals_.size()); for (const auto v : root_dynamic_vals_) { if (v) { @@ -109,6 +121,7 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { //! Detect dynamic IterDomain transforms when handling TensorViews void handle(TensorView* tv) override { + info_.name_to_tensorview_[tv->name()] = tv; const auto& rfd = tv->getMaybeRFactorDomain(); for (auto id : rfd) { if (!id->extent()->isConstScalar() || id->extent()->evaluateInt() == 0) { @@ -201,7 +214,7 @@ std::vector findEmptyTensors( // Replace with full. Note that even if the definition was a FullOp, we // still mark this tensor for replacement, so that we can ensure the // empty axes are marked with constant zeroes - empty_tensors.push_back(EmptyTensorDescriptor{tv, empty_axes}); + empty_tensors.push_back(EmptyTensorDescriptor{tv->name(), empty_axes}); continue; } if (tv->definition()) { @@ -408,7 +421,7 @@ std::string DynamicTransformConcretizationInfo::toString() const { std::string indent = " "; ss << indent << "Empty tensors:\n"; for (const auto& kv : empty_tensors_) { - ss << indent << indent << kv.tv->toString() + ss << indent << indent << initial_info_->lookUpTV(kv.tv_name)->toString() << " has zero extent in these axes:"; for (auto i : kv.empty_axes) { ss << " " << i; @@ -507,8 +520,9 @@ void DynamicTransformConcretizer::concretize() { } void DynamicTransformConcretizer::removeEmptyBranches() { + info_->initialInfo()->fusion()->printMath(); for (const auto& empty_tv_descr : info_->getEmptyTensors()) { - auto tv = empty_tv_descr.tv; + auto tv = info_->initialInfo()->lookUpTV(empty_tv_descr.tv_name); auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); std::vector new_shape; new_shape.reserve(rfactor.size()); diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index f96c32b9800..23255129b63 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -80,6 +80,15 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { return dynamic_resized_ids_; } + TensorView* lookUpTV(size_t tv_name) const { + auto it = name_to_tensorview_.find(tv_name); + TORCH_INTERNAL_ASSERT( + it != name_to_tensorview_.end(), + "Could not find TensorView with name ", + tv_name); + return it->second; + } + std::string toString() const; DynamicTransformInitialInfo clone(IrCloner& ir_cloner) const; @@ -98,6 +107,14 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { private: DynamicTransformInitialInfo(Fusion* fusion) : fusion_(fusion) {} + // Holds mapping from the name() of a TensorView to its value. This is so that + // we can hold only the name of a tensor in conc_info and still be able to + // access a cloned TensorView. Holding pointers directly would not work in + // such a case since after cloning we no longer have a mapping between + // original Vals and cloned Vals. Note that the functionality offered by this + // map probably belongs in Fusion instead. + std::unordered_map name_to_tensorview_; + private: Fusion* fusion_ = nullptr; @@ -123,11 +140,11 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { //! Describes known empty dimensions in a TensorView's maybe RFactor domain struct TORCH_CUDA_CU_API EmptyTensorDescriptor { - TensorView* tv; + size_t tv_name; std::vector empty_axes; bool operator==(const EmptyTensorDescriptor& other) const { - return tv == other.tv && empty_axes == other.empty_axes; + return tv_name == other.tv_name && empty_axes == other.empty_axes; } bool operator!=(const EmptyTensorDescriptor& other) const { @@ -142,7 +159,7 @@ struct TORCH_CUDA_CU_API EmptyTensorDescriptor { } // We need to hash the tv address here, since we could conceivably find two // different tensors that are empty in the same axes. - hash ^= std::hash()(tv); + hash ^= std::hash()(tv_name); return hash; } }; From c80a93bc09508d6e4701cf7f566e2174b761c63f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 14 Jun 2023 12:48:23 -0400 Subject: [PATCH 25/63] Change assumption in PadShmoo due to empty concretization change --- test/test_dynamic_transform.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index f7ab55b1a09..3a6f3111b34 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -956,7 +956,7 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) { // Test zero-dimensional input //{{3, 0}, {0, 0}, false}, // SIGFPE (see #264 above) - {{3, 0}, {1, 1}, false}, + {{3, 0}, {1, 1}, true}, // zero-dimensional concretizes differently //{{3, 0}, {-1, 1}, false}, // SIGFPE (see #264 above) }; // NOLINTEND(bugprone-implicit-widening-of-multiplication-result) From 08c4edc88b3daad28036db3507c0376cf63c2356 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 14 Jun 2023 12:49:51 -0400 Subject: [PATCH 26/63] Print name to TV mapping and dyn extent vals --- csrc/dynamic_transform.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 7d3566ee633..505d2bcb059 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -72,6 +72,15 @@ std::string DynamicTransformInitialInfo::toString() const { for (const auto& op : dynamic_resized_ids_) { ss << indent << indent << op->toString() << "\n"; } + ss << indent << "Dynamic extent Vals:\n"; + for (const auto& v : dynamic_extent_vals_) { + ss << indent << indent << v->toString() << "\n"; + } + ss << indent << "Name to TensorView mapping:\n"; + for (const auto& kv : name_to_tensorview_) { + ss << indent << indent << kv.first << " => " << kv.second->toString() + << "\n"; + } ss << indent << "Root dynamic Vals:\n"; for (const auto& v : root_dynamic_vals_) { ss << indent << indent << v->toString() << "\n"; From 7e770ace3a3df4eda5692ccf3711c00eb582fdd9 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 14 Jun 2023 12:50:49 -0400 Subject: [PATCH 27/63] Change placement of FusionGuard in getKernelRuntimeFor --- csrc/kernel_cache.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp index 0480ca3eb66..a8093d0a19f 100644 --- a/csrc/kernel_cache.cpp +++ b/csrc/kernel_cache.cpp @@ -620,9 +620,6 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( // Clone fusion_ so that we can safely use an ExpressionEvaluator on it, for // the purposes of computing the concretization info. auto conc_fusion = std::make_unique(*fusion_); - - // concretize fusion_ for use in this runtime - FusionGuard fg(conc_fusion.get()); if (initial_info.isDynamic()) { const auto& conc_initial_info = conc_fusion->getManaged("initial_info"); @@ -645,6 +642,7 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( conc_fusion->printMath(); } } + FusionGuard fg(conc_fusion.get()); kernel_runtimes.emplace_back(std::make_unique( std::move(conc_fusion), args, forced_index_type)); kernel_runtime = kernel_runtimes.back().get(); From 5c0a9e053954672d748c87b4b1fe460cb93c7aaa Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 14 Jun 2023 12:51:41 -0400 Subject: [PATCH 28/63] Handle PadOp Use FusionGuard instead of passing fusion around. Properly mark visited TVs. --- csrc/dynamic_transform.cpp | 93 ++++++++++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 505d2bcb059..121e8176077 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -186,8 +186,8 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { //! Additionally, we check inputs since they might actually be disconnected from //! outputs. std::vector findEmptyTensors( - Fusion* fusion, ExpressionEvaluator* expr_eval) { + auto fusion = FusionGuard::getCurFusion(); std::vector empty_tensors; std::vector vals(fusion->inputs()); vals.insert(vals.end(), fusion->outputs().begin(), fusion->outputs().end()); @@ -203,6 +203,7 @@ std::vector findEmptyTensors( if (visited.find(tv) != visited.end()) { continue; } + visited.insert(tv); std::vector empty_axes; auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); @@ -253,7 +254,12 @@ DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( bool has_empty_tensor = false; for (auto ext : initial_info_->getDynamicExtentVals()) { - if (expr_eval->evaluate(ext).value().as() == 0) { + auto ext_opt = expr_eval->evaluate(ext); + TORCH_INTERNAL_ASSERT( + ext_opt.has_value(), + "Could not evaluate dynamic extent: ", + ext->toString()); + if (ext_opt.value().as() == 0) { has_empty_tensor = true; break; } @@ -261,7 +267,7 @@ DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( // Find a minimal set of empty tensors to replace with full() calls // NOTE: this does a backward traversal from outputs. if (has_empty_tensor) { - empty_tensors_ = findEmptyTensors(initial_info_->fusion(), expr_eval); + empty_tensors_ = findEmptyTensors(expr_eval); } } @@ -462,6 +468,7 @@ class DynamicTransformConcretizer : public OptOutMutator { TORCH_INTERNAL_ASSERT( fusion == info->fusion(), "Invalid DynamicTransformInitialInfo. The associated Fusion is different from the given Fusion"); + FusionGuard fg(fusion); concretize(); } @@ -474,7 +481,7 @@ class DynamicTransformConcretizer : public OptOutMutator { //! Modify the Fusion by replacing tv with output of full() expression in //! outputs and all uses. - TensorView* replaceEmpty( + TensorView* replaceWithFull( TensorView* tv, std::vector& new_shape, Val* fill_value = nullptr); @@ -516,7 +523,8 @@ void DynamicTransformConcretizer::concretize() { // Set output IterTypes for dynamic resize ops concretizeResize(); - // Concretize empty tensors last. + // Concretize empty tensors last in case some empty tensor are fed into + // replaced dynamic ops. removeEmptyBranches(); // Finally, propagate concretized domains @@ -541,6 +549,7 @@ void DynamicTransformConcretizer::removeEmptyBranches() { for (auto ax : empty_tv_descr.empty_axes) { // Hard-code zero extent for empty axes. This lets us detect empty input // and output tensors during scheduling/execution. + registerConcretization(new_shape[ax], tv->fusion()->zeroVal()); new_shape[ax] = tv->fusion()->zeroVal(); } @@ -554,7 +563,7 @@ void DynamicTransformConcretizer::removeEmptyBranches() { }; // Given a TensorView, get a shape with hard-coded zeroes - auto reduction_shape = [](TensorView* out_tv) -> std::vector { + auto orig_shape = [](TensorView* out_tv) -> std::vector { auto nored_axes = TensorDomain::noReductions(out_tv->getMaybeRFactorDomain()); // Output shape is simply the same as the original reduction. If there @@ -569,33 +578,73 @@ void DynamicTransformConcretizer::removeEmptyBranches() { return out_shape; }; - // If expr is a ReductionOp or WelfordOp over some empty axes, replace it - // with a call to full(). + std::unordered_map replaced; + auto maybeReplaced = [&replaced](TensorView* tv) -> TensorView* { + auto it = replaced.find(tv); + if (it == replaced.end()) { + return tv; + } + return it->second; + }; + + // Replace uses whose outputs might not be empty. Many expressions are + // guaranteed to have empty outputs if any of the inputs are empty; for + // example simple unary or binary ops. In those cases, we don't need to + // doctor the Fusion since they will have an empty tensor downstream which + // will cut off their dependence, resulting in those uses becoming dead + // code. + // + // Other expressions can convert an empty tensor into a non-empty tensor; + // particularly pad, cat, and reduction ops. These ops might have + // non-empty outputs so in order to guarantee that all (non- input or + // output) tensors are removed, we need to replace those ops with an + // equivalent that does not have any empty inputs. for (auto use : tv->uses()) { + // If use is a ReductionOp or WelfordOp over some empty axes, replace it + // with a call to full(). if (auto rop = dynamic_cast(use)) { - auto out = rop->out()->as(); + auto out = maybeReplaced(rop->out()->as()); if (hasEmptyRootReductionAxis(out)) { - auto out_shape = reduction_shape(out); - replaceEmpty(out, out_shape); + auto out_shape = orig_shape(out); + replaced[out] = replaceWithFull(out, out_shape); } } else if (auto wop = dynamic_cast(use)) { - auto avg = wop->outAvg()->as(); - auto var = wop->outVar()->as(); - auto N = wop->outN()->as(); + auto avg = maybeReplaced(wop->outAvg()->as()); + auto var = maybeReplaced(wop->outVar()->as()); + auto N = maybeReplaced(wop->outN()->as()); if (hasEmptyRootReductionAxis(avg)) { - auto out_shape = reduction_shape(avg); + auto out_shape = orig_shape(avg); auto nan = IrBuilder::create(0.0 / 0.0); - replaceEmpty(avg, out_shape, nan); - replaceEmpty(var, out_shape, nan); - replaceEmpty(N, out_shape); + replaced[avg] = replaceWithFull(avg, out_shape, nan); + replaced[var] = replaceWithFull(var, out_shape, nan); + replaced[N] = replaceWithFull(N, out_shape); } + } else if (auto pop = dynamic_cast(use)) { + auto out = maybeReplaced(pop->out()->as()); + auto out_shape = orig_shape(out); + // Wherever there is a zero in the input, we will replace the original + // output extent so that we no longer reference the now-zero input + // extent + for (auto i : empty_tv_descr.empty_axes) { + auto pad_widths = pop->getPadWidths((int)i); + out_shape[i] = add(pad_widths.first, pad_widths.second); + } + replaced[out] = replaceWithFull(out, out_shape, pop->value()); } + //} else if (auto cop = dynamic_cast(use)) { + //} + } + if (tv->isFusionInput()) { + // OptOutMutator::mutate(tv) merely changes the TensorDomain of tv without + // actually replacing tv itself. + OptOutMutator::mutate(tv); + } else { + replaced[tv] = replaceWithFull(tv, new_shape); } - replaceEmpty(tv, new_shape); } } -TensorView* DynamicTransformConcretizer::replaceEmpty( +TensorView* DynamicTransformConcretizer::replaceWithFull( TensorView* tv, std::vector& new_shape, Val* fill_value) { @@ -631,6 +680,10 @@ TensorView* DynamicTransformConcretizer::replaceEmpty( registerConcretization(tv, mut_tv); OptOutMutator::mutate(tv); + for (auto use : tv->uses()) { + ir_utils::replaceValInExpr(use, tv, mut_tv); + } + if (tv->isFusionInput()) { tv->fusion()->replaceInput(tv, mut_tv); } From b52df95f7cf1604d1131bb55a09296411d7fc8f7 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 14 Jun 2023 13:04:31 -0400 Subject: [PATCH 29/63] Remove stray debugging printMath --- csrc/dynamic_transform.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 121e8176077..4a10fb741ee 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -537,7 +537,6 @@ void DynamicTransformConcretizer::concretize() { } void DynamicTransformConcretizer::removeEmptyBranches() { - info_->initialInfo()->fusion()->printMath(); for (const auto& empty_tv_descr : info_->getEmptyTensors()) { auto tv = info_->initialInfo()->lookUpTV(empty_tv_descr.tv_name); auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); From a6644641a0f49492d9f6194e380a32151a7edc39 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 21 Jun 2023 11:23:39 -0400 Subject: [PATCH 30/63] Replace cats with empty inputs. --- csrc/dynamic_transform.cpp | 106 ++++++++++++++++++++++++++------ test/test_dynamic_transform.cpp | 34 ++++++++++ 2 files changed, 120 insertions(+), 20 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 4a10fb741ee..7f6ed669d99 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -486,6 +487,10 @@ class DynamicTransformConcretizer : public OptOutMutator { std::vector& new_shape, Val* fill_value = nullptr); + //! Replace a TensorView with a new one in all uses, and in inputs and + //! outputs. + void replaceTV(TensorView* old_tv, TensorView* new_tv); + void concretizeReshape(); void concretizeResize(); @@ -620,18 +625,74 @@ void DynamicTransformConcretizer::removeEmptyBranches() { } } else if (auto pop = dynamic_cast(use)) { auto out = maybeReplaced(pop->out()->as()); - auto out_shape = orig_shape(out); - // Wherever there is a zero in the input, we will replace the original - // output extent so that we no longer reference the now-zero input - // extent - for (auto i : empty_tv_descr.empty_axes) { - auto pad_widths = pop->getPadWidths((int)i); - out_shape[i] = add(pad_widths.first, pad_widths.second); + + // A cat op can have input empty tensors and still output a non-empty + // tensor. This is only possible if there is more than one input, so we + // only need to handle those cases. We find the non-empty inputs to cat + // then replace with another cat (or `set` if n=1). + // + // [Detecting cat ops] + // The `cat` function creates a CatOp object, but its inputs() are not + // the original inputs. Rather, they are the inputs after padding to the + // output extent in the concatenated dimension. Thus, in the IR graph, + // instead of the following: + // + // T0 T1 T2 + // \ | / + // CatOp + // | + // T3 + // + // a cat is represented as: + // T0 T1 T2 + // | | | + // PadOp PadOp PadOp + // \ | / + // CatOp + // | + // T3 + if (pop->out()->uses().size() == 1 && + pop->out()->uses()[0]->isA()) { + auto cop = pop->out()->uses()[0]->as(); + std::vector nonempty_inputs; + for (auto inp : cop->inputs()) { + // Each "input" to CatOp is a pad() of the corresponding _actual_ + // input. Here we peel off the pad op to collect the non-padded cat + // inputs. + auto padded_inp_tv = inp->as(); + TORCH_INTERNAL_ASSERT( + padded_inp_tv->definition() && + padded_inp_tv->definition()->isA(), + "Input to cat should have definition that is a PadOp"); + auto inp_tv = padded_inp_tv->definition() + ->as() + ->in() + ->as(); + + if (inp_tv != tv) { + // we could remove other empty tensors here while we're at it. + // They will get removed by further passes anyway though as tv + // ranges over all empty tensors. + nonempty_inputs.push_back(inp_tv); + } + } + auto old_cat = cop->output(0)->as(); + auto new_cat = nonempty_inputs.size() == 1 + ? set(nonempty_inputs[0]) + : cat(nonempty_inputs, cop->concatenatedDim()); + replaceTV(old_cat, new_cat); + } else { // Replace pads that are not part of CatOps with full() + auto out_shape = orig_shape(out); + // Wherever there is a zero in the input, we will replace the original + // output extent so that we no longer reference the now-zero input + // extent + for (auto i : empty_tv_descr.empty_axes) { + auto pad_widths = pop->getPadWidths((int)i); + out_shape[i] = add(pad_widths.first, pad_widths.second); + } + replaced[out] = replaceWithFull(out, out_shape, pop->value()); } - replaced[out] = replaceWithFull(out, out_shape, pop->value()); } - //} else if (auto cop = dynamic_cast(use)) { - //} } if (tv->isFusionInput()) { // OptOutMutator::mutate(tv) merely changes the TensorDomain of tv without @@ -675,23 +736,28 @@ TensorView* DynamicTransformConcretizer::replaceWithFull( } mut_tv = full(new_shape, fill_value, tv->getDataType().value()); } + replaceTV(tv, mut_tv); - registerConcretization(tv, mut_tv); - OptOutMutator::mutate(tv); + return mut_tv; +} - for (auto use : tv->uses()) { - ir_utils::replaceValInExpr(use, tv, mut_tv); - } +void DynamicTransformConcretizer::replaceTV( + TensorView* old_tv, + TensorView* new_tv) { + registerConcretization(old_tv, new_tv); + OptOutMutator::mutate(old_tv); - if (tv->isFusionInput()) { - tv->fusion()->replaceInput(tv, mut_tv); + for (auto use : old_tv->uses()) { + ir_utils::replaceValInExpr(use, old_tv, new_tv); } - if (tv->isFusionOutput()) { - tv->fusion()->replaceOutput(tv, mut_tv); + if (old_tv->isFusionInput()) { + old_tv->fusion()->replaceInput(old_tv, new_tv); } - return mut_tv; + if (old_tv->isFusionOutput()) { + old_tv->fusion()->replaceOutput(old_tv, new_tv); + } } void DynamicTransformConcretizer::concretizeReshape() { diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 3a6f3111b34..d9b06a05b71 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -994,4 +994,38 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { testValidate(&fusion, outputs, aten_inputs, {at2}, __LINE__, __FILE__); } +// Test that empty input to cat is concretized away +TEST_F(NVFuserTest, FusionDynamicEmptyCat_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(fusion_ptr.get()); + + auto tv0 = makeSymbolicTensor(1); + fusion.addInput(tv0); + auto tv1 = makeSymbolicTensor(1); + fusion.addInput(tv1); + + auto tv2 = cat({tv0, tv1}, 0); + + fusion.addOutput(tv2); + + // Check correctness + FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at0 = at::randn({5}, options); + at::Tensor at1 = at::randn({0}, options); + std::vector aten_inputs = {at0, at1}; + auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto at2 = at::cat({at0, at1}, 0); + testValidate(&fusion, outputs, aten_inputs, {at2}, __LINE__, __FILE__); + + // Check that fusion consists only of tv2 = set(tv0) + auto fkr = fusion_executor_cache.getMostRecentKernelRuntime(); + auto seg_fusion = fkr->fusionSegments(); + auto output_def = seg_fusion->outputs()[0]->definition(); + EXPECT_TRUE(output_def->isA()); + EXPECT_EQ(output_def->as()->opType(), LoadStoreOpType::Set); + EXPECT_EQ(output_def->input(0), seg_fusion->inputs()[0]); +} + } // namespace nvfuser From c10f56b07c478aa3d6d177e30cb70704daac7908 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 21 Jun 2023 11:27:46 -0400 Subject: [PATCH 31/63] Add test with three catted tensors, only one empty --- test/test_dynamic_transform.cpp | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index d9b06a05b71..616d2c408ed 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -995,7 +995,36 @@ TEST_F(NVFuserTest, FusionDynamicSliceToBroadcast_CUDA) { } // Test that empty input to cat is concretized away -TEST_F(NVFuserTest, FusionDynamicEmptyCat_CUDA) { +TEST_F(NVFuserTest, FusionDynamicEmptyCat1_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(fusion_ptr.get()); + + auto tv0 = makeSymbolicTensor(1); + fusion.addInput(tv0); + auto tv1 = makeSymbolicTensor(1); + fusion.addInput(tv1); + auto tv2 = makeSymbolicTensor(1); + fusion.addInput(tv2); + + auto tv3 = cat({tv0, tv1, tv2}, 0); + + fusion.addOutput(tv3); + + // Check correctness + FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr)); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at0 = at::randn({5}, options); + at::Tensor at1 = at::randn({0}, options); + at::Tensor at2 = at::randn({3}, options); + std::vector aten_inputs = {at0, at1, at2}; + auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs); + auto at3 = at::cat({at0, at1, at2}, 0); + testValidate(&fusion, outputs, aten_inputs, {at3}, __LINE__, __FILE__); +} + +// Test that empty input to cat is concretized away +TEST_F(NVFuserTest, FusionDynamicEmptyCat2_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(fusion_ptr.get()); From 97cf4416264b85aa11df530f857d470c5263561c Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 21 Jun 2023 11:38:11 -0400 Subject: [PATCH 32/63] Bind tv1 extents in DynamicTransform1_CUDA This is necessary now for empty checking during concretization. --- test/test_dynamic_transform.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 616d2c408ed..50862aa886f 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -60,6 +60,8 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { // output: 3, 4 expr_eval.bind(tv0->axis(0)->extent(), 4); expr_eval.bind(tv0->axis(1)->extent(), 3); + expr_eval.bind(tv1->axis(0)->extent(), 3); + expr_eval.bind(tv1->axis(1)->extent(), 4); expr_eval.bind(reshape_shape0, 3); expr_eval.bind(reshape_shape1, 4); @@ -78,6 +80,8 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { // output: 3, -1 expr_eval.bind(tv0->axis(0)->extent(), 4); expr_eval.bind(tv0->axis(1)->extent(), 3); + expr_eval.bind(tv1->axis(0)->extent(), 3); + expr_eval.bind(tv1->axis(1)->extent(), 4); expr_eval.bind(reshape_shape0, 3); expr_eval.bind(reshape_shape1, -1); @@ -96,6 +100,8 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { // output: 5, -1 expr_eval.bind(tv0->axis(0)->extent(), 4); expr_eval.bind(tv0->axis(1)->extent(), 3); + expr_eval.bind(tv1->axis(0)->extent(), 5); + expr_eval.bind(tv1->axis(1)->extent(), 3); expr_eval.bind(reshape_shape0, 5); expr_eval.bind(reshape_shape1, -1); From 9bdb140140e4d72aa88500612fc7d12e3f2dd8a8 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 21 Jun 2023 12:10:46 -0400 Subject: [PATCH 33/63] Only bind tv1 extents in the one case. See comment --- test/test_dynamic_transform.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index 50862aa886f..3438a25cf8c 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -60,8 +60,6 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { // output: 3, 4 expr_eval.bind(tv0->axis(0)->extent(), 4); expr_eval.bind(tv0->axis(1)->extent(), 3); - expr_eval.bind(tv1->axis(0)->extent(), 3); - expr_eval.bind(tv1->axis(1)->extent(), 4); expr_eval.bind(reshape_shape0, 3); expr_eval.bind(reshape_shape1, 4); @@ -80,11 +78,16 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { // output: 3, -1 expr_eval.bind(tv0->axis(0)->extent(), 4); expr_eval.bind(tv0->axis(1)->extent(), 3); - expr_eval.bind(tv1->axis(0)->extent(), 3); - expr_eval.bind(tv1->axis(1)->extent(), 4); expr_eval.bind(reshape_shape0, 3); expr_eval.bind(reshape_shape1, -1); + // In this case, if we do not bind tv1->axis(1)->extent(), we get a failure + // to evaluate it when checking whether tv1 is empty. It is possible to + // infer that it is not empty in this case, but it would require replicating + // some of the ExpressionEvaluator::propagateBoundValuesThroughExactMaps() + // functionality inside concretization, which is not implemented. + expr_eval.bind(tv1->axis(1)->extent(), 4); + auto initial_info = DynamicTransform::getInitialInfo(&fusion); auto info = DynamicTransformConcretizationInfo(&initial_info, &expr_eval); TORCH_CHECK( @@ -100,8 +103,6 @@ TEST_F(NVFuserTest, DynamicTransform1_CUDA) { // output: 5, -1 expr_eval.bind(tv0->axis(0)->extent(), 4); expr_eval.bind(tv0->axis(1)->extent(), 3); - expr_eval.bind(tv1->axis(0)->extent(), 5); - expr_eval.bind(tv1->axis(1)->extent(), 3); expr_eval.bind(reshape_shape0, 5); expr_eval.bind(reshape_shape1, -1); From b45638d2459b9c21b9d6b2712434c34870a443ad Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 21 Jun 2023 12:36:17 -0400 Subject: [PATCH 34/63] Minor cleanup Make explicit that we do not replace inputs --- csrc/dynamic_transform.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 7f6ed669d99..7e0aeb6e359 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -542,6 +542,7 @@ void DynamicTransformConcretizer::concretize() { } void DynamicTransformConcretizer::removeEmptyBranches() { + auto fusion = FusionGuard::getCurFusion(); for (const auto& empty_tv_descr : info_->getEmptyTensors()) { auto tv = info_->initialInfo()->lookUpTV(empty_tv_descr.tv_name); auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); @@ -553,8 +554,8 @@ void DynamicTransformConcretizer::removeEmptyBranches() { for (auto ax : empty_tv_descr.empty_axes) { // Hard-code zero extent for empty axes. This lets us detect empty input // and output tensors during scheduling/execution. - registerConcretization(new_shape[ax], tv->fusion()->zeroVal()); - new_shape[ax] = tv->fusion()->zeroVal(); + registerConcretization(new_shape[ax], fusion->zeroVal()); + new_shape[ax] = fusion->zeroVal(); } auto hasEmptyRootReductionAxis = [&empty_tv_descr](TensorView* out_tv) { @@ -694,11 +695,7 @@ void DynamicTransformConcretizer::removeEmptyBranches() { } } } - if (tv->isFusionInput()) { - // OptOutMutator::mutate(tv) merely changes the TensorDomain of tv without - // actually replacing tv itself. - OptOutMutator::mutate(tv); - } else { + if (!tv->isFusionInput()) { replaced[tv] = replaceWithFull(tv, new_shape); } } From 3026cc6c7910d6ff5d7bb72f68b2b3e855b723a1 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 23 Jun 2023 09:52:17 -0400 Subject: [PATCH 35/63] Update comment on initial info handling of TVs --- csrc/dynamic_transform.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 7e0aeb6e359..ecc1a670600 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -129,7 +129,7 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { } } - //! Detect dynamic IterDomain transforms when handling TensorViews + //! Detect possibly empty TensorViews and dynamic IterDomain transforms void handle(TensorView* tv) override { info_.name_to_tensorview_[tv->name()] = tv; const auto& rfd = tv->getMaybeRFactorDomain(); From 8b8524e5a34c84e7ef428f875add3d2a0aaacc16 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 23 Jun 2023 09:54:54 -0400 Subject: [PATCH 36/63] Rename dynamic_extent_vals to maybe_zero_extents_ --- csrc/dynamic_transform.cpp | 10 +++++----- csrc/dynamic_transform.h | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index ecc1a670600..30550dc5586 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -40,10 +40,10 @@ DynamicTransformInitialInfo DynamicTransformInitialInfo::clone( cloned_info.dynamic_resized_ids_.push_back(ir_cloner.clone(op)); } } - cloned_info.dynamic_extent_vals_.reserve(dynamic_extent_vals_.size()); - for (const auto v : dynamic_extent_vals_) { + cloned_info.maybe_zero_extents_.reserve(maybe_zero_extents_.size()); + for (const auto v : maybe_zero_extents_) { if (v) { - cloned_info.dynamic_extent_vals_.insert(ir_cloner.clone(v)); + cloned_info.maybe_zero_extents_.insert(ir_cloner.clone(v)); } } cloned_info.name_to_tensorview_.reserve(name_to_tensorview_.size()); @@ -74,7 +74,7 @@ std::string DynamicTransformInitialInfo::toString() const { ss << indent << indent << op->toString() << "\n"; } ss << indent << "Dynamic extent Vals:\n"; - for (const auto& v : dynamic_extent_vals_) { + for (const auto& v : maybe_zero_extents_) { ss << indent << indent << v->toString() << "\n"; } ss << indent << "Name to TensorView mapping:\n"; @@ -135,7 +135,7 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { const auto& rfd = tv->getMaybeRFactorDomain(); for (auto id : rfd) { if (!id->extent()->isConstScalar() || id->extent()->evaluateInt() == 0) { - info_.dynamic_extent_vals_.insert(id->extent()); + info_.maybe_zero_extents_.insert(id->extent()); leaf_dynamic_vals_.push_back(id->extent()); } if (!id->definition() || id->getIterType() != IterType::Symbolic) { diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index 23255129b63..899035fc96b 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -51,7 +51,7 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { //! Return whether there are any tensors with unknown extent in some //! dimension, so that they might be empty bool hasPossibleEmptyTensor() const { - return !dynamic_extent_vals_.empty(); + return !maybe_zero_extents_.empty(); } //! Return a set of scalars that are inputs or extents of input TensorViews @@ -65,7 +65,7 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { //! Fusion. If any of these evaluate to zero, there is at least one empty //! TensorView present. const std::unordered_set& getDynamicExtentVals() const { - return dynamic_extent_vals_; + return maybe_zero_extents_; } //! Return a vector of outputs of ViewOp expressions that have dynamic output @@ -130,7 +130,7 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { // This is a minimal set of scalars to check for empty tensors. If any are // zero, we should traverse to find empty tensors. - std::unordered_set dynamic_extent_vals_; + std::unordered_set maybe_zero_extents_; // Root Vals that determine concretization std::unordered_set root_dynamic_vals_; From 0faf0ef4844eabd4fa0dbf91517394f160ac2855 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 23 Jun 2023 10:02:19 -0400 Subject: [PATCH 37/63] Place findEmptyTensors in anonymous namespace --- csrc/dynamic_transform.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 30550dc5586..d53782f3ad8 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -181,6 +181,8 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { std::vector leaf_dynamic_vals_; }; +namespace { // Anonymous namespace for local function findEmptyTensors + //! This performs a depth-first search from outputs toward inputs for empty //! tensors. It does not traverse past any zero tensors it finds; this is why //! this is implemented as a single function instead of with BackwardVisitor. @@ -237,6 +239,8 @@ std::vector findEmptyTensors( return empty_tensors; } +} // namespace for findEmptyTensors + DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( const DynamicTransformInitialInfo* initial_info, ExpressionEvaluator* expr_eval) From c5dfe02fb4a1c92517607f79675fccbc0cc439fb Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 23 Jun 2023 10:16:17 -0400 Subject: [PATCH 38/63] Improve comments and recurse in maybeReplaced --- csrc/dynamic_transform.cpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index d53782f3ad8..b43d7a13e19 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -239,7 +239,7 @@ std::vector findEmptyTensors( return empty_tensors; } -} // namespace for findEmptyTensors +} // namespace DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( const DynamicTransformInitialInfo* initial_info, @@ -571,29 +571,28 @@ void DynamicTransformConcretizer::removeEmptyBranches() { }); }; - // Given a TensorView, get a shape with hard-coded zeroes + // Given a TensorView get a vector of its maybeRFactor maybeExpandedExtents auto orig_shape = [](TensorView* out_tv) -> std::vector { - auto nored_axes = + const auto& rfactor = TensorDomain::noReductions(out_tv->getMaybeRFactorDomain()); - // Output shape is simply the same as the original reduction. If there - // were zeros in the non-Reduction axes, it would be replaced by - // full() directly. - std::vector out_shape(nored_axes.size()); - std::transform( - nored_axes.begin(), - nored_axes.end(), - out_shape.begin(), - [](IterDomain* id) -> Val* { return id->getMaybeExpandedExtent(); }); + std::vector out_shape; + out_shape.reserve(rfactor.size()); + for (const auto id :) { + out_shape.push_back(id->getMaybeExpandedExtent()); + } return out_shape; }; + // As we replace TensorViews, we want to operate on the replaced values + // instead of the originals. This map lets use keep track of multiple + // replacements and get the latest one. std::unordered_map replaced; auto maybeReplaced = [&replaced](TensorView* tv) -> TensorView* { auto it = replaced.find(tv); if (it == replaced.end()) { return tv; } - return it->second; + return maybeReplaced(it->second); }; // Replace uses whose outputs might not be empty. Many expressions are From 9811fa8cf4ee2914271141bdedf9675981a5ad1a Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 23 Jun 2023 11:03:06 -0400 Subject: [PATCH 39/63] Fix typo and add example to comment --- csrc/dynamic_transform.cpp | 75 +++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 5 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index b43d7a13e19..d85f24bf916 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -480,12 +480,77 @@ class DynamicTransformConcretizer : public OptOutMutator { private: void concretize(); - //! Set definitions of empty tensors to full() calls, replace reductions over - //! empty axes with full calls. + //! removeEmptyBranches sets definitions of empty tensors to full(), and + //! replaces uses like reductions over empty axes with full calls. + //! + //! Consider the following Fusion with input T0, T1 and output T3: + //! + //! T0 + //! | + //! sum + //! | + //! T2 T1 + //! \ / + //! mul + //! | + //! T3 + //! + //! If T1 has any size-zero dimensions, then we know that T3 is also empty, + //! and T2 may be empty as well (unless it's broadcasting in all the empty + //! dimensions of T1). In this case, we can replace the entire Fusion with a + //! single call to full(): + //! + //! T0 T1 + //! + //! full + //! | + //! T3 + //! + //! Notice that the graph is now disconnected since T0 and T1 remain as Fusion + //! inputs. + //! + //! If instead, T1 is not empty, but T0 is, then there are two possibilities: + //! a) If any empty axes of T0 are not reduced, then T2 shares those empty + //! axes, in which case T3 must also be empty, so we can rewrite the Fusion + //! the same way as above, by redefining T3 = full(shape) + //! + //! b) If instead the empty axes of T0 are all being reduced in the sum, + //! then T2 is not empty. In this case, since T0 is an input, rewriting it + //! as a full() output is not helpful. However, we know that any use of an + //! empty tensor does not require computation over T0, so we can rewrite it. + //! In this case, we can rewrite the sum as a full(shape, 0) since the sum + //! over an empty tensor is 0 (more generally, the initial value of the + //! reduction). This leads to the following rewritten Fusion: + //! + //! T0 + //! + //! full + //! | + //! T2 T1 + //! \ / + //! mul + //! | + //! T3 + //! + //! After this call, the Fusion will only contain empty tensors if they are + //! Fusion inputs or outputs. Furthermore, output tensors will have constant + //! zeros for the extents of empty axes. + //! + //! Instead of sum, we may encounter pad or cat ops. These are handled as + //! follows: + //! + //! Pads of empty tensors are replaced with full() using a fill value equal + //! to the pad value. + //! + //! Cat of tensors including some that are empty in the cat dimension are + //! simply replaced with a call to cat() that excludes the empty tensors. + //! Note that if any non-cat dimensions are empty, then the output will be + //! empty as well and the cat becomes dead code, as in the second example + //! with empty T0 from above. void removeEmptyBranches(); - //! Modify the Fusion by replacing tv with output of full() expression in - //! outputs and all uses. + //! replaceWithFull modifies the Fusion by replacing tv with output of full() + //! expression in outputs and all uses. TensorView* replaceWithFull( TensorView* tv, std::vector& new_shape, @@ -577,7 +642,7 @@ void DynamicTransformConcretizer::removeEmptyBranches() { TensorDomain::noReductions(out_tv->getMaybeRFactorDomain()); std::vector out_shape; out_shape.reserve(rfactor.size()); - for (const auto id :) { + for (const auto id : rfactor) { out_shape.push_back(id->getMaybeExpandedExtent()); } return out_shape; From c09691c9fc0691be58825ad66fb90ffb5882b6d7 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 23 Jun 2023 11:08:51 -0400 Subject: [PATCH 40/63] Fix stuff I broke when trying to write a recursive lambda. --- csrc/dynamic_transform.cpp | 39 ++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index d85f24bf916..1885132781d 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -586,8 +586,21 @@ class DynamicTransformConcretizer : public OptOutMutator { //! its producer domains. Returns true if any root ID is concretized. bool propagateFromProducerToConsumer(TensorView* consumer); + TensorView* maybeReplaced(TensorView* tv) { + auto it = replaced_tvs_.find(tv); + if (it == replaced_tvs_.end()) { + return tv; + } + return maybeReplaced(it->second); + }; + private: const DynamicTransformConcretizationInfo* info_; + + //! As we replace TensorViews, we want to operate on the replaced values + //! instead of the originals. This map lets use keep track of multiple + //! replacements and get the latest one. + std::unordered_map replaced_tvs_; }; void DynamicTransformConcretizer::concretize() { @@ -648,18 +661,6 @@ void DynamicTransformConcretizer::removeEmptyBranches() { return out_shape; }; - // As we replace TensorViews, we want to operate on the replaced values - // instead of the originals. This map lets use keep track of multiple - // replacements and get the latest one. - std::unordered_map replaced; - auto maybeReplaced = [&replaced](TensorView* tv) -> TensorView* { - auto it = replaced.find(tv); - if (it == replaced.end()) { - return tv; - } - return maybeReplaced(it->second); - }; - // Replace uses whose outputs might not be empty. Many expressions are // guaranteed to have empty outputs if any of the inputs are empty; for // example simple unary or binary ops. In those cases, we don't need to @@ -679,7 +680,7 @@ void DynamicTransformConcretizer::removeEmptyBranches() { auto out = maybeReplaced(rop->out()->as()); if (hasEmptyRootReductionAxis(out)) { auto out_shape = orig_shape(out); - replaced[out] = replaceWithFull(out, out_shape); + replaceWithFull(out, out_shape); } } else if (auto wop = dynamic_cast(use)) { auto avg = maybeReplaced(wop->outAvg()->as()); @@ -688,9 +689,9 @@ void DynamicTransformConcretizer::removeEmptyBranches() { if (hasEmptyRootReductionAxis(avg)) { auto out_shape = orig_shape(avg); auto nan = IrBuilder::create(0.0 / 0.0); - replaced[avg] = replaceWithFull(avg, out_shape, nan); - replaced[var] = replaceWithFull(var, out_shape, nan); - replaced[N] = replaceWithFull(N, out_shape); + replaceWithFull(avg, out_shape, nan); + replaceWithFull(var, out_shape, nan); + replaceWithFull(N, out_shape); } } else if (auto pop = dynamic_cast(use)) { auto out = maybeReplaced(pop->out()->as()); @@ -759,12 +760,12 @@ void DynamicTransformConcretizer::removeEmptyBranches() { auto pad_widths = pop->getPadWidths((int)i); out_shape[i] = add(pad_widths.first, pad_widths.second); } - replaced[out] = replaceWithFull(out, out_shape, pop->value()); + replaceWithFull(out, out_shape, pop->value()); } } } if (!tv->isFusionInput()) { - replaced[tv] = replaceWithFull(tv, new_shape); + replaceWithFull(tv, new_shape); } } } @@ -823,6 +824,8 @@ void DynamicTransformConcretizer::replaceTV( if (old_tv->isFusionOutput()) { old_tv->fusion()->replaceOutput(old_tv, new_tv); } + + replaced_tvs_[old_tv] = new_tv; } void DynamicTransformConcretizer::concretizeReshape() { From 9a0ba86fe266d3be1ecc90f6b753fd2abb052895 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 23 Jun 2023 11:43:28 -0400 Subject: [PATCH 41/63] Update comment in removeEmptyBranches() --- csrc/dynamic_transform.cpp | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 1885132781d..2517d2cc8c0 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -666,13 +666,28 @@ void DynamicTransformConcretizer::removeEmptyBranches() { // example simple unary or binary ops. In those cases, we don't need to // doctor the Fusion since they will have an empty tensor downstream which // will cut off their dependence, resulting in those uses becoming dead - // code. + // code. For example, suppose we determined tv2 is empty, and we have the + // following Fusion: // - // Other expressions can convert an empty tensor into a non-empty tensor; - // particularly pad, cat, and reduction ops. These ops might have - // non-empty outputs so in order to guarantee that all (non- input or - // output) tensors are removed, we need to replace those ops with an - // equivalent that does not have any empty inputs. + // auto tv4 = add(tv2, tv3); + // fusion.addOutput(tv4); + // + // If we know that tv2 is empty in any dimension, then either tv3 has a + // matching empty dimension or it is broadcast in that dimension. Either + // way, the corresponding dimension in tv4 will be empty, so tv4 is an empty + // tensor. If we replace this expression with + // + // auto tv4 = full(shape, zeroVal()); + // + // Then the tensors tv2 and tv3 will become dead code if they have no other + // live uses. In this case tv4 is an output tensor, so we must keep it in + // the Fusion. + // + // Some special expressions can convert an empty tensor into a non-empty + // tensor; particularly pad, cat, and reduction ops. These ops might have + // non-empty outputs so in order to guarantee that all non- input or + // output tensors are removed, we need to replace those ops with an + // equivalent that does not have any empty inputs. For example for (auto use : tv->uses()) { // If use is a ReductionOp or WelfordOp over some empty axes, replace it // with a call to full(). From c34caa70e07916516f1dcc235f7b988ce22f0d10 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 23 Jun 2023 12:00:56 -0400 Subject: [PATCH 42/63] Update comments, don't replace inputs --- csrc/dynamic_transform.cpp | 82 +++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 32 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 2517d2cc8c0..3fcde499777 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -550,14 +550,39 @@ class DynamicTransformConcretizer : public OptOutMutator { void removeEmptyBranches(); //! replaceWithFull modifies the Fusion by replacing tv with output of full() - //! expression in outputs and all uses. + //! expression in outputs and all uses. This is used to replace pads of empty + //! inputs with full tensors containing only the pad value, and it is used to + //! replace empty output tensors in order to eliminate dead code in their + //! definitions. For example, if we have the following Fusion: + //! + //! T0 (input) + //! | + //! foo (some heavy computation) + //! | + //! T2 T1 (input) + //! \ / + //! mul + //! | + //! T3 (output) + //! + //! Consider if T2 is Broadcast in dimension i but T2 is zero in dimension i. + //! Then T3 is also zero in dimension i, so T3 is empty. By replacing T3 with + //! full(), we still have an empty output, but its definition avoids the heavy + //! computation of foo: + //! + //! T0 T1 (inputs) + //! + //! full + //! | + //! T3 (output) TensorView* replaceWithFull( TensorView* tv, std::vector& new_shape, Val* fill_value = nullptr); - //! Replace a TensorView with a new one in all uses, and in inputs and - //! outputs. + //! Replace a TensorView with a new one in all uses and in Fusion outputs. + //! Note that we do not replace Fusion inputs, since doing so may remove + //! extent Vals that are used in hard-to-predict places. void replaceTV(TensorView* old_tv, TensorView* new_tv); void concretizeReshape(); @@ -736,6 +761,19 @@ void DynamicTransformConcretizer::removeEmptyBranches() { // CatOp // | // T3 + // + // If we determine that one of the inputs, T1, is empty in the cat + // dimension, then we rewrite this as: + // + // T0 T2 + // | | + // PadOp PadOp + // \ / + // CatOp + // | + // T3 + // + // This is done by simply calling the cat() command with only {T0, T2}. if (pop->out()->uses().size() == 1 && pop->out()->uses()[0]->isA()) { auto cop = pop->out()->uses()[0]->as(); @@ -791,32 +829,16 @@ TensorView* DynamicTransformConcretizer::replaceWithFull( Val* fill_value) { TensorView* mut_tv = nullptr; if (!tv->definition()) { - // No definition. Probably an input. - TORCH_INTERNAL_ASSERT( - !tv->hasRFactor(), - "Found RFactor in input TensorView ", - tv->toString()); - std::vector expanded(tv->nDims()); - for (int i : c10::irange((int)tv->nDims())) { - expanded[i] = tv->axis(i)->hasExpandedExtent(); - } - mut_tv = TensorViewBuilder() - .ndims(tv->nDims()) - .dtype(tv->getDataType().value()) - .contiguity(tv->getContiguity()) - .shape(new_shape) - .expanded(expanded) - .build(); - mut_tv->setMemoryType(MemoryType::Global); - } else { - if (!fill_value) { - fill_value = tv->fusion()->zeroVal(); - } - if (fill_value->getDataType().value() != tv->getDataType().value()) { - fill_value = castOp(tv->getDataType().value(), fill_value); - } - mut_tv = full(new_shape, fill_value, tv->getDataType().value()); + return tv; + } + if (!fill_value) { + fill_value = tv->fusion()->zeroVal(); } + if (fill_value->getDataType().value() != tv->getDataType().value()) { + fill_value = castOp(tv->getDataType().value(), fill_value); + } + mut_tv = full(new_shape, fill_value, tv->getDataType().value()); + replaceTV(tv, mut_tv); return mut_tv; @@ -832,10 +854,6 @@ void DynamicTransformConcretizer::replaceTV( ir_utils::replaceValInExpr(use, old_tv, new_tv); } - if (old_tv->isFusionInput()) { - old_tv->fusion()->replaceInput(old_tv, new_tv); - } - if (old_tv->isFusionOutput()) { old_tv->fusion()->replaceOutput(old_tv, new_tv); } From ef53abeb67e4a1cbf794b9fbe5852c159d61291b Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Sun, 25 Jun 2023 20:00:35 -0400 Subject: [PATCH 43/63] Use quiet_NaN instead of 0.0 / 0.0 --- csrc/dynamic_transform.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index c36e2a63a8d..4d902499f38 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -20,6 +20,7 @@ #include #include +#include #include namespace nvfuser { @@ -728,7 +729,8 @@ void DynamicTransformConcretizer::removeEmptyBranches() { auto N = maybeReplaced(wop->outN()->as()); if (hasEmptyRootReductionAxis(avg)) { auto out_shape = orig_shape(avg); - auto nan = IrBuilder::create(0.0 / 0.0); + auto nan = IrBuilder::create( + std::numeric_limits::quiet_NaN()); replaceWithFull(avg, out_shape, nan); replaceWithFull(var, out_shape, nan); replaceWithFull(N, out_shape); From ba7e9dd84d38a9fb76c6ed24a378624618bb5af7 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Sun, 25 Jun 2023 20:12:40 -0400 Subject: [PATCH 44/63] Rename getDynamicExtentVals -> getMaybeZeroExtents --- csrc/dynamic_transform.cpp | 2 +- csrc/dynamic_transform.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 4d902499f38..7d189700082 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -259,7 +259,7 @@ DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( analyzeResizes(expr_eval); bool has_empty_tensor = false; - for (auto ext : initial_info_->getDynamicExtentVals()) { + for (auto ext : initial_info_->getMaybeZeroExtents()) { auto ext_opt = expr_eval->evaluate(ext); TORCH_INTERNAL_ASSERT( ext_opt.hasValue(), diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index 899035fc96b..5a3b34bd0c2 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -64,7 +64,7 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { //! Return a set of scalars that appear as extents in TensorViews in the //! Fusion. If any of these evaluate to zero, there is at least one empty //! TensorView present. - const std::unordered_set& getDynamicExtentVals() const { + const std::unordered_set& getMaybeZeroExtents() const { return maybe_zero_extents_; } From 8483d14b63b86f7e1a8b39fa43a3919927038e7f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 26 Jun 2023 07:42:28 -0400 Subject: [PATCH 45/63] Skip replacing reductions with empty outputs --- csrc/dynamic_transform.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 7d189700082..6fe711d980c 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -674,6 +674,14 @@ void DynamicTransformConcretizer::removeEmptyBranches() { return out_tv->getRootDomain().at(ax)->isReduction(); }); }; + auto hasEmptyRootNonReductionAxis = [&empty_tv_descr](TensorView* out_tv) { + return std::any_of( + empty_tv_descr.empty_axes.begin(), + empty_tv_descr.empty_axes.end(), + [&out_tv](size_t ax) { + return !out_tv->getRootDomain().at(ax)->isReduction(); + }); + }; // Given a TensorView get a vector of its maybeRFactor maybeExpandedExtents auto orig_shape = [](TensorView* out_tv) -> std::vector { @@ -719,7 +727,11 @@ void DynamicTransformConcretizer::removeEmptyBranches() { // with a call to full(). if (auto rop = dynamic_cast(use)) { auto out = maybeReplaced(rop->out()->as()); - if (hasEmptyRootReductionAxis(out)) { + // If a reduction has empty non-reduced axes, then its output will be + // empty so it should already be dead code. In those cases we skip + // replacing the reduction with full as that should be redundant. + if (hasEmptyRootReductionAxis(out) && + !hasEmptyRootNonReductionAxis(out)) { auto out_shape = orig_shape(out); replaceWithFull(out, out_shape); } @@ -727,7 +739,8 @@ void DynamicTransformConcretizer::removeEmptyBranches() { auto avg = maybeReplaced(wop->outAvg()->as()); auto var = maybeReplaced(wop->outVar()->as()); auto N = maybeReplaced(wop->outN()->as()); - if (hasEmptyRootReductionAxis(avg)) { + if (hasEmptyRootReductionAxis(avg) && + !hasEmptyRootNonReductionAxis(avg)) { auto out_shape = orig_shape(avg); auto nan = IrBuilder::create( std::numeric_limits::quiet_NaN()); From 7fb9428e58fbfb42b33c171eceaf298e4e107eff Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Mon, 26 Jun 2023 10:35:17 -0400 Subject: [PATCH 46/63] Refactor to more clearly show dispatch of replaceEmptyUse --- csrc/dynamic_transform.cpp | 359 ++++++++++++++++++++----------------- 1 file changed, 195 insertions(+), 164 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 6fe711d980c..0d01d17b6da 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -550,6 +550,46 @@ class DynamicTransformConcretizer : public OptOutMutator { //! with empty T0 from above. void removeEmptyBranches(); + //! Replace uses whose outputs might not be empty. Many expressions are + //! guaranteed to have empty outputs if any of the inputs are empty; for + //! example simple unary or binary ops. In those cases, we don't need to + //! doctor the Fusion since they will have an empty tensor downstream which + //! will cut off their dependence, resulting in those uses becoming dead + //! code. For example, suppose we determined tv2 is empty, and we have the + //! following Fusion: + //! + //! auto tv4 = add(tv2, tv3); + //! fusion.addOutput(tv4); + //! + //! If we know that tv2 is empty in any dimension, then either tv3 has a + //! matching empty dimension or it is broadcast in that dimension. Either + //! way, the corresponding dimension in tv4 will be empty, so tv4 is an empty + //! tensor. If we replace this expression with + //! + //! auto tv4 = full(shape, zeroVal()); + //! + //! Then the tensors tv2 and tv3 will become dead code if they have no other + //! live uses. In this case tv4 is an output tensor, so we must keep it in + //! the Fusion. + //! + //! Some special expressions can convert an empty tensor into a non-empty + //! tensor; particularly pad, cat, and reduction ops. These ops might have + //! non-empty outputs so in order to guarantee that all non- input or + //! output tensors are removed, we need to replace those ops with an + //! equivalent that does not have any empty inputs. For example + void replaceEmptyUse(ReductionOp* rop, const std::vector& empty_axes); + void replaceEmptyUse(WelfordOp* wop, const std::vector& empty_axes); + void replaceEmptyUse(PadOp* pop, const std::vector& empty_axes); + void replaceEmptyUse(Expr* e, const std::vector& empty_axes) { + if (auto rop = dynamic_cast(e)) { + replaceEmptyUse(rop, empty_axes); + } else if (auto wop = dynamic_cast(e)) { + replaceEmptyUse(wop, empty_axes); + } else if (auto pop = dynamic_cast(e)) { + replaceEmptyUse(pop, empty_axes); + } + } + //! replaceWithFull modifies the Fusion by replacing tv with output of full() //! expression in outputs and all uses. This is used to replace pads of empty //! inputs with full tensors containing only the pad value, and it is used to @@ -649,6 +689,160 @@ void DynamicTransformConcretizer::concretize() { } } +namespace { + +//! This function simply extracts the maybeExpandedExtent Vals from the +//! noReductions(maybeRFactorDomain) of the provided tensorview. +std::vector tensor_shape(TensorView* tv) { + const auto& rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); + std::vector shape; + shape.reserve(rfactor.size()); + for (const auto id : rfactor) { + shape.push_back(id->getMaybeExpandedExtent()); + } + return shape; +} + +//! Return whether or not we should replace a reduction or Welford op that is +//! given an empty input tensor. Returns false if any non-reduction axes are +//! empty, and otherwise return true if any reduction axis is empty. +bool reductionOutputShouldBeReplaced( + TensorView* out, + const std::vector& empty_axes) { + bool has_empty_reduction = false; + bool has_empty_nonreduction = false; + // A reduction op itself should not generate an R-Factor domain anyway, but + // note that we use the root domain explicitly here for clarity. This is + // because empty_axes refers to the _input_ of the reduction op we are + // considering, so it maps to the root domain of the TensorView "out" in this + // function. + auto dom = out->getRootDomain(); + for (auto ax : empty_axes) { + if (dom.at(ax)->isReduction()) { + has_empty_reduction = true; + } else { + has_empty_nonreduction = true; + } + }; + // If a reduction has empty non-reduced axes, then its output will be + // empty so it should already be dead code. In those cases we skip + // replacing the reduction with full as that should be redundant. + return has_empty_reduction && !has_empty_nonreduction; +} + +} // namespace + +void DynamicTransformConcretizer::replaceEmptyUse( + ReductionOp* rop, + const std::vector& empty_axes) { + auto out = maybeReplaced(rop->out()->as()); + if (reductionOutputShouldBeReplaced(out, empty_axes)) { + auto out_shape = tensor_shape(out); + replaceWithFull(out, out_shape); + } +} + +void DynamicTransformConcretizer::replaceEmptyUse( + WelfordOp* wop, + const std::vector& empty_axes) { + auto avg = maybeReplaced(wop->outAvg()->as()); + auto var = maybeReplaced(wop->outVar()->as()); + auto N = maybeReplaced(wop->outN()->as()); + if (reductionOutputShouldBeReplaced(avg, empty_axes)) { + auto out_shape = tensor_shape(avg); + auto nan = + IrBuilder::create(std::numeric_limits::quiet_NaN()); + replaceWithFull(avg, out_shape, nan); + replaceWithFull(var, out_shape, nan); + replaceWithFull(N, out_shape); + } +} + +void DynamicTransformConcretizer::replaceEmptyUse( + PadOp* pop, + const std::vector& empty_axes) { + auto out = maybeReplaced(pop->out()->as()); + + // A cat op can have input empty tensors and still output a non-empty + // tensor. This is only possible if there is more than one input, so we + // only need to handle those cases. We find the non-empty inputs to cat + // then replace with another cat (or `set` if n=1). + // + // [Detecting cat ops] + // The `cat` function creates a CatOp object, but its inputs() are not + // the original inputs. Rather, they are the inputs after padding to the + // output extent in the concatenated dimension. Thus, in the IR graph, + // instead of the following: + // + // T0 T1 T2 + // \ | / + // CatOp + // | + // T3 + // + // a cat is represented as: + // + // T0 T1 T2 + // | | | + // PadOp PadOp PadOp + // \ | / + // CatOp + // | + // T3 + // + // If we determine that one of the inputs, T1, is empty in the cat + // dimension, then we rewrite this as: + // + // T0 T2 + // | | + // PadOp PadOp + // \ / + // CatOp + // | + // T3 + // + // This is done by simply calling the cat() command with only {T0, T2}. + auto pad_uses = pop->out()->uses(); + if (pad_uses.size() == 1 && pad_uses[0]->isA()) { + auto cop = pad_uses[0]->as(); + std::vector nonempty_inputs; + for (auto inp : cop->inputs()) { + // Each "input" to CatOp is a pad() of the corresponding _actual_ + // input. Here we peel off the pad op to collect the non-padded cat + // inputs. + auto padded_inp_tv = inp->as(); + TORCH_INTERNAL_ASSERT( + padded_inp_tv->definition() && + padded_inp_tv->definition()->isA(), + "Input to cat should have definition that is a PadOp"); + auto inp_pad_op = padded_inp_tv->definition()->as(); + auto inp_tv = inp_pad_op->in()->as(); + + if (inp_pad_op != pop) { + // We could remove other empty tensors here while we're at it. They will + // get removed by further passes anyway though as tv ranges over all + // empty tensors. + nonempty_inputs.push_back(inp_tv); + } + } + auto old_cat = cop->output(0)->as(); + auto new_cat = nonempty_inputs.size() == 1 + ? set(nonempty_inputs[0]) + : cat(nonempty_inputs, cop->concatenatedDim()); + replaceTV(old_cat, new_cat); + } else { // Replace pads that are not part of CatOps with full() + auto out_shape = tensor_shape(out); + // Wherever there is a zero in the input, we will replace the original + // output extent so that we no longer reference the now-zero input + // extent + for (auto ax : empty_axes) { + auto pad_widths = pop->getPadWidths((int)ax); + out_shape[ax] = add(pad_widths.first, pad_widths.second); + } + replaceWithFull(out, out_shape, pop->value()); + } +} + void DynamicTransformConcretizer::removeEmptyBranches() { auto fusion = FusionGuard::getCurFusion(); for (const auto& empty_tv_descr : info_->getEmptyTensors()) { @@ -666,171 +860,8 @@ void DynamicTransformConcretizer::removeEmptyBranches() { new_shape[ax] = fusion->zeroVal(); } - auto hasEmptyRootReductionAxis = [&empty_tv_descr](TensorView* out_tv) { - return std::any_of( - empty_tv_descr.empty_axes.begin(), - empty_tv_descr.empty_axes.end(), - [&out_tv](size_t ax) { - return out_tv->getRootDomain().at(ax)->isReduction(); - }); - }; - auto hasEmptyRootNonReductionAxis = [&empty_tv_descr](TensorView* out_tv) { - return std::any_of( - empty_tv_descr.empty_axes.begin(), - empty_tv_descr.empty_axes.end(), - [&out_tv](size_t ax) { - return !out_tv->getRootDomain().at(ax)->isReduction(); - }); - }; - - // Given a TensorView get a vector of its maybeRFactor maybeExpandedExtents - auto orig_shape = [](TensorView* out_tv) -> std::vector { - const auto& rfactor = - TensorDomain::noReductions(out_tv->getMaybeRFactorDomain()); - std::vector out_shape; - out_shape.reserve(rfactor.size()); - for (const auto id : rfactor) { - out_shape.push_back(id->getMaybeExpandedExtent()); - } - return out_shape; - }; - - // Replace uses whose outputs might not be empty. Many expressions are - // guaranteed to have empty outputs if any of the inputs are empty; for - // example simple unary or binary ops. In those cases, we don't need to - // doctor the Fusion since they will have an empty tensor downstream which - // will cut off their dependence, resulting in those uses becoming dead - // code. For example, suppose we determined tv2 is empty, and we have the - // following Fusion: - // - // auto tv4 = add(tv2, tv3); - // fusion.addOutput(tv4); - // - // If we know that tv2 is empty in any dimension, then either tv3 has a - // matching empty dimension or it is broadcast in that dimension. Either - // way, the corresponding dimension in tv4 will be empty, so tv4 is an empty - // tensor. If we replace this expression with - // - // auto tv4 = full(shape, zeroVal()); - // - // Then the tensors tv2 and tv3 will become dead code if they have no other - // live uses. In this case tv4 is an output tensor, so we must keep it in - // the Fusion. - // - // Some special expressions can convert an empty tensor into a non-empty - // tensor; particularly pad, cat, and reduction ops. These ops might have - // non-empty outputs so in order to guarantee that all non- input or - // output tensors are removed, we need to replace those ops with an - // equivalent that does not have any empty inputs. For example for (auto use : tv->uses()) { - // If use is a ReductionOp or WelfordOp over some empty axes, replace it - // with a call to full(). - if (auto rop = dynamic_cast(use)) { - auto out = maybeReplaced(rop->out()->as()); - // If a reduction has empty non-reduced axes, then its output will be - // empty so it should already be dead code. In those cases we skip - // replacing the reduction with full as that should be redundant. - if (hasEmptyRootReductionAxis(out) && - !hasEmptyRootNonReductionAxis(out)) { - auto out_shape = orig_shape(out); - replaceWithFull(out, out_shape); - } - } else if (auto wop = dynamic_cast(use)) { - auto avg = maybeReplaced(wop->outAvg()->as()); - auto var = maybeReplaced(wop->outVar()->as()); - auto N = maybeReplaced(wop->outN()->as()); - if (hasEmptyRootReductionAxis(avg) && - !hasEmptyRootNonReductionAxis(avg)) { - auto out_shape = orig_shape(avg); - auto nan = IrBuilder::create( - std::numeric_limits::quiet_NaN()); - replaceWithFull(avg, out_shape, nan); - replaceWithFull(var, out_shape, nan); - replaceWithFull(N, out_shape); - } - } else if (auto pop = dynamic_cast(use)) { - auto out = maybeReplaced(pop->out()->as()); - - // A cat op can have input empty tensors and still output a non-empty - // tensor. This is only possible if there is more than one input, so we - // only need to handle those cases. We find the non-empty inputs to cat - // then replace with another cat (or `set` if n=1). - // - // [Detecting cat ops] - // The `cat` function creates a CatOp object, but its inputs() are not - // the original inputs. Rather, they are the inputs after padding to the - // output extent in the concatenated dimension. Thus, in the IR graph, - // instead of the following: - // - // T0 T1 T2 - // \ | / - // CatOp - // | - // T3 - // - // a cat is represented as: - // T0 T1 T2 - // | | | - // PadOp PadOp PadOp - // \ | / - // CatOp - // | - // T3 - // - // If we determine that one of the inputs, T1, is empty in the cat - // dimension, then we rewrite this as: - // - // T0 T2 - // | | - // PadOp PadOp - // \ / - // CatOp - // | - // T3 - // - // This is done by simply calling the cat() command with only {T0, T2}. - if (pop->out()->uses().size() == 1 && - pop->out()->uses()[0]->isA()) { - auto cop = pop->out()->uses()[0]->as(); - std::vector nonempty_inputs; - for (auto inp : cop->inputs()) { - // Each "input" to CatOp is a pad() of the corresponding _actual_ - // input. Here we peel off the pad op to collect the non-padded cat - // inputs. - auto padded_inp_tv = inp->as(); - TORCH_INTERNAL_ASSERT( - padded_inp_tv->definition() && - padded_inp_tv->definition()->isA(), - "Input to cat should have definition that is a PadOp"); - auto inp_tv = padded_inp_tv->definition() - ->as() - ->in() - ->as(); - - if (inp_tv != tv) { - // we could remove other empty tensors here while we're at it. - // They will get removed by further passes anyway though as tv - // ranges over all empty tensors. - nonempty_inputs.push_back(inp_tv); - } - } - auto old_cat = cop->output(0)->as(); - auto new_cat = nonempty_inputs.size() == 1 - ? set(nonempty_inputs[0]) - : cat(nonempty_inputs, cop->concatenatedDim()); - replaceTV(old_cat, new_cat); - } else { // Replace pads that are not part of CatOps with full() - auto out_shape = orig_shape(out); - // Wherever there is a zero in the input, we will replace the original - // output extent so that we no longer reference the now-zero input - // extent - for (auto i : empty_tv_descr.empty_axes) { - auto pad_widths = pop->getPadWidths((int)i); - out_shape[i] = add(pad_widths.first, pad_widths.second); - } - replaceWithFull(out, out_shape, pop->value()); - } - } + replaceEmptyUse(use, empty_tv_descr.empty_axes); } if (!tv->isFusionInput()) { replaceWithFull(tv, new_shape); From 2175ed81a65e0f38e75349b687c2c638cdcc551f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 7 Jul 2023 12:00:58 -0400 Subject: [PATCH 47/63] Remove TV replacement code. Only mutate extents. The goal now is to replace all zero extents with zeroVal() everywhere, so that the pre-segmentation RemoveEmptyPass will be able to remove empty tensors effectively. --- csrc/dynamic_transform.cpp | 456 ++++--------------------------------- csrc/dynamic_transform.h | 59 +---- 2 files changed, 57 insertions(+), 458 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 0d01d17b6da..354466bab98 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -41,16 +41,16 @@ DynamicTransformInitialInfo DynamicTransformInitialInfo::clone( cloned_info.dynamic_resized_ids_.push_back(ir_cloner.clone(op)); } } - cloned_info.maybe_zero_extents_.reserve(maybe_zero_extents_.size()); - for (const auto v : maybe_zero_extents_) { + cloned_info.maybe_zero_extents_set_.reserve(maybe_zero_extents_set_.size()); + for (const auto v : maybe_zero_extents_set_) { if (v) { - cloned_info.maybe_zero_extents_.insert(ir_cloner.clone(v)); + cloned_info.maybe_zero_extents_set_.insert(ir_cloner.clone(v)); } } - cloned_info.name_to_tensorview_.reserve(name_to_tensorview_.size()); - for (const auto kv : name_to_tensorview_) { - if (kv.second) { - cloned_info.name_to_tensorview_[kv.first] = ir_cloner.clone(kv.second); + cloned_info.maybe_zero_extents_.reserve(maybe_zero_extents_.size()); + for (const auto v : maybe_zero_extents_) { + if (v) { + cloned_info.maybe_zero_extents_.push_back(ir_cloner.clone(v)); } } cloned_info.root_dynamic_vals_.reserve(root_dynamic_vals_.size()); @@ -78,11 +78,6 @@ std::string DynamicTransformInitialInfo::toString() const { for (const auto& v : maybe_zero_extents_) { ss << indent << indent << v->toString() << "\n"; } - ss << indent << "Name to TensorView mapping:\n"; - for (const auto& kv : name_to_tensorview_) { - ss << indent << indent << kv.first << " => " << kv.second->toString() - << "\n"; - } ss << indent << "Root dynamic Vals:\n"; for (const auto& v : root_dynamic_vals_) { ss << indent << indent << v->toString() << "\n"; @@ -102,6 +97,8 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { traverseTo(fusion, fusion->getTerminatingOutputs(), false, false); finalizeDynamicVals(); + + finalizeMaybeEmptyExtents(); } const auto& getInfo() const { @@ -132,11 +129,10 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { //! Detect possibly empty TensorViews and dynamic IterDomain transforms void handle(TensorView* tv) override { - info_.name_to_tensorview_[tv->name()] = tv; const auto& rfd = tv->getMaybeRFactorDomain(); for (auto id : rfd) { if (!id->extent()->isConstScalar() || id->extent()->evaluateInt() == 0) { - info_.maybe_zero_extents_.insert(id->extent()); + info_.maybe_zero_extents_set_.insert(id->extent()); leaf_dynamic_vals_.push_back(id->extent()); } if (!id->definition() || id->getIterType() != IterType::Symbolic) { @@ -169,6 +165,16 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { } } + //! Convert the maybe_zero_extents_set_ to a vector so that we can index it. + void finalizeMaybeEmptyExtents() { + info_.maybe_zero_extents_.reserve(info_.maybe_zero_extents_set_.size()); + for (auto val : info_.maybe_zero_extents_set_) { + info_.maybe_zero_extents_.push_back(val); + } + // Clear the corresponding set to free memory and avoid speed up cloning + info_.maybe_zero_extents_set_.clear(); + } + private: DynamicTransformInitialInfo info_; @@ -182,66 +188,6 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { std::vector leaf_dynamic_vals_; }; -namespace { // Anonymous namespace for local function findEmptyTensors - -//! This performs a depth-first search from outputs toward inputs for empty -//! tensors. It does not traverse past any zero tensors it finds; this is why -//! this is implemented as a single function instead of with BackwardVisitor. -//! Additionally, we check inputs since they might actually be disconnected from -//! outputs. -std::vector findEmptyTensors( - ExpressionEvaluator* expr_eval) { - auto fusion = FusionGuard::getCurFusion(); - std::vector empty_tensors; - std::vector vals(fusion->inputs()); - vals.insert(vals.end(), fusion->outputs().begin(), fusion->outputs().end()); - std::unordered_set visited; - - while (!vals.empty()) { - auto val = vals.back(); - vals.pop_back(); - if (!val->isA()) { - continue; - } - auto tv = val->as(); - if (visited.find(tv) != visited.end()) { - continue; - } - visited.insert(tv); - - std::vector empty_axes; - auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); - bool empty = false; - for (size_t i : c10::irange(rfactor.size())) { - auto id = rfactor.at(i); - auto extent_eval = expr_eval->evaluate(id->extent()); - TORCH_INTERNAL_ASSERT( - extent_eval.hasValue(), - "When finding empty tensors: could not evaluate extent of ", - id->toString()); - if (extent_eval == 0) { - empty_axes.push_back(i); - empty = true; - } - } - if (empty) { - // Replace with full. Note that even if the definition was a FullOp, we - // still mark this tensor for replacement, so that we can ensure the - // empty axes are marked with constant zeroes - empty_tensors.push_back(EmptyTensorDescriptor{tv->name(), empty_axes}); - continue; - } - if (tv->definition()) { - for (auto inp : tv->definition()->inputs()) { - vals.push_back(inp); - } - } - } - return empty_tensors; -} - -} // namespace - DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( const DynamicTransformInitialInfo* initial_info, ExpressionEvaluator* expr_eval) @@ -258,23 +204,18 @@ DynamicTransformConcretizationInfo::DynamicTransformConcretizationInfo( analyzeResizes(expr_eval); - bool has_empty_tensor = false; - for (auto ext : initial_info_->getMaybeZeroExtents()) { + auto maybe_zero_extents = initial_info_->getMaybeZeroExtents(); + for (auto i : c10::irange(maybe_zero_extents.size())) { + auto ext = maybe_zero_extents.at(i); auto ext_opt = expr_eval->evaluate(ext); TORCH_INTERNAL_ASSERT( ext_opt.hasValue(), "Could not evaluate dynamic extent: ", ext->toString()); if (ext_opt == 0) { - has_empty_tensor = true; - break; + empty_extents_.push_back(i); } } - // Find a minimal set of empty tensors to replace with full() calls - // NOTE: this does a backward traversal from outputs. - if (has_empty_tensor) { - empty_tensors_ = findEmptyTensors(expr_eval); - } } void DynamicTransformConcretizationInfo::analyzeReshapes( @@ -405,7 +346,7 @@ bool DynamicTransformConcretizationInfo::operator==( if (reshape_transforms_.size() != other.reshape_transforms_.size() || resize_itertypes_.size() != other.resize_itertypes_.size() || - empty_tensors_.size() != other.empty_tensors_.size()) { + empty_extents_.size() != other.empty_extents_.size()) { return false; } @@ -425,10 +366,10 @@ bool DynamicTransformConcretizationInfo::operator==( } } - for (const auto i : c10::irange(empty_tensors_.size())) { - const auto& et = empty_tensors_.at(i); - const auto& other_et = other.empty_tensors_.at(i); - if (et != other_et) { + for (const auto i : c10::irange(empty_extents_.size())) { + const auto& ee = empty_extents_.at(i); + const auto& other_ee = other.empty_extents_.at(i); + if (ee != other_ee) { return false; } } @@ -440,14 +381,10 @@ std::string DynamicTransformConcretizationInfo::toString() const { std::stringstream ss; ss << "DynamicTransformConcretizationInfo\n"; std::string indent = " "; - ss << indent << "Empty tensors:\n"; - for (const auto& kv : empty_tensors_) { - ss << indent << indent << initial_info_->lookUpTV(kv.tv_name)->toString() - << " has zero extent in these axes:"; - for (auto i : kv.empty_axes) { - ss << " " << i; - } - ss << "\n"; + ss << indent << "Empty tensor extents:\n"; + for (const auto& i : empty_extents_) { + auto ext = initial_info_->getMaybeZeroExtents().at(i); + ss << indent << indent << ext->toString() << " is zero\n"; } ss << indent << "Reshape:\n"; for (const auto& [tv_index, analyze_result] : reshape_transforms_) { @@ -548,88 +485,14 @@ class DynamicTransformConcretizer : public OptOutMutator { //! Note that if any non-cat dimensions are empty, then the output will be //! empty as well and the cat becomes dead code, as in the second example //! with empty T0 from above. - void removeEmptyBranches(); - - //! Replace uses whose outputs might not be empty. Many expressions are - //! guaranteed to have empty outputs if any of the inputs are empty; for - //! example simple unary or binary ops. In those cases, we don't need to - //! doctor the Fusion since they will have an empty tensor downstream which - //! will cut off their dependence, resulting in those uses becoming dead - //! code. For example, suppose we determined tv2 is empty, and we have the - //! following Fusion: - //! - //! auto tv4 = add(tv2, tv3); - //! fusion.addOutput(tv4); - //! - //! If we know that tv2 is empty in any dimension, then either tv3 has a - //! matching empty dimension or it is broadcast in that dimension. Either - //! way, the corresponding dimension in tv4 will be empty, so tv4 is an empty - //! tensor. If we replace this expression with - //! - //! auto tv4 = full(shape, zeroVal()); - //! - //! Then the tensors tv2 and tv3 will become dead code if they have no other - //! live uses. In this case tv4 is an output tensor, so we must keep it in - //! the Fusion. - //! - //! Some special expressions can convert an empty tensor into a non-empty - //! tensor; particularly pad, cat, and reduction ops. These ops might have - //! non-empty outputs so in order to guarantee that all non- input or - //! output tensors are removed, we need to replace those ops with an - //! equivalent that does not have any empty inputs. For example - void replaceEmptyUse(ReductionOp* rop, const std::vector& empty_axes); - void replaceEmptyUse(WelfordOp* wop, const std::vector& empty_axes); - void replaceEmptyUse(PadOp* pop, const std::vector& empty_axes); - void replaceEmptyUse(Expr* e, const std::vector& empty_axes) { - if (auto rop = dynamic_cast(e)) { - replaceEmptyUse(rop, empty_axes); - } else if (auto wop = dynamic_cast(e)) { - replaceEmptyUse(wop, empty_axes); - } else if (auto pop = dynamic_cast(e)) { - replaceEmptyUse(pop, empty_axes); - } - } - - //! replaceWithFull modifies the Fusion by replacing tv with output of full() - //! expression in outputs and all uses. This is used to replace pads of empty - //! inputs with full tensors containing only the pad value, and it is used to - //! replace empty output tensors in order to eliminate dead code in their - //! definitions. For example, if we have the following Fusion: - //! - //! T0 (input) - //! | - //! foo (some heavy computation) - //! | - //! T2 T1 (input) - //! \ / - //! mul - //! | - //! T3 (output) - //! - //! Consider if T2 is Broadcast in dimension i but T2 is zero in dimension i. - //! Then T3 is also zero in dimension i, so T3 is empty. By replacing T3 with - //! full(), we still have an empty output, but its definition avoids the heavy - //! computation of foo: - //! - //! T0 T1 (inputs) - //! - //! full - //! | - //! T3 (output) - TensorView* replaceWithFull( - TensorView* tv, - std::vector& new_shape, - Val* fill_value = nullptr); - - //! Replace a TensorView with a new one in all uses and in Fusion outputs. - //! Note that we do not replace Fusion inputs, since doing so may remove - //! extent Vals that are used in hard-to-predict places. - void replaceTV(TensorView* old_tv, TensorView* new_tv); + // void removeEmptyBranches(); void concretizeReshape(); void concretizeResize(); + void concretizeEmptyExtents(); + //! Use this instead of calling registerMutation directly, since it will also //! check that the concretized value is a valid input to all of its uses. void registerConcretization(Val* old_val, Val* new_val) { @@ -652,259 +515,36 @@ class DynamicTransformConcretizer : public OptOutMutator { //! its producer domains. Returns true if any root ID is concretized. bool propagateFromProducerToConsumer(TensorView* consumer); - TensorView* maybeReplaced(TensorView* tv) { - auto it = replaced_tvs_.find(tv); - if (it == replaced_tvs_.end()) { - return tv; - } - return maybeReplaced(it->second); - }; - private: const DynamicTransformConcretizationInfo* info_; - - //! As we replace TensorViews, we want to operate on the replaced values - //! instead of the originals. This map lets use keep track of multiple - //! replacements and get the latest one. - std::unordered_map replaced_tvs_; }; void DynamicTransformConcretizer::concretize() { - // Concretize all dynamic reshape ops + //! Concretize all dynamic reshape ops concretizeReshape(); - // Set output IterTypes for dynamic resize ops + //! Set output IterTypes for dynamic resize ops concretizeResize(); - // Concretize empty tensors last in case some empty tensor are fed into - // replaced dynamic ops. - removeEmptyBranches(); + //! Registers replacement of all empty extents with zeroVal() + concretizeEmptyExtents(); // Finally, propagate concretized domains auto all_stmts = StmtSort::getStmts(info_->fusion(), true); for (auto stmt : all_stmts) { - if (stmt->isA()) { - mutate(stmt); - } - } -} - -namespace { - -//! This function simply extracts the maybeExpandedExtent Vals from the -//! noReductions(maybeRFactorDomain) of the provided tensorview. -std::vector tensor_shape(TensorView* tv) { - const auto& rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); - std::vector shape; - shape.reserve(rfactor.size()); - for (const auto id : rfactor) { - shape.push_back(id->getMaybeExpandedExtent()); - } - return shape; -} - -//! Return whether or not we should replace a reduction or Welford op that is -//! given an empty input tensor. Returns false if any non-reduction axes are -//! empty, and otherwise return true if any reduction axis is empty. -bool reductionOutputShouldBeReplaced( - TensorView* out, - const std::vector& empty_axes) { - bool has_empty_reduction = false; - bool has_empty_nonreduction = false; - // A reduction op itself should not generate an R-Factor domain anyway, but - // note that we use the root domain explicitly here for clarity. This is - // because empty_axes refers to the _input_ of the reduction op we are - // considering, so it maps to the root domain of the TensorView "out" in this - // function. - auto dom = out->getRootDomain(); - for (auto ax : empty_axes) { - if (dom.at(ax)->isReduction()) { - has_empty_reduction = true; - } else { - has_empty_nonreduction = true; - } - }; - // If a reduction has empty non-reduced axes, then its output will be - // empty so it should already be dead code. In those cases we skip - // replacing the reduction with full as that should be redundant. - return has_empty_reduction && !has_empty_nonreduction; -} - -} // namespace - -void DynamicTransformConcretizer::replaceEmptyUse( - ReductionOp* rop, - const std::vector& empty_axes) { - auto out = maybeReplaced(rop->out()->as()); - if (reductionOutputShouldBeReplaced(out, empty_axes)) { - auto out_shape = tensor_shape(out); - replaceWithFull(out, out_shape); - } -} - -void DynamicTransformConcretizer::replaceEmptyUse( - WelfordOp* wop, - const std::vector& empty_axes) { - auto avg = maybeReplaced(wop->outAvg()->as()); - auto var = maybeReplaced(wop->outVar()->as()); - auto N = maybeReplaced(wop->outN()->as()); - if (reductionOutputShouldBeReplaced(avg, empty_axes)) { - auto out_shape = tensor_shape(avg); - auto nan = - IrBuilder::create(std::numeric_limits::quiet_NaN()); - replaceWithFull(avg, out_shape, nan); - replaceWithFull(var, out_shape, nan); - replaceWithFull(N, out_shape); - } -} - -void DynamicTransformConcretizer::replaceEmptyUse( - PadOp* pop, - const std::vector& empty_axes) { - auto out = maybeReplaced(pop->out()->as()); - - // A cat op can have input empty tensors and still output a non-empty - // tensor. This is only possible if there is more than one input, so we - // only need to handle those cases. We find the non-empty inputs to cat - // then replace with another cat (or `set` if n=1). - // - // [Detecting cat ops] - // The `cat` function creates a CatOp object, but its inputs() are not - // the original inputs. Rather, they are the inputs after padding to the - // output extent in the concatenated dimension. Thus, in the IR graph, - // instead of the following: - // - // T0 T1 T2 - // \ | / - // CatOp - // | - // T3 - // - // a cat is represented as: - // - // T0 T1 T2 - // | | | - // PadOp PadOp PadOp - // \ | / - // CatOp - // | - // T3 - // - // If we determine that one of the inputs, T1, is empty in the cat - // dimension, then we rewrite this as: - // - // T0 T2 - // | | - // PadOp PadOp - // \ / - // CatOp - // | - // T3 - // - // This is done by simply calling the cat() command with only {T0, T2}. - auto pad_uses = pop->out()->uses(); - if (pad_uses.size() == 1 && pad_uses[0]->isA()) { - auto cop = pad_uses[0]->as(); - std::vector nonempty_inputs; - for (auto inp : cop->inputs()) { - // Each "input" to CatOp is a pad() of the corresponding _actual_ - // input. Here we peel off the pad op to collect the non-padded cat - // inputs. - auto padded_inp_tv = inp->as(); - TORCH_INTERNAL_ASSERT( - padded_inp_tv->definition() && - padded_inp_tv->definition()->isA(), - "Input to cat should have definition that is a PadOp"); - auto inp_pad_op = padded_inp_tv->definition()->as(); - auto inp_tv = inp_pad_op->in()->as(); - - if (inp_pad_op != pop) { - // We could remove other empty tensors here while we're at it. They will - // get removed by further passes anyway though as tv ranges over all - // empty tensors. - nonempty_inputs.push_back(inp_tv); - } - } - auto old_cat = cop->output(0)->as(); - auto new_cat = nonempty_inputs.size() == 1 - ? set(nonempty_inputs[0]) - : cat(nonempty_inputs, cop->concatenatedDim()); - replaceTV(old_cat, new_cat); - } else { // Replace pads that are not part of CatOps with full() - auto out_shape = tensor_shape(out); - // Wherever there is a zero in the input, we will replace the original - // output extent so that we no longer reference the now-zero input - // extent - for (auto ax : empty_axes) { - auto pad_widths = pop->getPadWidths((int)ax); - out_shape[ax] = add(pad_widths.first, pad_widths.second); - } - replaceWithFull(out, out_shape, pop->value()); + mutate(stmt); } } -void DynamicTransformConcretizer::removeEmptyBranches() { +void DynamicTransformConcretizer::concretizeEmptyExtents() { auto fusion = FusionGuard::getCurFusion(); - for (const auto& empty_tv_descr : info_->getEmptyTensors()) { - auto tv = info_->initialInfo()->lookUpTV(empty_tv_descr.tv_name); - auto rfactor = TensorDomain::noReductions(tv->getMaybeRFactorDomain()); - std::vector new_shape; - new_shape.reserve(rfactor.size()); - for (auto id : rfactor) { - new_shape.push_back(id->getMaybeExpandedExtent()); - } - for (auto ax : empty_tv_descr.empty_axes) { - // Hard-code zero extent for empty axes. This lets us detect empty input - // and output tensors during scheduling/execution. - registerConcretization(new_shape[ax], fusion->zeroVal()); - new_shape[ax] = fusion->zeroVal(); - } - - for (auto use : tv->uses()) { - replaceEmptyUse(use, empty_tv_descr.empty_axes); - } - if (!tv->isFusionInput()) { - replaceWithFull(tv, new_shape); - } - } -} - -TensorView* DynamicTransformConcretizer::replaceWithFull( - TensorView* tv, - std::vector& new_shape, - Val* fill_value) { - TensorView* mut_tv = nullptr; - if (!tv->definition()) { - return tv; + for (const auto& ext_index : info_->getEmptyExtents()) { + auto ext = info_->initialInfo()->getMaybeZeroExtents().at(ext_index); + auto zero = fusion->zeroVal(ext->getDataType().value()); + // Register the concretization of this scalar, which allows us to replace it + // whenever it is used as an extent member of an IterDomain. + registerConcretization(ext, zero); } - if (!fill_value) { - fill_value = tv->fusion()->zeroVal(); - } - if (fill_value->getDataType().value() != tv->getDataType().value()) { - fill_value = castOp(tv->getDataType().value(), fill_value); - } - mut_tv = full(new_shape, fill_value, tv->getDataType().value()); - - replaceTV(tv, mut_tv); - - return mut_tv; -} - -void DynamicTransformConcretizer::replaceTV( - TensorView* old_tv, - TensorView* new_tv) { - registerConcretization(old_tv, new_tv); - OptOutMutator::mutate(old_tv); - - for (auto use : old_tv->uses()) { - ir_utils::replaceValInExpr(use, old_tv, new_tv); - } - - if (old_tv->isFusionOutput()) { - old_tv->fusion()->replaceOutput(old_tv, new_tv); - } - - replaced_tvs_[old_tv] = new_tv; } void DynamicTransformConcretizer::concretizeReshape() { diff --git a/csrc/dynamic_transform.h b/csrc/dynamic_transform.h index 5a3b34bd0c2..0cb1af5f9d3 100644 --- a/csrc/dynamic_transform.h +++ b/csrc/dynamic_transform.h @@ -64,7 +64,7 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { //! Return a set of scalars that appear as extents in TensorViews in the //! Fusion. If any of these evaluate to zero, there is at least one empty //! TensorView present. - const std::unordered_set& getMaybeZeroExtents() const { + const std::vector& getMaybeZeroExtents() const { return maybe_zero_extents_; } @@ -80,15 +80,6 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { return dynamic_resized_ids_; } - TensorView* lookUpTV(size_t tv_name) const { - auto it = name_to_tensorview_.find(tv_name); - TORCH_INTERNAL_ASSERT( - it != name_to_tensorview_.end(), - "Could not find TensorView with name ", - tv_name); - return it->second; - } - std::string toString() const; DynamicTransformInitialInfo clone(IrCloner& ir_cloner) const; @@ -107,14 +98,6 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { private: DynamicTransformInitialInfo(Fusion* fusion) : fusion_(fusion) {} - // Holds mapping from the name() of a TensorView to its value. This is so that - // we can hold only the name of a tensor in conc_info and still be able to - // access a cloned TensorView. Holding pointers directly would not work in - // such a case since after cloning we no longer have a mapping between - // original Vals and cloned Vals. Note that the functionality offered by this - // map probably belongs in Fusion instead. - std::unordered_map name_to_tensorview_; - private: Fusion* fusion_ = nullptr; @@ -130,7 +113,9 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { // This is a minimal set of scalars to check for empty tensors. If any are // zero, we should traverse to find empty tensors. - std::unordered_set maybe_zero_extents_; + std::unordered_set maybe_zero_extents_set_; + // The set above is populated then used to create this unique vector + std::vector maybe_zero_extents_; // Root Vals that determine concretization std::unordered_set root_dynamic_vals_; @@ -138,32 +123,6 @@ class TORCH_CUDA_CU_API DynamicTransformInitialInfo { friend class DynamicTransformInitialInfoBuilder; }; -//! Describes known empty dimensions in a TensorView's maybe RFactor domain -struct TORCH_CUDA_CU_API EmptyTensorDescriptor { - size_t tv_name; - std::vector empty_axes; - - bool operator==(const EmptyTensorDescriptor& other) const { - return tv_name == other.tv_name && empty_axes == other.empty_axes; - } - - bool operator!=(const EmptyTensorDescriptor& other) const { - return !operator==(other); - } - - size_t hash() const { - size_t hash = 0; - for (auto ax : empty_axes) { - hash <<= 3; - hash ^= ax; - } - // We need to hash the tv address here, since we could conceivably find two - // different tensors that are empty in the same axes. - hash ^= std::hash()(tv_name); - return hash; - } -}; - //! A set of transformations for a symbolic fusion with concrete sizes //! of the fusion inputs class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { @@ -172,8 +131,8 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { const DynamicTransformInitialInfo* initial_info, ExpressionEvaluator* expr_eval); - const std::vector& getEmptyTensors() const { - return empty_tensors_; + const std::vector& getEmptyExtents() const { + return empty_extents_; } //! Return a vector of pairs holding the index of each reshaped TensorView in @@ -241,9 +200,9 @@ class TORCH_CUDA_CU_API DynamicTransformConcretizationInfo { //! result of analyzeView std::vector> reshape_transforms_; - //! Holds, for each empty tensor, a pointer to the tensor along with a vector - //! of positions in its rfactor domain which are size 0 - std::vector empty_tensors_; + //! Holds a vector of indices into initial_info_.getMaybeZeroExtents() which + //! evaluate to 0 + std::vector empty_extents_; //! Holds the index of the resized IterDomain (output of the Resize op) in the //! vector returned by initial_info_->getDynamicResizedIterDomains() along From bdbb31ca4a224fd38dc06082ce6f3cbecac3fa10 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 7 Jul 2023 12:09:18 -0400 Subject: [PATCH 48/63] Skip check for symbolic axis in mutate(TensorView) --- csrc/dynamic_transform.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 354466bab98..27e8c9db117 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -608,10 +608,6 @@ void DynamicTransformConcretizer::checkConcretizedUses( // concretized. Since symbolic IDs may be propagated down to // consumers, those domains need to be concretized accordingly. void DynamicTransformConcretizer::mutate(TensorView* tv) { - if (!tv->domain()->hasSymbolicAxis()) { - return; - } - // First, try to concretize the root domain as there may be symbolic // axes inherited from the producers propagateFromProducerToConsumer(tv); From 42392d34b5fb7bcf957b1a779bf5c6999cea9818 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 7 Jul 2023 12:21:14 -0400 Subject: [PATCH 49/63] Specify iter_type in cat() during RemoveEmptyPass This fixes a bug where we were re-introducing symbolic extents when doing empty replacements. --- csrc/ops/alias.cpp | 14 ++++++++++---- csrc/ops/alias.h | 6 ++++-- csrc/optimization/remove_empty.cpp | 12 ++++++++++-- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp index 57ea4714b18..c3e6526606b 100644 --- a/csrc/ops/alias.cpp +++ b/csrc/ops/alias.cpp @@ -441,7 +441,8 @@ bool hasSimilarDtype(DataType base, DataType dt) { TensorView* pad( TensorView* inp, const std::vector& pad_widths, - Val* value) { + Val* value, + std::optional iter_type_opt) { DataType dt = inp->getDataType().value(); if (!value) { // Create a zero of the appropriate type @@ -510,7 +511,8 @@ TensorView* pad( out_root_id = IterDomainBuilder(inp_root_id).is_rfactor_domain(true).build(); // Expand the root domain and mark it as a rfactor domain - out_rf_id = IterDomain::resize(out_root_id, left_pad, right_pad, true); + out_rf_id = IterDomain::resize( + out_root_id, left_pad, right_pad, true, iter_type_opt); is_padded_any = true; } root_ids.at(idx) = out_root_id; @@ -539,7 +541,10 @@ TensorView* pad( // account for the size difference between each of the inputs and the // output. All of the inputs to CatOp have the same shape as the // output shape. -TensorView* cat(const std::vector& inputs, int64_t cat_dim) { +TensorView* cat( + const std::vector& inputs, + int64_t cat_dim, + std::optional iter_type_opt) { TORCH_CHECK(!inputs.empty(), "No input tensor given"); const auto dtype = inputs.at(0)->getDataType().value(); @@ -642,7 +647,8 @@ TensorView* cat(const std::vector& inputs, int64_t cat_dim) { pad_widths.at((ndims - dim - 1) * 2 + 1) = right_pad_i; } - resized_inputs.at(input_idx) = pad(inputs.at(input_idx), pad_widths); + resized_inputs.at(input_idx) = + pad(inputs.at(input_idx), pad_widths, nullptr, iter_type_opt); } // Now all of resized_inputs have the same shape as the out tensor diff --git a/csrc/ops/alias.h b/csrc/ops/alias.h index 1d3299fd6e5..a9e5ea1eff7 100644 --- a/csrc/ops/alias.h +++ b/csrc/ops/alias.h @@ -97,12 +97,14 @@ TORCH_CUDA_CU_API TensorView* transpose(TensorView* x); TORCH_CUDA_CU_API TensorView* pad( TensorView* x, const std::vector& pad_widths, - Val* value = nullptr); + Val* value = nullptr, + std::optional iter_type_opt = std::nullopt); //! Concatenate tensors in the given dimension TORCH_CUDA_CU_API TensorView* cat( const std::vector& inputs, - int64_t dim); + int64_t dim, + std::optional iter_type_opt = std::nullopt); //! Return a tensor where each dimension is sliced as specified by the //! ranges parameter. Stepping must be one at this moment. diff --git a/csrc/optimization/remove_empty.cpp b/csrc/optimization/remove_empty.cpp index b133422d8ad..4c6532618e7 100644 --- a/csrc/optimization/remove_empty.cpp +++ b/csrc/optimization/remove_empty.cpp @@ -260,8 +260,16 @@ class EmptyTensorRemover : public DeadCodeRemover { if (non_empty_inputs.size() != cop->inputs().size()) { // Replace this op with a new cat op auto old_tv = cop->outputs()[0]->as(); - // NOTE: cat() will translate to set() if non_empty_inputs.size() == 1 - auto new_tv = cat(non_empty_inputs, dim); + // NOTE: cat() will translate to set() if non_empty_inputs.size() == 1. + // Also note that unless we're careful this call to cat() might result in + // symbolic axis, since the inputs have potentially symbolic extents in + // the cat dimension. However, since we have already undergone + // concretization at this point, we can trust that the original IterType, + // so we pass it here to avoid creating new Symbolic axes. + auto iter_type = old_tv->getMaybeRFactorDomain() + .at(cop->concatenatedDim()) + ->getIterType(); + auto new_tv = cat(non_empty_inputs, dim, iter_type); registerReplacement(old_tv, new_tv); } } From 7f9b588fa955f8a98dd594ea5df871ca9f8987ee Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 15:03:33 -0400 Subject: [PATCH 50/63] Fix up this PR based on lessons from #576 --- csrc/dynamic_transform.cpp | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 27e8c9db117..68054c33504 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -520,19 +520,19 @@ class DynamicTransformConcretizer : public OptOutMutator { }; void DynamicTransformConcretizer::concretize() { - //! Concretize all dynamic reshape ops + // Concretize all dynamic reshape ops concretizeReshape(); - //! Set output IterTypes for dynamic resize ops + // Set output IterTypes for dynamic resize ops concretizeResize(); - //! Registers replacement of all empty extents with zeroVal() + // Registers replacement of all empty extents with zeroVal() concretizeEmptyExtents(); // Finally, propagate concretized domains - auto all_stmts = StmtSort::getStmts(info_->fusion(), true); - for (auto stmt : all_stmts) { - mutate(stmt); + auto all_stmts = StmtSort::getStmts(info_->fusion()); + for (auto tv : ir_utils::filterByType(all_stmts)) { + mutate(tv); } } @@ -541,6 +541,10 @@ void DynamicTransformConcretizer::concretizeEmptyExtents() { for (const auto& ext_index : info_->getEmptyExtents()) { auto ext = info_->initialInfo()->getMaybeZeroExtents().at(ext_index); auto zero = fusion->zeroVal(ext->getDataType().value()); + auto uses = ext->uses(); + for (auto use : uses) { + ir_utils::replaceValInExpr(use, ext, zero); + } // Register the concretization of this scalar, which allows us to replace it // whenever it is used as an extent member of an IterDomain. registerConcretization(ext, zero); @@ -562,7 +566,8 @@ void DynamicTransformConcretizer::concretizeReshape() { checkConcretizedUses(incomplete_out_tv, concrete_reshape_out_tv); // Replace the old tensor with the new concretized tensor - for (auto use_of_old_tv : incomplete_out_tv->uses()) { + auto uses = incomplete_out_tv->uses(); + for (auto use_of_old_tv : uses) { ir_utils::replaceValInExpr( use_of_old_tv, incomplete_out_tv, concrete_reshape_out_tv); } @@ -608,6 +613,12 @@ void DynamicTransformConcretizer::checkConcretizedUses( // concretized. Since symbolic IDs may be propagated down to // consumers, those domains need to be concretized accordingly. void DynamicTransformConcretizer::mutate(TensorView* tv) { + for (auto root_id : tv->getRootDomain()) { + // This will register root_id for mutation if its extent, start, or + // stop_offset is registered for mutation + OptOutMutator::mutate(root_id); + } + // First, try to concretize the root domain as there may be symbolic // axes inherited from the producers propagateFromProducerToConsumer(tv); @@ -680,12 +691,14 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { continue; } auto concretized_out_id = - IterDomainBuilder(out_id).iter_type(iter_type).build(); + IterDomainBuilder(maybeMutated(out_id)->as()) + .iter_type(iter_type) + .build(); registerConcretization(out_id, concretized_out_id); } - // The expr itself needs to be mutated as well in case the outputs are - // mutated, which can be done by the mutate method + // expr must be mutated in order to set it as the definition for the + // concretized outputs. OptOutMutator::mutate(expr); } } @@ -814,7 +827,9 @@ bool DynamicTransformConcretizer::propagateFromProducerToConsumer( consumer->toString()); auto concretized_id = - IterDomainBuilder(root_id).iter_type(*id_type).build(); + IterDomainBuilder(maybeMutated(root_id)->as()) + .iter_type(*id_type) + .build(); registerConcretization(root_id, concretized_id); is_concretized = true; From 7593ea4509aca9c18dcf6acd4e73d62851f3371d Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 15:04:44 -0400 Subject: [PATCH 51/63] Comment out non-working test case in DynamicPadShmoo --- test/test_dynamic_transform.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp index b815b8f9437..2a651a112ce 100644 --- a/test/test_dynamic_transform.cpp +++ b/test/test_dynamic_transform.cpp @@ -951,7 +951,10 @@ TEST_F(NVFuserTest, DynamicPadShmoo_CUDA) { //{{3, 5}, {-3, -2}, false}, // output is zero-dimensional // Output has size 1 so is set to broadcast. - {{3, 5}, {0, -4}, true}, + // This was previously "working" by concretizing the size-1 pad to + // Iteration, even though it should be Broadcast. When set properly to + // Broadcast, it fails with an error in ConcretizedBroadcastDomains. + //{{3, 5}, {0, -4}, true}, // Test full negative shifts, so output doesn't overlap input {{3, 5}, {-5, 2}, false}, From aa5d75cf1b1bc99aa2e4878b96d5de334bce7f4c Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 15:07:49 -0400 Subject: [PATCH 52/63] Clean up stale code --- csrc/dynamic_transform.cpp | 73 -------------------------------------- 1 file changed, 73 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 68054c33504..4a18401c0d5 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -8,19 +8,15 @@ #include #include #include -#include #include #include #include #include -#include -#include #include #include #include #include -#include #include namespace nvfuser { @@ -418,75 +414,6 @@ class DynamicTransformConcretizer : public OptOutMutator { private: void concretize(); - //! removeEmptyBranches sets definitions of empty tensors to full(), and - //! replaces uses like reductions over empty axes with full calls. - //! - //! Consider the following Fusion with input T0, T1 and output T3: - //! - //! T0 - //! | - //! sum - //! | - //! T2 T1 - //! \ / - //! mul - //! | - //! T3 - //! - //! If T1 has any size-zero dimensions, then we know that T3 is also empty, - //! and T2 may be empty as well (unless it's broadcasting in all the empty - //! dimensions of T1). In this case, we can replace the entire Fusion with a - //! single call to full(): - //! - //! T0 T1 - //! - //! full - //! | - //! T3 - //! - //! Notice that the graph is now disconnected since T0 and T1 remain as Fusion - //! inputs. - //! - //! If instead, T1 is not empty, but T0 is, then there are two possibilities: - //! a) If any empty axes of T0 are not reduced, then T2 shares those empty - //! axes, in which case T3 must also be empty, so we can rewrite the Fusion - //! the same way as above, by redefining T3 = full(shape) - //! - //! b) If instead the empty axes of T0 are all being reduced in the sum, - //! then T2 is not empty. In this case, since T0 is an input, rewriting it - //! as a full() output is not helpful. However, we know that any use of an - //! empty tensor does not require computation over T0, so we can rewrite it. - //! In this case, we can rewrite the sum as a full(shape, 0) since the sum - //! over an empty tensor is 0 (more generally, the initial value of the - //! reduction). This leads to the following rewritten Fusion: - //! - //! T0 - //! - //! full - //! | - //! T2 T1 - //! \ / - //! mul - //! | - //! T3 - //! - //! After this call, the Fusion will only contain empty tensors if they are - //! Fusion inputs or outputs. Furthermore, output tensors will have constant - //! zeros for the extents of empty axes. - //! - //! Instead of sum, we may encounter pad or cat ops. These are handled as - //! follows: - //! - //! Pads of empty tensors are replaced with full() using a fill value equal - //! to the pad value. - //! - //! Cat of tensors including some that are empty in the cat dimension are - //! simply replaced with a call to cat() that excludes the empty tensors. - //! Note that if any non-cat dimensions are empty, then the output will be - //! empty as well and the cat becomes dead code, as in the second example - //! with empty T0 from above. - // void removeEmptyBranches(); - void concretizeReshape(); void concretizeResize(); From ae8b5b5062c400f07df32e563530fa518c47e556 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 15:12:34 -0400 Subject: [PATCH 53/63] Remove unused Fusion::replaceInput --- csrc/fusion.cpp | 29 ----------------------------- csrc/fusion.h | 3 --- 2 files changed, 32 deletions(-) diff --git a/csrc/fusion.cpp b/csrc/fusion.cpp index c510b357173..1093687e812 100644 --- a/csrc/fusion.cpp +++ b/csrc/fusion.cpp @@ -286,35 +286,6 @@ void Fusion::removeOutput(Val* output) { all_tv_uses_valid_ = false; } -void Fusion::replaceInput(Val* input, Val* replacement) { - auto find_input = std::find(inputs_.begin(), inputs_.end(), input); - TORCH_CHECK(find_input != inputs_.end(), "Unable to find input in Fusion"); - - std::replace_if( - inputs_.begin(), - inputs_.end(), - [&input](Val* v) { return v == input; }, - replacement); - - if (replacement->getValType().value() == ValType::TensorView) { - replacement->setIsFusionInput(true); - replacement->as()->setMemoryType(MemoryType::Global); - } - if (input->getValType().value() == ValType::TensorView) { - input->setIsFusionInput(false); - input->as()->setMemoryType(MemoryType::Local); - } - // Mark uses invalid so that they will be reset next time uses() is called - invalidateTvUses(); - - // Maintain aliased inputs - for (auto [aliased_output, aliased_input] : io_alias_) { - if (aliased_input == input) { - io_alias_[aliased_output] = replacement; - } - } -} - void Fusion::replaceOutput(Val* output, Val* replacement) { auto find_output = std::find(outputs_.begin(), outputs_.end(), output); TORCH_CHECK(find_output != outputs_.end(), "Unable to find output in Fusion"); diff --git a/csrc/fusion.h b/csrc/fusion.h index 83fd9a0c503..211a8a1abe0 100644 --- a/csrc/fusion.h +++ b/csrc/fusion.h @@ -127,9 +127,6 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer { //! Deregister output as an output of the fusion void removeOutput(Val* output); - //! Replace input with another value - void replaceInput(Val* input, Val* replacement); - //! Replace output with another value void replaceOutput(Val* output, Val* replacement); From da70cc17d5ffe95c32c6517e17fbeb18c2bace3e Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 15:12:45 -0400 Subject: [PATCH 54/63] Minor comment cleanup --- csrc/dynamic_transform.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 4a18401c0d5..4df74cb6c80 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -161,13 +161,13 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { } } - //! Convert the maybe_zero_extents_set_ to a vector so that we can index it. + //! Convert maybe_zero_extents_set_ to a vector so we can index it reliably void finalizeMaybeEmptyExtents() { info_.maybe_zero_extents_.reserve(info_.maybe_zero_extents_set_.size()); for (auto val : info_.maybe_zero_extents_set_) { info_.maybe_zero_extents_.push_back(val); } - // Clear the corresponding set to free memory and avoid speed up cloning + // Clear the corresponding set to free memory and speed up cloning info_.maybe_zero_extents_set_.clear(); } From 899ee217270a489ed3dbbd8ef41c253584c1e215 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 15:21:59 -0400 Subject: [PATCH 55/63] Update comment in FusionResizeMultiSliceEmpty_CUDA --- test/test_resize.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index 147856ede75..f2b327df4b3 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2148,13 +2148,10 @@ TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) { auto tv0 = makeConcreteTensor(shape); fusion->addInput(tv0); - // Perform a size-1 slice and a size-0 slice on tv0. The size-1 slice - // could be size >1 with no change in the error. The order does not - // matter. Performing only one of these slices does not trigger the - // error and the output is correct in that case. If there are - // multiple size-0 slices the error is not triggered. It only seems - // to appear when there are both size-0 and size non-zero slices of - // the same tensor. + // In issue #365, this triggered an error in vectorization when there were + // multiple slices, and one of them was empty. If this is properly handled in + // the pre-segmentation RemoveEmptyPass as it should be, then the size-zero + // slices will be replaced with full(), and vectorization can work properly. auto tv1 = slice( tv0, {{IrBuilder::create(0), From 69356eb38e2297fc04d5b2d635b893edf642e5ce Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 15:29:56 -0400 Subject: [PATCH 56/63] Verify that tv2 is replaced by full in #365 repro test --- test/test_resize.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index f2b327df4b3..939bf127a48 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2179,6 +2179,14 @@ TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) { TORCH_CHECK(ref0.equal(cg_outputs[0])); TORCH_CHECK(ref1.equal(cg_outputs[1])); + + // Check that tv2 is replaced by a FullOp + const auto runtime = executor_cache.getMostRecentKernelRuntime(); + const auto preseg_fusion = runtime->fusionSegments()->completeFusion(); + EXPECT_EQ(preseg_fusion->outputs().size(), 2); + EXPECT_NE(preseg_fusion->outputs().at(1), tv1); + EXPECT_NE(preseg_fusion->outputs().at(1)->definition(), nullptr); + EXPECT_TRUE(preseg_fusion->outputs().at(1)->definition()->isA()); } TEST_F(NVFuserTest, SliceVectorization) { From 4dcff1e282b507396e7a7ba64cf9dc999512d178 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 15:58:13 -0400 Subject: [PATCH 57/63] Fix segfault in segmentation --- csrc/fusion_segmenter.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index 377af6898da..21410df8085 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -3399,13 +3399,24 @@ void SegmentCandidateFinder::forwardInputs() { continue; } - if (expr->output(0)->uses().size() > 1) { + // expr is a unary op so there is a single output. Here we look at that + // output's further uses + auto output_uses = expr->output(0)->uses(); + if (output_uses.size() == 0) { + // Unused outputs terminate here + continue; + } + + // If the output of expr has multiple uses, exclude it and move on. + if (output_uses.size() > 1) { excluded_inp_unary_exprs_.pushBack(expr); forwarded_inputs.pushBack(expr->output(0)); continue; } - to_visit.emplace_back(expr->output(0)->uses()[0]); + // If there is a single use, visit it to try and extend the chain of + // unaryOps + to_visit.emplace_back(output_uses[0]); } } From aac04e8147d0920951399b0445293a7b452d646e Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 16:10:59 -0400 Subject: [PATCH 58/63] Fix clang-tidy warning about .empty() --- csrc/fusion_segmenter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index c8ca12ca6e0..5bb3d8999b0 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -3402,7 +3402,7 @@ void SegmentCandidateFinder::forwardInputs() { // expr is a unary op so there is a single output. Here we look at that // output's further uses auto output_uses = expr->output(0)->uses(); - if (output_uses.size() == 0) { + if (output_uses.empty()) { // Unused outputs terminate here continue; } From a1c923812027e7ca1b8ac7b3d66412145f4ff206 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 12 Jul 2023 16:16:12 -0400 Subject: [PATCH 59/63] Update test to accomodate recent Scalar refactor --- test/test_resize.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index ba7c35a5c03..5eb27f9b001 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2167,15 +2167,15 @@ TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) { // slices will be replaced with full(), and vectorization can work properly. auto tv1 = slice( tv0, - {{IrBuilder::create(0), - IrBuilder::create(1), - IrBuilder::create(1)}}); + {{IrBuilder::create(0), + IrBuilder::create(1), + IrBuilder::create(1)}}); fusion->addOutput(tv1); auto tv2 = slice( tv0, - {{IrBuilder::create(0), - IrBuilder::create(0), - IrBuilder::create(1)}}); + {{IrBuilder::create(0), + IrBuilder::create(0), + IrBuilder::create(1)}}); fusion->addOutput(tv2); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); From 16a9418fa107edffed56d6d7a31753a8edc326e9 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com> Date: Thu, 13 Jul 2023 07:29:08 -0400 Subject: [PATCH 60/63] Initialize maybe_zero_extents_ more efficiently Co-authored-by: Naoya Maruyama --- csrc/dynamic_transform.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 4df74cb6c80..f69eefb4822 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -163,10 +163,7 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { //! Convert maybe_zero_extents_set_ to a vector so we can index it reliably void finalizeMaybeEmptyExtents() { - info_.maybe_zero_extents_.reserve(info_.maybe_zero_extents_set_.size()); - for (auto val : info_.maybe_zero_extents_set_) { - info_.maybe_zero_extents_.push_back(val); - } + info_.maybe_zero_extents_ = std::vector(info_.maybe_zero_extents_set_.begin(), info_.maybe_zero_extents_set_.end()); // Clear the corresponding set to free memory and speed up cloning info_.maybe_zero_extents_set_.clear(); } From 91e3c0484b81dc0d4a7e3f8d4c67c611f7a19a90 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 13 Jul 2023 07:41:17 -0400 Subject: [PATCH 61/63] Minor type and linter fix --- csrc/dynamic_transform.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index f69eefb4822..c443e863808 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -163,7 +163,9 @@ class DynamicTransformInitialInfoBuilder : public IterVisitor { //! Convert maybe_zero_extents_set_ to a vector so we can index it reliably void finalizeMaybeEmptyExtents() { - info_.maybe_zero_extents_ = std::vector(info_.maybe_zero_extents_set_.begin(), info_.maybe_zero_extents_set_.end()); + info_.maybe_zero_extents_ = std::vector( + info_.maybe_zero_extents_set_.begin(), + info_.maybe_zero_extents_set_.end()); // Clear the corresponding set to free memory and speed up cloning info_.maybe_zero_extents_set_.clear(); } From 14468a6f8855e27818fa2e39105a7b423943084a Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 13 Jul 2023 15:31:00 -0400 Subject: [PATCH 62/63] Expand comment about why we concretize ext->zero --- csrc/dynamic_transform.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index c443e863808..27f91a66c68 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -473,6 +473,12 @@ void DynamicTransformConcretizer::concretizeEmptyExtents() { } // Register the concretization of this scalar, which allows us to replace it // whenever it is used as an extent member of an IterDomain. + // + // When we ext in all uses above, it affects downstream expressions. For + // example we might replace i0 with 0 in (i0 + i1) + i2 to form (0 + i1) + + // i2. However, i0 itself might be used as the extent, start, or stop values + // in an IterDomain, so we register the concretization here so that we can + // replace these values whenever we encounter them. registerConcretization(ext, zero); } } From 6f2db3bcbaac5d620c56315f265246da3ecc38bd Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 13 Jul 2023 15:33:04 -0400 Subject: [PATCH 63/63] Update comment about cat in EmptyTensorRemover --- csrc/optimization/remove_empty.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/csrc/optimization/remove_empty.cpp b/csrc/optimization/remove_empty.cpp index 450fe45a1ab..a582984f5de 100644 --- a/csrc/optimization/remove_empty.cpp +++ b/csrc/optimization/remove_empty.cpp @@ -262,10 +262,12 @@ class EmptyTensorRemover : public DeadCodeRemover { auto old_tv = cop->outputs()[0]->as(); // NOTE: cat() will translate to set() if non_empty_inputs.size() == 1. // Also note that unless we're careful this call to cat() might result in - // symbolic axis, since the inputs have potentially symbolic extents in - // the cat dimension. However, since we have already undergone - // concretization at this point, we can trust that the original IterType, - // so we pass it here to avoid creating new Symbolic axes. + // symbolic axis, since the inputs may have unknown extents in the cat + // dimension. By default, cat() will make the conservative choice in such + // a situation and set the output IterType to Symbolic. However, since we + // have already undergone concretization at this point, we can trust that + // the original IterType is correct, so we pass it here to avoid creating + // new Symbolic axes. auto iter_type = old_tv->getMaybeRFactorDomain() .at(cop->concatenatedDim()) ->getIterType();