From ea50620f88f45797b8d68f25ed387c00ed046f31 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 13 Sep 2023 10:14:36 -0400 Subject: [PATCH 01/12] Add test --- python_tests/test_python_frontend.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/python_tests/test_python_frontend.py b/python_tests/test_python_frontend.py index 821cf90df25..21bd8ae0619 100644 --- a/python_tests/test_python_frontend.py +++ b/python_tests/test_python_frontend.py @@ -2462,6 +2462,27 @@ def fusion_func(fd: FusionDefinition) -> None: self.assertEqual(nvf_out[0].shape, (0, 0)) self.assertEqual(nvf_out[1].shape, (0, 0)) + # Test that a pad of an expanded empty tensor works properly + # See https://github.com/NVIDIA/Fuser/issues/596#issuecomment-1714465618 + def test_pad_expanded_empty(self): + inputs = [ + torch.randn((0,), dtype=torch.float64, device="cuda:0").as_strided( + (2, 0, 3), (0, 0, 0) + ), + ] + + def fusion_func(fd: FusionDefinition) -> None: + T0 = fd.from_pytorch(inputs[0]) + S1 = fd.define_scalar(-3.70753, dtype=DataType.Double) + T2 = fd.ops.pad(T0, [0, 0, 1, 1, 1, 0], S1) + fd.add_output(T2) + + nvf_out, _ = self.exec_nvfuser(fusion_func, inputs) + + torch_ref = F.pad(inputs[0], (0, 0, 1, 1, 1, 0), "constant", -3.70753) + + self.assertEqual(nvf_out[0], torch_ref) + if __name__ == "__main__": run_tests() From ed8f5b1ab29cb5aa9ae9ed28791a74c436f1e13e Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 13 Sep 2023 10:14:54 -0400 Subject: [PATCH 02/12] Use expanded extent and mutated out_id --- csrc/dynamic_transform.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index b4f205ca487..7311f7cf3b3 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -305,7 +305,7 @@ void DynamicTransformConcretizationInfo::analyzeResizes( "Found non-dynamic Resize in initial concretization info: ", op->toString()); - auto extent_val = expr_eval->evaluate(out_id->extent()); + auto extent_val = expr_eval->evaluate(out_id->getMaybeExpandedExtent()); NVF_ERROR( extent_val.hasValue(), "Cannot evaluate the extent of a resized domain: ", @@ -678,7 +678,8 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { } // Update the IterType of each output for (auto out_id : ir_utils::filterByType(expr->outputs())) { - if (!out_id->isSymbolic()) { + auto mut_id = maybeMutated(out_id)->as(); + if (!mut_id->isSymbolic()) { continue; } @@ -690,9 +691,7 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { expr->toString()); auto concretized_out_id = - IterDomainBuilder(maybeMutated(out_id)->as()) - .iter_type(iter_type) - .build(); + IterDomainBuilder(mut_id).iter_type(iter_type).build(); registerConcretization(out_id, concretized_out_id); } From 2ae5bf9eeffbd5bd53249856257e3c1921221ac1 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Wed, 13 Sep 2023 11:47:28 -0400 Subject: [PATCH 03/12] Bind tensorviews in PrecomputedValues::bindInputs --- csrc/evaluator_common.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/evaluator_common.cpp b/csrc/evaluator_common.cpp index e19631a59aa..2757c1cbf96 100644 --- a/csrc/evaluator_common.cpp +++ b/csrc/evaluator_common.cpp @@ -170,9 +170,8 @@ void PrecomputedValues::bindInputs(const KernelArgumentHolder& args) { if (!tensor.is_cpu()) { bindTensorMetaData(tensor_input, tensor); } - } else { - bindValue(input->evaluatorIndex(), *args[i]); } + bindValue(input->evaluatorIndex(), *args[i]); } } From 26a1f4458961ffa2bb88dddaa927e85739f0e6b2 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 19 Sep 2023 11:30:28 -0400 Subject: [PATCH 04/12] Bind extents and TensorMetaData instead of tensor itself --- csrc/executor_utils.cpp | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 5c6a70148b4..10e321f7fda 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -721,7 +721,35 @@ ExpressionEvaluator bindInputs( ExpressionEvaluator expr_eval; const auto& inputs = kernel->inputs(); for (const auto i : c10::irange(inputs.size())) { - expr_eval.bind(inputs[i], *args[i], true); + if (auto tv = dynamic_cast(inputs[i])) { + const auto& input = args[i]->as(); + + // bind all extents + const auto& root_dom = + TensorDomain::noReductions(tv->getMaybeRFactorDomain()); + NVF_ERROR(root_dom.size() == input.sizes().size()); + for (const auto j : c10::irange(input.sizes().size())) { + expr_eval.bind( + root_dom.at(j)->getMaybeExpandedExtent(), + input.sizes().at(j), + true); + } + + // bind TensorMetaData so that GetMetaData expressions can be evaluated + std::shared_ptr struct_ = std::make_shared(); + TensorMetaData* metadata = (TensorMetaData*)struct_.get(); + metadata->dtype = + std::get(aten_to_data_type(input.scalar_type()).type); + metadata->data = input.data_ptr(); + metadata->logical_size = input.sizes(); + metadata->logical_stride = input.strides(); + metadata->alloc_size = input.sizes(); + metadata->alloc_stride = input.strides(); + PolymorphicValue pv(std::move(struct_)); + expr_eval.bind(IrBuilder::metadataExpr(tv), pv, true); + } else { + expr_eval.bind(inputs[i], *args[i], true); + } } return expr_eval; From a21ae8c6261ed54393fd761fdeffacf649b3a39f Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 19 Sep 2023 14:44:49 -0400 Subject: [PATCH 05/12] Revert "Bind extents and TensorMetaData instead of tensor itself" This reverts commit 26a1f4458961ffa2bb88dddaa927e85739f0e6b2. --- csrc/executor_utils.cpp | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 10e321f7fda..5c6a70148b4 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -721,35 +721,7 @@ ExpressionEvaluator bindInputs( ExpressionEvaluator expr_eval; const auto& inputs = kernel->inputs(); for (const auto i : c10::irange(inputs.size())) { - if (auto tv = dynamic_cast(inputs[i])) { - const auto& input = args[i]->as(); - - // bind all extents - const auto& root_dom = - TensorDomain::noReductions(tv->getMaybeRFactorDomain()); - NVF_ERROR(root_dom.size() == input.sizes().size()); - for (const auto j : c10::irange(input.sizes().size())) { - expr_eval.bind( - root_dom.at(j)->getMaybeExpandedExtent(), - input.sizes().at(j), - true); - } - - // bind TensorMetaData so that GetMetaData expressions can be evaluated - std::shared_ptr struct_ = std::make_shared(); - TensorMetaData* metadata = (TensorMetaData*)struct_.get(); - metadata->dtype = - std::get(aten_to_data_type(input.scalar_type()).type); - metadata->data = input.data_ptr(); - metadata->logical_size = input.sizes(); - metadata->logical_stride = input.strides(); - metadata->alloc_size = input.sizes(); - metadata->alloc_stride = input.strides(); - PolymorphicValue pv(std::move(struct_)); - expr_eval.bind(IrBuilder::metadataExpr(tv), pv, true); - } else { - expr_eval.bind(inputs[i], *args[i], true); - } + expr_eval.bind(inputs[i], *args[i], true); } return expr_eval; From ef85ea154d36efce4fd4fd1944bfcea32c62e952 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 19 Sep 2023 14:48:34 -0400 Subject: [PATCH 06/12] Add comment about binding tensors to ExpressionEvaluator --- csrc/executor_utils.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/csrc/executor_utils.cpp b/csrc/executor_utils.cpp index 5c6a70148b4..14bcf3f7d4b 100644 --- a/csrc/executor_utils.cpp +++ b/csrc/executor_utils.cpp @@ -721,6 +721,9 @@ ExpressionEvaluator bindInputs( ExpressionEvaluator expr_eval; const auto& inputs = kernel->inputs(); for (const auto i : c10::irange(inputs.size())) { + // NOTE: we bind all inputs here, including at::Tensors. This means that + // expr_eval will create a PolymorphicValue containing *args[i], which means + // that at::Tensor's lifetime will be at least as long as that of expr_eval. expr_eval.bind(inputs[i], *args[i], true); } From 3f7e19fde2bb4d9c7c16e08f443a254e9ca99afe Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 21 Sep 2023 08:10:59 -0400 Subject: [PATCH 07/12] Only bind tv metadata in PrecomputedValues --- csrc/evaluator_common.cpp | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/csrc/evaluator_common.cpp b/csrc/evaluator_common.cpp index 2757c1cbf96..a2bc919b640 100644 --- a/csrc/evaluator_common.cpp +++ b/csrc/evaluator_common.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -170,8 +171,9 @@ void PrecomputedValues::bindInputs(const KernelArgumentHolder& args) { if (!tensor.is_cpu()) { bindTensorMetaData(tensor_input, tensor); } + } else { + bindValue(input->evaluatorIndex(), *args[i]); } - bindValue(input->evaluatorIndex(), *args[i]); } } @@ -341,6 +343,23 @@ void PrecomputedValues::bindTensorMetaData( bindValue(extent->evaluatorIndex(), value); } } + + // Here we bind TensorMetaData so that GetMetaData expressions can be + // evaluated. Note that we do not bind the at::Tensor itself here since that + // would mean PrecomputedValues will own the tensor. Unlike + // ExpressionEvaluator, PrecomputedValues objects are typically long-lived, so + // we do not want them to own large objects. + std::shared_ptr struct_ = std::make_shared(); + TensorMetaData* metadata = (TensorMetaData*)struct_.get(); + metadata->dtype = + std::get(aten_to_data_type(tensor.scalar_type()).type); + metadata->data = tensor.data_ptr(); + metadata->logical_size = tensor.sizes(); + metadata->logical_stride = tensor.strides(); + metadata->alloc_size = tensor.sizes(); + metadata->alloc_stride = tensor.strides(); + PolymorphicValue pv(std::move(struct_)); + bindValue(IrBuilder::metadataExpr(tv)->evaluatorIndex(), pv); } NaiveValueMachine::NaiveValueMachine(PrecomputedValues& precomputed_values) From 41d6b2db0989bf6df0ab16ba67d5598fd161156b Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 21 Sep 2023 10:21:16 -0400 Subject: [PATCH 08/12] Add C++ test --- test/test_resize.cpp | 47 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index 1fec7dbcb5b..76dc8ca6bcd 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2828,4 +2828,51 @@ TEST_F(ResizeTest, CatOfExpandedBroadcast) { NVF_CHECK(ref.equal(cg_outputs[0])); } +// Test that an empty input which is expanded in some non-zero directions can be +// padded in the empty dim as well as the expanded dims. +// This should match test_python_frontend.py::test_pad_expanded_empty +// See https://github.com/NVIDIA/Fuser/issues/870 +TEST_F(ResizeTest, PadExpandedEmpty) { + auto fusion_ptr = std::make_unique(); + auto& fusion = *fusion_ptr; + FusionGuard fg(&fusion); + + auto i0 = IrBuilder::create(DataType::Index); + auto i1 = IrBuilder::create(DataType::Index); + auto i2 = IrBuilder::create(DataType::Index); + + auto tv0 = TensorViewBuilder() + .shape({i0, i1, i2}) + .expanded({true, false, true}) + .dtype(DataType::Double) + .build(); + fusion.addInput(tv0); + + auto s0 = IrBuilder::create(-3.70753); + + std::vector pad_widths( + {fusion.zeroVal(DataType::Index), + fusion.zeroVal(DataType::Index), + fusion.oneVal(DataType::Index), + fusion.oneVal(DataType::Index), + fusion.oneVal(DataType::Index), + fusion.zeroVal(DataType::Index)}); + auto tv1 = pad(tv0, pad_widths, s0); + fusion.addOutput(tv1); + + auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0); + + auto t0 = at::randn({0}, options).as_strided({2, 0, 3}, {0, 0, 0}); + std::vector aten_inputs({t0}); + + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); + + std::cout << t0 << std::endl; + std::cout << t0.strides() << std::endl; + + testValidate( + executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__); +} + } // namespace nvfuser From 84ed3b4254bae3413b54831e2cb4d4469753691a Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 21 Sep 2023 10:21:24 -0400 Subject: [PATCH 09/12] Add StructHandle::operator== --- csrc/polymorphic_value.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/csrc/polymorphic_value.h b/csrc/polymorphic_value.h index 5eb3b0c5244..86016e3e295 100644 --- a/csrc/polymorphic_value.h +++ b/csrc/polymorphic_value.h @@ -181,6 +181,12 @@ class StructHandle { StructHandle& operator=(const StructHandle& other) = default; StructHandle& operator=(StructHandle&& other) = default; + //! This is a shallow comparison operator that just checks whether we point to + //! the same exact Struct + bool operator==(const StructHandle& other) const { + return struct_ptr_ == other.struct_ptr_; + } + template bool is() const { return std::dynamic_pointer_cast(struct_ptr_) != nullptr; From c9f174af628ef3600c7ec5ae011af7cc1709df17 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 21 Sep 2023 10:38:28 -0400 Subject: [PATCH 10/12] Silence lint error about TORCH_CUDA_CU_API --- csrc/scheduler/normalization_inner.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/scheduler/normalization_inner.h b/csrc/scheduler/normalization_inner.h index 8fd0ed8ef6d..d05d2ea724f 100644 --- a/csrc/scheduler/normalization_inner.h +++ b/csrc/scheduler/normalization_inner.h @@ -65,17 +65,17 @@ class InnerPersistentKernelScheduler : public SchedulerEntry { const scheduler_utils::ReductionTvProperties& properties); }; -TORCH_CUDA_CU_API std::shared_ptr getInnerPersistentHeuristics( +std::shared_ptr getInnerPersistentHeuristics( Fusion* fusion, const at::ArrayRef& runtime_inputs, HeuristicSummary* data_cache = nullptr); -TORCH_CUDA_CU_API std::shared_ptr getInnerPersistentHeuristics( +std::shared_ptr getInnerPersistentHeuristics( Fusion* fusion, SchedulerRuntimeInfo& runtime_info, HeuristicSummary* data_cache = nullptr); -TORCH_CUDA_CU_API void scheduleInnerPersistentKernel( +void scheduleInnerPersistentKernel( Fusion* fusion, const ReductionParams& rparams); From 70c06ba51825e461c90a9400cb1351d7b3e8c5a3 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 21 Sep 2023 15:55:15 -0400 Subject: [PATCH 11/12] Use ExpressionEvaluator to build metadata for PV --- csrc/evaluator_common.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/csrc/evaluator_common.cpp b/csrc/evaluator_common.cpp index b3c3341c83d..5864d2e744d 100644 --- a/csrc/evaluator_common.cpp +++ b/csrc/evaluator_common.cpp @@ -349,17 +349,19 @@ void PrecomputedValues::bindTensorMetaData( // would mean PrecomputedValues will own the tensor. Unlike // ExpressionEvaluator, PrecomputedValues objects are typically long-lived, so // we do not want them to own large objects. - std::shared_ptr struct_ = std::make_shared(); - TensorMetaData* metadata = (TensorMetaData*)struct_.get(); - metadata->dtype = - std::get(aten_to_data_type(tensor.scalar_type()).type); - metadata->data = tensor.data_ptr(); - metadata->logical_size = tensor.sizes(); - metadata->logical_stride = tensor.strides(); - metadata->alloc_size = tensor.sizes(); - metadata->alloc_stride = tensor.strides(); - PolymorphicValue pv(std::move(struct_)); - bindValue(IrBuilder::metadataExpr(tv)->evaluatorIndex(), pv); + // To do this we create a temporary ExpressionEvaluator so that we can compute + // the metadata once, then save it + ExpressionEvaluator ee; + ee.bind(tv, tensor); + auto metadata_val = IrBuilder::metadataExpr(tv); + auto metadata = ee.evaluate(metadata_val); + NVF_ERROR( + metadata.hasValue(), + "Could not evaluate metadata expression for ", + tv->toString(), + " with input tensor ", + tensor); + bindValue(metadata_val->evaluatorIndex(), metadata); } NaiveValueMachine::NaiveValueMachine(PrecomputedValues& precomputed_values) From 139580f9f189d6902daf6f0ad7f1d29709014149 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 22 Sep 2023 12:57:14 -0400 Subject: [PATCH 12/12] Bind this and add note about dangers --- csrc/evaluator_common.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/csrc/evaluator_common.cpp b/csrc/evaluator_common.cpp index 5864d2e744d..0298fe5d329 100644 --- a/csrc/evaluator_common.cpp +++ b/csrc/evaluator_common.cpp @@ -352,9 +352,16 @@ void PrecomputedValues::bindTensorMetaData( // To do this we create a temporary ExpressionEvaluator so that we can compute // the metadata once, then save it ExpressionEvaluator ee; + ee.bindPrecomputedValues(this); ee.bind(tv, tensor); auto metadata_val = IrBuilder::metadataExpr(tv); auto metadata = ee.evaluate(metadata_val); + // NOTE: In some cases we may not be able to evaluate metadata. For example, + // if there exists a split expression between the root and rfactor domains + // of tv whose split factor is not able to be evaluated. For that reason, + // calling code should ensure that all inputs required to propagate strides + // from root to allocation domains are already bound to "this" before binding + // a TensorView's metadata. NVF_ERROR( metadata.hasValue(), "Could not evaluate metadata expression for ",