From c1761a5e6223cf4f161c8e2318dd80c2aa71ff7e Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 18 May 2023 10:39:48 -0400 Subject: [PATCH 1/6] Add C++ and python tests for multiple slices, some empty --- python_tests/test_python_frontend.py | 58 ++++++++++++++++++++++++++++ test/test_resize.cpp | 48 +++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/python_tests/test_python_frontend.py b/python_tests/test_python_frontend.py index 0d50d6631f9..4a548a55e02 100644 --- a/python_tests/test_python_frontend.py +++ b/python_tests/test_python_frontend.py @@ -2182,6 +2182,64 @@ def fusion_func(fd: FusionDefinition, inps, matmul_fn) -> None: fc = FusionCache.get() fc.reset() + def test_multiple_empty_slices(self): + inputs = [ + torch.testing.make_tensor((4,), dtype=torch.float32, device="cuda"), + ] + + def fusion_func(fd: FusionDefinition): + T2 = fd.define_tensor( + symbolic_sizes=[-1], + contiguity=[True], + dtype=DataType.Float, + is_cpu=False, + ) + # Perform a size-1 slice and a size-0 slice on T2. The size-1 slice + # could be size >1 with no change in the error. The order does not + # matter. Performing only one of these slices does not trigger the + # error and the output is correct in that case. If there are + # multiple size-0 slices the error is not triggered. It only seems + # to appear when there are both size-0 and size non-zero slices of + # the same tensor. + T3 = fd.ops.slice(T2, start_indices=[2], end_indices=[3], strides=[1]) + T4 = fd.ops.slice(T2, start_indices=[1], end_indices=[1], strides=[1]) + fd.add_output(T3) + fd.add_output(T4) + + nvf_out, _ = self.exec_nvfuser(fusion_func, inputs) + + self.assertEqual(inputs[0][2:3], nvf_out[0]) + self.assertEqual(inputs[0][1:1], nvf_out[1]) + + def test_thunder420(self): + inputs = [ + torch.testing.make_tensor((4, 6, 7), dtype=torch.float32, device="cuda"), + ] + + def fusion_func(fd: FusionDefinition): + T2 = fd.define_tensor( + symbolic_sizes=[-1], + contiguity=[True], + dtype=DataType.Float, + is_cpu=False, + ) + # Perform a size-1 slice and a size-0 slice on T2. The size-1 slice + # could be size >1 with no change in the error. The order does not + # matter. Performing only one of these slices does not trigger the + # error and the output is correct in that case. If there are + # multiple size-0 slices the error is not triggered. It only seems + # to appear when there are both size-0 and size non-zero slices of + # the same tensor. + T3 = fd.ops.slice(T2, start_indices=[0], end_indices=[4], strides=[1]) + T4 = fd.ops.slice(T2, start_indices=[1], end_indices=[1], strides=[1]) + fd.add_output(T3) + fd.add_output(T4) + + nvf_out, _ = self.exec_nvfuser(fusion_func, inputs) + + self.assertEqual(inputs[0][2:3], nvf_out[0]) + self.assertEqual(inputs[0][1:1], nvf_out[1]) + if __name__ == "__main__": run_tests() diff --git a/test/test_resize.cpp b/test/test_resize.cpp index d8fe4cffa76..7ba1d16c368 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2053,4 +2053,52 @@ TEST_F(NVFuserTest, ResizePermuteAndSlice_CUDA) { __FILE__); } +// Test issue with multiple slices, one of which has size 0 +TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + std::vector shape({9}); + + // concrete shapes to avoid dynamic Fusion + auto tv0 = makeConcreteTensor(shape); + fusion->addInput(tv0); + + // Perform a size-1 slice and a size-0 slice on tv0. The size-1 slice + // could be size >1 with no change in the error. The order does not + // matter. Performing only one of these slices does not trigger the + // error and the output is correct in that case. If there are + // multiple size-0 slices the error is not triggered. It only seems + // to appear when there are both size-0 and size non-zero slices of + // the same tensor. + auto tv1 = slice( + tv0, + {{IrBuilder::create(0), + IrBuilder::create(1), + IrBuilder::create(1)}}); + fusion->addOutput(tv1); + auto tv2 = slice( + tv0, + {{IrBuilder::create(0), + IrBuilder::create(0), + IrBuilder::create(1)}}); + fusion->addOutput(tv2); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + + FusionExecutorCache executor_cache(std::move(fusion)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); + FusionExecutor fe; + + auto ref0 = t0.index({at::indexing::Slice(0, 1)}); + auto ref1 = t0.index({at::indexing::Slice(0, 0)}); + + TORCH_CHECK(ref0.equal(cg_outputs[0])); + TORCH_CHECK(ref1.equal(cg_outputs[1])); +} + } // namespace nvfuser From 76c80113d31130a5d93b2efa8709985c2f0cde03 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 18 May 2023 10:56:23 -0400 Subject: [PATCH 2/6] Remove numerator!=0 check. Note that this is not really safe as it will set zero_ = false even if it's still zero because of zero input numerator. --- csrc/scheduler/vectorize_helper.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/csrc/scheduler/vectorize_helper.h b/csrc/scheduler/vectorize_helper.h index afd8e601a1d..cb653f883f6 100644 --- a/csrc/scheduler/vectorize_helper.h +++ b/csrc/scheduler/vectorize_helper.h @@ -71,12 +71,6 @@ class TORCH_CUDA_CU_API ProjectedExtent { // Multiply numerator by provided value, or if currently zero set numerator to // provided value. void multiplyNumeratorValue(Val* new_numerator_val) { - TORCH_INTERNAL_ASSERT( - !new_numerator_val->isZeroInt() && - (!new_numerator_val->isConstInt() || - new_numerator_val->evaluateInt() > 0), - "Adding numerator value of zero not supported in ProjectedExtent."); - zero_ = false; if (new_numerator_val->isConstInt()) { const_numerator_vals_.push_back(new_numerator_val->evaluateInt()); From 86da47c6644400db4a705adc54baca5e131aa6b3 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 18 May 2023 10:57:19 -0400 Subject: [PATCH 3/6] Remove split/merge size-0 checks. See #367. --- csrc/ir_nodes.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/csrc/ir_nodes.cpp b/csrc/ir_nodes.cpp index 4c4634841a4..e42998b00d9 100644 --- a/csrc/ir_nodes.cpp +++ b/csrc/ir_nodes.cpp @@ -2250,9 +2250,6 @@ std::vector IterDomain::clone( // domains have valid start and stop, it's not possible to contiguous // predication. IterDomain* IterDomain::merge(IterDomain* outer, IterDomain* inner) { - TORCH_CHECK( - !outer->extent()->isZeroInt() && !inner->extent()->isZeroInt(), - "Merging IterDomains with ending values that are 0 is not supported at this time."); TORCH_CHECK( outer->isReduction() == inner->isReduction(), "Merging IterDomains requires that their iteration types match. ", @@ -2330,10 +2327,6 @@ std::pair IterDomain::split( bool inner_split, Val* start_offset, Val* stop_offset) { - TORCH_CHECK( - !in->extent()->isZeroInt(), - "Splitting IterDomains with ending values that are 0 is not supported at this time."); - TORCH_CHECK( factor->isIntegralScalar(), "Cannot split by non-integer value ", factor); From 8f18ffece71ffa2cb5b4d2d45e5b586accdd41c4 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Thu, 18 May 2023 12:09:25 -0400 Subject: [PATCH 4/6] Continue if constant zero in multiplyNumeratorValue --- csrc/scheduler/vectorize_helper.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/csrc/scheduler/vectorize_helper.h b/csrc/scheduler/vectorize_helper.h index cb653f883f6..4fcfdcbd9e0 100644 --- a/csrc/scheduler/vectorize_helper.h +++ b/csrc/scheduler/vectorize_helper.h @@ -71,6 +71,13 @@ class TORCH_CUDA_CU_API ProjectedExtent { // Multiply numerator by provided value, or if currently zero set numerator to // provided value. void multiplyNumeratorValue(Val* new_numerator_val) { + if (new_numerator_val->isZeroInt() || + (new_numerator_val->isConstInt() && + new_numerator_val->evaluateInt() == 0)) { + // If we know the value is zero, we want to skip setting zero_ = true + return; + } + zero_ = false; if (new_numerator_val->isConstInt()) { const_numerator_vals_.push_back(new_numerator_val->evaluateInt()); From 9b77b776cc29c7c86c37fbb0dbb3a8b6fb11bf69 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Fri, 19 May 2023 14:43:00 -0400 Subject: [PATCH 5/6] Use FusionExecutor instead of FEC for easier debugging --- test/test_resize.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_resize.cpp b/test/test_resize.cpp index fe269bfb53c..b27180b7fb4 100644 --- a/test/test_resize.cpp +++ b/test/test_resize.cpp @@ -2205,9 +2205,11 @@ TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) { auto t0 = at::randn(shape, options); std::vector aten_inputs({t0}); - FusionExecutorCache executor_cache(std::move(fusion)); - auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); + auto params = *getPointwiseHeuristics(fusion.get(), aten_inputs); + schedulePointwise(fusion.get(), params); FusionExecutor fe; + fe.compileFusion(fusion.get(), aten_inputs); + auto cg_outputs = fe.runFusion(aten_inputs); auto ref0 = t0.index({at::indexing::Slice(0, 1)}); auto ref1 = t0.index({at::indexing::Slice(0, 0)}); @@ -2216,5 +2218,4 @@ TEST_F(NVFuserTest, FusionResizeMultiSliceEmpty_CUDA) { TORCH_CHECK(ref1.equal(cg_outputs[1])); } - } // namespace nvfuser From 5565688f04189e3943e34b7da0f251fef87a9856 Mon Sep 17 00:00:00 2001 From: Jacob Hinkle Date: Tue, 23 May 2023 09:58:56 -0400 Subject: [PATCH 6/6] Guard against multiplying by zero extents in ContiguousInnerDimensionsMapper --- csrc/scheduler/vectorize_helper.cpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp index ca2fcf27b86..62753c1f6c8 100644 --- a/csrc/scheduler/vectorize_helper.cpp +++ b/csrc/scheduler/vectorize_helper.cpp @@ -218,9 +218,16 @@ ContiguousInnerDimensionsMapper::ContiguousInnerDimensionsMapper( if (std::find(reference_ids.begin(), reference_ids.end(), id) != reference_ids.end()) { reordered_rfactor.push_back(id); - // Initiailze the extent for the mapped iter domain + // Initialize the extent for the mapped iter domain ProjectedExtent pe; - pe.multiplyNumeratorValue(commonOrConstExtent(ca_map_, id)); + auto ext = commonOrConstExtent(ca_map_, id); + if (ext->isConstInt() && ext->evaluateInt() == 0) { + // A size-zero extent ID will be predicated out always, so it should + // not affect the projected extent calculation + continue; + } else { + pe.multiplyNumeratorValue(ext); + } addProjectedExtent(id, pe); } else if (!id->isBroadcast()) { // Ignore broadcasts in the reference. Otherwise, remove non-contiguous @@ -239,9 +246,16 @@ ContiguousInnerDimensionsMapper::ContiguousInnerDimensionsMapper( if (std::find(reference_ids.begin(), reference_ids.end(), id) != reference_ids.end()) { reordered_root.push_back(id); - // Initiailze the extent for the mapped iter domain + // Initialize the extent for the mapped iter domain ProjectedExtent pe; - pe.multiplyNumeratorValue(commonOrConstExtent(ca_map_, id)); + auto ext = commonOrConstExtent(ca_map_, id); + if (ext->isConstInt() && ext->evaluateInt() == 0) { + // A size-zero extent ID will be predicated out always, so it should + // not affect the projected extent calculation + continue; + } else { + pe.multiplyNumeratorValue(ext); + } addProjectedExtent(id, pe); } else if (!id->isBroadcast()) { // Ignore broadcasts in the reference. Otherwise, remove non-contiguous