diff --git a/csrc/options.cpp b/csrc/options.cpp index e79de41c1c1..62693cf76fd 100644 --- a/csrc/options.cpp +++ b/csrc/options.cpp @@ -163,7 +163,6 @@ const std::unordered_map& getEnableOptions() { {"kernel_profile", EnableOption::KernelProfile}, {"memory_promotion", EnableOption::MemoryPromotion}, {"reuse_zeroed_memory", EnableOption::ReuseZeroedMemory}, - {"resize_scheduler", EnableOption::ResizeScheduler}, {"static_fusion_count", EnableOption::StaticFusionCount}, {"wait_debugger", EnableOption::WaitDebugger}, {"warn_register_spill", EnableOption::WarnRegisterSpill}, @@ -211,6 +210,7 @@ const std::unordered_map& getDisableOptions() { {"kernel_reuse", DisableOption::KernelReuse}, {"var_name_remapping", DisableOption::VarNameRemapping}, {"welford_vectorization", DisableOption::WelfordVectorization}, + {"resize_scheduler", DisableOption::ResizeScheduler}, {"reuse_mismatched_type_registers", DisableOption::ReuseMismatchedTypeRegisters}, {"multidevice", DisableOption::Multidevice}}; diff --git a/csrc/options.h b/csrc/options.h index 3d0cdd888cb..e5538dc3d7a 100644 --- a/csrc/options.h +++ b/csrc/options.h @@ -104,7 +104,6 @@ enum class EnableOption { KernelProfile, //! Enable intra-kernel performance profiling MemoryPromotion, //! Enable promotion of memory types for non-pointwise ops ReuseZeroedMemory, //! Re-use zeroed memory used for grid synchronization - ResizeScheduler, //! Enable the resize scheduler StaticFusionCount, //! Enable using single static count in kernel name WaitDebugger, // Used for debugging multi-GPU. The rank given in the argument // will wait for `gdb attach` at the start. @@ -148,6 +147,7 @@ enum class DisableOption { //! need this in particular to investigate possible conflicts //! between nvFuser communicator and the framework also setting //! up `c10d::ProcessGroup` + ResizeScheduler, //! Disable the resize scheduler EndOfOption //! Placeholder for counting the number of elements }; diff --git a/csrc/preseg_passes/pre_segmenter.cpp b/csrc/preseg_passes/pre_segmenter.cpp index 8017e116f6e..042f03191f7 100644 --- a/csrc/preseg_passes/pre_segmenter.cpp +++ b/csrc/preseg_passes/pre_segmenter.cpp @@ -69,7 +69,7 @@ namespace nvfuser::preseg_passes { // currently only limited to pointwise patterns and does not // support, for example, reductions, etc, so this preseg pass still // may be preferable in some cases. - if (!isOptionEnabled(EnableOption::ResizeScheduler)) { + if (isOptionDisabled(DisableOption::ResizeScheduler)) { OptimizationPass::runPass(fusion); } // NOTE vvv this doesn't really work, since our type promotion to higher diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp index 171ef740e40..87702d2069a 100644 --- a/csrc/scheduler/resize.cpp +++ b/csrc/scheduler/resize.cpp @@ -62,9 +62,8 @@ std::pair getLargestTensor( } // namespace bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) { - if (!isOptionEnabled(EnableOption::ResizeScheduler)) { - scheduler_debug_utils::canScheduleRejectReason( - schedulerType(), "Not enabled"); + if (isOptionDisabled(DisableOption::ResizeScheduler)) { + scheduler_debug_utils::canScheduleRejectReason(schedulerType(), "Disabled"); return false; } @@ -472,9 +471,8 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) { } if (vec_factor > 1) { - auto vec_ref_tv = largest_input != nullptr ? largest_input : ref_tv; const auto tvs_to_vectorize = - scheduler_utils::getInputsOutputsWithInnerDim(vec_ref_tv, true, true); + scheduler_utils::getInputsOutputsWithInnerDim(ref_tv, true, true); for (auto tv_to_vectorize : tvs_to_vectorize) { if (tv_to_vectorize->isFusionInput()) { for (auto consumer_tv : ir_utils::consumerTvsOf(tv_to_vectorize)) { diff --git a/csrc/scheduler/tools/loop_domain_scheduler.cpp b/csrc/scheduler/tools/loop_domain_scheduler.cpp index 3f11d7b86cb..ffa7aaad252 100644 --- a/csrc/scheduler/tools/loop_domain_scheduler.cpp +++ b/csrc/scheduler/tools/loop_domain_scheduler.cpp @@ -528,18 +528,15 @@ void scheduleLoopDomainsBy( } } - // It should be either: all of the inputs found and none of the - // outputs found, or none of the inputs found and all of the - // outputs found. + // If all of the inputs are found, the tranform expr is replayed as + // a forward op. Direction replay_dir_tv = Direction::Undefined; if (replay_dir != Direction::Backward && input_ids.size() == transform->inputs().size()) { - NVF_ERROR(output_ids.empty()); replay_dir_tv = Direction::Forward; } else if ( replay_dir != Direction::Forward && output_ids.size() == transform->outputs().size()) { - NVF_ERROR(input_ids.empty()); replay_dir_tv = Direction::Backward; } else { // Replay not possible since none of inputs nor outputs are connected with diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp index 4abd4b3a856..58f2d3bc825 100644 --- a/csrc/scheduler/tools/resize_utils.cpp +++ b/csrc/scheduler/tools/resize_utils.cpp @@ -76,7 +76,8 @@ void propagateResizeToInputs(Expr* resize_tensor_op) { continue; } - scheduler_tools::scheduleLoopDomainsBy(tvs_to_schedule, resize); + scheduler_tools::scheduleLoopDomainsBy( + tvs_to_schedule, resize, Direction::Forward); } } diff --git a/tests/cpp/test_move_pad.cpp b/tests/cpp/test_move_pad.cpp index e99dc563024..006881b550e 100644 --- a/tests/cpp/test_move_pad.cpp +++ b/tests/cpp/test_move_pad.cpp @@ -21,7 +21,13 @@ using testing::IsTrue; using testing::Property; using testing::UnorderedElementsAre; -using MovePadTest = NVFuserTest; +class MovePadTest : public NVFuserTest { + protected: + void SetUp() override { + DisableOptionsGuard::getCurOptions().set(DisableOption::ResizeScheduler); + NVFuserTest::SetUp(); + } +}; TEST_F(MovePadTest, UnaryCat) { auto fusion = std::make_unique(); diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index 906b1902a73..2b44f5b1c20 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -55,21 +55,9 @@ void checkLoopDomainEquivalence( } // namespace -class ResizeTest : public NVFuserTest { - protected: - void SetUp() override { - EnableOptionsGuard::getCurOptions().set(EnableOption::ResizeScheduler); - NVFuserTest::SetUp(); - } -}; +using ResizeTest = NVFuserTest; -class ResizeSchedulerTest : public NVFuserFixtureParamTest { - protected: - void SetUp() override { - EnableOptionsGuard::getCurOptions().set(EnableOption::ResizeScheduler); - NVFuserFixtureParamTest::SetUp(); - } -}; +using ResizeSchedulerTest = NVFuserFixtureParamTest; using testing::Each; using testing::HasSubstr; @@ -5626,7 +5614,7 @@ TEST_F(ResizeTest, TraversalForInliningPosition) { // Disable the resize schedule because the original issue happened // with the pointwise scheduler - EnableOptionsGuard::getCurOptions().unset(EnableOption::ResizeScheduler); + DisableOptionsGuard::getCurOptions().set(DisableOption::ResizeScheduler); auto tv0 = makeContigConcreteTensor({16}); fusion.addInput(tv0); @@ -5727,7 +5715,7 @@ TEST_F(ResizeTest, Repro3801) { // Disable the resize schedule because the original issue happened // with the pointwise scheduler - EnableOptionsGuard::getCurOptions().unset(EnableOption::ResizeScheduler); + DisableOptionsGuard::getCurOptions().set(DisableOption::ResizeScheduler); auto T13 = makeContigConcreteTensor({1, 16}); fusion.addInput(T13); diff --git a/tests/cpp/test_rope.cpp b/tests/cpp/test_rope.cpp index 9eaaa417fb7..d4449c38560 100644 --- a/tests/cpp/test_rope.cpp +++ b/tests/cpp/test_rope.cpp @@ -46,13 +46,7 @@ struct RopeConfig { } }; -class RopeTest : public NVFuserFixtureParamTest { - protected: - void SetUp() override { - EnableOptionsGuard::getCurOptions().set(EnableOption::ResizeScheduler); - NVFuserTest::SetUp(); - } -}; +using RopeTest = NVFuserFixtureParamTest; using MistralRopeTest = RopeTest; diff --git a/tests/python/test_python_frontend.py b/tests/python/test_python_frontend.py index 3b926dbd6dd..ac6a4767caa 100644 --- a/tests/python/test_python_frontend.py +++ b/tests/python/test_python_frontend.py @@ -3285,6 +3285,16 @@ def fusion_func(fd: FusionDefinition) -> None: nvf_out, _ = self.exec_nvfuser(fusion_func, inputs, supports_segmentation=False) # self.assertEqual(nvf_out[0], t24) + # This fusion takes a long time to segment and schedule + # because of the resized extents, which seem to stress the + # expression simplifier a lot. Serializing this fusion would + # significantly increase the test time as it would be + # deserialized every time, which includes segmentation and + # scheduling. Ideally, we should optimize the expression + # simplifier, but for now resetting the cache should avoid the + # issue. + FusionCache.reset() + # Test that symbolic IterDomains can be concatenated # https://github.com/NVIDIA/Fuser/issues/1554 def test_cat_symbolic(self):