Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
404d6b6
repro
naoyam Feb 4, 2025
1b8fc0f
renable resize sched
naoyam Feb 5, 2025
861c187
cleanup
naoyam Feb 5, 2025
12d0e85
debug print
naoyam Feb 7, 2025
643779d
enable resize scheduler
naoyam Feb 7, 2025
4f28c44
fix
naoyam Feb 7, 2025
5f851ea
fix
naoyam Feb 8, 2025
d7c9af1
cleanup
naoyam Feb 8, 2025
4cb7fbd
fix
naoyam Feb 10, 2025
e1b1790
fix
naoyam Feb 13, 2025
c79e234
cleanup
naoyam Feb 13, 2025
c5cc1ba
WIP: extent simplification
naoyam Feb 14, 2025
cc4f1c2
further simplification
naoyam Feb 14, 2025
4994d84
Do some more simplifications specific to extents
naoyam Feb 14, 2025
cd59f29
test fix
naoyam Feb 14, 2025
5dc95f4
cleanup
naoyam Feb 14, 2025
b979fc5
cleanup
naoyam Feb 14, 2025
339aac3
WIP: fix
naoyam Feb 14, 2025
a08ce8a
debug
naoyam Feb 15, 2025
1d50e73
fix
naoyam Feb 15, 2025
10d37d5
Merge branch 'main' into simplify_resize_extents
naoyam Feb 16, 2025
d72668e
clang-tidy
naoyam Feb 16, 2025
4f17092
Merge branch 'main' into enable_resize_scheduler_by_default
naoyam Feb 16, 2025
6499911
Merge branch 'simplify_resize_extents' into enable_resize_scheduler_b…
naoyam Feb 16, 2025
d9fa74e
remove debug print
naoyam Feb 17, 2025
9f6a79f
python test WAR
naoyam Feb 20, 2025
5dfb2ba
Merge branch 'main' into enable_resize_scheduler_by_default
naoyam Feb 20, 2025
8ee1966
Merge branch 'main' into enable_resize_scheduler_by_default
naoyam Feb 20, 2025
2a56bbd
cleanup
naoyam Feb 21, 2025
90e13e3
cleanup
naoyam Feb 21, 2025
0c0d9b8
cleanup
naoyam Feb 24, 2025
18f3265
Merge remote-tracking branch 'origin/main' into enable_resize_schedul…
naoyam Feb 24, 2025
4cca2f4
Merge remote-tracking branch 'origin/main' into enable_resize_schedul…
naoyam Feb 26, 2025
89022b0
Merge remote-tracking branch 'origin/main' into enable_resize_schedul…
naoyam Feb 28, 2025
2315b48
cleanup
naoyam Feb 28, 2025
0e9bc84
Merge remote-tracking branch 'origin/main' into enable_resize_schedul…
naoyam Feb 28, 2025
015c730
update
naoyam Feb 28, 2025
e0b3955
Merge remote-tracking branch 'origin/main' into enable_resize_schedul…
naoyam Mar 3, 2025
ea5bc58
cleanup
naoyam Mar 3, 2025
c1c33b0
temporarily move back assertions
naoyam Mar 3, 2025
0c5c6f8
Remove assertions
naoyam Mar 4, 2025
501beb3
Merge remote-tracking branch 'origin/main' into enable_resize_schedul…
naoyam Mar 4, 2025
4e8b9ce
Merge remote-tracking branch 'origin/main' into enable_resize_schedul…
naoyam Mar 5, 2025
875a92d
update
naoyam Mar 5, 2025
6336c7c
Merge branch 'main' into enable_resize_scheduler_by_default
naoyam Mar 15, 2025
7b972c5
Merge branch 'main' into enable_resize_scheduler_by_default
naoyam Mar 17, 2025
f473ecb
Merge branch 'main' into enable_resize_scheduler_by_default
naoyam Mar 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion csrc/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ const std::unordered_map<std::string, EnableOption>& getEnableOptions() {
{"kernel_profile", EnableOption::KernelProfile},
{"memory_promotion", EnableOption::MemoryPromotion},
{"reuse_zeroed_memory", EnableOption::ReuseZeroedMemory},
{"resize_scheduler", EnableOption::ResizeScheduler},
{"static_fusion_count", EnableOption::StaticFusionCount},
{"wait_debugger", EnableOption::WaitDebugger},
{"warn_register_spill", EnableOption::WarnRegisterSpill},
Expand Down Expand Up @@ -211,6 +210,7 @@ const std::unordered_map<std::string, DisableOption>& getDisableOptions() {
{"kernel_reuse", DisableOption::KernelReuse},
{"var_name_remapping", DisableOption::VarNameRemapping},
{"welford_vectorization", DisableOption::WelfordVectorization},
{"resize_scheduler", DisableOption::ResizeScheduler},
{"reuse_mismatched_type_registers",
DisableOption::ReuseMismatchedTypeRegisters},
{"multidevice", DisableOption::Multidevice}};
Expand Down
2 changes: 1 addition & 1 deletion csrc/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ enum class EnableOption {
KernelProfile, //! Enable intra-kernel performance profiling
MemoryPromotion, //! Enable promotion of memory types for non-pointwise ops
ReuseZeroedMemory, //! Re-use zeroed memory used for grid synchronization
ResizeScheduler, //! Enable the resize scheduler
StaticFusionCount, //! Enable using single static count in kernel name
WaitDebugger, // Used for debugging multi-GPU. The rank given in the argument
// will wait for `gdb attach` at the start.
Expand Down Expand Up @@ -148,6 +147,7 @@ enum class DisableOption {
//! need this in particular to investigate possible conflicts
//! between nvFuser communicator and the framework also setting
//! up `c10d::ProcessGroup`
ResizeScheduler, //! Disable the resize scheduler
EndOfOption //! Placeholder for counting the number of elements
};

Expand Down
2 changes: 1 addition & 1 deletion csrc/preseg_passes/pre_segmenter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ namespace nvfuser::preseg_passes {
// currently only limited to pointwise patterns and does not
// support, for example, reductions, etc, so this preseg pass still
// may be preferable in some cases.
if (!isOptionEnabled(EnableOption::ResizeScheduler)) {
if (isOptionDisabled(DisableOption::ResizeScheduler)) {
OptimizationPass<MovePadPass>::runPass(fusion);
}
// NOTE vvv this doesn't really work, since our type promotion to higher
Expand Down
8 changes: 3 additions & 5 deletions csrc/scheduler/resize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,8 @@ std::pair<TensorView*, int64_t> getLargestTensor(
} // namespace

bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
if (!isOptionEnabled(EnableOption::ResizeScheduler)) {
scheduler_debug_utils::canScheduleRejectReason(
schedulerType(), "Not enabled");
if (isOptionDisabled(DisableOption::ResizeScheduler)) {
scheduler_debug_utils::canScheduleRejectReason(schedulerType(), "Disabled");
return false;
}

Expand Down Expand Up @@ -472,9 +471,8 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) {
}

if (vec_factor > 1) {
auto vec_ref_tv = largest_input != nullptr ? largest_input : ref_tv;
const auto tvs_to_vectorize =
scheduler_utils::getInputsOutputsWithInnerDim(vec_ref_tv, true, true);
scheduler_utils::getInputsOutputsWithInnerDim(ref_tv, true, true);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you remind me why we were using largest_input as vectorization reference in the first place?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Forgot to mention this, but this was an actually a bug. It should have been changed in #3955.

for (auto tv_to_vectorize : tvs_to_vectorize) {
if (tv_to_vectorize->isFusionInput()) {
for (auto consumer_tv : ir_utils::consumerTvsOf(tv_to_vectorize)) {
Expand Down
7 changes: 2 additions & 5 deletions csrc/scheduler/tools/loop_domain_scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,18 +528,15 @@ void scheduleLoopDomainsBy(
}
}

// It should be either: all of the inputs found and none of the
// outputs found, or none of the inputs found and all of the
// outputs found.
// If all of the inputs are found, the tranform expr is replayed as
// a forward op.
Direction replay_dir_tv = Direction::Undefined;
if (replay_dir != Direction::Backward &&
input_ids.size() == transform->inputs().size()) {
NVF_ERROR(output_ids.empty());
Copy link
Collaborator Author

@naoyam naoyam Mar 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I originally thought this should be always the case but that isn't actually case. In particular, there can be a resize that produces an output that is mapped with the input. In that case, output_ids won't be empty, but as long as all the mapped inputs are found, that should not be a problem.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: we should remove the comment above on line 525-527 then.

replay_dir_tv = Direction::Forward;
} else if (
replay_dir != Direction::Forward &&
output_ids.size() == transform->outputs().size()) {
NVF_ERROR(input_ids.empty());
replay_dir_tv = Direction::Backward;
} else {
// Replay not possible since none of inputs nor outputs are connected with
Expand Down
3 changes: 2 additions & 1 deletion csrc/scheduler/tools/resize_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ void propagateResizeToInputs(Expr* resize_tensor_op) {
continue;
}

scheduler_tools::scheduleLoopDomainsBy(tvs_to_schedule, resize);
scheduler_tools::scheduleLoopDomainsBy(
tvs_to_schedule, resize, Direction::Forward);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added the direction option to make it more explicit as it's always forward transformations.

}
}

Expand Down
8 changes: 7 additions & 1 deletion tests/cpp/test_move_pad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@ using testing::IsTrue;
using testing::Property;
using testing::UnorderedElementsAre;

using MovePadTest = NVFuserTest;
class MovePadTest : public NVFuserTest {
protected:
void SetUp() override {
DisableOptionsGuard::getCurOptions().set(DisableOption::ResizeScheduler);
NVFuserTest::SetUp();
}
};

TEST_F(MovePadTest, UnaryCat) {
auto fusion = std::make_unique<Fusion>();
Expand Down
20 changes: 4 additions & 16 deletions tests/cpp/test_resize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,21 +55,9 @@ void checkLoopDomainEquivalence(

} // namespace

class ResizeTest : public NVFuserTest {
protected:
void SetUp() override {
EnableOptionsGuard::getCurOptions().set(EnableOption::ResizeScheduler);
NVFuserTest::SetUp();
}
};
using ResizeTest = NVFuserTest;

class ResizeSchedulerTest : public NVFuserFixtureParamTest<bool> {
protected:
void SetUp() override {
EnableOptionsGuard::getCurOptions().set(EnableOption::ResizeScheduler);
NVFuserFixtureParamTest<bool>::SetUp();
}
};
using ResizeSchedulerTest = NVFuserFixtureParamTest<bool>;

using testing::Each;
using testing::HasSubstr;
Expand Down Expand Up @@ -5626,7 +5614,7 @@ TEST_F(ResizeTest, TraversalForInliningPosition) {

// Disable the resize schedule because the original issue happened
// with the pointwise scheduler
EnableOptionsGuard::getCurOptions().unset(EnableOption::ResizeScheduler);
DisableOptionsGuard::getCurOptions().set(DisableOption::ResizeScheduler);

auto tv0 = makeContigConcreteTensor({16});
fusion.addInput(tv0);
Expand Down Expand Up @@ -5727,7 +5715,7 @@ TEST_F(ResizeTest, Repro3801) {

// Disable the resize schedule because the original issue happened
// with the pointwise scheduler
EnableOptionsGuard::getCurOptions().unset(EnableOption::ResizeScheduler);
DisableOptionsGuard::getCurOptions().set(DisableOption::ResizeScheduler);

auto T13 = makeContigConcreteTensor({1, 16});
fusion.addInput(T13);
Expand Down
8 changes: 1 addition & 7 deletions tests/cpp/test_rope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,7 @@ struct RopeConfig {
}
};

class RopeTest : public NVFuserFixtureParamTest<RopeConfig> {
protected:
void SetUp() override {
EnableOptionsGuard::getCurOptions().set(EnableOption::ResizeScheduler);
NVFuserTest::SetUp();
}
};
using RopeTest = NVFuserFixtureParamTest<RopeConfig>;

using MistralRopeTest = RopeTest;

Expand Down
10 changes: 10 additions & 0 deletions tests/python/test_python_frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3285,6 +3285,16 @@ def fusion_func(fd: FusionDefinition) -> None:
nvf_out, _ = self.exec_nvfuser(fusion_func, inputs, supports_segmentation=False)
# self.assertEqual(nvf_out[0], t24)

# This fusion takes a long time to segment and schedule
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without this, the python tests took about 2 hours to complete. This is a WAR @rdspring1 suggested.

# because of the resized extents, which seem to stress the
# expression simplifier a lot. Serializing this fusion would
# significantly increase the test time as it would be
# deserialized every time, which includes segmentation and
# scheduling. Ideally, we should optimize the expression
# simplifier, but for now resetting the cache should avoid the
# issue.
FusionCache.reset()

# Test that symbolic IterDomains can be concatenated
# https://github.com/NVIDIA/Fuser/issues/1554
def test_cat_symbolic(self):
Expand Down