Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions csrc/fusion_segmenter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3399,13 +3399,21 @@ void SegmentCandidateFinder::forwardInputs() {
continue;
}

if (expr->output(0)->uses().size() > 1) {
// expr is a unary op so there is a single output. Here we look at that
// output's further uses
const auto& output_uses = expr->output(0)->uses();

if (output_uses.size() == 1) {
// If there is a single use, visit it to try and extend the chain of
// unaryOps
to_visit.emplace_back(output_uses.at(0));
} else {
// If there are either no more uses, or more than one use, we cannot
// extend the chain of unary Ops. In either case, finalize this chain by
// saving the expr and its output.
excluded_inp_unary_exprs_.pushBack(expr);
forwarded_inputs.pushBack(expr->output(0));
continue;
}

to_visit.emplace_back(expr->output(0)->uses()[0]);
}
}

Expand Down
36 changes: 36 additions & 0 deletions test/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9215,6 +9215,42 @@ TEST_F(NVFuserTest, FusionDebugStreamGuard_CUDA) {
ASSERT_EQ(ss.str(), text);
}

// Repro of https://github.com/NVIDIA/Fuser/issues/585
TEST_F(NVFuserTest, FusionDanglingUnaryOp_CUDA) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

// Create a segmented Fusion. We call segment_set here to ensure the whole
// Fusion cannot be scheduled. This triggers segmentation, so that
// forwardInputs() is called. The structure of this Fusion is not important;
// it is only important that it must be segmented.
auto size = IrBuilder::create<Scalar>(5);
auto tv0 = full({size}, fusion->zeroVal(), DataType::Int);
auto tv1 = segment_set(tv0);
fusion->addOutput(tv1);

// Now take in an input that has a chain of UnaryOp uses that terminates in a
// Val with no uses. This triggers a segfault in forwardInputs().
Val* alpha = IrBuilder::create<Scalar>(DataType::Int);
fusion->addInput(alpha);
neg(castOp(DataType::Float, alpha));

FusionExecutorCache executor_cache(std::move(fusion));

auto cg_outputs = executor_cache.runFusionWithInputs({11});

auto options = at::TensorOptions().dtype(at::kInt).device(at::kCUDA, 0);
auto aten_out = at::zeros({5}, options);

testValidate(
executor_cache.fusion(),
cg_outputs,
{11},
{aten_out},
__LINE__,
__FILE__);
}

// Test file size should be up to 10K LoC. Create a new file for more tests.

} // namespace nvfuser