diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index 5a02962fb03..3b4458ea8f9 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -3399,13 +3399,21 @@ void SegmentCandidateFinder::forwardInputs() { continue; } - if (expr->output(0)->uses().size() > 1) { + // expr is a unary op so there is a single output. Here we look at that + // output's further uses + const auto& output_uses = expr->output(0)->uses(); + + if (output_uses.size() == 1) { + // If there is a single use, visit it to try and extend the chain of + // unaryOps + to_visit.emplace_back(output_uses.at(0)); + } else { + // If there are either no more uses, or more than one use, we cannot + // extend the chain of unary Ops. In either case, finalize this chain by + // saving the expr and its output. excluded_inp_unary_exprs_.pushBack(expr); forwarded_inputs.pushBack(expr->output(0)); - continue; } - - to_visit.emplace_back(expr->output(0)->uses()[0]); } } diff --git a/test/test_gpu3.cpp b/test/test_gpu3.cpp index cd9ee4f7392..54fb4da2af4 100644 --- a/test/test_gpu3.cpp +++ b/test/test_gpu3.cpp @@ -9215,6 +9215,42 @@ TEST_F(NVFuserTest, FusionDebugStreamGuard_CUDA) { ASSERT_EQ(ss.str(), text); } +// Repro of https://github.com/NVIDIA/Fuser/issues/585 +TEST_F(NVFuserTest, FusionDanglingUnaryOp_CUDA) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + // Create a segmented Fusion. We call segment_set here to ensure the whole + // Fusion cannot be scheduled. This triggers segmentation, so that + // forwardInputs() is called. The structure of this Fusion is not important; + // it is only important that it must be segmented. + auto size = IrBuilder::create(5); + auto tv0 = full({size}, fusion->zeroVal(), DataType::Int); + auto tv1 = segment_set(tv0); + fusion->addOutput(tv1); + + // Now take in an input that has a chain of UnaryOp uses that terminates in a + // Val with no uses. This triggers a segfault in forwardInputs(). + Val* alpha = IrBuilder::create(DataType::Int); + fusion->addInput(alpha); + neg(castOp(DataType::Float, alpha)); + + FusionExecutorCache executor_cache(std::move(fusion)); + + auto cg_outputs = executor_cache.runFusionWithInputs({11}); + + auto options = at::TensorOptions().dtype(at::kInt).device(at::kCUDA, 0); + auto aten_out = at::zeros({5}, options); + + testValidate( + executor_cache.fusion(), + cg_outputs, + {11}, + {aten_out}, + __LINE__, + __FILE__); +} + // Test file size should be up to 10K LoC. Create a new file for more tests. } // namespace nvfuser