Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f9def57
Add clipping to slice output extent expressions
jacobhinkle Jun 6, 2023
15587bd
Merge branch 'main' into slice_clip
jacobhinkle Jun 6, 2023
2c4af30
Add where to ExpressionEvaluator, handle negative in slice
jacobhinkle Jun 7, 2023
2b4ef9a
Merge remote-tracking branch 'origin/main' into slice_clip
jacobhinkle Sep 11, 2023
5dcf2c8
Support Set,Where, bool ops in NaiveValueMachine
jacobhinkle Sep 11, 2023
7432adb
Silence clang-tidy in test_resize.cpp
jacobhinkle Sep 11, 2023
d96ee2e
Merge remote-tracking branch 'origin/main' into slice_clip
jacobhinkle Sep 14, 2023
9856d93
Add simplifying comparison operators
jacobhinkle Sep 14, 2023
8544650
Clean up normalized slice start/stop expressions
jacobhinkle Sep 14, 2023
eee9ff2
Fix wrong ostream in preseg ir dump
jacobhinkle Sep 14, 2023
efcb203
Remove debug print
jacobhinkle Sep 14, 2023
bf0a4b6
Handle NE in runBinaryOp
jacobhinkle Sep 14, 2023
121a3b9
Fix simplified comparison ops
jacobhinkle Sep 14, 2023
93f3737
Use maybeCastExpr in resize
jacobhinkle Sep 14, 2023
ec315a1
Update doc comment on slice op
jacobhinkle Sep 14, 2023
98f3519
Add input range test
jacobhinkle Sep 14, 2023
b557e38
Reformat test
jacobhinkle Sep 14, 2023
5b06adc
Merge branch 'main' into slice_clip
jacobhinkle Sep 18, 2023
72493d2
Simplify clipping exprs, clean up op
jacobhinkle Sep 26, 2023
553e63d
Merge remote-tracking branch 'origin/main' into slice_clip
jacobhinkle Sep 26, 2023
a9d9e58
Simplify maybe cast exprs
jacobhinkle Sep 26, 2023
90f9616
Remove unneeded change to nodes.cpp
jacobhinkle Sep 26, 2023
ca3a674
Restore check for trivial slice
jacobhinkle Sep 26, 2023
2fd5c23
Use SimplifyingIrBuilder
jacobhinkle Sep 26, 2023
2bc4748
Cast extent first
jacobhinkle Sep 26, 2023
2f714a8
Merge branch 'main' into slice_clip
jacobhinkle Sep 26, 2023
2331989
Adding a reshape example (#944)
naoyam Sep 26, 2023
74eb074
Remove manual refs and add FEC test
jacobhinkle Sep 26, 2023
d7a4b56
Change check for invalid extents to be >= 0
jacobhinkle Sep 26, 2023
87dae14
Use same set of slice cases for all three tests
jacobhinkle Sep 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion csrc/dynamic_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ void DynamicTransformConcretizationInfo::analyzeResizes(
out_id->toString());
auto extent_int = extent_val.as<int64_t>();
NVF_ERROR(
extent_int > 0,
extent_int >= 0,
"Invalid resized domain extent ",
extent_int,
" for domain ",
Expand Down
4 changes: 2 additions & 2 deletions csrc/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -897,8 +897,8 @@ FusionKernelRuntime::FusionKernelRuntime(
fusion.get());

if (isDebugDumpEnabled(DebugDumpOption::FusionIrPreseg)) {
std::cout << "Fusion IR after pre-segmenter optimization passes:"
<< std::endl;
debug() << "Fusion IR after pre-segmenter optimization passes:"
<< std::endl;
Comment on lines -900 to +901
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated to this PR. Just found wrong ostream in this debug dump.

fusion->printMath();
}

Expand Down
54 changes: 37 additions & 17 deletions csrc/ops/alias.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -690,9 +690,6 @@ TensorView* cat(
return out;
}

// Currently there's no error check about the actual values of the
// Slice parameters. For example, the start parameter of a range of a
// domain is assumed to be >= 0 and < the extent of the domain.
TensorView* slice(TensorView* inp, const std::vector<Slice>& ranges) {
const auto inp_dom = TensorDomain::noReductions(inp->getMaybeRFactorDomain());
const int ndims = static_cast<int>(inp_dom.size());
Expand All @@ -704,36 +701,58 @@ TensorView* slice(TensorView* inp, const std::vector<Slice>& ranges) {
", Expected: ",
ndims);

auto normalize_slice_range = [](Slice range, Val* extent) -> Slice {
const auto normalize_slice_range = [](Slice range, Val* extent) -> Slice {
auto cast_extent =
SimplifyingIrBuilder::maybeCastExpr(DataType::Index, extent);

auto zero = FusionGuard::getCurFusion()->zeroVal(DataType::Index);

// norm_start = max(0, start < 0 ? start + extent : start)
if (range.start == nullptr) {
range.start = FusionGuard::getCurFusion()->zeroVal();
}
if (range.stop == nullptr) {
range.stop = extent;
}
if (range.step == nullptr) {
range.step = FusionGuard::getCurFusion()->oneVal();
}
if (range.start->dtype() != DataType::Index) {
range.start = zero;
} else if (!range.start->isZeroInt()) {
range.start =
SimplifyingIrBuilder::maybeCastExpr(DataType::Index, range.start);
range.start = SimplifyingIrBuilder::maxExpr(
zero,
SimplifyingIrBuilder::whereExpr(
SimplifyingIrBuilder::ltExpr(range.start, zero),
SimplifyingIrBuilder::addExpr(range.start, cast_extent),
range.start));
}
if (range.stop->dtype() != DataType::Index) {

// norm_stop = max(norm_start, min(extent, stop < 0 ? stop + extent : stop)
if (range.stop == nullptr) {
range.stop = cast_extent;
} else if (!range.stop->sameAs(extent)) {
range.stop =
SimplifyingIrBuilder::maybeCastExpr(DataType::Index, range.stop);
range.stop = SimplifyingIrBuilder::maxExpr(
range.start,
SimplifyingIrBuilder::minExpr(
cast_extent,
SimplifyingIrBuilder::whereExpr(
SimplifyingIrBuilder::ltExpr(range.stop, zero),
SimplifyingIrBuilder::addExpr(range.stop, cast_extent),
range.stop)));
}
if (range.step->dtype() != DataType::Index) {

// Ensure step is of type Index
if (range.step == nullptr) {
range.step = FusionGuard::getCurFusion()->oneVal(DataType::Index);
} else {
range.step =
SimplifyingIrBuilder::maybeCastExpr(DataType::Index, range.step);
}

return range;
};

for (auto& range : ranges) {
// Step not supported yet
NVF_CHECK(
range.step == nullptr || range.step->isOneInt(),
"Unsupported step: ",
"Unsupported step (must be 1 or null): ",
range.step->toString());
}

Expand All @@ -754,12 +773,13 @@ TensorView* slice(TensorView* inp, const std::vector<Slice>& ranges) {
out_root_id = inp_root_id->cloneWithoutRFactor();
out_rf_id = out_root_id;
} else {
// Clip the start and stop values to the extent of the input
out_root_id =
IterDomainBuilder(inp_root_id).is_rfactor_domain(true).build();
out_rf_id = IterDomain::resize(
out_root_id,
SimplifyingIrBuilder::negExpr(range.start),
sub(range.stop, inp_root_id->extent()),
SimplifyingIrBuilder::subExpr(range.stop, inp_root_id->extent()),
true);
needs_real_slicing = true;
}
Expand Down
4 changes: 3 additions & 1 deletion csrc/ops/alias.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ TensorView* cat(
std::optional<IterType> iter_type_opt = std::nullopt);

//! Return a tensor where each dimension is sliced as specified by the
//! ranges parameter. Stepping must be one at this moment.
//! ranges parameter. Stepping must be one at this moment. The semantics of
//! slicing with negative values and values >= extent follow those of numpy and
//! PyTorch.
TensorView* slice(TensorView* inp, const std::vector<Slice>& ranges);

} // namespace nvfuser
136 changes: 131 additions & 5 deletions test/test_resize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,132 @@ TEST_F(ResizeTest, FusionResizeSlice5) {
testValidate(&fusion, cg_outputs, aten_inputs, {t2, t4}, __LINE__, __FILE__);
}

std::vector<std::pair<int64_t, int64_t>> slice_cases(
{{0, 5},
{3, 9},
{3, 4},
{7, 5},
{0, 11},
{11, 13},
{-3, 8},
{-3, -1},
{-3, -5},
{13, -1},
{-11, 9},
{-11, 0},
{-13, -11}});

// Test slice with a variety of constant ranges
TEST_F(NVFuserTest, FusionResizeSliceConstantShmoo_CUDA) {
for (auto [start, stop] : slice_cases) {
Fusion fusion;
FusionGuard fg(&fusion);

std::vector<int64_t> shape({9});

// concrete shapes to avoid dynamic Fusion
auto tv0 = makeConcreteTensor(shape);
fusion.addInput(tv0);

auto tv1 = slice(
tv0, {{IrBuilder::create<Val>(start), IrBuilder::create<Val>(stop)}});
fusion.addOutput(tv1);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

auto t0 = at::randn(shape, options);
std::vector<c10::IValue> aten_inputs({t0});

FusionExecutor fe;
fe.compileFusion(&fusion, aten_inputs);
auto cg_outputs = fe.runFusion(aten_inputs);

testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
}
}

// Test slice with a variety of non-constant input ranges
TEST_F(NVFuserTest, FusionResizeSliceInputShmoo_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);

std::vector<int64_t> shape({9});

// concrete shapes to avoid dynamic Fusion
auto tv0 = makeConcreteTensor(shape);
auto s0 = IrBuilder::create<Val>(DataType::Index);
auto s1 = IrBuilder::create<Val>(DataType::Index);
fusion.addInput(tv0);
fusion.addInput(s0);
fusion.addInput(s1);

auto tv1 = slice(tv0, {{s0, s1}});
fusion.addOutput(tv1);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

{
// Concretize so that we set output IterType as Iteration. We should now
// have expressions that work with any input range.
ExpressionEvaluator expr_eval;

expr_eval.bind(tv0->axis(0)->extent(), 9);
expr_eval.bind(s0, 0);
expr_eval.bind(s1, 9);

auto initial_info = DynamicTransform::getInitialInfo(&fusion);
auto info = DynamicTransformConcretizationInfo(&initial_info, &expr_eval);

DynamicTransform::concretizeFusion(&fusion, &info);
NVF_CHECK(
!fusion.hasDynamicTransform(), "Expected to have no dynamic transform");
}

FusionExecutor fe;
fe.compileFusion(&fusion);

auto t0 = at::randn(shape, options);
for (auto [start, stop] : slice_cases) {
std::vector<c10::IValue> aten_inputs({t0, start, stop});
auto cg_outputs = fe.runFusion(aten_inputs);

testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
}
}

// Same as FusionResizeSliceInputShmoo_CUDA but use FusionExecutorCache, which
// might re-concretize when output sizes change
TEST_F(NVFuserTest, FusionResizeSliceInputShmooFusionExecutorCache_CUDA) {
auto fusion_ptr = std::make_unique<Fusion>();
auto fusion = fusion_ptr.get();
FusionGuard fg(fusion);

std::vector<int64_t> shape({9});

// concrete shapes to avoid dynamic Fusion
auto tv0 = makeConcreteTensor(shape);
auto s0 = IrBuilder::create<Val>(DataType::Index);
auto s1 = IrBuilder::create<Val>(DataType::Index);
fusion->addInput(tv0);
fusion->addInput(s0);
fusion->addInput(s1);

auto tv1 = slice(tv0, {{s0, s1}});
fusion->addOutput(tv1);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);

FusionExecutorCache fec(std::move(fusion_ptr));

auto t0 = at::randn(shape, options);
for (auto [start, stop] : slice_cases) {
std::vector<c10::IValue> aten_inputs({t0, start, stop});
auto cg_outputs = fec.runFusionWithInputs(aten_inputs);

testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__);
}
}

// Auto scheduled version of Slice1
TEST_F(ResizeTest, FusionResizeSliceScheduler1) {
auto fusion_ptr = std::make_unique<Fusion>();
Expand Down Expand Up @@ -2319,7 +2445,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual1) {
FusionGuard fg(fusion_ptr.get());

const int64_t slice_offset = 4;
const std::vector<int64_t> shape({1024 * 1024});
const std::vector<int64_t> shape({1024L * 1024L});
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Silencing clang-tidy


// Using a concrete tensor to avoid dynamic reshape
auto tv0 = makeContigConcreteTensor(shape);
Expand Down Expand Up @@ -2358,7 +2484,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual2) {
FusionGuard fg(fusion_ptr.get());

const int64_t slice_offset = 4;
const std::vector<int64_t> shape({1024 * 1024});
const std::vector<int64_t> shape({1024L * 1024L});

auto tv0 = makeContigConcreteTensor(shape);
fusion.addInput(tv0);
Expand Down Expand Up @@ -2414,7 +2540,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual3) {
FusionGuard fg(fusion_ptr.get());

const int64_t slice_offset = 4;
const std::vector<int64_t> shape({1024 * 1024});
const std::vector<int64_t> shape({1024L * 1024L});

auto tv0 = makeContigConcreteTensor(shape);
fusion.addInput(tv0);
Expand Down Expand Up @@ -2463,7 +2589,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual4) {
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());

const std::vector<int64_t> shape({1024 * 1024});
const std::vector<int64_t> shape({1024L * 1024L});

auto tv0 = makeContigConcreteTensor({shape[0] - 4});
fusion.addInput(tv0);
Expand Down Expand Up @@ -2505,7 +2631,7 @@ TEST_F(ResizeTest, Slice2DVectorizeManual1) {
// The extent of the innermost domain is just 2, and the outer
// domain is sliced. This slicing should be vectorizable by a
// factor of 4 as the two domains can be merged and vectorized.
const std::vector<int64_t> shape({1024 * 1024, 2});
const std::vector<int64_t> shape({1024L * 1024L, 2});

auto tv0 = makeContigConcreteTensor(shape);
fusion.addInput(tv0);
Expand Down
Loading