Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions csrc/dynamic_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,6 @@ void DynamicTransformConcretizationInfo::analyzeReshapes(
// Determine output shape using expr evaluator. Note there may be
// one domain of extent -1
std::vector<int64_t> out_shape(out_dom.size(), 0);
bool extent_m1_found = false;
for (const auto i : c10::irange(out_dom.size())) {
auto out_id = out_dom.at(i);
auto extent_val = expr_eval->evaluate(out_id->extent());
Expand All @@ -277,16 +276,12 @@ void DynamicTransformConcretizationInfo::analyzeReshapes(
extent_val.is<int64_t>(),
"Invalid evaluated value of domain extent: ",
out_id->toString());
const auto extent_int = extent_val.as<int64_t>();
auto extent_int = extent_val.as<int64_t>();
if (extent_int == -1) {
TORCH_INTERNAL_ASSERT(
!extent_m1_found,
"Multiple output domains of size -1 not allowed",
out_tv->toString());
extent_m1_found = true;
} else {
TORCH_INTERNAL_ASSERT(
extent_int > 0, "Invalid output domain extent: ", extent_int);
// For non-constant Scalar sizes, check that we have not passed -1.
TORCH_CHECK(
out_id->extent()->isConst(),
"Values of -1 passed to reshape must be constant at definition.")
}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These checks are handled by reshape() and analyzeView() and are not needed here.

out_shape.at(i) = extent_int;
}
Expand Down
37 changes: 32 additions & 5 deletions csrc/ops/alias.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#include <expr_simplifier.h>
#include <ir/builder.h>
#include <ir/utils.h>
#include <ops/alias.h>
Expand Down Expand Up @@ -121,12 +122,38 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
// Create placeholder rfactor domain. Note it's not connected with the root
// domain.
std::vector<IterDomain*> rfactor_domain(new_sizes.size(), nullptr);
bool found_neg_one = false;
for (const auto i : c10::irange(new_sizes.size())) {
auto rf_id = IterDomainBuilder(
FusionGuard::getCurFusion()->zeroVal(), new_sizes.at(i))
.iter_type(IterType::Symbolic)
.is_rfactor_domain(true)
.build();
auto new_size = new_sizes.at(i);
if (new_size->isConstScalar() && new_size->evaluateInt() == -1) {
// It is usually safe to use the provided scalars as the output shapes.
// However, if -1 is provided for some position, it will not correspond to
// the actual extent in that position.

TORCH_CHECK(
!found_neg_one,
"A maximum of one value of -1 can be provided to reshape.");
found_neg_one = true;

Val* numel = FusionGuard::getCurFusion()->oneVal();
Val* other_new_numel = FusionGuard::getCurFusion()->oneVal();
for (const auto j : c10::irange(inp_dom.size())) {
numel = mul(numel, inp_dom.at(j)->extent());
}
Comment on lines +140 to +142
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we put this in a new method Scalar TensorView::numel() const instead?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe in loewr_utils? I'd prefer minimalistic designs for the IR nodes themselves.

for (const auto j : c10::irange(new_sizes.size())) {
if (i == j) {
continue;
}
other_new_numel = mul(other_new_numel, new_sizes.at(j));
}
new_size = div(numel, other_new_numel);
new_size = simplifyExpr(new_size);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that SimplifyingIrBuilder evaluates constant subexpressions but does not simplify (x * y) / y, so I used simplifyExpr here.

}
auto rf_id =
IterDomainBuilder(FusionGuard::getCurFusion()->zeroVal(), new_size)
.iter_type(IterType::Symbolic)
.is_rfactor_domain(true)
.build();
rfactor_domain.at(i) = rf_id;
}

Expand Down
81 changes: 81 additions & 0 deletions test/test_dynamic_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1070,4 +1070,85 @@ TEST_F(NVFuserTest, FusionDynamicEmptyCat2_CUDA) {
EXPECT_EQ(output_def->input(0), seg_fusion->inputs()[0]);
}

TEST_F(NVFuserTest, Issue249_CUDA) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);

TensorView* tv0 = makeSymbolicTensor(4);
fusion.addInput(tv0);

auto tv1 = add(tv0, tv0);
auto tv2 = reshape(
tv1,
{tv1->axis(0)->extent(),
tv1->axis(2)->extent(),
IrBuilder::create<Scalar>(-1)});
auto tv3 = add(tv2, tv2);
fusion.addOutput(tv3);

FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x = at::randn({2, 3, 4, 5}, options);
auto at_y = (at_x + at_x).reshape({2, 4, -1});
auto at_z = at_y + at_y;

auto outputs = fusion_executor_cache.runFusionWithInputs({at_x});

testValidate(
fusion_executor_cache.fusion(),
outputs,
{at_x},
{at_z},
__LINE__,
__FILE__);
}

// This is just like the test above, but uses an input scalar with value -1
TEST_F(NVFuserTest, Issue249InputNegative1_CUDA) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);

TensorView* tv0 = makeSymbolicTensor(4);
fusion.addInput(tv0);

auto s0 = IrBuilder::create<Scalar>(DataType::Int);
auto s1 = IrBuilder::create<Scalar>(DataType::Int);
auto s2 = IrBuilder::create<Scalar>(DataType::Int);
fusion.addInput(s0);
fusion.addInput(s1);
fusion.addInput(s2);

auto tv1 = add(tv0, tv0);
auto tv2 = reshape(tv1, {s0, s1, s2});
auto tv3 = add(tv2, tv2);
fusion.addOutput(tv3);

FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x = at::randn({2, 3, 4, 5}, options);
auto at_y = (at_x + at_x).reshape({2, 4, -1});
auto at_z = at_y + at_y;

// Dynamic reshape sizes that are not constant at definition must be explicit:
// no -1 allowed
EXPECT_THROW(
fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, -1}),
std::exception);
Comment on lines +1136 to +1140
Copy link
Collaborator Author

@jacobhinkle jacobhinkle Jul 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Input scalars of -1 are not supported for dynamic reshape currently. Supporting this is not impossible, but would increase the complexity quite a bit. At first glance, it seems like we could just replace any Val that evaluates to -1 in a reshape output with the proper shape. However, that is not safe since we currently replace all uses of concretized scalars and we would run the risk of erroneously replacing intended values of -1 with the positive integer. Instead, it's safer to just disallow this explicitly and require -1 to be a constant at definition.


// Passing explicit sizes works fine
auto outputs = fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, 15});

testValidate(
fusion_executor_cache.fusion(),
outputs,
{at_x, 2, 4, 15},
{at_z},
__LINE__,
__FILE__);
}

} // namespace nvfuser