From 2705785442a1e9ecf93f91902da548136fba8336 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 19 May 2025 14:03:56 -0400
Subject: [PATCH 1/6] Use proper extent expression for dynamic reshape extents

Fixes #4476
---
 csrc/ops/alias.cpp           | 40 +++++++++++++++++++++++-------------
 tests/cpp/test_evaluator.cpp | 24 ++++++++++++++++++++++
 2 files changed, 50 insertions(+), 14 deletions(-)
diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
index cdc80329206..46dbf271615 100644
--- a/csrc/ops/alias.cpp
+++ b/csrc/ops/alias.cpp
@@ -150,6 +150,25 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
   // domain.
   std::vector<IterDomain*> logical_domain(new_sizes.size(), nullptr);
   bool found_neg_one = false;
+
+  Val* numel = FusionGuard::getCurFusion()->oneVal();
+  for (const auto j : arange(inp_dom.size())) {
+    numel = SimplifyingIrBuilder::mulExpr(numel, inp_dom.at(j)->extent());
+  }
+
+  const auto neg_one_size = [&numel, &new_sizes](size_t pos) {
+    Val* other_new_numel = FusionGuard::getCurFusion()->oneVal();
+    for (const auto j : arange(new_sizes.size())) {
+      if (pos == j) {
+        continue;
+      }
+      other_new_numel =
+          SimplifyingIrBuilder::mulExpr(other_new_numel, new_sizes.at(j));
+    }
+    Val* new_size = SimplifyingIrBuilder::divExpr(numel, other_new_numel);
+    return simplifyExpr(new_size);
+  };
+
   for (const auto i : arange(new_sizes.size())) {
     auto new_size = new_sizes.at(i);
     if (new_size->isConstScalar() && new_size->evaluate().as<int64_t>() == -1) {
@@ -162,20 +181,13 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
           "A maximum of one value of -1 can be provided to reshape.");
       found_neg_one = true;
 
-      Val* numel = FusionGuard::getCurFusion()->oneVal();
-      Val* other_new_numel = FusionGuard::getCurFusion()->oneVal();
-      for (const auto j : arange(inp_dom.size())) {
-        numel = SimplifyingIrBuilder::mulExpr(numel, inp_dom.at(j)->extent());
-      }
-      for (const auto j : arange(new_sizes.size())) {
-        if (i == j) {
-          continue;
-        }
-        other_new_numel =
-            SimplifyingIrBuilder::mulExpr(other_new_numel, new_sizes.at(j));
-      }
-      new_size = SimplifyingIrBuilder::divExpr(numel, other_new_numel);
-      new_size = simplifyExpr(new_size);
+      new_size = neg_one_size(i);
+    } else if (!new_size->isConstScalar()) {
+      // Dynamic size could be -1. Here we build a correct shape expression
+      new_size = where(
+          eq(new_size, IrBuilder::create<Val>(-1L, DataType::Index)),
+          neg_one_size(i),
+          new_size);
     }
     new_size = SimplifyingIrBuilder::maybeCastExpr(DataType::Index, new_size);
     auto rf_id =
diff --git a/tests/cpp/test_evaluator.cpp b/tests/cpp/test_evaluator.cpp
index a29d22a3e19..75c4e1b005e 100644
--- a/tests/cpp/test_evaluator.cpp
+++ b/tests/cpp/test_evaluator.cpp
@@ -825,4 +825,28 @@ TEST_F(ExprEvalTest, NamedScalar) {
   EXPECT_EQ(cache_id_pvalue.as<int64_t>(), kCacheIdValue);
 }
 
+TEST_F(ExprEvalTest, View_FlattenToMinusOneAsInput) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* in = makeSymbolicTensor(2);
+  auto* out_dim_size = IrBuilder::create<Val>(DataType::Int);
+  TensorView* out = reshape(in, {out_dim_size});
+  fusion.addInput(in);
+  fusion.addInput(out_dim_size);
+  fusion.addOutput(out);
+
+  fusion.printMath();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
+  at::Tensor in_tensor = at::randn({2, 3}, options);
+
+  ExpressionEvaluator evaluator;
+  evaluator.bind(in, in_tensor);
+  evaluator.bind(out_dim_size, -1);
+  auto out_tensor = evaluator.evaluate(out).as<at::Tensor>();
+
+  EXPECT_TRUE(at::allclose(out_tensor, in_tensor.view({-1})));
+}
+
 } // namespace nvfuser

From 163d1cb0a712730b80fdaf2c13e470f39d08d7c8 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 22 May 2025 12:46:06 -0400
Subject: [PATCH 2/6] Compute numel only when necessary

---
 csrc/ops/alias.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
index 46dbf271615..1dba5ca1719 100644
--- a/csrc/ops/alias.cpp
+++ b/csrc/ops/alias.cpp
@@ -151,12 +151,17 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
   std::vector<IterDomain*> logical_domain(new_sizes.size(), nullptr);
   bool found_neg_one = false;
 
-  Val* numel = FusionGuard::getCurFusion()->oneVal();
-  for (const auto j : arange(inp_dom.size())) {
-    numel = SimplifyingIrBuilder::mulExpr(numel, inp_dom.at(j)->extent());
-  }
+  // We don't compute numel unless we need to, and then we compute it only once
+  // to encourage hoisting
+  Val* numel = nullptr;
+  const auto neg_one_size = [&numel, &inp_dom, &new_sizes](size_t pos) {
+    if (numel == nullptr) {
+      numel = FusionGuard::getCurFusion()->oneVal();
+      for (const auto j : arange(inp_dom.size())) {
+        numel = SimplifyingIrBuilder::mulExpr(numel, inp_dom.at(j)->extent());
+      }
+    }
 
-  const auto neg_one_size = [&numel, &new_sizes](size_t pos) {
     Val* other_new_numel = FusionGuard::getCurFusion()->oneVal();
     for (const auto j : arange(new_sizes.size())) {
       if (pos == j) {

From 3ad7cb9dfe13200d5637dbf4c9d4ab47d13ff45b Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 22 May 2025 12:47:17 -0400
Subject: [PATCH 3/6] Fix a couple tests

---
 tests/cpp/test_dynamic_transform.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/cpp/test_dynamic_transform.cpp b/tests/cpp/test_dynamic_transform.cpp
index 090ac43af23..109dc2aab83 100644
--- a/tests/cpp/test_dynamic_transform.cpp
+++ b/tests/cpp/test_dynamic_transform.cpp
@@ -88,17 +88,19 @@ TEST_F(DynamicTransformTest, DynamicTransform1) {
     expr_eval.bind(tv0->axis(1)->extent(), 3L);
     expr_eval.bind(reshape_shape0, 3L);
     expr_eval.bind(reshape_shape1, -1L);
+    // We cannot infer the shape of tv1 from the above bound values, since
+    // either axis of tv2 might be broadcast against one from tv1.
+    expr_eval.bind(tv1->axis(0)->extent(), 3L);
+    expr_eval.bind(tv1->axis(1)->extent(), 4L);
 
     // This should throw an exception since any reshape size of -1 must be
     // specified as a definition-time constant, as opposed to an input scalar.
-    EXPECT_THAT(
-        [&]() {
-          auto initial_info = DynamicTransform::getInitialInfo(&fusion);
-          auto info =
-              DynamicTransformConcretizationInfo(&initial_info, &expr_eval);
-        },
-        ::testing::ThrowsMessage<nvfError>(::testing::HasSubstr(
-            "Values of -1 passed to reshape must be constant at definition")));
+    auto initial_info = DynamicTransform::getInitialInfo(&fusion);
+    auto info = DynamicTransformConcretizationInfo(&initial_info, &expr_eval);
+    NVF_CHECK(
+        info.getReshapeTransforms().size() == 1,
+        "Expected to have one reshape transform: ",
+        info.toString());
   }
 
   {
@@ -1160,10 +1162,8 @@ TEST_F(DynamicTransformTest, Issue249InputNegative1) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor at_x = at::randn({2, 3, 4, 5}, options);
 
-  // Dynamic reshape sizes that are not constant at definition must be explicit:
-  // no -1 allowed
-  EXPECT_THROW(
-      executor_cache.runFusionWithInputs({at_x, 2, 4, -1}), std::exception);
+  // Test that running with dynamic -1 works as expected
+  executor_cache.runFusionWithInputs({at_x, 2, 4, -1});
 
   // Passing explicit sizes works fine
   auto outputs = executor_cache.runFusionWithInputs({at_x, 2, 4, 15});

From b2c773aa0d634d52e2cbc3aa56c956a81191e871 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 22 May 2025 13:08:05 -0400
Subject: [PATCH 4/6] Fix division by zero

---
 csrc/ops/alias.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
index 1dba5ca1719..b9fe2b69d1f 100644
--- a/csrc/ops/alias.cpp
+++ b/csrc/ops/alias.cpp
@@ -170,6 +170,14 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
       other_new_numel =
           SimplifyingIrBuilder::mulExpr(other_new_numel, new_sizes.at(j));
     }
+    // In case numel is 0, other_new_numel might also be 0 and we would hit a
+    // division by zero. In such cases, using 1 as the denominator will cause
+    // us to properly compute 0 for this extent.
+    other_new_numel = SimplifyingIrBuilder::whereExpr(
+        eq(other_new_numel, FusionGuard::getCurFusion()->zeroVal()),
+        FusionGuard::getCurFusion()->oneVal(),
+        other_new_numel);
+
     Val* new_size = SimplifyingIrBuilder::divExpr(numel, other_new_numel);
     return simplifyExpr(new_size);
   };

From ee8335c1466f679249b64c21b98b6c16908a86f0 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 22 May 2025 13:08:17 -0400
Subject: [PATCH 5/6] Fix FusionDynamicReshapeReductionShmoo

---
 tests/cpp/test_dynamic_transform.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/cpp/test_dynamic_transform.cpp b/tests/cpp/test_dynamic_transform.cpp
index 109dc2aab83..81d37b8e57c 100644
--- a/tests/cpp/test_dynamic_transform.cpp
+++ b/tests/cpp/test_dynamic_transform.cpp
@@ -860,17 +860,14 @@ TEST_F(DynamicTransformTest, FusionDynamicReshapeReductionShmoo) {
       {{8, 3 * 5, 7, 9}, {8, 3, 5 * 7, 9}, false}, // merge(1) osplit(1, 3)
 
       // test passing -1 dynamically for dimension size
-      // This is unsupported. See https://github.com/NVIDIA/Fuser/issues/249
-      // Values of -1 must be passed as constants instead of input-dependent
-      // scalars.
-      //{{8, 3 * 5, 7, 9}, {8, 3, -1, 9}, false} // merge(1) osplit(1, 3)
+      {{8, 3 * 5, 7, 9}, {8, 3, -1, 9}, false}, // merge(1) osplit(1, 3)
 
       // Empty reshapes should translate to FullOp
       {{8, 0, 7, 9}, {7, 8, 0, 9}, true}, // symbolic_sizes = [ -1, -1, 0, -1 ]
-      // In the case below there's now a separate Val introduced for the output
-      // extent, which is zero. This is represented in
-      // DynamicTransformConcretizationInfo causing cache miss
-      {{8, 0, 7, 9}, {7, 8, -1, 9}, true}, // symbolic_sizes = [ -1, -1, 0, -1 ]
+      // This hits the same conc info as {8, 3 * 5, 7, 9}, {8, 3, 5 * 7, 9}
+      {{8, 0, 7, 9},
+       {7, 8, -1, 9},
+       false}, // symbolic_sizes = [ -1, -1, 0, -1 ]
       {{8, 0, 7, 9}, {7, 8, 0, 0}, true}, // symbolic_sizes = [ -1, -1, 0, 0 ]
       {{8, 0, 7, 9}, {47, 0, 13, 0}, true}, // symbolic_sizes = [ -1, 0, -1, 0 ]
   };

From 88166f876675ea9d1d2b22f90babc6f75752a609 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 23 May 2025 09:19:38 -0400
Subject: [PATCH 6/6] Cast new sizes to Index explicitly

---
 csrc/ops/alias.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
index b9fe2b69d1f..6b1ad79a3dd 100644
--- a/csrc/ops/alias.cpp
+++ b/csrc/ops/alias.cpp
@@ -167,8 +167,10 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
       if (pos == j) {
         continue;
       }
+      Val* new_size =
+          SimplifyingIrBuilder::maybeCastExpr(DataType::Index, new_sizes.at(j));
       other_new_numel =
-          SimplifyingIrBuilder::mulExpr(other_new_numel, new_sizes.at(j));
+          SimplifyingIrBuilder::mulExpr(other_new_numel, new_size);
     }
     // In case numel is 0, other_new_numel might also be 0 and we would hit a
     // division by zero. In such cases, using 1 as the denominator will cause
@@ -179,6 +181,7 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
         other_new_numel);
 
     Val* new_size = SimplifyingIrBuilder::divExpr(numel, other_new_numel);
+    NVF_ERROR(new_size->dtype() == DataType::Index);
     return simplifyExpr(new_size);
   };