NVIDIA · jacobhinkle · Jul 14, 2023 · Jul 14, 2023 · Jul 14, 2023 · Jul 14, 2023
diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
@@ -265,7 +265,6 @@ void DynamicTransformConcretizationInfo::analyzeReshapes(
     // Determine output shape using expr evaluator. Note there may be
     // one domain of extent -1
     std::vector<int64_t> out_shape(out_dom.size(), 0);
-    bool extent_m1_found = false;
     for (const auto i : c10::irange(out_dom.size())) {
       auto out_id = out_dom.at(i);
       auto extent_val = expr_eval->evaluate(out_id->extent());
@@ -277,16 +276,12 @@ void DynamicTransformConcretizationInfo::analyzeReshapes(
           extent_val.is<int64_t>(),
           "Invalid evaluated value of domain extent: ",
           out_id->toString());
-      const auto extent_int = extent_val.as<int64_t>();
+      auto extent_int = extent_val.as<int64_t>();
       if (extent_int == -1) {
-        TORCH_INTERNAL_ASSERT(
-            !extent_m1_found,
-            "Multiple output domains of size -1 not allowed",
-            out_tv->toString());
-        extent_m1_found = true;
-      } else {
-        TORCH_INTERNAL_ASSERT(
-            extent_int > 0, "Invalid output domain extent: ", extent_int);
+        // For non-constant Scalar sizes, check that we have not passed -1.
+        TORCH_CHECK(
+            out_id->extent()->isConst(),
+            "Values of -1 passed to reshape must be constant at definition.")
       }
       out_shape.at(i) = extent_int;
     }

diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
+#include <expr_simplifier.h>
 #include <ir/builder.h>
 #include <ir/utils.h>
 #include <ops/alias.h>
@@ -121,12 +122,38 @@ TensorView* reshape(TensorView* inp_tv, const std::vector<Val*>& new_sizes) {
   // Create placeholder rfactor domain. Note it's not connected with the root
   // domain.
   std::vector<IterDomain*> rfactor_domain(new_sizes.size(), nullptr);
+  bool found_neg_one = false;
   for (const auto i : c10::irange(new_sizes.size())) {
-    auto rf_id = IterDomainBuilder(
-                     FusionGuard::getCurFusion()->zeroVal(), new_sizes.at(i))
-                     .iter_type(IterType::Symbolic)
-                     .is_rfactor_domain(true)
-                     .build();
+    auto new_size = new_sizes.at(i);
+    if (new_size->isConstScalar() && new_size->evaluateInt() == -1) {
+      // It is usually safe to use the provided scalars as the output shapes.
+      // However, if -1 is provided for some position, it will not correspond to
+      // the actual extent in that position.
+
+      TORCH_CHECK(
+          !found_neg_one,
+          "A maximum of one value of -1 can be provided to reshape.");
+      found_neg_one = true;
+
+      Val* numel = FusionGuard::getCurFusion()->oneVal();
+      Val* other_new_numel = FusionGuard::getCurFusion()->oneVal();
+      for (const auto j : c10::irange(inp_dom.size())) {
+        numel = mul(numel, inp_dom.at(j)->extent());
+      }
+      for (const auto j : c10::irange(new_sizes.size())) {
+        if (i == j) {
+          continue;
+        }
+        other_new_numel = mul(other_new_numel, new_sizes.at(j));
+      }
+      new_size = div(numel, other_new_numel);
+      new_size = simplifyExpr(new_size);
+    }
+    auto rf_id =
+        IterDomainBuilder(FusionGuard::getCurFusion()->zeroVal(), new_size)
+            .iter_type(IterType::Symbolic)
+            .is_rfactor_domain(true)
+            .build();
     rfactor_domain.at(i) = rf_id;
   }
 

diff --git a/test/test_dynamic_transform.cpp b/test/test_dynamic_transform.cpp
@@ -1070,4 +1070,85 @@ TEST_F(NVFuserTest, FusionDynamicEmptyCat2_CUDA) {
   EXPECT_EQ(output_def->input(0), seg_fusion->inputs()[0]);
 }
 
+TEST_F(NVFuserTest, Issue249_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, tv0);
+  auto tv2 = reshape(
+      tv1,
+      {tv1->axis(0)->extent(),
+       tv1->axis(2)->extent(),
+       IrBuilder::create<Scalar>(-1)});
+  auto tv3 = add(tv2, tv2);
+  fusion.addOutput(tv3);
+
+  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn({2, 3, 4, 5}, options);
+  auto at_y = (at_x + at_x).reshape({2, 4, -1});
+  auto at_z = at_y + at_y;
+
+  auto outputs = fusion_executor_cache.runFusionWithInputs({at_x});
+
+  testValidate(
+      fusion_executor_cache.fusion(),
+      outputs,
+      {at_x},
+      {at_z},
+      __LINE__,
+      __FILE__);
+}
+
+// This is just like the test above, but uses an input scalar with value -1
+TEST_F(NVFuserTest, Issue249InputNegative1_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  auto s0 = IrBuilder::create<Scalar>(DataType::Int);
+  auto s1 = IrBuilder::create<Scalar>(DataType::Int);
+  auto s2 = IrBuilder::create<Scalar>(DataType::Int);
+  fusion.addInput(s0);
+  fusion.addInput(s1);
+  fusion.addInput(s2);
+
+  auto tv1 = add(tv0, tv0);
+  auto tv2 = reshape(tv1, {s0, s1, s2});
+  auto tv3 = add(tv2, tv2);
+  fusion.addOutput(tv3);
+
+  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn({2, 3, 4, 5}, options);
+  auto at_y = (at_x + at_x).reshape({2, 4, -1});
+  auto at_z = at_y + at_y;
+
+  // Dynamic reshape sizes that are not constant at definition must be explicit:
+  // no -1 allowed
+  EXPECT_THROW(
+      fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, -1}),
+      std::exception);
+
+  // Passing explicit sizes works fine
+  auto outputs = fusion_executor_cache.runFusionWithInputs({at_x, 2, 4, 15});
+
+  testValidate(
+      fusion_executor_cache.fusion(),
+      outputs,
+      {at_x, 2, 4, 15},
+      {at_z},
+      __LINE__,
+      __FILE__);
+}
+
 } // namespace nvfuser