More lost scalars during segmentation for dynamic reshape

```c++
// Test concretizing a pad that follows a reshape. This requires the
// ExpressionEvaluator used in concretization to propagate shapes properly
// across symbolic reshapes in order to infer the size of the downstream pad.
TEST_F(NVFuserTest, DynamicReshapePad_CUDA) {
  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
  Fusion& fusion = *fusion_ptr.get();
  FusionGuard fg(&fusion);

  TensorView* tv0 = makeSymbolicTensor(2);
  fusion.addInput(tv0);

  auto s0 = IrBuilder::create<Val>(DataType::Int);
  auto s1 = IrBuilder::create<Val>(DataType::Int);
  auto s2 = IrBuilder::create<Val>(DataType::Int);
  auto s3 = IrBuilder::create<Val>(DataType::Int);
  fusion.addInput(s0);
  fusion.addInput(s1);
  fusion.addInput(s2);
  fusion.addInput(s3);

  auto tv1 = reshape(tv0, {s2, s3});
  auto tv2 = pad(tv1, {fusion.zeroVal(), s0, fusion.zeroVal(), s1});
  fusion.addOutput(tv2);

  /*
Symbolic Fusion:
Inputs:
  T0_g[ iS0{i0}, iS1{i2} ], float
  i3, int64_t
  i4, int64_t
  i5, int64_t
  i6, int64_t
Outputs:
  T2_g[ ?S7{( i5 + i4 )}rf, ?S9{( i6 + i3 )}rf ], float

%kernel_math {
T1_l[ ?S4{i5}rf, ?S5{i6}rf ] = view( T0_g[ iS0{i0}, iS1{i2} ] )
T2_g[ ?S7{( i5 + i4 )}rf, ?S9{( i6 + i3 )}rf ]
   = pad( T1_l[ ?S4{i5}rf, ?S5{i6}rf ], {0, i4, 0, i3} )
}

Concretized Fusion:
Inputs:
  T0_g[ iS0{i0}, iS1{i2} ], float
  i3, int64_t
  i4, int64_t
  i5, int64_t
  i6, int64_t
Outputs:
  T2_g[ iS21{( i5 + i4 )}rf, iS22{( i6 + i3 )}rf ], float

%kernel_math {
T3_l[ iS15{3}rf, iS16{( ceilDiv(( i0 * i2 ), 3) )}rf ] = view( T0_g[ iS0{i0}, iS1{i2} ] )
T2_g[ iS21{( i5 + i4 )}rf, iS22{( i6 + i3 )}rf ]
   = pad( T3_l[ iS15{3}rf, iS16{( ceilDiv(( i0 * i2 ), 3) )}rf ], {0, i4, 0, i3} )
}

T0_g[ iS0{i0}, iS1{i2} ]
 root domain : (iS0{i0}, iS1{i2})
 contiguity: f f
 leaf domain : (iS0{i0}, iS1{i2})
T3_l[ iS15{3}rf, iS16{( ceilDiv(( i0 * i2 ), 3) )}rf ]
 root domain : (iS12{i0}rf, iS13{i2}rf)
  Merge: iS12{i0}rf and iS13{i2}rf -> iS14{( i0 * i2 )}rf
  Outer split: iS14{( i0 * i2 )}rf by factor 3 -> iS15{3}rf, iS16{( ceilDiv(( i0 * i2 ), 3) )}rf, start offset: 0, stop offset: 0
 rfactor domain : (iS15{3}rf, iS16{( ceilDiv(( i0 * i2 ), 3) )}rf)
 contiguity: t t
 leaf domain : (iS15{3}rf, iS16{( ceilDiv(( i0 * i2 ), 3) )}rf)
T2_g[ iS21{( i5 + i4 )}rf, iS22{( i6 + i3 )}rf ]
 root domain : (iS19{i5}rf, iS20{i6}rf)
  Resize: iS19{i5}rf by 0 and i4 -> iS21{( i5 + i4 )}rf
  Resize: iS20{i6}rf by 0 and i3 -> iS22{( i6 + i3 )}rf
 rfactor domain : (iS21{( i5 + i4 )}rf, iS22{( i6 + i3 )}rf)
 contiguity: t t
 leaf domain : (iS21{( i5 + i4 )}rf, iS22{( i6 + i3 )}rf)
  */

  FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));

  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
  at::Tensor at_x = at::randn({4, 3}, options);
  std::vector<c10::IValue> aten_inputs = {at_x, 1, 1, 3, 4};
  auto at_y = at::pad(at_x.reshape({3, 4}), {0, 1, 0, 1});

  auto outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
  /*
terminate called after throwing an instance of 'c10::Error'
  what():  val.hasValue() INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/executor.cpp":1039, please report a bug to
PyTorch. Tried to evaluate the extent, ( ceilDiv(( ( ( ( i5 * i6 ) + ( i5 * i3 ) ) + ( i4 * i6 ) ) + ( i4 * i3 ) ), 128) )
 for the ptype: blockIdx.x to set launch bounds but could not.
  */

  testValidate(
      fusion_executor_cache.fusion(),
      outputs,
      aten_inputs,
      {at_y},
      __LINE__,
      __FILE__);
}
```
The fusion gets segmented at `T3` and the second segment cannot evaluate the output extent, just like in #629. The fix in that case (#630) was to propagate the replaced extents during concretization. In this case that approach does not resolve the issue, since the downstream `pad` op builds an extent expression using the fusion inputs `i5` and `i6`. Although we register those values for mutation, the actual rfactor extents in `T2` are `i5 + i4` and `i6 + i3` so they are not replaced. It is not necessarily clear that replacing them before segmentation is always advantageous either: before segmentation we don't know if the original expression with fusion inputs, e.g. `i6 + i3`, will be easier to compute than the concretized one: `ceilDiv(( i0 * i2), 3) + i3`.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

More lost scalars during segmentation for dynamic reshape #656

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

More lost scalars during segmentation for dynamic reshape #656

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions