-
Notifications
You must be signed in to change notification settings - Fork 79
Description
I am trying to make the following test pass:
// Test dynamic pad followed by broadcast resolution
TEST_F(NVFuserTest, DynamicPadBroadcast_CUDA) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Fusion& fusion = *fusion_ptr.get();
FusionGuard fg(&fusion);
TensorView* tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
TensorView* tv1 = makeSymbolicTensor(2);
fusion.addInput(tv1);
// 2d axis order here is YX
auto ypad = IrBuilder::create<Int>();
fusion.addInput(ypad);
auto xpad = IrBuilder::create<Int>();
fusion.addInput(xpad);
// two-way resizes to cut square tv down to broadcastable size in either axis
auto tv0_pad = pad(tv0, {fusion.zeroVal(), xpad, fusion.zeroVal(), ypad});
// This will potentially resolve the y or x broadcast
auto p = mul(tv0_pad, tv1);
fusion.addOutput(p);
fusion.printMath();
/*
Inputs:
T0_g[ iS0{i0}, iS1{i2} ], float
T1_g[ iS2{i3}, iS3{i4} ], float
i5, int64_t
i6, int64_t
Outputs:
T3_g[ iS8{( i0 + i5 )}, iS9{( i2 + i6 )} ], float
%kernel_math {
T2_l[ ?S5{( i0 + i5 )}rf, ?S7{( i2 + i6 )}rf ]
= pad( T0_g[ iS0{i0}, iS1{i2} ], {0, i5, 0, i6} )
T3_g[ iS8{( i0 + i5 )}, iS9{( i2 + i6 )} ]
= T2_l[ ?S5{( i0 + i5 )}rf, ?S7{( i2 + i6 )}rf ]
* T1_g[ iS2{i3}, iS3{i4} ];
}
*/
FusionExecutorCache fusion_executor_cache(std::move(fusion_ptr));
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x = at::randn({5, 5}, options);
at::Tensor at_y = at::randn({5, 5}, options);
std::vector<c10::IValue> aten_inputs({at_x, at_y, 0, 0});
std::vector<at::Tensor> outputs;
// shrink first axis so that the multiplication resolves a broadcast
aten_inputs[2] = -4;
aten_inputs[3] = 0;
outputs = fusion_executor_cache.runFusionWithInputs(aten_inputs);
testValidate(fusion_executor_cache.fusion(), outputs, aten_inputs, {at_x[0].unsqueeze(0) * at_y}, __LINE__, __FILE__);
}During concretization, we try and evaluate the extent of dynamic resize. In this case, we want to evaluate extents of ?S5{( i0 + i5 )}rf and ?S7{( i2 + i6 )}rf. To compute these we use an expression evaluator, which actually throws an error:
C++ exception with description "known_size == this_size INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/expr_evaluator.cpp":210, please report a bug to PyTorch. Conflicting sizes: 5, 1
The following initial values are bound correctly based on inputs:
Binding 5 to i0
Binding 5 to i2
Binding 5 to i3
Binding 5 to i4
Binding -4 to i5
Binding 0 to i6
The error comes in ExpressionEvaluator::propagateBoundValuesThroughExactMaps when evaluating i0 + i5 which evaluates to 1 but has known_size == 5. I believe this is because earlier, the multiplication sees a Symbolic domain aligned with an Iteration domain and interprets this as an exact map. I think the fix is to adjust the exact map condition to never set an exact mapping to/from Symbolic domains.