-
Notifications
You must be signed in to change notification settings - Fork 79
Description
This fusion fails at the validation:
TEST_F(NVFuserTest, Issue667Repro_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = sum(tv0, {0, 1});
fusion.addOutput(tv1);
// [i0, i1]
tv1->split(1, 4);
// [i0, i1/4, 4]
tv1->merge(0);
// [i0*i1/4, 4]
tv1->split(0, 4);
// [i0*i1/4/4, 4, 4]
tv1->split(0, 1);
// [i0*i1/4/4, 1, 4, 4]
tv1->axis(1)->parallelize(ParallelType::Unswitch);
fusion.printKernel();
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({4, 10}, options);
std::vector<c10::IValue> aten_inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, aten_inputs);
auto outputs = fe.runFusion(aten_inputs);
auto ref = t0.to(at::kDouble).sum();
testValidate(
&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
}
I believe this is a simplified repro of the same issue as #667.
The issue is the unswitch. If it's disabled, the validation passes.
Here's the generated kernel. I disabled index hoisting and predicate elimination.
__global__ void kernel1(Tensor<float, 2, 2> T0, Tensor<float, 0, 0> T1) {
NVFUSER_DEFINE_MAGIC_ZERO;
T1[0] = 0.00000000000000000e+00;
#pragma unroll 1
for(nvfuser_index_t i0 = 0; i0 < (ceilDiv((T0.logical_size[0] * (ceilDiv(T0.logical_size[1], 4))), 4)); ++i0) {
if ((((3 + (4 * i0)) < (T0.logical_size[0] * (ceilDiv(T0.logical_size[1], 4)))) && ((3 + (4 * ((3 + (4 * i0)) % (ceilDiv(T0.logical_size[1], 4))))) < T0.logical_size[1]))) {
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
T1[0]
= T1[0]
+ T0[((((((T0).alloc_stride)[0]) * (((4 * i0) + i1) / (ceilDiv(T0.logical_size[1], 4)))) + ((4 * (((T0).alloc_stride)[1])) * (((4 * i0) + i1) % (ceilDiv(T0.logical_size[1], 4))))) + ((((T0).alloc_stride)[1]) * (i2 + nvfuser_zero)))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
#pragma unroll
for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
if (((((4 * i0) + (i1 + nvfuser_zero)) < (T0.logical_size[0] * (ceilDiv(T0.logical_size[1], 4)))) && (((4 * (((4 * i0) + i1) % (ceilDiv(T0.logical_size[1], 4)))) + (i2 + nvfuser_zero)) < T0.logical_size[1]))) {
T1[0]
= T1[0]
+ T0[((((((T0).alloc_stride)[0]) * (((4 * i0) + i1) / (ceilDiv(T0.logical_size[1], 4)))) + ((4 * (((T0).alloc_stride)[1])) * (((4 * i0) + i1) % (ceilDiv(T0.logical_size[1], 4))))) + ((((T0).alloc_stride)[1]) * (i2 + nvfuser_zero)))];
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
}
}
The problem is this predicate for the unswitch if statement:
((3 + (4 * ((3 + (4 * i0)) % (ceilDiv(T0.logical_size[1], 4))))) < T0.logical_size[1])))
And in particular, this part: (3 + (4 * i0)) % (ceilDiv(T0.logical_size[1], 4)). Suppose i0 == 0, the LHS of % is just 3. Since the RHS is also 3, this module result is just 0. However, when i1 is 2, this module is (2 + (4 * i0)) % (ceilDiv(T0.logical_size[1], 4)), so it's 2. So, for the i1 loop, we use 3 to generate the predicate for the end of the loop, but that's not the case generating the largest index due to the module operation. So, it seems that the assumption we have here (https://github.com/NVIDIA/Fuser/blob/main/csrc/device_lower/analysis/index_compute.cpp#L367) is not always valid.