Wrong unswitch predicate 

This fusion fails at the validation:

```
TEST_F(NVFuserTest, Issue667Repro_CUDA) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeSymbolicTensor(2);
  fusion.addInput(tv0);

  auto tv1 = sum(tv0, {0, 1});
  fusion.addOutput(tv1);

  // [i0, i1]
  tv1->split(1, 4);
  // [i0, i1/4, 4]
  tv1->merge(0);
  // [i0*i1/4, 4]
  tv1->split(0, 4);
  // [i0*i1/4/4, 4, 4]
  tv1->split(0, 1);
  // [i0*i1/4/4, 1, 4, 4]

  tv1->axis(1)->parallelize(ParallelType::Unswitch);

  fusion.printKernel();

  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
  at::Tensor t0 = at::randn({4, 10}, options);
  std::vector<c10::IValue> aten_inputs = {t0};

  FusionExecutor fe;
  fe.compileFusion(&fusion, aten_inputs);
  auto outputs = fe.runFusion(aten_inputs);

  auto ref = t0.to(at::kDouble).sum();

  testValidate(
      &fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
}
```

I believe this is a simplified repro of the same issue as #667.

The issue is the unswitch. If it's disabled, the validation passes.

Here's the generated kernel. I disabled index hoisting and predicate elimination.
```
__global__ void kernel1(Tensor<float, 2, 2> T0, Tensor<float, 0, 0> T1) {
  NVFUSER_DEFINE_MAGIC_ZERO;
  T1[0] = 0.00000000000000000e+00;
  #pragma unroll 1
  for(nvfuser_index_t i0 = 0; i0 < (ceilDiv((T0.logical_size[0] * (ceilDiv(T0.logical_size[1], 4))), 4)); ++i0) {
    if ((((3 + (4 * i0)) < (T0.logical_size[0] * (ceilDiv(T0.logical_size[1], 4)))) && ((3 + (4 * ((3 + (4 * i0)) % (ceilDiv(T0.logical_size[1], 4))))) < T0.logical_size[1]))) {
      #pragma unroll
      for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
        #pragma unroll
        for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
          T1[0]
            = T1[0]
            + T0[((((((T0).alloc_stride)[0]) * (((4 * i0) + i1) / (ceilDiv(T0.logical_size[1], 4)))) + ((4 * (((T0).alloc_stride)[1])) * (((4 * i0) + i1) % (ceilDiv(T0.logical_size[1], 4))))) + ((((T0).alloc_stride)[1]) * (i2 + nvfuser_zero)))];
        }
      }
      NVFUSER_UPDATE_MAGIC_ZERO;
    } else {
      #pragma unroll
      for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
        #pragma unroll
        for(nvfuser_index_t i2 = 0; i2 < 4; ++i2) {
          if (((((4 * i0) + (i1 + nvfuser_zero)) < (T0.logical_size[0] * (ceilDiv(T0.logical_size[1], 4)))) && (((4 * (((4 * i0) + i1) % (ceilDiv(T0.logical_size[1], 4)))) + (i2 + nvfuser_zero)) < T0.logical_size[1]))) {
            T1[0]
              = T1[0]
              + T0[((((((T0).alloc_stride)[0]) * (((4 * i0) + i1) / (ceilDiv(T0.logical_size[1], 4)))) + ((4 * (((T0).alloc_stride)[1])) * (((4 * i0) + i1) % (ceilDiv(T0.logical_size[1], 4))))) + ((((T0).alloc_stride)[1]) * (i2 + nvfuser_zero)))];
          }
        }
      }
      NVFUSER_UPDATE_MAGIC_ZERO;
    }
  }
}
```

The problem is this predicate for the unswitch if statement:

```
((3 + (4 * ((3 + (4 * i0)) % (ceilDiv(T0.logical_size[1], 4))))) < T0.logical_size[1]))) 
```

And in particular, this part: `(3 + (4 * i0)) % (ceilDiv(T0.logical_size[1], 4))`. Suppose `i0 == 0`, the LHS of `%` is just 3. Since the RHS is also `3`, this module result is just `0`. However, when `i1` is `2`, this module is `(2 + (4 * i0)) % (ceilDiv(T0.logical_size[1], 4))`, so it's `2`. So, for the `i1` loop, we use `3` to generate the predicate for the end of the loop, but that's not the case generating the largest index due to the module operation. So, it seems that the assumption we have here (https://github.com/NVIDIA/Fuser/blob/main/csrc/device_lower/analysis/index_compute.cpp#L367) is not always valid.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Wrong unswitch predicate #681

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Wrong unswitch predicate #681

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions