Repeatability in matmul scheduler

I am seeing that the test `EpilogueAlphaBetaBias_CUDA`  does not produce the exact same cuda code every time. This probably applies to other tests as well and I do not believe it's an issue with this test in particular.

To reproduce, using TOT (63d1b1246), I ran the following:
```
for i in $(seq 1 10); do NVFUSER_DUMP=scheduler_params,scheduler_verbose,fusion_ir_preseg,segmenter_logging,cuda_kernel build/nvfuser_tests --gtest_filter='*MatmulSchedulerTest.EpilogueAlphaBetaBias_CUDA*' > dump-$i;done
```
After that, running `for i in $(seq 1 10);do diff -U3 dump-1 dump-$i | diff-highlight;done | less -FXR` I see that about half the time, we have diffs like this:
```diff
--- dump-61d-1  2023-08-28 14:39:33.881921498 -0400
+++ dump-61d-5  2023-08-28 14:39:40.265840614 -0400
@@ -1581,28 +1581,28 @@
       i149 = i147 % 32;
       nvfuser_index_t i150;
       i150 = i149 / 8;
-      __half T19[1];
+      __half T20[1];
-      T19[0] = 0;
+      T20[0] = 0;
       if (b145) {
-        T19[0]
+        T20[0]
            = T2[((i140 + i149) + (T1.logical_size[1] * i148))];
       }
       float T11[1];
       T11[0]
-         = __half2float(T19[0]);
+         = __half2float(T20[0]);
       float T12[1];
       T12[0]
         = (float) d1
         * T11[0];
-      float T20[1];
+      float T19[1];
-      T20[0] = 0;
+      T19[0] = 0;
       if (b144) {
-        T20[0]
+        T19[0]
            = T3[(i142 + i148)];
       }
       float T8[1];
       T8[0]
-         = T20[0];
+         = T19[0];
       float T9[1];
       T9[0]
         = T15[((((i143 + (32 * i148)) + (32 * (i150 / 4))) + (8 * ((i148 % 4) ^ (i150 % 4)))) + (i149 % 8))]
```
Note the dump parameters. `fusion_ir_preseg` shows the fusion just before segmentation and lowering, and `segmenter_logging` shows that segmentation proceeds the same way in each case. However, adding `fusion_ir` reflects the difference above:
```diff
+++ dump-61d-3  2023-08-28 15:02:10.132620300 -0400
@@ -104,20 +104,20 @@
          T6_l[ bS53{( ceilDiv(1, 256) )}, iS55{( ceilDiv(i6, 32) )}, iS57{( ceilDiv(i5, 32) )}, iS146{( ceilDiv(32, 16) )}, bthreadIdx.z151{( ceilDiv(( ( ceilDiv(256, 4) ) * 4 ), 64) )}, ithreadIdx.y159{( ceilDiv(( ( ( ceilDiv(( ceilDiv(32, 8) ), 4) ) * 4 ) * 8 ), 32) )}, bS161{( ceilDiv(64, 16) )}, iS163{( ceilDiv(32, 8) )}, bS162{16}, ithreadIdx.x259{( ( ceilDiv(8, 4) ) * 16 )}, iS258{4} ] ca_pos( 6 ) produce_pos( 6 ))
 T15_s[ iblockIdx.x89{( ceilDiv(i2, 256) )}, iblockIdx.y91{( ceilDiv(i6, 32) )}, ithreadIdx.z285{( ceilDiv(( ( ceilDiv(256, 4) ) * 4 ), 64) )}, ithreadIdx.y287{( ceilDiv(( ( ( ceilDiv(( ceilDiv(32, 8) ), 4) ) * 4 ) * 8 ), 32) )}, iS289{( ceilDiv(64, 16) )}, iS291{( ceilDiv(32, 8) )}, iS293{( ceilDiv(16, 8) )}, ithreadIdx.x297{( 8 * ( ceilDiv(8, 2) ) )}, iV296{2} ] ca_pos( 2 ) produce_pos( 2 )
    = Set( T7_l[ iblockIdx.x47{( ceilDiv(i2, 256) )}, iblockIdx.y49{( ceilDiv(i6, 32) )}, rS51{( ceilDiv(i4, 32) )}, rS144{( ceilDiv(32, 16) )}, ithreadIdx.z136{( ceilDiv(( ( ceilDiv(256, 4) ) * 4 ), 64) )}, ithreadIdx.y138{( ceilDiv(( ( ( ceilDiv(( ceilDiv(32, 8) ), 4) ) * 4 ) * 8 ), 32) )}, iS140{( ceilDiv(64, 16) )}, iS142{( ceilDiv(32, 8) )}, iMMA269{( ceilDiv(16, 8) )}, ithreadIdx.x273{( 8 * ( ceilDiv(8, 2) ) )}, iMMA272{2}, rMMA145{16} ] ca_pos( 2 ) produce_pos( 6 ) )
-T20_l[ iblockIdx.x341{( ceilDiv(i9, 256) )}, iS385{( ceilDiv(( ceilDiv(( ceilDiv(( ceilDiv(256, 4) ), 32) ), 1) ), 4) )}, ithreadIdx.z386{4}, ithreadIdx.y384{1}, ithreadIdx.x382{32}, iS380{4} ] ca_pos( 6 )
+T19_l[ iblockIdx.x335{( ceilDiv(i9, 256) )}, iS385{( ceilDiv(( ceilDiv(( ceilDiv(( ceilDiv(256, 4) ), 32) ), 1) ), 4) )}, ithreadIdx.z386{4}, ithreadIdx.y384{1}, ithreadIdx.x382{32}, iS380{4} ] ca_pos( 6 )
    = Set( T3_g[ iS125{( ceilDiv(i9, 256) )}, iS126{256} ] )
 T8_l[ iS97{( ceilDiv(i9, 256) )}, bS99{( ceilDiv(1, 32) )}, iS377{( ceilDiv(( ceilDiv(( ceilDiv(( ceilDiv(( 256 * 32 ), 4) ), 32) ), 1) ), 4) )}, iS378{4}, iS376{1}, iS374{32}, iS372{4} ] ca_pos( 7 ) produce_pos( 7 )
```
Inserting `printMath` just before and after scheduling shows this is coming from the scheduler which in this case is the matmul scheduler.

See https://github.com/NVIDIA/Fuser/issues/766#issuecomment-1695481531.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repeatability in matmul scheduler #799

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Repeatability in matmul scheduler #799

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions