-
Notifications
You must be signed in to change notification settings - Fork 79
Description
I am seeing that the test EpilogueAlphaBetaBias_CUDA does not produce the exact same cuda code every time. This probably applies to other tests as well and I do not believe it's an issue with this test in particular.
To reproduce, using TOT (63d1b12), I ran the following:
for i in $(seq 1 10); do NVFUSER_DUMP=scheduler_params,scheduler_verbose,fusion_ir_preseg,segmenter_logging,cuda_kernel build/nvfuser_tests --gtest_filter='*MatmulSchedulerTest.EpilogueAlphaBetaBias_CUDA*' > dump-$i;done
After that, running for i in $(seq 1 10);do diff -U3 dump-1 dump-$i | diff-highlight;done | less -FXR I see that about half the time, we have diffs like this:
--- dump-61d-1 2023-08-28 14:39:33.881921498 -0400
+++ dump-61d-5 2023-08-28 14:39:40.265840614 -0400
@@ -1581,28 +1581,28 @@
i149 = i147 % 32;
nvfuser_index_t i150;
i150 = i149 / 8;
- __half T19[1];
+ __half T20[1];
- T19[0] = 0;
+ T20[0] = 0;
if (b145) {
- T19[0]
+ T20[0]
= T2[((i140 + i149) + (T1.logical_size[1] * i148))];
}
float T11[1];
T11[0]
- = __half2float(T19[0]);
+ = __half2float(T20[0]);
float T12[1];
T12[0]
= (float) d1
* T11[0];
- float T20[1];
+ float T19[1];
- T20[0] = 0;
+ T19[0] = 0;
if (b144) {
- T20[0]
+ T19[0]
= T3[(i142 + i148)];
}
float T8[1];
T8[0]
- = T20[0];
+ = T19[0];
float T9[1];
T9[0]
= T15[((((i143 + (32 * i148)) + (32 * (i150 / 4))) + (8 * ((i148 % 4) ^ (i150 % 4)))) + (i149 % 8))]Note the dump parameters. fusion_ir_preseg shows the fusion just before segmentation and lowering, and segmenter_logging shows that segmentation proceeds the same way in each case. However, adding fusion_ir reflects the difference above:
+++ dump-61d-3 2023-08-28 15:02:10.132620300 -0400
@@ -104,20 +104,20 @@
T6_l[ bS53{( ceilDiv(1, 256) )}, iS55{( ceilDiv(i6, 32) )}, iS57{( ceilDiv(i5, 32) )}, iS146{( ceilDiv(32, 16) )}, bthreadIdx.z151{( ceilDiv(( ( ceilDiv(256, 4) ) * 4 ), 64) )}, ithreadIdx.y159{( ceilDiv(( ( ( ceilDiv(( ceilDiv(32, 8) ), 4) ) * 4 ) * 8 ), 32) )}, bS161{( ceilDiv(64, 16) )}, iS163{( ceilDiv(32, 8) )}, bS162{16}, ithreadIdx.x259{( ( ceilDiv(8, 4) ) * 16 )}, iS258{4} ] ca_pos( 6 ) produce_pos( 6 ))
T15_s[ iblockIdx.x89{( ceilDiv(i2, 256) )}, iblockIdx.y91{( ceilDiv(i6, 32) )}, ithreadIdx.z285{( ceilDiv(( ( ceilDiv(256, 4) ) * 4 ), 64) )}, ithreadIdx.y287{( ceilDiv(( ( ( ceilDiv(( ceilDiv(32, 8) ), 4) ) * 4 ) * 8 ), 32) )}, iS289{( ceilDiv(64, 16) )}, iS291{( ceilDiv(32, 8) )}, iS293{( ceilDiv(16, 8) )}, ithreadIdx.x297{( 8 * ( ceilDiv(8, 2) ) )}, iV296{2} ] ca_pos( 2 ) produce_pos( 2 )
= Set( T7_l[ iblockIdx.x47{( ceilDiv(i2, 256) )}, iblockIdx.y49{( ceilDiv(i6, 32) )}, rS51{( ceilDiv(i4, 32) )}, rS144{( ceilDiv(32, 16) )}, ithreadIdx.z136{( ceilDiv(( ( ceilDiv(256, 4) ) * 4 ), 64) )}, ithreadIdx.y138{( ceilDiv(( ( ( ceilDiv(( ceilDiv(32, 8) ), 4) ) * 4 ) * 8 ), 32) )}, iS140{( ceilDiv(64, 16) )}, iS142{( ceilDiv(32, 8) )}, iMMA269{( ceilDiv(16, 8) )}, ithreadIdx.x273{( 8 * ( ceilDiv(8, 2) ) )}, iMMA272{2}, rMMA145{16} ] ca_pos( 2 ) produce_pos( 6 ) )
-T20_l[ iblockIdx.x341{( ceilDiv(i9, 256) )}, iS385{( ceilDiv(( ceilDiv(( ceilDiv(( ceilDiv(256, 4) ), 32) ), 1) ), 4) )}, ithreadIdx.z386{4}, ithreadIdx.y384{1}, ithreadIdx.x382{32}, iS380{4} ] ca_pos( 6 )
+T19_l[ iblockIdx.x335{( ceilDiv(i9, 256) )}, iS385{( ceilDiv(( ceilDiv(( ceilDiv(( ceilDiv(256, 4) ), 32) ), 1) ), 4) )}, ithreadIdx.z386{4}, ithreadIdx.y384{1}, ithreadIdx.x382{32}, iS380{4} ] ca_pos( 6 )
= Set( T3_g[ iS125{( ceilDiv(i9, 256) )}, iS126{256} ] )
T8_l[ iS97{( ceilDiv(i9, 256) )}, bS99{( ceilDiv(1, 32) )}, iS377{( ceilDiv(( ceilDiv(( ceilDiv(( ceilDiv(( 256 * 32 ), 4) ), 32) ), 1) ), 4) )}, iS378{4}, iS376{1}, iS374{32}, iS372{4} ] ca_pos( 7 ) produce_pos( 7 )Inserting printMath just before and after scheduling shows this is coming from the scheduler which in this case is the matmul scheduler.
See #766 (comment).