Grid reduction with serialized blocks by jacobhinkle · Pull Request #1405 · NVIDIA/Fuser

jacobhinkle · 2023-11-29T14:35:31Z

This PR introduces kernel IR nodes for the syncs that need to occur before and after a loop containing a serial grid reduction. That grid reduction can be inlined with other computation in a loop nest, and the sync nodes will be placed around the outer loop in the generated kernel. The kir::GridReduction node itself is modified to have an attribute available via bool kir::GridReduction::isSerial() const indicating whether this is a serial grid reduction. This PR tests that codegen is correct.

Default CUDA kernel for the included test:

__global__ void nvfuser_none_f0_c0_r0_g0(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T1, Tensor<float, 1, 1> T4, Tensor<int64_t, 1, 1> T5) {
  alignas(16) extern __shared__ char array[];
  void* shared_mem = array;
  NVFUSER_DEFINE_MAGIC_ZERO;
  nvfuser_index_t i0;
  i0 = 32LL * ((nvfuser_index_t)threadIdx.y);
  nvfuser_index_t i1;
  i1 = 32768LL * ((nvfuser_index_t)blockIdx.y);
  nvfuser_index_t i2;
  i2 = 262144LL * ((nvfuser_index_t)blockIdx.x);
  nvfuser_index_t i3;
  i3 = ((((2097152LL * ((nvfuser_index_t)blockIdx.z)) + ((nvfuser_index_t)threadIdx.x)) + i0) + i1) + i2;
  nvfuser_index_t i4;
  i4 = ((((nvfuser_index_t)threadIdx.x) + i0) + i1) + i2;
  nvfuser_index_t i5;
  i5 = (((-2097152LL + ((nvfuser_index_t)threadIdx.x)) + i0) + i1) + i2;
  bool b6;
  b6 = ((nvfuser_index_t)blockIdx.z) == (((nvfuser_index_t)gridDim.z) + -1LL);
  // Allocate global tensor T4
  // Allocate global tensor T5
  float T2[128LL];
  #pragma unroll
  for(nvfuser_index_t i7 = 0; i7 < 32LL; ++i7) {
    #pragma unroll
    for(nvfuser_index_t i8 = 0; i8 < 4LL; ++i8) {
      T2[(i7 + (32LL * i8))] = 0.000000000e+00f;
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO;
  #pragma unroll
  for(nvfuser_index_t i7 = 0; i7 < 32LL; ++i7) {
    nvfuser_index_t i9;
    i9 = 256LL * i7;
    nvfuser_index_t i10;
    i10 = i3 + i9;
    nvfuser_index_t i11;
    i11 = -i9;    #pragma unroll
    for(nvfuser_index_t i8 = 0; i8 < 4LL; ++i8) {
      nvfuser_index_t i12;
      i12 = 8192LL * (i8 + nvfuser_zero);
      if ((i5 < (i11 - i12))) {
        T2[(i7 + (32LL * i8))]
           = T0[(i10 + i12)];
      }
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO;
  #pragma unroll
  for(nvfuser_index_t i13 = 0; i13 < 4LL; ++i13) {
    nvfuser_index_t i14;
    i14 = 32LL * i13;
    nvfuser_index_t i15;
    i15 = 8192LL * i13;
    nvfuser_index_t i16;
    i16 = i4 + i15;
    nvfuser_index_t i17;
    i17 = -i15;
    #pragma unroll
    for(nvfuser_index_t i18 = 0; i18 < 32LL; ++i18) {
      nvfuser_index_t i19;
      i19 = 256LL * (i18 + nvfuser_zero);
      float T3[1LL];
      T3[0LL] = 0.000000000e+00f;
      reduction::gridReduce<false, false, true, false, false, false, false, true>(
        T3[0LL],
        T2[(i14 + i18)],
        [](float &a, float b) { a = a + b; },
        &T4[0],
        &T5[0],
        static_cast<float*>(shared_mem),
        true,
        true,
        float(0.000000000e+00f),
        ((i13 * 32LL) + i18),
        128LL);
      if ((b6 && (i5 < (i17 - i19)))) {
        T1[(i16 + i19)]
           = T3[0LL];
      }
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO;
}

The serial reduction kernel looks like this:

__global__ void nvfuser_none_f0_c0_r0_g0(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T1, Tensor<float, 1, 1> T6, Tensor<int64_t, 1, 1> T5) {
  alignas(16) extern __shared__ char array[];
  void* shared_mem = array;
  NVFUSER_DEFINE_MAGIC_ZERO;
  nvfuser_index_t i0;
  i0 = 32LL * ((nvfuser_index_t)threadIdx.y);
  nvfuser_index_t i1;
  i1 = 32768LL * ((nvfuser_index_t)blockIdx.y);
  nvfuser_index_t i2;
  i2 = 262144LL * ((nvfuser_index_t)blockIdx.x);
  nvfuser_index_t i3;
  i3 = ((((2097152LL * ((nvfuser_index_t)blockIdx.z)) + ((nvfuser_index_t)threadIdx.x)) + i0) + i1) + i2;
  nvfuser_index_t i4;
  i4 = ((((nvfuser_index_t)threadIdx.x) + i0) + i1) + i2;
  nvfuser_index_t i5;
  i5 = (((-2097152LL + ((nvfuser_index_t)threadIdx.x)) + i0) + i1) + i2;
  bool b6;
  b6 = ((nvfuser_index_t)blockIdx.z) == (((nvfuser_index_t)gridDim.z) + -1LL);
  // Allocate global tensor T6
  // Allocate global tensor T5
  float T2[128LL];
  #pragma unroll
  for(nvfuser_index_t i7 = 0; i7 < 32LL; ++i7) {
    #pragma unroll
    for(nvfuser_index_t i8 = 0; i8 < 4LL; ++i8) {
      T2[(i7 + (32LL * i8))] = 0.000000000e+00f;
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO;
  #pragma unroll
  for(nvfuser_index_t i7 = 0; i7 < 32LL; ++i7) {
    nvfuser_index_t i9;
    i9 = 256LL * i7;
    nvfuser_index_t i10;
    i10 = i3 + i9;
    nvfuser_index_t i11;
    i11 = -i9;
    #pragma unroll
    for(nvfuser_index_t i8 = 0; i8 < 4LL; ++i8) {
      nvfuser_index_t i12;
      i12 = 8192LL * (i8 + nvfuser_zero);
      if ((i5 < (i11 - i12))) {
        T2[(i7 + (32LL * i8))]
           = T0[(i10 + i12)];
      }
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO;
  grid_sync::blockSerializeWait<false, false, true, false>(&T5[index_utils::maskedOffset<true, true, false>(blockIdx, gridDim)]);
  #pragma unroll
  for(nvfuser_index_t i13 = 0; i13 < 4LL; ++i13) {
    nvfuser_index_t i14;
    i14 = 32LL * i13;
    nvfuser_index_t i15;
    i15 = 8192LL * i13;
    nvfuser_index_t i16;
    i16 = i4 + i15;
    nvfuser_index_t i17;
    i17 = -i15;
    #pragma unroll
    for(nvfuser_index_t i18 = 0; i18 < 32LL; ++i18) {
      nvfuser_index_t i19;
      i19 = 256LL * (i18 + nvfuser_zero);
      float T3[1LL];
      T3[0LL] = 0.000000000e+00f;
      reduction::serialReductionStep(
        T3[0LL],
        T2[(i14 + i18)],
        0.000000000e+00f,
        T6[(i16 + i19)],
        [](float &a, float b) { a = a + b; },
        index_utils::maskedOffset<false, false, true>(blockIdx, gridDim) == 0,
        index_utils::maskedOffset<false, false, true>(blockIdx, gridDim) == index_utils::maskedSize<false, false, true>(gridDim) - 1,
        true,
        true);
      if ((b6 && (i5 < (i17 - i19)))) {
        T1[(i16 + i19)]
           = T3[0LL];
      }
    }
  }
  grid_sync::blockSerializeRelease<false, false, true, false>(&T5[index_utils::maskedOffset<true, true, false>(blockIdx, gridDim)]);
  NVFUSER_UPDATE_MAGIC_ZERO;
}

What is not included in this PR

There is no automatic scheduling or lowering to serial reductions in this PR. The included test works via a post-lowering hook in FusionExecutor to simply test that we can codegen the nodes properly once they are manually placed.

There is also no re-use of global buffers currently, so this is not yet an "in-place" reduction. I.e. we must manually allocate a work buffer that is the full size of the grid reduction output at this time. In the future, we can avoid the need for that workspace by aliasing an output buffer.

The work buffer must currently be the same dtype as the reduction element. In the future, we could relax this in order to cast to lower precision in the work buffer. This would enable us to re-use the global memory allocated for TST and HSH matmul output, at the expense of a small loss in precision.

Related to #1316 and #991.

Soon I will add the serial gridReduce codegen version so that we can compare to one another.

... even if there is no wait(). Just writing a value to the semaphore is deadlocking for me on my 3090 Ti.

What's left: - modify allocation size for work buffer - codegen the serial grid reduction

test/test_serial_gridreduce.cpp

I think this will have negligible perf impact by adding a sync in the last block after the last loop. The upside is that we can keep the sync buffer clean (i.e. all zeros), which at some point might help us to re-use sync buffers, removing one memset kernel launch per execution.

jacobhinkle · 2023-12-07T13:58:21Z

!build

csrc/kernel_ir.h

runtime/grid_reduction.cu

runtime/grid_sync.cu

naoyam

LGTM

This change enables `ReductionOp`s to be lowered as serial reductions (see #1405) if requested during scheduling. 1. At scheduling, a `ReductionOp` is modified by calling its `requestSerialGridReduction()` method. The output tensor can be scheduled before or after this method call, and should result in the op having all its reduction axes parallelized as grid dimensions. 2. Early in lowering, we find `ReductionOp`s having `serialGridReductionRequested() == true`, and we place syncs around their outer loop. At this point, we also analyze that outer loop to determine if there are any conflicting expressions, such as conflicting grid reductions. 3. Later in lowering, during the indexing pass, we convert the `ReductionOp` to a `GridReduction` that has its serial buffer set. The serial buffer is a temporary `TensorIndex` indexed like a global memory version of the reduction output tensor. The generated kernel looks like this: ```c++ // Allocate global tensor T4 grid_sync::blockSerializeWait<false, false, true>(&T4[index_utils::maskedOffset<true, true, false>(blockIdx, gridDim)]); #pragma unroll for(nvfuser_index_t i13 = 0; i13 < 4LL; ++i13) { nvfuser_index_t i14; i14 = 8LL * i13; nvfuser_index_t i15; i15 = 2048LL * i13; nvfuser_index_t i16; i16 = i4 + i15; nvfuser_index_t i17; i17 = -i15; #pragma unroll for(nvfuser_index_t i18 = 0; i18 < 8LL; ++i18) { nvfuser_index_t i19; i19 = 256LL * (i18 + nvfuser_zero); nvfuser_index_t i20; i20 = i16 + i19; float T3[1LL]; T3[0LL] = 0.000000000e+00f; // Allocate global tensor T5 reduction::serialReductionStep( T3[0LL], T2[(i14 + i18)], 0.000000000e+00f, T5[i20], [](float &a, float b) { a = a + b; }, index_utils::maskedOffset<false, false, true>(blockIdx, gridDim) == 0, index_utils::maskedOffset<false, false, true>(blockIdx, gridDim) == index_utils::maskedSize<false, false, true>(gridDim) - 1, true, true); if ((b6 && (i5 < (i17 - i19)))) { T1[i20] = T3[0LL]; } } } NVFUSER_UPDATE_MAGIC_ZERO; grid_sync::blockSerializeRelease<false, false, true>(&T4[index_utils::maskedOffset<true, true, false>(blockIdx, gridDim)]); ``` Notice that the index `i20` now matches between the output `T1` and the intermediate `T5`. In another PR, I will attempt to extend our buffer reuse machinery to recognize this as a chance to use `T1` in place of `T5` (i.e. inner aliasing, in-place reduction). Also notice that I have not yet hoisted the sync flags index, or the `first_block` and `last_block` predicates.

jacobhinkle added 3 commits November 29, 2023 08:22

Add test with current gridReduce approach.

97839a9

Soon I will add the serial gridReduce codegen version so that we can compare to one another.

Add isSerial() attribute/method on GridReduction

7f97273

Add SerialReduction{Pre,Post}Sync nodes

9381ce7

jacobhinkle changed the title ~~Serial grid reduce~~ Serial grid reduction Nov 29, 2023

jacobhinkle changed the title ~~Serial grid reduction~~ Grid reduction with serialized blocks Nov 29, 2023

jacobhinkle added 11 commits November 29, 2023 09:36

Use post-lowering hook

d37b7f3

Fix segfault by setting sync_buf

f5647f3

Add dispatch, codegen with missing runtime function

7229ebe

Add post sync runtime function

846e238

WIP: still deadlocking whenever I write in the release stage

69727b2

... even if there is no wait(). Just writing a value to the semaphore is deadlocking for me on my 3090 Ti.

Forgot to remove a template arg

c00acee

Fix deadlock by removing gridSync

97535b0

What's left: - modify allocation size for work buffer - codegen the serial grid reduction

Resize work buf in test

1537951

Rename nodes and update codegen/test

907062c

Missed some replacements

bbdca33

Generate code properly with preds and indices hardcoded

81b2eb0

jacobhinkle commented Dec 2, 2023

View reviewed changes

test/test_serial_gridreduce.cpp Outdated Show resolved Hide resolved

jacobhinkle added 4 commits December 2, 2023 12:04

Remove debug print in test

a98243f

Rename and fill in proper predicates

8b0ea42

Get proper predicates for first and last step

a1d0171

Fix typo

5b429eb

Priya2698 mentioned this pull request Dec 4, 2023

Validation tolerances #1275

Closed

jacobhinkle added 5 commits December 4, 2023 14:46

Use TensorIndex

d37035f

Switch to reduction over 5 blocks. Clean up schedule

69b7c3e

Fix off by one error in last_block condition

a0dc34f

Reschedule so that loads precede serialized block

de3080a

Merge remote-tracking branch 'origin/main' into serial_grid_reduce

9d43add

jacobhinkle mentioned this pull request Dec 6, 2023

Scheduling/lowering for serial grid reduction #1456

Merged

jacobhinkle marked this pull request as ready for review December 6, 2023 15:14

Clean up syncs and reduce init

2b8bedb

jacobhinkle added 5 commits December 7, 2023 12:44

Merge remote-tracking branch 'origin/main' into serial_grid_reduce

0e90ee0

Reduce number of kernels launched in test

355107c

Fix typo

65a1919

Reduce runtime of test

e08c3bc

jacobhinkle requested review from naoyam and zasdfgbnm December 7, 2023 18:24

jacobhinkle added 3 commits December 7, 2023 18:28

Busy wait with first thread only

d9ae550

Update toString of new nodes

0477c15

Merge remote-tracking branch 'origin/main' into serial_grid_reduce

945d945

naoyam reviewed Dec 7, 2023

View reviewed changes

csrc/kernel_ir.h Outdated Show resolved Hide resolved

runtime/grid_reduction.cu Show resolved Hide resolved

runtime/grid_sync.cu Outdated Show resolved Hide resolved

runtime/grid_sync.cu Outdated Show resolved Hide resolved

naoyam reviewed Dec 8, 2023

View reviewed changes

runtime/grid_sync.cu Outdated Show resolved Hide resolved

jacobhinkle added 4 commits December 8, 2023 00:35

Merge remote-tracking branch 'origin/main' into serial_grid_reduce

b701527

Update comments and rearrange serialReductionStep

55cef35

Remove unsupported pre-Volta branches

2c16c37

Merge remote-tracking branch 'origin/main' into serial_grid_reduce

44f9dfb

naoyam approved these changes Dec 8, 2023

View reviewed changes

jacobhinkle merged commit 34c7fa4 into main Dec 8, 2023

jacobhinkle deleted the serial_grid_reduce branch December 8, 2023 17:40

jacobhinkle mentioned this pull request Dec 13, 2023

Split-K matmul is slow even for few partitions #1316

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Grid reduction with serialized blocks#1405

Grid reduction with serialized blocks#1405
jacobhinkle merged 36 commits intomainfrom
serial_grid_reduce

jacobhinkle commented Nov 29, 2023 •

edited

Loading

Uh oh!

Uh oh!

jacobhinkle commented Dec 7, 2023

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

naoyam left a comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

jacobhinkle commented Nov 29, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

What is not included in this PR

Uh oh!

Uh oh!

jacobhinkle commented Dec 7, 2023

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

naoyam left a comment

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

jacobhinkle commented Nov 29, 2023 •

edited

Loading