[webgpu] Optimize MatMulNBits for f16 Block32 prefill performance by daijh · Pull Request #23908 · microsoft/onnxruntime

daijh · 2025-03-06T01:53:18Z

Description

This commit improve the MatMulNBits f16 Block32 prefill performance, by increasing tiling size and enhancing memory efficiency. Achieved a +2x performance boost on Intel iGPUs for Phi-3.5-mini f16 model.

Motivation and Context

See above.

daijh · 2025-03-06T01:55:52Z

Tests:

model_benchmark.exe -i Phi-3.5-mini-instruct-onnx-web -l 1000

Prompt-1000	Prefill-default (tps)	Prefill-opt (tps)
LNL	14.5829	327.627
MTL	61.4833	160.695
ADL	45.1106	101.871

daijh · 2025-03-06T02:01:16Z

@qjia7 @sushraja-msft @jchen10
Please take a look, thanks.

daijh · 2025-03-06T02:09:27Z

Add shader for easy review.

enable f16;
enable subgroups_f16;
enable subgroups;
const workgroup_size_x: u32 = 128;
const workgroup_size_y: u32 = 1;
const workgroup_size_z: u32 = 1;
@group(0) @binding(0) var<storage, read> input_a: array<vec4<f16>>;
@group(0) @binding(1) var<storage, read> input_b: array<vec4<u32>>;
@group(0) @binding(2) var<storage, read> scales: array<f16>;
@group(0) @binding(3) var<storage, read_write> output: array<f16>;
struct Uniforms {
  input_a_shape: vec3<u32>,
  input_a_stride: vec2<u32>,
  input_b_shape: vec3<u32>,
  input_b_stride: vec2<u32>,
  output_shape: vec3<u32>,
  output_stride: vec2<u32>,
  block_size: u32
};
@group(0) @binding(4) var<uniform> uniforms: Uniforms;

alias input_a_value_t = vec4<f16>;
alias input_a_indices_t = vec3<u32>;
fn i2o_input_a(indices : input_a_indices_t)->u32 {
  return indices[0] * uniforms.input_a_stride[0] + indices[1] * uniforms.input_a_stride[1] + indices[2];
}
fn get_input_a_by_indices(indices: input_a_indices_t)->input_a_value_t {
  return input_a[i2o_input_a(indices)];
}
alias input_b_value_t = vec4<u32>;
alias input_b_indices_t = vec3<u32>;
fn i2o_input_b(indices : input_b_indices_t)->u32 {
  return indices[0] * uniforms.input_b_stride[0] + indices[1] * uniforms.input_b_stride[1] + indices[2];
}
fn get_input_b_by_indices(indices: input_b_indices_t)->input_b_value_t {
  return input_b[i2o_input_b(indices)];
}
alias output_value_t = f16;
alias output_indices_t = vec3<u32>;
alias output_element_t = f16;
fn i2o_output(indices : output_indices_t)->u32 {
  return indices[0] * uniforms.output_stride[0] + indices[1] * uniforms.output_stride[1] + indices[2];
}
fn set_output_by_indices(indices: output_indices_t, value: output_value_t) {
  output[i2o_output(indices)]=value;
}

fn mm_read_a(batch : u32, row : u32, col : u32) -> input_a_value_t {
  if (row < uniforms.input_a_shape[1] && col < uniforms.input_a_shape[2]) {
    return get_input_a_by_indices(input_a_indices_t(batch, row, col));
  }
  return input_a_value_t(0);
}

fn mm_read_b(row : u32, col : u32) -> input_b_value_t {
  if (row < uniforms.input_b_shape[0] && col < uniforms.input_b_shape[1]) {
    return get_input_b_by_indices(input_b_indices_t(row, col, 0));
  }
  return input_b_value_t(0);
}

fn mm_read_scale(row : u32, col : u32) -> output_value_t {
  if (row < uniforms.input_b_shape[0] && col < uniforms.input_b_shape[1]) {
    return scales[row * uniforms.input_b_shape[1] + col];
  }
  return output_value_t(0);
}

fn mm_write_y(batch : u32, row : u32, col : u32, value : output_value_t) {
  if (row < uniforms.output_shape[1] && col < uniforms.output_shape[2]) {
    set_output_by_indices(output_indices_t(batch, row, col), value);
  }
}

const tile_m = 16u;
const tile_n = 128u;

var<workgroup> a_data_wg: array<array<input_a_value_t, 8u>, tile_m>;

@compute @workgroup_size(workgroup_size_x, workgroup_size_y, workgroup_size_z)
fn main(@builtin(global_invocation_id) global_id : vec3<u32>,
        @builtin(workgroup_id) workgroup_id : vec3<u32>,
        @builtin(local_invocation_index) local_idx : u32,
        @builtin(local_invocation_id) local_id : vec3<u32>,
        @builtin(subgroup_invocation_id) sg_id : u32,
        @builtin(subgroup_size) sg_size : u32,
        @builtin(num_workgroups) num_workgroups : vec3<u32>) {
  let workgroup_idx = workgroup_id.z * num_workgroups[0] * num_workgroups[1] + workgroup_id.y * num_workgroups[0] + workgroup_id.x;
  let global_idx = workgroup_idx * (workgroup_size_x * workgroup_size_y * workgroup_size_z) + local_idx;

  let batch = workgroup_id.z;
  let row = workgroup_id.y * tile_m;
  let col = workgroup_id.x * tile_n;

  let a_elements_per_col = uniforms.input_a_shape[2];
  // A block32 containing 8 elements of `a`.
  let a_blocks_per_col = (a_elements_per_col + 7u) / 8u;

  // f32 accumulator
  var results : array<f32, tile_m>;
  for (var a_block_idx = 0u; a_block_idx < a_blocks_per_col; a_block_idx++) {
    // load `a` elements into workgroup memory, TileM x 8(block32).
    let a_row_idx = local_idx / 8u;
    let a_col_idx = local_idx % 8u;
    a_data_wg[a_row_idx][a Effect_col_idx] = mm_read_a(batch, row + a_row_idx, a_block_idx * 8u + a_col_idx);
    workgroupBarrier();

    let b_row = col + local_idx;
    let b_col = a_block_idx;

    let b_data = mm_read_b(b_row, b_col);
    let scale = mm_read_scale(b_row, b_col);
    let zero_point = output_element_t(8.0);

    for (var b_idx = 0u; b_idx < 4u; b_idx++) {
      let b_value = b_data[b_idx];
      let b_value_lower = unpack4xU8(b_value & 0x0F0F0F0Fu);
      let b_value_upper = unpack4xU8((b_value >> 4u) & 0x0F0F0F0Fu);
      let b_quantized_values = mat2x4<output_element_t>(
          output_element_t(b_value_lower[0]), output_element_t(b_value_upper[0]),
          output_element_t(b_value_lower[1]), output_element_t(b_value_upper[1]),
          output_element_t(b_value_lower[2]), output_element_t(b_value_upper[2]),
          output_element_t(b_value_lower[3]), output_element_t(b_value_upper[3]));
      let b_dequantized_values =
          (b_quantized_values - mat2x4<output_element_t>(zero_point, zero_point,
                                                        zero_point, zero_point,
                                                        zero_point, zero_point,
                                                        zero_point, zero_point)) * scale;

      for (var m_idx = 0u; m_idx < tile_m; m_idx++) {
        let a_data0 = a_data_wg[m_idx][b_idx * 2u];
        let a_data1 = a_data_wg[m_idx][b_idx * 2u + 1u];

        results[m_idx] += f32(dot(a_data0, b_dequantized_values[0u])) +
                          f32(dot(a_data1, b_dequantized_values[1u]));
      }
    }

    workgroupBarrier();
  }

  // write the results
  for (var m_idx = 0u; m_idx < tile_m; m_idx++) {
    mm_write_y(batch, row + m_idx, col + local_idx, output_value_t(results[m_idx]));
  }

}

guschmue · 2025-03-06T02:52:55Z

/azp run ONNX Runtime Web CI Pipeline,Windows GPU CI Pipeline,Linux Android Emulator QNN CI Pipeline

azure-pipelines · 2025-03-06T02:53:09Z

Azure Pipelines successfully started running 2 pipeline(s).

guschmue · 2025-03-06T02:53:10Z

/azp run Linux CPU CI Pipeline,Linux CPU Minimal Build E2E CI Pipeline,Linux GPU CI Pipeline,Linux GPU TensorRT CI Pipeline,Linux OpenVINO CI Pipeline,Linux QNN CI Pipeline,MacOS CI Pipeline,Windows ARM64 QNN CI Pipeline,Windows CPU CI Pipeline

guschmue · 2025-03-06T02:53:23Z

/azp run Windows GPU TensorRT CI Pipeline,onnxruntime-binary-size-checks-ci-pipeline,orttraining-linux-ci-pipeline,orttraining-linux-gpu-ci-pipeline,orttraining-ortmodule-distributed,Windows x64 QNN CI Pipeline,Big Models

guschmue · 2025-03-06T02:53:35Z

/azp run Windows GPU CUDA CI Pipeline,Windows GPU DML CI Pipeline,Windows GPU Doc Gen CI Pipeline, Win_TRT_Minimal_CUDA_Test_CI

azure-pipelines · 2025-03-06T02:53:40Z

Azure Pipelines successfully started running 4 pipeline(s).

azure-pipelines · 2025-03-06T02:53:46Z

Azure Pipelines successfully started running 9 pipeline(s).

azure-pipelines · 2025-03-06T02:53:51Z

Azure Pipelines successfully started running 4 pipeline(s).

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc

daijh · 2025-03-07T08:11:58Z

From what I can tell yours is a generation mode shader, if you are seeing good perf with this tile size -we should just replace the current generation shader with yours. Even better if we can make these shaders have the tile sizes as tunable.

I'm trying to avoid making too many modifications in a single PR to keep it easier review, and comparable with previous shader.
If accepted, I'll subsequently integrate its improvements into the default shader prefill path (as decode performance is not improved).

What are your thoughts?

daijh · 2025-03-07T08:14:09Z

As to why you are seeing great prefill speed, its because our prefill fp16 shader is not based on co-operative matmul (we havent got around to rewriting that shader that way, if you can pick that up that would be amazing as well). The DP4A matmul shader is using techniques of co-operative matmul, and we are using that for many models by passing accuracy_level 4 with model_builder.py.

Yes, we observed quite good performance at accuracy level 4 using the DP4A shader. I'll investigate similar for f16.

guschmue · 2025-03-11T19:33:48Z

I can capture some perf numbers as well

sushraja-msft · 2025-03-11T19:50:38Z

From what I can tell yours is a generation mode shader, if you are seeing good perf with this tile size -we should just replace the current generation shader with yours. Even better if we can make these shaders have the tile sizes as tunable.

I'm trying to avoid making too many modifications in a single PR to keep it easier review, and comparable with previous shader. If accepted, I'll subsequently integrate its improvements into the default shader prefill path (as decode performance is not improved).

What are your thoughts?

that's acceptable, perhaps name this MatMulNBitsBlock32Program > MatMulNBitsBlockWideTileProgram and land this PR and then work towards making this the default prefill program on all platforms. Ill review the shader

daijh · 2025-03-12T05:21:57Z

From what I can tell yours is a generation mode shader, if you are seeing good perf with this tile size -we should just replace the current generation shader with yours. Even better if we can make these shaders have the tile sizes as tunable.

I'm trying to avoid making too many modifications in a single PR to keep it easier review, and comparable with previous shader. If accepted, I'll subsequently integrate its improvements into the default shader prefill path (as decode performance is not improved).
What are your thoughts?

that's acceptable, perhaps name this MatMulNBitsBlock32Program > MatMulNBitsBlockWideTileProgram and land this PR and then work towards making this the default prefill program on all platforms. Ill review the shader

Sure. Thanks.

daijh · 2025-03-12T05:23:27Z

I can capture some perf numbers as well

@guschmue thanks, please let me know if any issues.

guschmue · 2025-03-12T16:12:14Z

/azp run ONNX Runtime Web CI Pipeline,Windows GPU CI Pipeline,Linux Android Emulator QNN CI Pipeline

guschmue · 2025-03-12T16:12:21Z

/azp run Linux CPU CI Pipeline,Linux CPU Minimal Build E2E CI Pipeline,Linux GPU CI Pipeline,Linux GPU TensorRT CI Pipeline, Linux OpenVINO CI Pipeline,Linux QNN CI Pipeline,MacOS CI Pipeline,Windows ARM64 QNN CI Pipeline,Windows CPU CI Pipeline

guschmue · 2025-03-12T16:12:27Z

/azp run Windows GPU TensorRT CI Pipeline,onnxruntime-binary-size-checks-ci-pipeline,orttraining-linux-ci-pipeline,orttraining-linux-gpu-ci-pipeline,orttraining-ortmodule-distributed,Windows x64 QNN CI Pipeline,Big Models

azure-pipelines · 2025-03-12T16:12:28Z

Azure Pipelines successfully started running 2 pipeline(s).

guschmue · 2025-03-12T16:12:33Z

/azp run Windows GPU CUDA CI Pipeline,Windows GPU DML CI Pipeline,Windows GPU Doc Gen CI Pipeline, Win_TRT_Minimal_CUDA_Test_CI

azure-pipelines · 2025-03-12T16:12:47Z

Azure Pipelines successfully started running 4 pipeline(s).

azure-pipelines · 2025-03-12T16:12:51Z

Azure Pipelines successfully started running 4 pipeline(s).

azure-pipelines · 2025-03-12T16:12:58Z

Azure Pipelines successfully started running 9 pipeline(s).

daijh · 2025-03-26T02:59:55Z

Resolved existing comment. Please take another look, thanks.

@sushraja-msft @guschmue @qjia7

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h

daijh · 2025-03-28T01:27:04Z

@guschmue could you have a look as well, and apply this PR?

guschmue · 2025-04-01T01:01:48Z

CI pipelines changes - can you merge with main?

- Rename to `MatMulNBitsBlockWideTileProgram` for clarity. - Enforce `M >= kMinMForTileOptimization`. - Add TODO for future improvements.

daijh · 2025-04-01T04:06:54Z

CI pipelines changes - can you merge with main?

Rebase to main. Please help to re-trigger the CI, thanks.

guschmue · 2025-04-01T22:21:30Z

lint issue, wants you to run
lintrunner -a
If your local lintrunner doesn't complain it is maybe an older version (I ran into this earlier today)

daijh · 2025-04-01T22:51:06Z

lint issue, wants you to run lintrunner -a If your local lintrunner doesn't complain it is maybe an older version (I ran into this earlier today)

Fixed the lint issues.

daijh · 2025-04-02T08:13:01Z

The logs of CI failure shows it's likely a result of infrastructure instability, and does not to be related to the changes.

guschmue

one more change

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc

guschmue · 2025-04-03T15:19:28Z

/azp run Windows x64 QNN CI Pipeline,Linux QNN CI Pipeline,Win_TRT_Minimal_CUDA_Test_CI,Windows ARM64 QNN CI Pipeline,Windows GPU Doc Gen CI Pipeline

azure-pipelines · 2025-04-03T15:19:51Z

Azure Pipelines successfully started running 5 pipeline(s).

done

…crosoft#23908) ### Description This commit improve the MatMulNBits f16 Block32 prefill performance, by increasing tiling size and enhancing memory efficiency. Achieved a +2x performance boost on Intel iGPUs for Phi-3.5-mini f16 model. ### Motivation and Context See above.

guschmue added the ep:WebGPU ort-web webgpu provider label Mar 6, 2025

sushraja-msft reviewed Mar 6, 2025

View reviewed changes

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc Outdated Show resolved Hide resolved

sushraja-msft reviewed Mar 6, 2025

View reviewed changes

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc Outdated Show resolved Hide resolved

sushraja-msft reviewed Mar 6, 2025

View reviewed changes

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc Outdated Show resolved Hide resolved

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc Outdated Show resolved Hide resolved

daijh force-pushed the matmul-f16-block32-prefill branch from 8a250db to 74da290 Compare March 12, 2025 07:54

sushanthr reviewed Mar 26, 2025

View reviewed changes

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.h Outdated Show resolved Hide resolved

sushanthr approved these changes Mar 26, 2025

View reviewed changes

daijh added 10 commits April 1, 2025 11:20

[webgpu] Optimize MatMulNBits for f16 Block32 prefill performance

1be49b2

Resolve comments

4ead004

- Rename to `MatMulNBitsBlockWideTileProgram` for clarity. - Enforce `M >= kMinMForTileOptimization`. - Add TODO for future improvements.

Fix variable naming

14bbe9d

Add comment on f32 accumulator

0f55827

Improve comment

695d9d0

More comment and avoid magic number

fd751cb

Improve variable naming

58d76f6

Add tile_m and tile_n into constructor

ae482a2

Rename to MatMulNBitsWideTileProgram

287be7e

Improve comment to reflect new naming

ca1710a

daijh force-pushed the matmul-f16-block32-prefill branch from 4d3801f to ca1710a Compare April 1, 2025 04:04

Fix lint

17c0b1f

guschmue previously requested changes Apr 2, 2025

View reviewed changes

onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc Outdated Show resolved Hide resolved

Prefer onnxruntime::narrow

e7f8bb4

guschmue approved these changes Apr 4, 2025

View reviewed changes

guschmue merged commit 3dfc2ae into microsoft:main Apr 4, 2025
69 checks passed

daijh deleted the matmul-f16-block32-prefill branch April 4, 2025 03:51

Conversation

daijh commented Mar 6, 2025

Description

Motivation and Context

Uh oh!

daijh commented Mar 6, 2025

Uh oh!

daijh commented Mar 6, 2025

Uh oh!

daijh commented Mar 6, 2025

Uh oh!

guschmue commented Mar 6, 2025

Uh oh!

azure-pipelines bot commented Mar 6, 2025

Uh oh!

guschmue commented Mar 6, 2025

Uh oh!

guschmue commented Mar 6, 2025

Uh oh!

guschmue commented Mar 6, 2025

Uh oh!

azure-pipelines bot commented Mar 6, 2025

Uh oh!

azure-pipelines bot commented Mar 6, 2025

Uh oh!

azure-pipelines bot commented Mar 6, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

daijh commented Mar 7, 2025

Uh oh!

daijh commented Mar 7, 2025

Uh oh!

guschmue commented Mar 11, 2025

Uh oh!

sushraja-msft commented Mar 11, 2025

Uh oh!

daijh commented Mar 12, 2025

Uh oh!

daijh commented Mar 12, 2025

Uh oh!

guschmue commented Mar 12, 2025

Uh oh!

guschmue commented Mar 12, 2025

Uh oh!

guschmue commented Mar 12, 2025

Uh oh!

azure-pipelines bot commented Mar 12, 2025

Uh oh!

guschmue commented Mar 12, 2025

Uh oh!

azure-pipelines bot commented Mar 12, 2025

Uh oh!

azure-pipelines bot commented Mar 12, 2025

Uh oh!

azure-pipelines bot commented Mar 12, 2025

Uh oh!

daijh commented Mar 26, 2025

Uh oh!

Uh oh!

daijh commented Mar 28, 2025

Uh oh!

guschmue commented Apr 1, 2025

Uh oh!

daijh commented Apr 1, 2025

Uh oh!

guschmue commented Apr 1, 2025

Uh oh!

daijh commented Apr 1, 2025

Uh oh!

daijh commented Apr 2, 2025

Uh oh!

guschmue left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

guschmue commented Apr 3, 2025

Uh oh!