Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,7 @@ namespace contrib {
namespace webgpu {

namespace {

constexpr unsigned int kMinMForTileOptimization = 4;

template <typename T>
inline T ceil_div(T numerator, T denominator) {
return (numerator + denominator - 1) / denominator;
}

} // namespace

ONNX_OPERATOR_KERNEL_EX(
Expand Down Expand Up @@ -246,8 +239,8 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
constexpr uint32_t workgroup_size = 128;
constexpr uint32_t tile_m = workgroup_size / 8;
constexpr uint32_t tile_n = workgroup_size;
const uint32_t num_N_tile = ceil_div(N, tile_n);
const uint32_t num_M_tile = ceil_div(M, tile_m);
const uint32_t num_N_tile = CeilDiv(N, tile_n);
const uint32_t num_M_tile = CeilDiv(M, tile_m);

MatMulNBitsWideTileProgram program{has_zero_points, has_bias, has_weight_idx, tile_m, tile_n, static_cast<uint32_t>(nbits)};
program.SetWorkgroupSize(workgroup_size);
Expand All @@ -268,7 +261,7 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
if (has_zero_points) {
program.AddInput({zero_points,
ProgramTensorMetadataDependency::TypeAndRank,
{ceil_div(zero_points->Shape().Size(), static_cast<int64_t>(4))},
{CeilDiv(zero_points->Shape().Size(), static_cast<int64_t>(4))},
4});
}
if (has_bias) {
Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/core/providers/webgpu/math/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "core/providers/webgpu/webgpu_supported_types.h"
#include "core/providers/webgpu/nn/fuse_utils.h"
#include "core/providers/webgpu/data_transfer.h"
#include "core/providers/webgpu/webgpu_utils.h"

namespace onnxruntime {
namespace webgpu {
Expand Down Expand Up @@ -147,7 +148,7 @@ Status MatMul::ComputeInternal(ComputeContext& context) const {
}
program
.AddOutputs({{output_tensor, ProgramTensorMetadataDependency::None, output_shape_shader, components}})
.SetDispatchGroupSize((output_size + 63) / 64) // Integer ceiling division
.SetDispatchGroupSize(CeilDiv(output_size, 64u))
.AddIndices(outer_dims)
.AddUniformVariables({{output_size}, {m}, {n}, {k}});

Expand Down
22 changes: 7 additions & 15 deletions onnxruntime/core/providers/webgpu/nn/im2col_matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,7 @@

namespace onnxruntime {
namespace webgpu {

namespace {

// TODO: move to common header.
template <typename T>
inline T ceil_div(T numerator, T denominator) {
return (numerator + denominator - 1) / denominator;
}

// Chooses the optimal tile size (M, N) for the im2col operation.
// This tile size is performance-tuned and varies depending on the target device.
std::pair<uint32_t, uint32_t> ChooseTileSize(uint32_t im2col_m, uint32_t im2col_n) {
Expand All @@ -32,8 +24,8 @@ std::pair<uint32_t, uint32_t> ChooseTileSize(uint32_t im2col_m, uint32_t im2col_
const uint32_t tile_m = tile_pair.first;
const uint32_t tile_n = tile_pair.second;

const uint32_t dispatch_m = ceil_div(im2col_m, tile_m);
const uint32_t dispatch_n = ceil_div(im2col_n, tile_n);
const uint32_t dispatch_m = CeilDiv(im2col_m, tile_m);
const uint32_t dispatch_n = CeilDiv(im2col_n, tile_n);
const uint32_t dispatch = dispatch_m * dispatch_n;

if (dispatch >= 128) {
Expand Down Expand Up @@ -115,7 +107,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
OIHW2OHWIProgram transpose_program{};
transpose_program.SetWorkgroupSize(64);

const uint32_t Ci_tiles = ceil_div(channel_input, 64u);
const uint32_t Ci_tiles = CeilDiv(channel_input, 64u);
transpose_program.SetDispatchGroupSize(channel_output, Ci_tiles);

transpose_program.AddInput({weight,
Expand All @@ -127,7 +119,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
{kernel_height},
{kernel_width},
{Ci_tiles},
{ceil_div(kernel_height * kernel_height, 4u)}});
{CeilDiv(kernel_height * kernel_height, 4u)}});
ORT_RETURN_IF_ERROR(context.RunProgram(transpose_program));

// im2col-matmul
Expand Down Expand Up @@ -156,8 +148,8 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
Im2ColMatMulProgram im2col_mm_program{has_bias, tile_m, tile_n, use_subgroup};
im2col_mm_program.SetWorkgroupSize(workgroup_size);

const uint32_t M_tiles = ceil_div(im2col_m, tile_m);
const uint32_t N_tiles = ceil_div(im2col_n, tile_n);
const uint32_t M_tiles = CeilDiv(im2col_m, tile_m);
const uint32_t N_tiles = CeilDiv(im2col_n, tile_n);
im2col_mm_program.SetDispatchGroupSize(M_tiles, N_tiles, batch);

im2col_mm_program.AddInput({src,
Expand Down Expand Up @@ -185,7 +177,7 @@ Status ApplyIm2ColMatMulProgram(ComputeContext& context,
{im2col_n},
{M_tiles},
{N_tiles},
{ceil_div(ceil_div(im2col_k, 4u), 4u)},
{CeilDiv(CeilDiv(im2col_k, 4u), 4u)},
{dilations},
{pads},
{strides}});
Expand Down
24 changes: 7 additions & 17 deletions onnxruntime/core/providers/webgpu/tensor/transpose.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,15 @@
// Licensed under the MIT License.

#include "core/common/inlined_containers.h"
#include "core/providers/webgpu/tensor/transpose.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/providers/webgpu/tensor/transpose.h"
#include "core/providers/webgpu/shader_variable.h"
#include "core/providers/webgpu/shader_helper.h"
#include "core/providers/webgpu/webgpu_supported_types.h"

namespace {

inline uint32_t ceil_div(int64_t numerator, int32_t denominator) {
return static_cast<uint32_t>((numerator + denominator - 1) / denominator);
}

} // namespace
#include "core/providers/webgpu/webgpu_utils.h"

namespace onnxruntime {
namespace webgpu {

ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Transpose,
kOnnxDomain,
Expand Down Expand Up @@ -139,25 +131,23 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContextBase& context,
new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
}

uint32_t output_size = onnxruntime::narrow<int32_t>(input_shape.Size());
uint32_t output_size = onnxruntime::narrow<uint32_t>(input_shape.Size());
TransposeProgram program{permutations, use_shared};

program
.CacheHint(absl::StrJoin(permutations, "-"))
.AddInputs({{&input, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
.AddOutputs({{&output, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
.AddUniformVariables({
{static_cast<uint32_t>(output_size)},
});
.AddUniformVariables({{output_size}});

if (use_shared) {
program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1);
program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)));
} else {
program.SetWorkgroupSize(WORKGROUP_SIZE);
program.SetWorkgroupSize(64u);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: why not use WORKGROUP_SIZE?
In line 134, please help correct it to uint32_t output_size = onnxruntime::narrow<uint32_t>(input_shape.Size());

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reasons:

  1. Specify workgroup_size=64 explicitly improve readability.
  2. WORKGROUP_SIZE is defined as SafeInt<uint32_t>, it's not easy to cast back into uint32_t which is required by CeilDiv.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that it's not necessary to use SafeInt<uint32_t> for WORKGROUP_SIZE. WORKGROUP_SIZE is just a default value. If you are using 64 as the workgroup size, you can skip to call program.SetWorkgroupSize(64u); unless you are using a different value. Currently, a lot of ops are using the default WORKGROUP_SIZE. If you search, there should be many 'SetDispatchGroupSize((XXX + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE). @fs-eire Do you think it's better to change the type of WORKGROUP_SIZE to uint32_t from SafeInt<uint32_t>? Or in all places, we explicitly specify the workgroup size.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally prefer to explicitly specify the workgroup_size. This practice improves code readability and saves time for developers read the source first time, as it eliminates the need to navigate to another file to figure out the default WORKGROUP_SIZE;

I have no objection if someone wishes to retain the implicit style.


uint32_t dispatch_x = ceil_div(output_size, WORKGROUP_SIZE);
uint32_t dispatch_x = CeilDiv(output_size, 64u);
uint32_t dispatch_y = 1;
uint32_t dispatch_z = 1;

Expand All @@ -171,7 +161,7 @@ Status Transpose::DoTranspose(onnxruntime::webgpu::ComputeContextBase& context,
uint32_t dispatch_size = dispatch_x;
dispatch_x = 4;
dispatch_y = 8;
dispatch_z = ceil_div(dispatch_size, dispatch_x * dispatch_y);
dispatch_z = CeilDiv(dispatch_size, dispatch_x * dispatch_y);
}
program.SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z);
}
Expand Down
5 changes: 5 additions & 0 deletions onnxruntime/core/providers/webgpu/webgpu_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ namespace webgpu {

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove empty line here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for identifying it.

class ShaderVariableHelper;

template <typename T>
inline T CeilDiv(T numerator, T denominator) {
return (numerator + denominator - 1) / denominator;
}

/**
* Returns the maximum number of components `N` to be used as `vecN` for the given size.
*/
Expand Down
Loading