Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions csrc/scheduler/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,29 @@ void scheduleMatmul(
// [... M,N,K]
scheduler_utils::matmul_utils::makeTile(cc, gemm_tile.cta_tile.toVector());

// Applies swizzle factor on C
if (params.grid_swizzle_factor != 1) {
int factor = std::max(1, params.grid_swizzle_factor); // must be >=1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't consider this type of tiling to be a swizzle. We consider swizzles to be non affine transformations. Can we just get a different word for this optimization? Tile shuffle or something similar might be a good name.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are already using the word swizzle for both affine and non-affine transformations. See note [WarpMmaSwizzler].

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went with CUTLASS's naming. What about grid scaling ? Or I'm fine with tile shuffle. So should I update @zasdfgbnm @csarofeen ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it's fine then, we should probably have a non-affine vs affine swizzle rename, but doesn't seem to make a difference right now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's merge this PR as is so it is consistent with tracking-matmul.

if (params.rasterization_order ==
MatmulParam::TileRasterizationOrder::RowMajor) {
cc->split(1, factor);
// [I1, I2/factor, factor]
cc->reorder({{1, 2}});
// [I1, factor, I2/factor]
cc->merge(0);
// [I1*factor, I2/factor]
} else if (
params.rasterization_order ==
MatmulParam::TileRasterizationOrder::ColumnMajor) {
cc->split(0, factor);
// [I1/factor, factor, I2]
cc->reorder({{1, 2}});
// [I1/factor, I2, factor]
cc->merge(1);
// [I1/factor, I2*factor]
}
}

// [Mo, No, Ko, Mi, Ni, Ki]
// Propagate tiling globally
scheduler_utils::transformPropagateToAllFrom(cc, -1);
Expand Down
15 changes: 15 additions & 0 deletions csrc/scheduler/matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,21 @@ class MatmulParam {
RowMajor = 0,
ColumnMajor = 1
} rasterization_order = TileRasterizationOrder::RowMajor;

//! Swizzle factor is used to increase L2 hit rate.
//! It horizontally squeezes the grid so that gridDim.x is larger and
//! gridDim.y is smaller.
//! We rely on the observation that the CTAs are scheduled by the GPU by
//! iterating on gridDim.x first. As a result, as blocks are launched, they
//! will more likely be forming sub-tiles of the C matrix. This will increase
//! L2 hit rate/data reuse of A and B.
//!
//! Eg for grid_swizzle_factor=2:
//! A1 A2 B1 B2 --> A1 A2 A3 A4 B1 B2 B3 B4
//! A3 A4 B3 B4 C1 C2 C3 C4 D1 D2 D3 D4
//! C1 C2 D1 D2
//! C3 C4 D3 D4
int grid_swizzle_factor = 1;
};

//! Prototype auto scheduling function.
Expand Down
95 changes: 95 additions & 0 deletions test/test_gpu_tensorcore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,101 @@ TEST_F(NVFuserTest, FusionAmpereMatmulPipelineGmem_CUDA) {
}
}

// Matmul test for Ampere MMA: checking CTA Swizzles
TEST_F(NVFuserTest, FusionAmpereSwizzle_CUDA) {
// Keep multiples of 8 to keep vectorizable.
int dim = 8192;
int M = dim, N = dim, K = dim;
const auto all_orders = {
MatmulParam::TileRasterizationOrder::RowMajor,
MatmulParam::TileRasterizationOrder::ColumnMajor};

REQUIRE_DEVICE_SMEM_SIZE(70 << 10, 0);

auto test = [&](MatmulLayout layout,
MatmulParam::TileRasterizationOrder order,
int swizzle,
float& runtime) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeContigTensor(2, DataType::Half);
auto tv1 = makeContigTensor(2, DataType::Half);

fusion.addInput(tv0);
fusion.addInput(tv1);

auto tv2 = matmul(tv0, tv1, layout);

fusion.addOutput(tv2);

MatMulTileOptions gemm_tile;
gemm_tile.cta_tile = GemmTile(128, 128, 32);
gemm_tile.warp_tile = GemmTile(64, 64, 32);
gemm_tile.instruction_tile = GemmTile(16, 8, 16);

auto mma_builder =
MmaBuilder(MmaOptions::MacroType::Ampere_16_8_16, gemm_tile)
.layout(layout);

MatmulParam params(mma_builder);
params.tile_sizes = gemm_tile;
params.async_gmem_load_operands = true;
params.double_buffer_options.double_buffer_smem_write = true;
params.double_buffer_options.double_buffer_smem_read = true;
params.double_buffer_options.smem_double_buffer_stage = 3;

params.rasterization_order = order;
params.grid_swizzle_factor = swizzle;

scheduleMatmul(tv2, tv0, tv1, params);

at::manual_seed(0);
auto inputs = fp16MatmulAtInput(M, N, K, layout);

FusionExecutor fe;
fe.setMeasureKernelTimeFlag(true);
NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
8,
0,
fe.compileFusion(
&fusion,
{inputs.first, inputs.second},
LaunchParams(),
matmul_cparams));
auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
auto tref = atMatmul(
inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
TORCH_CHECK(cg_outputs[0].allclose(tref, 0.01, 0.01));

int gdimx = fe.lastLaunchParams().gdimx();
int gdimy = fe.lastLaunchParams().gdimy();

int expected_gdim_unswizzled = (dim + 128 - 1) / 128;
int expected_gdimx = expected_gdim_unswizzled * swizzle;
int expected_gdimy = (expected_gdim_unswizzled + swizzle - 1) / swizzle;

TORCH_CHECK(gdimx == expected_gdimx);
TORCH_CHECK(gdimy == expected_gdimy);

runtime = fe.kernelTimeMs();
};

// Checking only a single layout to keep runtime short (compilation overhead)
for (auto layout : {MatmulLayout::TT}) {
for (auto order : all_orders) {
float runtime1 = 0;
test(layout, order, 1, runtime1);

float runtime4 = 0;
test(layout, order, 4, runtime4);

// GRID Swizzle requires further changes to work in main. So for now we
// don't assert the perf benefit here.
// TORCH_CHECK(runtime4 < runtime1);
}
}
}

TEST_F(NVFuserTest, FusionAmpereMatmulRegDoubleBuffer_CUDA) {
// Keep multiples of 8 to keep vectorizable.
int M = 504, N = 136, K = 248;
Expand Down