Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions csrc/predicate_compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,10 @@ Bool* ParallelizedDomainPredicate::getPredicate(

Val* pred = GpuLower::current()->kernel()->trueVal();

// TODO: this is a temporary hacking. I don't think it is generally correct.
// Find a way to eliminate it for MMA.
return pred->as<Bool>();

for (auto pt : kParallelTypeThreads) {
auto pred_info_it = pred_map.find(pt);
if (pred_info_it != pred_map.end()) {
Expand Down
23 changes: 23 additions & 0 deletions csrc/scheduler/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,29 @@ void scheduleMatmul(
// [... M,N,K]
scheduler_utils::matmul_utils::makeTile(cc, gemm_tile.cta_tile.toVector());

// Applies swizzle factor on C
if (params.grid_swizzle_factor != 1) {
int factor = std::max(1, params.grid_swizzle_factor); // must be >=1
if (params.rasterization_order ==
MatmulParam::TileRasterizationOrder::RowMajor) {
cc->split(1, factor);
// [I1, I2/factor, factor]
cc->reorder({{1, 2}});
// [I1, factor, I2/factor]
cc->merge(0);
// [I1*factor, I2/factor]
} else if (
params.rasterization_order ==
MatmulParam::TileRasterizationOrder::ColumnMajor) {
cc->split(0, factor);
// [I1/factor, factor, I2]
cc->reorder({{1, 2}});
// [I1/factor, I2, factor]
cc->merge(1);
// [I1/factor, I2*factor]
}
}

// [Mo, No, Ko, Mi, Ni, Ki]
// Propagate tiling globally
scheduler_utils::transformPropagateToAllFrom(cc, -1);
Expand Down
15 changes: 15 additions & 0 deletions csrc/scheduler/matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,21 @@ class MatmulParam {
ColumnMajor = 1
} rasterization_order = TileRasterizationOrder::RowMajor;

//! Swizzle factor is used to increase L2 hit rate.
//! It horizontally squeezes the grid so that gridDim.x is larger and
//! gridDim.y is smaller.
//! We rely on the observation that the CTAs are scheduled by the GPU by
//! iterating on gridDim.x first. As a result, as blocks are launched, they
//! will more likely be forming sub-tiles of the C matrix. This will increase
//! L2 hit rate/data reuse of A and B.
//!
//! Eg for grid_swizzle_factor=2:
//! A1 A2 B1 B2 --> A1 A2 A3 A4 B1 B2 B3 B4
//! A3 A4 B3 B4 C1 C2 C3 C4 D1 D2 D3 D4
//! C1 C2 D1 D2
//! C3 C4 D3 D4
int grid_swizzle_factor = 1;

//! Enables predicate peeling mainloop:
bool peel_main_loop = true;
};
Expand Down
94 changes: 94 additions & 0 deletions test/test_gpu_tensorcore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,100 @@ TEST_F(NVFuserTest, FusionAmpereMatmulPipelineGmem_CUDA) {
}
}

// Matmul test for Ampere MMA: checking CTA Swizzles
TEST_F(NVFuserTest, FusionAmpereSwizzle_CUDA) {
// Keep multiples of 8 to keep vectorizable.
int dim = 8192;
int M = dim, N = dim, K = dim;
const auto all_orders = {
MatmulParam::TileRasterizationOrder::RowMajor,
MatmulParam::TileRasterizationOrder::ColumnMajor};

REQUIRE_DEVICE_SMEM_SIZE(70 << 10, 0);

auto test = [&](MatmulLayout layout,
MatmulParam::TileRasterizationOrder order,
int swizzle,
float& runtime) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeContigTensor(2, DataType::Half);
auto tv1 = makeContigTensor(2, DataType::Half);

fusion.addInput(tv0);
fusion.addInput(tv1);

auto tv2 = matmul(tv0, tv1, layout);

fusion.addOutput(tv2);

MatMulTileOptions gemm_tile;
gemm_tile.cta_tile = GemmTile(128, 128, 32);
gemm_tile.warp_tile = GemmTile(64, 64, 32);
gemm_tile.instruction_tile = GemmTile(16, 8, 16);

auto mma_builder =
MmaBuilder(MmaOptions::MacroType::Ampere_16_8_16, gemm_tile)
.layout(layout);

MatmulParam params(mma_builder);
params.tile_sizes = gemm_tile;
params.async_gmem_load_operands = true;
params.double_buffer_options.double_buffer_smem_write = true;
params.double_buffer_options.double_buffer_smem_read = true;
params.double_buffer_options.smem_double_buffer_stage = 3;

params.rasterization_order = order;
params.grid_swizzle_factor = swizzle;

scheduleMatmul(tv2, tv0, tv1, params);

at::manual_seed(0);
auto inputs = fp16MatmulAtInput(M, N, K, layout);

FusionExecutor fe;
fe.setMeasureKernelTimeFlag(true);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will add some debug printting when running this test? Could you comment this line out?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No debug prints. The only effect is to create cudaEvents and return the runtime through fe.kernelTimeMs() (otherwise it's just zero).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahhh, I thought it would automatically std::cout << fe.kernelTimeMs(). If no debug prints, then we can keep it.

NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
8,
0,
fe.compileFusion(
&fusion,
{inputs.first, inputs.second},
LaunchParams(),
matmul_cparams));
auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
auto tref = atMatmul(
inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
TORCH_CHECK(cg_outputs[0].allclose(tref, 0.01, 0.01));

int gdimx = fe.lastLaunchParams().gdimx();
int gdimy = fe.lastLaunchParams().gdimy();

int expected_gdim_unswizzled = (dim + 128 - 1) / 128;
int expected_gdimx = expected_gdim_unswizzled * swizzle;
int expected_gdimy = (expected_gdim_unswizzled + swizzle - 1) / swizzle;

TORCH_CHECK(gdimx == expected_gdimx);
TORCH_CHECK(gdimy == expected_gdimy);

runtime = fe.kernelTimeMs();
};

// Gmem pipeline stage

for (auto layout : {MatmulLayout::TT}) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be kAllSupportedMatmulLayout?

Copy link
Collaborator Author

@mmigdal-nv mmigdal-nv Mar 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just removed it to keep a short runtime as the test checks four configs per layout already

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current test takes 15s, would jump to 45s.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, then let's just test one layout

for (auto order : all_orders) {
float runtime1 = 0;
test(layout, order, 1, runtime1);

float runtime4 = 0;
test(layout, order, 4, runtime4);

TORCH_CHECK(runtime4 < runtime1);
}
}
}

TEST_F(NVFuserTest, FusionAmpereMatmulRegDoubleBuffer_CUDA) {
// Keep multiples of 8 to keep vectorizable.
int M = 504, N = 136, K = 248;
Expand Down