-
Notifications
You must be signed in to change notification settings - Fork 79
Description
This is a simple linear fusion with what I believe is a valid config.
TEST_F(MatmulTest, MatmulBiasSmallTileBug) {
NVFUSER_TEST_CUDA_ARCH_GUARD(8, 0);
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
auto tv0 = makeContigTensor(2, DataType::Half);
auto tv1 = makeContigTensor(2, DataType::Half);
auto tv2 = makeContigTensor(1, DataType::Half);
fusion->addInput(tv0);
fusion->addInput(tv1);
fusion->addInput(tv2);
auto tv3 = linear(tv0, tv1, tv2);
fusion->addOutput(tv3);
MatmulParams mparams;
mparams.supported_vec_size = {8, 8, 4};
mparams.mma_macro = MmaMacro::Ampere_16_8_16;
mparams.tile_sizes = {
/*cta_tile=*/{32, 32, 32},
/*warp_tile=*/{16, 16, 32},
/*instruction_tile=*/{16, 8, 16}};
mparams.async_gmem_load_operands = true;
mparams.circular_buffer_options.circular_buffer_smem_write = true;
mparams.circular_buffer_options.circular_buffer_smem_read = true;
mparams.circular_buffer_options.smem_circular_buffer_stage = 2;
mparams.splitk_factor = 1;
mparams.use_smem_epilogue = true;
mparams.promote_prologue_smem_reuse = false;
SchedulerEntry::makeSchedulerInstance(SchedulerType::Matmul)
->schedule(fusion.get(), &mparams);
int64_t M = 64, N = 64, K = 64;
auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
at::Tensor A = at::randn({M, K}, options);
at::Tensor B = at::randn({N, K}, options);
at::Tensor bias = at::randn({N}, options);
std::vector<c10::IValue> inputs{A, B, bias};
FusionExecutor fe;
NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
8, 0, fe.compileFusion(fusion.get(), inputs));
ASSERT_TRUE(getBankConflictInfo(fe.kernel()).empty());
auto cg_outputs = fe.runFusion(inputs);
at::Tensor tref = linear(A, B, bias);
testValidate(fusion.get(), cg_outputs, inputs, {tref}, __LINE__, __FILE__);
}This fails during lowering with the following error:
C++ exception with description " INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/id_model/id_model.cpp":966, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Detected loop domains are mapped in the loop graph. Tensor: T8_l___half[ iS93{( ceilDiv(( (( (( getMetaData(T0) )).logical_size ))[0] ), 32) )}, bS91{( ceilDiv(1, 32) )}, iS89{( ceilDiv(( (( (( getMetaData(T0) )).logical_size ))[1] ), 32) )}, iS103{( ceilDiv(32, 16) )}, ithreadIdx.z95{( ceilDiv(32, 16) )}, bthreadIdx.y97{( ceilDiv(32, 16) )}, iS99{( ceilDiv(16, 16) )}, bS101{( ceilDiv(16, 8) )}, bS102{8}, iS111{( ceilDiv(( ceilDiv(16, 8) ), 2) )}, ithreadIdx.x135{( 8 * 4 )}, iS109{( ceilDiv(( ceilDiv(16, 2) ), 4) )}, iS112{2}, iS108{2} ] ca_pos( 5 ) produce_pos( 5 ). Mapped loop domains: ithreadIdx.z95{( ceilDiv(32, 16) )} and bthreadIdx.y97{( ceilDiv(32, 16) )}
Exception raised from validateLoopGraphHasNoSelfMappedLeafDomains at /opt/pytorch/nvfuser/csrc/id_model/id_model.cpp:966 (most recent call first):
This is encountered in bias fusions only: removing the bias argument in the fusion above avoids the error. Changing the warp tile to 32,16,32 or 16,32,32 also avoids the error. It appears that the warp/cta tile's K dimension does not affect the error.
Disabling circular buffering altogether does not avoid the error.
This config is chosen by our internal heuristic for some problems, leading to this issue.