Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 25 additions & 14 deletions benchmark/gelu_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,21 @@ static std::vector<c10::IValue> setupInputs() {

//------------------------------------------------------------------------------

static void GeluBackward_SetupFusion(benchmark::State& benchmark_state) {
static void NvFuserScheduler_GeluBackward_SetupFusion(
benchmark::State& benchmark_state) {
for (auto _ : benchmark_state) {
Fusion fusion;
setupFusion(&fusion);
}
}

BENCHMARK(GeluBackward_SetupFusion)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_GeluBackward_SetupFusion)
->Unit(benchmark::kMicrosecond);

//------------------------------------------------------------------------------

static void GeluBackward_AutoSchedule(benchmark::State& benchmark_state) {
static void NvFuserScheduler_GeluBackward_AutoSchedule(
benchmark::State& benchmark_state) {
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
benchmark_state.PauseTiming();
Expand All @@ -117,11 +120,13 @@ static void GeluBackward_AutoSchedule(benchmark::State& benchmark_state) {
}
}

BENCHMARK(GeluBackward_AutoSchedule)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_GeluBackward_AutoSchedule)
->Unit(benchmark::kMicrosecond);

//------------------------------------------------------------------------------

static void GeluBackward_Lower(benchmark::State& benchmark_state) {
static void NvFuserScheduler_GeluBackward_Lower(
benchmark::State& benchmark_state) {
Fusion fusion;

// setup fusion
Expand All @@ -137,11 +142,12 @@ static void GeluBackward_Lower(benchmark::State& benchmark_state) {
}
}

BENCHMARK(GeluBackward_Lower)->Unit(benchmark::kMillisecond);
BENCHMARK(NvFuserScheduler_GeluBackward_Lower)->Unit(benchmark::kMillisecond);

//------------------------------------------------------------------------------

static void GeluBackward_Compile(benchmark::State& benchmark_state) {
static void NvFuserScheduler_GeluBackward_Compile(
benchmark::State& benchmark_state) {
Fusion fusion;

// setup fusion
Expand All @@ -158,11 +164,12 @@ static void GeluBackward_Compile(benchmark::State& benchmark_state) {
}
}

BENCHMARK(GeluBackward_Compile)->Unit(benchmark::kMillisecond);
BENCHMARK(NvFuserScheduler_GeluBackward_Compile)->Unit(benchmark::kMillisecond);

//------------------------------------------------------------------------------

static void GeluBackward_RunFusion(benchmark::State& benchmark_state) {
static void NvFuserScheduler_GeluBackward_RunFusion(
benchmark::State& benchmark_state) {
Fusion fusion;

// setup fusion
Expand All @@ -188,11 +195,13 @@ static void GeluBackward_RunFusion(benchmark::State& benchmark_state) {
}
}

BENCHMARK(GeluBackward_RunFusion)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion)
->Unit(benchmark::kMicrosecond);

//------------------------------------------------------------------------------

static void GeluBackward_RunFusion_GpuOnly(benchmark::State& benchmark_state) {
static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly(
benchmark::State& benchmark_state) {
Fusion fusion;

// setup fusion
Expand All @@ -209,13 +218,14 @@ static void GeluBackward_RunFusion_GpuOnly(benchmark::State& benchmark_state) {
runBenchmarkIterations(benchmark_state, &executor, inputs, lparams);
}

BENCHMARK(GeluBackward_RunFusion_GpuOnly)
BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_GpuOnly)
->Unit(benchmark::kMicrosecond)
->UseManualTime();

//------------------------------------------------------------------------------

static void GeluBackward_RunFusion_CpuOnly(benchmark::State& benchmark_state) {
static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly(
benchmark::State& benchmark_state) {
Fusion fusion;

// setup fusion
Expand All @@ -238,4 +248,5 @@ static void GeluBackward_RunFusion_CpuOnly(benchmark::State& benchmark_state) {
}
}

BENCHMARK(GeluBackward_RunFusion_CpuOnly)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_CpuOnly)
->Unit(benchmark::kMicrosecond);
10 changes: 6 additions & 4 deletions benchmark/heuristic_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ static auto getLayerBackwardNormRuntime(
return fec->getMostRecentKernelRuntime();
}

static void LayerNormBackward_HeuristicLookup(
static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
Expand Down Expand Up @@ -146,7 +146,7 @@ static auto getLayerForwardNormRuntime(
return fec->getMostRecentKernelRuntime();
}

static void LayerNormForward_HeuristicLookup(
static void NvFuserScheduler_LayerNormForward_HeuristicCache(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
Expand All @@ -168,5 +168,7 @@ static void LayerNormForward_HeuristicLookup(
}
}

BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_LayerNormBackward_HeuristicCache)
->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_LayerNormForward_HeuristicCache)
->Unit(benchmark::kMicrosecond);
10 changes: 6 additions & 4 deletions benchmark/heuristic_lookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ static auto getLayerBackwardNormRuntime(
return fec->getMostRecentKernelRuntime();
}

static void LayerNormBackward_HeuristicLookup(
static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
Expand Down Expand Up @@ -150,7 +150,7 @@ static auto getLayerForwardNormRuntime(
return fec->getMostRecentKernelRuntime();
}

static void LayerNormForward_HeuristicLookup(
static void NvFuserScheduler_LayerNormForward_HeuristicLookup(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
Expand All @@ -176,5 +176,7 @@ static void LayerNormForward_HeuristicLookup(
}
}

BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_LayerNormBackward_HeuristicLookup)
->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_LayerNormForward_HeuristicLookup)
->Unit(benchmark::kMicrosecond);
28 changes: 18 additions & 10 deletions benchmark/indexselect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,21 @@ static std::vector<c10::IValue> setupInputs() {

//------------------------------------------------------------------------------

static void IndexSelect_SetupFusion(benchmark::State& benchmark_state) {
static void NvFuserScheduler_IndexSelect_SetupFusion(
benchmark::State& benchmark_state) {
for (auto _ : benchmark_state) {
Fusion fusion;
setupFusion(&fusion);
}
}

BENCHMARK(IndexSelect_SetupFusion)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_IndexSelect_SetupFusion)
->Unit(benchmark::kMicrosecond);

//------------------------------------------------------------------------------

static void IndexSelect_AutoSchedule(benchmark::State& benchmark_state) {
static void NvFuserScheduler_IndexSelect_AutoSchedule(
benchmark::State& benchmark_state) {
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
benchmark_state.PauseTiming();
Expand All @@ -87,11 +90,13 @@ static void IndexSelect_AutoSchedule(benchmark::State& benchmark_state) {
}
}

BENCHMARK(IndexSelect_AutoSchedule)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_IndexSelect_AutoSchedule)
->Unit(benchmark::kMicrosecond);

//------------------------------------------------------------------------------

static void IndexSelect_Lower(benchmark::State& benchmark_state) {
static void NvFuserScheduler_IndexSelect_Lower(
benchmark::State& benchmark_state) {
Fusion fusion;

// setup fusion
Expand All @@ -107,11 +112,12 @@ static void IndexSelect_Lower(benchmark::State& benchmark_state) {
}
}

BENCHMARK(IndexSelect_Lower)->Unit(benchmark::kMillisecond);
BENCHMARK(NvFuserScheduler_IndexSelect_Lower)->Unit(benchmark::kMillisecond);

//------------------------------------------------------------------------------

static void IndexSelect_Compile(benchmark::State& benchmark_state) {
static void NvFuserScheduler_IndexSelect_Compile(
benchmark::State& benchmark_state) {
Fusion fusion;

// setup fusion
Expand All @@ -129,11 +135,12 @@ static void IndexSelect_Compile(benchmark::State& benchmark_state) {
}
}

BENCHMARK(IndexSelect_Compile)->Unit(benchmark::kMillisecond);
BENCHMARK(NvFuserScheduler_IndexSelect_Compile)->Unit(benchmark::kMillisecond);

//------------------------------------------------------------------------------

static void IndexSelect_RunFusion(benchmark::State& benchmark_state) {
static void NvFuserScheduler_IndexSelect_RunFusion(
benchmark::State& benchmark_state) {
Fusion fusion;

// setup fusion
Expand All @@ -158,7 +165,8 @@ static void IndexSelect_RunFusion(benchmark::State& benchmark_state) {
}
}

BENCHMARK(IndexSelect_RunFusion)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_IndexSelect_RunFusion)
->Unit(benchmark::kMicrosecond);

static void setupIndexSelectSimple(
Fusion* fusion,
Expand Down
46 changes: 29 additions & 17 deletions benchmark/lstm_cell.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,20 @@ static std::vector<c10::IValue> setupInputs(

//------------------------------------------------------------------------------

static void LstmCell_SetupFusion(benchmark::State& benchmark_state) {
static void NvFuserScheduler_LstmCell_SetupFusion(
benchmark::State& benchmark_state) {
for (auto _ : benchmark_state) {
Fusion fusion;
setupFusion(&fusion);
}
}

BENCHMARK(LstmCell_SetupFusion)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_LstmCell_SetupFusion)->Unit(benchmark::kMicrosecond);

//------------------------------------------------------------------------------

static void LstmCell_AutoSchedule(benchmark::State& benchmark_state) {
static void NvFuserScheduler_LstmCell_AutoSchedule(
benchmark::State& benchmark_state) {
constexpr int kHiddenFeatures = 512;
constexpr int kBatchSize = 64;

Expand All @@ -106,11 +108,12 @@ static void LstmCell_AutoSchedule(benchmark::State& benchmark_state) {
}
}

BENCHMARK(LstmCell_AutoSchedule)->Unit(benchmark::kMicrosecond);
BENCHMARK(NvFuserScheduler_LstmCell_AutoSchedule)
->Unit(benchmark::kMicrosecond);

//------------------------------------------------------------------------------

static void LstmCell_Lower(benchmark::State& benchmark_state) {
static void NvFuserScheduler_LstmCell_Lower(benchmark::State& benchmark_state) {
constexpr int kHiddenFeatures = 512;
constexpr int kBatchSize = 64;

Expand All @@ -129,11 +132,12 @@ static void LstmCell_Lower(benchmark::State& benchmark_state) {
}
}

BENCHMARK(LstmCell_Lower)->Unit(benchmark::kMillisecond);
BENCHMARK(NvFuserScheduler_LstmCell_Lower)->Unit(benchmark::kMillisecond);

//------------------------------------------------------------------------------

static void LstmCell_Compile(benchmark::State& benchmark_state) {
static void NvFuserScheduler_LstmCell_Compile(
benchmark::State& benchmark_state) {
constexpr int kHiddenFeatures = 512;
constexpr int kBatchSize = 64;

Expand All @@ -153,11 +157,11 @@ static void LstmCell_Compile(benchmark::State& benchmark_state) {
}
}

BENCHMARK(LstmCell_Compile)->Unit(benchmark::kMillisecond);
BENCHMARK(NvFuserScheduler_LstmCell_Compile)->Unit(benchmark::kMillisecond);

//------------------------------------------------------------------------------

static void LstmCell_RunFusion(
static void NvFuserScheduler_LstmCell_RunFusion(
benchmark::State& benchmark_state,
int hidden_features,
int batch_size) {
Expand Down Expand Up @@ -185,15 +189,15 @@ static void LstmCell_RunFusion(
}
}

BENCHMARK_CAPTURE(LstmCell_RunFusion, Small, 512, 64)
BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion, Small, 512, 64)
->Unit(benchmark::kMicrosecond);

BENCHMARK_CAPTURE(LstmCell_RunFusion, Medium, 1024, 128)
BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion, Medium, 1024, 128)
->Unit(benchmark::kMicrosecond);

//------------------------------------------------------------------------------

static void LstmCell_RunFusion_GpuOnly(
static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly(
benchmark::State& benchmark_state,
int hidden_features,
int batch_size) {
Expand All @@ -216,17 +220,21 @@ static void LstmCell_RunFusion_GpuOnly(
runBenchmarkIterations(benchmark_state, &executor, inputs, lparams);
}

BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Small, 512, 64)
BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion_GpuOnly, Small, 512, 64)
->Unit(benchmark::kMicrosecond)
->UseManualTime();

BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Medium, 1024, 128)
BENCHMARK_CAPTURE(
NvFuserScheduler_LstmCell_RunFusion_GpuOnly,
Medium,
1024,
128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();

//------------------------------------------------------------------------------

static void LstmCell_RunFusion_CpuOnly(
static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly(
benchmark::State& benchmark_state,
int hidden_features,
int batch_size) {
Expand All @@ -252,8 +260,12 @@ static void LstmCell_RunFusion_CpuOnly(
}
}

BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Small, 512, 64)
BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion_CpuOnly, Small, 512, 64)
->Unit(benchmark::kMicrosecond);

BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Medium, 1024, 128)
BENCHMARK_CAPTURE(
NvFuserScheduler_LstmCell_RunFusion_CpuOnly,
Medium,
1024,
128)
->Unit(benchmark::kMicrosecond);
Loading