From efce3ac4a7d7f9157b08ec832239423885b7bb9f Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Fri, 22 Sep 2023 10:51:38 -0700 Subject: [PATCH] rename all nvfuser benchmarks start with NvFuserScheduler_ --- benchmark/gelu_backward.cpp | 39 ++++++++++++------- benchmark/heuristic_cache.cpp | 10 +++-- benchmark/heuristic_lookup.cpp | 10 +++-- benchmark/indexselect.cpp | 28 +++++++++----- benchmark/lstm_cell.cpp | 46 ++++++++++++++--------- benchmark/many_pointwise_ops.cpp | 12 ++++-- benchmark/matmul.cpp | 64 ++++++++++++++++---------------- benchmark/shape_inference.cpp | 19 ++++++---- benchmark/softmax.cpp | 10 +++-- 9 files changed, 141 insertions(+), 97 deletions(-) diff --git a/benchmark/gelu_backward.cpp b/benchmark/gelu_backward.cpp index ea779836183..a7747c8ceba 100644 --- a/benchmark/gelu_backward.cpp +++ b/benchmark/gelu_backward.cpp @@ -92,18 +92,21 @@ static std::vector setupInputs() { //------------------------------------------------------------------------------ -static void GeluBackward_SetupFusion(benchmark::State& benchmark_state) { +static void NvFuserScheduler_GeluBackward_SetupFusion( + benchmark::State& benchmark_state) { for (auto _ : benchmark_state) { Fusion fusion; setupFusion(&fusion); } } -BENCHMARK(GeluBackward_SetupFusion)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_GeluBackward_SetupFusion) + ->Unit(benchmark::kMicrosecond); //------------------------------------------------------------------------------ -static void GeluBackward_AutoSchedule(benchmark::State& benchmark_state) { +static void NvFuserScheduler_GeluBackward_AutoSchedule( + benchmark::State& benchmark_state) { for (auto _ : benchmark_state) { // Setup (not included in the measurement) benchmark_state.PauseTiming(); @@ -117,11 +120,13 @@ static void GeluBackward_AutoSchedule(benchmark::State& benchmark_state) { } } -BENCHMARK(GeluBackward_AutoSchedule)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_GeluBackward_AutoSchedule) + ->Unit(benchmark::kMicrosecond); //------------------------------------------------------------------------------ -static void GeluBackward_Lower(benchmark::State& benchmark_state) { +static void NvFuserScheduler_GeluBackward_Lower( + benchmark::State& benchmark_state) { Fusion fusion; // setup fusion @@ -137,11 +142,12 @@ static void GeluBackward_Lower(benchmark::State& benchmark_state) { } } -BENCHMARK(GeluBackward_Lower)->Unit(benchmark::kMillisecond); +BENCHMARK(NvFuserScheduler_GeluBackward_Lower)->Unit(benchmark::kMillisecond); //------------------------------------------------------------------------------ -static void GeluBackward_Compile(benchmark::State& benchmark_state) { +static void NvFuserScheduler_GeluBackward_Compile( + benchmark::State& benchmark_state) { Fusion fusion; // setup fusion @@ -158,11 +164,12 @@ static void GeluBackward_Compile(benchmark::State& benchmark_state) { } } -BENCHMARK(GeluBackward_Compile)->Unit(benchmark::kMillisecond); +BENCHMARK(NvFuserScheduler_GeluBackward_Compile)->Unit(benchmark::kMillisecond); //------------------------------------------------------------------------------ -static void GeluBackward_RunFusion(benchmark::State& benchmark_state) { +static void NvFuserScheduler_GeluBackward_RunFusion( + benchmark::State& benchmark_state) { Fusion fusion; // setup fusion @@ -188,11 +195,13 @@ static void GeluBackward_RunFusion(benchmark::State& benchmark_state) { } } -BENCHMARK(GeluBackward_RunFusion)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion) + ->Unit(benchmark::kMicrosecond); //------------------------------------------------------------------------------ -static void GeluBackward_RunFusion_GpuOnly(benchmark::State& benchmark_state) { +static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly( + benchmark::State& benchmark_state) { Fusion fusion; // setup fusion @@ -209,13 +218,14 @@ static void GeluBackward_RunFusion_GpuOnly(benchmark::State& benchmark_state) { runBenchmarkIterations(benchmark_state, &executor, inputs, lparams); } -BENCHMARK(GeluBackward_RunFusion_GpuOnly) +BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_GpuOnly) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); //------------------------------------------------------------------------------ -static void GeluBackward_RunFusion_CpuOnly(benchmark::State& benchmark_state) { +static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly( + benchmark::State& benchmark_state) { Fusion fusion; // setup fusion @@ -238,4 +248,5 @@ static void GeluBackward_RunFusion_CpuOnly(benchmark::State& benchmark_state) { } } -BENCHMARK(GeluBackward_RunFusion_CpuOnly)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_CpuOnly) + ->Unit(benchmark::kMicrosecond); diff --git a/benchmark/heuristic_cache.cpp b/benchmark/heuristic_cache.cpp index a5e3a6667c0..9f25e8a3d33 100644 --- a/benchmark/heuristic_cache.cpp +++ b/benchmark/heuristic_cache.cpp @@ -94,7 +94,7 @@ static auto getLayerBackwardNormRuntime( return fec->getMostRecentKernelRuntime(); } -static void LayerNormBackward_HeuristicLookup( +static void NvFuserScheduler_LayerNormBackward_HeuristicCache( benchmark::State& benchmark_state) { std::unique_ptr fusion_ptr = std::make_unique(); FusionGuard fg(fusion_ptr.get()); @@ -146,7 +146,7 @@ static auto getLayerForwardNormRuntime( return fec->getMostRecentKernelRuntime(); } -static void LayerNormForward_HeuristicLookup( +static void NvFuserScheduler_LayerNormForward_HeuristicCache( benchmark::State& benchmark_state) { std::unique_ptr fusion_ptr = std::make_unique(); FusionGuard fg(fusion_ptr.get()); @@ -168,5 +168,7 @@ static void LayerNormForward_HeuristicLookup( } } -BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond); -BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_LayerNormBackward_HeuristicCache) + ->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_LayerNormForward_HeuristicCache) + ->Unit(benchmark::kMicrosecond); diff --git a/benchmark/heuristic_lookup.cpp b/benchmark/heuristic_lookup.cpp index b734054ca6b..91f108f1361 100644 --- a/benchmark/heuristic_lookup.cpp +++ b/benchmark/heuristic_lookup.cpp @@ -94,7 +94,7 @@ static auto getLayerBackwardNormRuntime( return fec->getMostRecentKernelRuntime(); } -static void LayerNormBackward_HeuristicLookup( +static void NvFuserScheduler_LayerNormBackward_HeuristicLookup( benchmark::State& benchmark_state) { std::unique_ptr fusion_ptr = std::make_unique(); FusionGuard fg(fusion_ptr.get()); @@ -150,7 +150,7 @@ static auto getLayerForwardNormRuntime( return fec->getMostRecentKernelRuntime(); } -static void LayerNormForward_HeuristicLookup( +static void NvFuserScheduler_LayerNormForward_HeuristicLookup( benchmark::State& benchmark_state) { std::unique_ptr fusion_ptr = std::make_unique(); FusionGuard fg(fusion_ptr.get()); @@ -176,5 +176,7 @@ static void LayerNormForward_HeuristicLookup( } } -BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond); -BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_LayerNormBackward_HeuristicLookup) + ->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_LayerNormForward_HeuristicLookup) + ->Unit(benchmark::kMicrosecond); diff --git a/benchmark/indexselect.cpp b/benchmark/indexselect.cpp index 05323d2806a..d8569e48998 100644 --- a/benchmark/indexselect.cpp +++ b/benchmark/indexselect.cpp @@ -62,18 +62,21 @@ static std::vector setupInputs() { //------------------------------------------------------------------------------ -static void IndexSelect_SetupFusion(benchmark::State& benchmark_state) { +static void NvFuserScheduler_IndexSelect_SetupFusion( + benchmark::State& benchmark_state) { for (auto _ : benchmark_state) { Fusion fusion; setupFusion(&fusion); } } -BENCHMARK(IndexSelect_SetupFusion)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_IndexSelect_SetupFusion) + ->Unit(benchmark::kMicrosecond); //------------------------------------------------------------------------------ -static void IndexSelect_AutoSchedule(benchmark::State& benchmark_state) { +static void NvFuserScheduler_IndexSelect_AutoSchedule( + benchmark::State& benchmark_state) { for (auto _ : benchmark_state) { // Setup (not included in the measurement) benchmark_state.PauseTiming(); @@ -87,11 +90,13 @@ static void IndexSelect_AutoSchedule(benchmark::State& benchmark_state) { } } -BENCHMARK(IndexSelect_AutoSchedule)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_IndexSelect_AutoSchedule) + ->Unit(benchmark::kMicrosecond); //------------------------------------------------------------------------------ -static void IndexSelect_Lower(benchmark::State& benchmark_state) { +static void NvFuserScheduler_IndexSelect_Lower( + benchmark::State& benchmark_state) { Fusion fusion; // setup fusion @@ -107,11 +112,12 @@ static void IndexSelect_Lower(benchmark::State& benchmark_state) { } } -BENCHMARK(IndexSelect_Lower)->Unit(benchmark::kMillisecond); +BENCHMARK(NvFuserScheduler_IndexSelect_Lower)->Unit(benchmark::kMillisecond); //------------------------------------------------------------------------------ -static void IndexSelect_Compile(benchmark::State& benchmark_state) { +static void NvFuserScheduler_IndexSelect_Compile( + benchmark::State& benchmark_state) { Fusion fusion; // setup fusion @@ -129,11 +135,12 @@ static void IndexSelect_Compile(benchmark::State& benchmark_state) { } } -BENCHMARK(IndexSelect_Compile)->Unit(benchmark::kMillisecond); +BENCHMARK(NvFuserScheduler_IndexSelect_Compile)->Unit(benchmark::kMillisecond); //------------------------------------------------------------------------------ -static void IndexSelect_RunFusion(benchmark::State& benchmark_state) { +static void NvFuserScheduler_IndexSelect_RunFusion( + benchmark::State& benchmark_state) { Fusion fusion; // setup fusion @@ -158,7 +165,8 @@ static void IndexSelect_RunFusion(benchmark::State& benchmark_state) { } } -BENCHMARK(IndexSelect_RunFusion)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_IndexSelect_RunFusion) + ->Unit(benchmark::kMicrosecond); static void setupIndexSelectSimple( Fusion* fusion, diff --git a/benchmark/lstm_cell.cpp b/benchmark/lstm_cell.cpp index eed45dd1e99..58b3d3b13da 100644 --- a/benchmark/lstm_cell.cpp +++ b/benchmark/lstm_cell.cpp @@ -78,18 +78,20 @@ static std::vector setupInputs( //------------------------------------------------------------------------------ -static void LstmCell_SetupFusion(benchmark::State& benchmark_state) { +static void NvFuserScheduler_LstmCell_SetupFusion( + benchmark::State& benchmark_state) { for (auto _ : benchmark_state) { Fusion fusion; setupFusion(&fusion); } } -BENCHMARK(LstmCell_SetupFusion)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_LstmCell_SetupFusion)->Unit(benchmark::kMicrosecond); //------------------------------------------------------------------------------ -static void LstmCell_AutoSchedule(benchmark::State& benchmark_state) { +static void NvFuserScheduler_LstmCell_AutoSchedule( + benchmark::State& benchmark_state) { constexpr int kHiddenFeatures = 512; constexpr int kBatchSize = 64; @@ -106,11 +108,12 @@ static void LstmCell_AutoSchedule(benchmark::State& benchmark_state) { } } -BENCHMARK(LstmCell_AutoSchedule)->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_LstmCell_AutoSchedule) + ->Unit(benchmark::kMicrosecond); //------------------------------------------------------------------------------ -static void LstmCell_Lower(benchmark::State& benchmark_state) { +static void NvFuserScheduler_LstmCell_Lower(benchmark::State& benchmark_state) { constexpr int kHiddenFeatures = 512; constexpr int kBatchSize = 64; @@ -129,11 +132,12 @@ static void LstmCell_Lower(benchmark::State& benchmark_state) { } } -BENCHMARK(LstmCell_Lower)->Unit(benchmark::kMillisecond); +BENCHMARK(NvFuserScheduler_LstmCell_Lower)->Unit(benchmark::kMillisecond); //------------------------------------------------------------------------------ -static void LstmCell_Compile(benchmark::State& benchmark_state) { +static void NvFuserScheduler_LstmCell_Compile( + benchmark::State& benchmark_state) { constexpr int kHiddenFeatures = 512; constexpr int kBatchSize = 64; @@ -153,11 +157,11 @@ static void LstmCell_Compile(benchmark::State& benchmark_state) { } } -BENCHMARK(LstmCell_Compile)->Unit(benchmark::kMillisecond); +BENCHMARK(NvFuserScheduler_LstmCell_Compile)->Unit(benchmark::kMillisecond); //------------------------------------------------------------------------------ -static void LstmCell_RunFusion( +static void NvFuserScheduler_LstmCell_RunFusion( benchmark::State& benchmark_state, int hidden_features, int batch_size) { @@ -185,15 +189,15 @@ static void LstmCell_RunFusion( } } -BENCHMARK_CAPTURE(LstmCell_RunFusion, Small, 512, 64) +BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion, Small, 512, 64) ->Unit(benchmark::kMicrosecond); -BENCHMARK_CAPTURE(LstmCell_RunFusion, Medium, 1024, 128) +BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion, Medium, 1024, 128) ->Unit(benchmark::kMicrosecond); //------------------------------------------------------------------------------ -static void LstmCell_RunFusion_GpuOnly( +static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly( benchmark::State& benchmark_state, int hidden_features, int batch_size) { @@ -216,17 +220,21 @@ static void LstmCell_RunFusion_GpuOnly( runBenchmarkIterations(benchmark_state, &executor, inputs, lparams); } -BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Small, 512, 64) +BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion_GpuOnly, Small, 512, 64) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); -BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Medium, 1024, 128) +BENCHMARK_CAPTURE( + NvFuserScheduler_LstmCell_RunFusion_GpuOnly, + Medium, + 1024, + 128) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); //------------------------------------------------------------------------------ -static void LstmCell_RunFusion_CpuOnly( +static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly( benchmark::State& benchmark_state, int hidden_features, int batch_size) { @@ -252,8 +260,12 @@ static void LstmCell_RunFusion_CpuOnly( } } -BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Small, 512, 64) +BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion_CpuOnly, Small, 512, 64) ->Unit(benchmark::kMicrosecond); -BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Medium, 1024, 128) +BENCHMARK_CAPTURE( + NvFuserScheduler_LstmCell_RunFusion_CpuOnly, + Medium, + 1024, + 128) ->Unit(benchmark::kMicrosecond); diff --git a/benchmark/many_pointwise_ops.cpp b/benchmark/many_pointwise_ops.cpp index 72ab48c7fd2..1617713f193 100644 --- a/benchmark/many_pointwise_ops.cpp +++ b/benchmark/many_pointwise_ops.cpp @@ -18,7 +18,7 @@ using namespace nvfuser; //------------------------------------------------------------------------------ -class ManyPointwiseOpsFixture : public benchmark::Fixture { +class NvFuserScheduler_ManyPointwiseOpsFixture : public benchmark::Fixture { public: void SetUp(const ::benchmark::State& state) override { fusion_ = std::make_unique(); @@ -39,14 +39,16 @@ class ManyPointwiseOpsFixture : public benchmark::Fixture { fusion_.reset(); } - ~ManyPointwiseOpsFixture() override { + ~NvFuserScheduler_ManyPointwiseOpsFixture() override { assert(fusion_ == nullptr); } std::unique_ptr fusion_ = nullptr; }; -BENCHMARK_DEFINE_F(ManyPointwiseOpsFixture, ManyPointwiseOpsCopyTest) +BENCHMARK_DEFINE_F( + NvFuserScheduler_ManyPointwiseOpsFixture, + ManyPointwiseOpsCopyTest) (benchmark::State& state) { for (auto _ : state) { Fusion fcopy = *fusion_; @@ -54,7 +56,9 @@ BENCHMARK_DEFINE_F(ManyPointwiseOpsFixture, ManyPointwiseOpsCopyTest) state.SetComplexityN(state.range(0)); } -BENCHMARK_REGISTER_F(ManyPointwiseOpsFixture, ManyPointwiseOpsCopyTest) +BENCHMARK_REGISTER_F( + NvFuserScheduler_ManyPointwiseOpsFixture, + ManyPointwiseOpsCopyTest) ->RangeMultiplier(2) ->Range(1 << 3, 1 << 12) ->Complexity(); diff --git a/benchmark/matmul.cpp b/benchmark/matmul.cpp index 0596bd55c2a..82623b90758 100644 --- a/benchmark/matmul.cpp +++ b/benchmark/matmul.cpp @@ -178,7 +178,7 @@ static void SingleMatmulBase( // TODO: FLOPS calculation } -static void EagerModeMatmul( +static void Baseline_Matmul( benchmark::State& benchmark_state, MatmulLayout layout) { std::vector input_mnk{ @@ -235,7 +235,7 @@ MatmulParams getMatmulParams( return params; } -static void Nvfuser_Matmul_4warp3stage( +static void NvFuserScheduler_Matmul_4warp3stage( benchmark::State& benchmark_state, MatmulLayout layout) { auto cta_tile = GemmTile(128, 128, 32); @@ -250,7 +250,7 @@ static void Nvfuser_Matmul_4warp3stage( SingleMatmulBase(benchmark_state, layout, params); } -static void Nvfuser_Matmul_8warp3stage( +static void NvFuserScheduler_Matmul_8warp3stage( benchmark::State& benchmark_state, MatmulLayout layout) { auto cta_tile = GemmTile(256, 128, 32); @@ -265,7 +265,7 @@ static void Nvfuser_Matmul_8warp3stage( SingleMatmulBase(benchmark_state, layout, params); } -static void Nvfuser_Matmul_4warp4stage( +static void NvFuserScheduler_Matmul_4warp4stage( benchmark::State& benchmark_state, MatmulLayout layout) { auto cta_tile = GemmTile(128, 128, 32); @@ -280,7 +280,7 @@ static void Nvfuser_Matmul_4warp4stage( SingleMatmulBase(benchmark_state, layout, params); } -static void Nvfuser_Matmul_8warp4stage( +static void NvFuserScheduler_Matmul_8warp4stage( benchmark::State& benchmark_state, MatmulLayout layout) { auto cta_tile = GemmTile(256, 128, 32); @@ -339,41 +339,41 @@ static void Nvfuser_Matmul_8warp4stage( run(NN_TIMM, MatmulLayout::NN, TIMMMatmulShapes) // Instantiations: -#define Nvfuser_4warp3stage_test(layout_label, layout, shapes) \ - BENCHMARK_CAPTURE( \ - Nvfuser_Matmul_4warp3stage, \ - no_quant_nvfuser_4warp_##layout_label, \ - layout) \ +#define NvFuserScheduler_4warp3stage_test(layout_label, layout, shapes) \ + BENCHMARK_CAPTURE( \ + NvFuserScheduler_Matmul_4warp3stage, \ + no_quant_nvfuser_4warp_##layout_label, \ + layout) \ ->shapes -#define Nvfuser_8warp3stage_test(layout_label, layout, shapes) \ - BENCHMARK_CAPTURE( \ - Nvfuser_Matmul_8warp3stage, \ - no_quant_nvfuser_8warp_##layout_label, \ - layout) \ +#define NvFuserScheduler_8warp3stage_test(layout_label, layout, shapes) \ + BENCHMARK_CAPTURE( \ + NvFuserScheduler_Matmul_8warp3stage, \ + no_quant_nvfuser_8warp_##layout_label, \ + layout) \ ->shapes -#define Nvfuser_4warp4stage_test(layout_label, layout, shapes) \ - BENCHMARK_CAPTURE( \ - Nvfuser_Matmul_4warp4stage, \ - no_quant_nvfuser_4warp_##layout_label, \ - layout) \ +#define NvFuserScheduler_4warp4stage_test(layout_label, layout, shapes) \ + BENCHMARK_CAPTURE( \ + NvFuserScheduler_Matmul_4warp4stage, \ + no_quant_nvfuser_4warp_##layout_label, \ + layout) \ ->shapes -#define Nvfuser_8warp4stage_test(layout_label, layout, shapes) \ - BENCHMARK_CAPTURE( \ - Nvfuser_Matmul_8warp4stage, \ - no_quant_nvfuser_8warp_##layout_label, \ - layout) \ +#define NvFuserScheduler_8warp4stage_test(layout_label, layout, shapes) \ + BENCHMARK_CAPTURE( \ + NvFuserScheduler_Matmul_8warp4stage, \ + no_quant_nvfuser_8warp_##layout_label, \ + layout) \ ->shapes -#define Eagermode_test(layout_label, layout, shapes) \ +#define Baseline_test(layout_label, layout, shapes) \ BENCHMARK_CAPTURE( \ - EagerModeMatmul, no_quant_eagermode_##layout_label, layout) \ + Baseline_Matmul, no_quant_eagermode_##layout_label, layout) \ ->shapes -ForAllLayouts(Nvfuser_4warp3stage_test); -ForAllLayouts(Nvfuser_4warp4stage_test); -ForAllLayouts(Nvfuser_8warp3stage_test); -ForAllLayouts(Nvfuser_8warp4stage_test); -ForAllLayouts(Eagermode_test); +ForAllLayouts(NvFuserScheduler_4warp3stage_test); +ForAllLayouts(NvFuserScheduler_4warp4stage_test); +ForAllLayouts(NvFuserScheduler_8warp3stage_test); +ForAllLayouts(NvFuserScheduler_8warp4stage_test); +ForAllLayouts(Baseline_test); diff --git a/benchmark/shape_inference.cpp b/benchmark/shape_inference.cpp index 590353d5a39..0916f3c1cf0 100644 --- a/benchmark/shape_inference.cpp +++ b/benchmark/shape_inference.cpp @@ -128,12 +128,12 @@ void LayerNormBackward_ShapeInference_Base( } } -static void LayerNormBackward_ShapeInference( +static void NvFuserScheduler_LayerNormBackward_ShapeInference( benchmark::State& benchmark_state) { LayerNormBackward_ShapeInference_Base(benchmark_state, true); } -static void LayerNormBackward_NoShapeInferenceCachedBaseline( +static void NvFuserScheduler_LayerNormBackward_NoShapeInferenceCachedBaseline( benchmark::State& benchmark_state) { LayerNormBackward_ShapeInference_Base(benchmark_state, false); } @@ -203,18 +203,21 @@ void LayerNormForward_ShapeInferenceBase( } } -static void LayerNormForward_NoShapeInferenceCachedBaseline( +static void NvFuserScheduler_LayerNormForward_NoShapeInferenceCachedBaseline( benchmark::State& benchmark_state) { LayerNormForward_ShapeInferenceBase(benchmark_state, false); } -static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) { +static void NvFuserScheduler_LayerNormForward_ShapeInference( + benchmark::State& benchmark_state) { LayerNormForward_ShapeInferenceBase(benchmark_state, true); } -BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond); -BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond); -BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline) +BENCHMARK(NvFuserScheduler_LayerNormBackward_ShapeInference) + ->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_LayerNormForward_ShapeInference) + ->Unit(benchmark::kMicrosecond); +BENCHMARK(NvFuserScheduler_LayerNormBackward_NoShapeInferenceCachedBaseline) ->Unit(benchmark::kMicrosecond); -BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline) +BENCHMARK(NvFuserScheduler_LayerNormForward_NoShapeInferenceCachedBaseline) ->Unit(benchmark::kMicrosecond); diff --git a/benchmark/softmax.cpp b/benchmark/softmax.cpp index 138713fbb4f..b6489eeb0e4 100644 --- a/benchmark/softmax.cpp +++ b/benchmark/softmax.cpp @@ -78,7 +78,8 @@ static void NvFuserScheduler_Softmax( } // Warp softmax comparison -static void Softmax_WarpReduceReference(benchmark::State& benchmark_state) { +static void NvFuserScheduler_Softmax_WarpReduceReference( + benchmark::State& benchmark_state) { auto dtype = DataType::Float; std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1)}; @@ -113,7 +114,8 @@ static void Softmax_WarpReduceReference(benchmark::State& benchmark_state) { (2 * aten_input.numel() * int64_t(dataTypeSize(dtype)))); } -static void Softmax_WarpReduce(benchmark::State& benchmark_state) { +static void NvFuserScheduler_Softmax_WarpReduce( + benchmark::State& benchmark_state) { auto dtype = DataType::Float; std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1)}; @@ -158,13 +160,13 @@ static void Softmax_WarpReduce(benchmark::State& benchmark_state) { (2 * aten_input.numel() * int64_t(dataTypeSize(dtype)))); } -BENCHMARK(Softmax_WarpReduce) +BENCHMARK(NvFuserScheduler_Softmax_WarpReduce) ->RangeMultiplier(2) ->Ranges({{8, 8}, {16 * 197, 16 * 197}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); -BENCHMARK(Softmax_WarpReduceReference) +BENCHMARK(NvFuserScheduler_Softmax_WarpReduceReference) ->RangeMultiplier(2) ->Ranges({{8, 8}, {16 * 197, 16 * 197}}) ->Unit(benchmark::kMicrosecond)