From efce3ac4a7d7f9157b08ec832239423885b7bb9f Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Fri, 22 Sep 2023 10:51:38 -0700
Subject: [PATCH] rename all nvfuser benchmarks start with NvFuserScheduler_

---
 benchmark/gelu_backward.cpp      | 39 ++++++++++++-------
 benchmark/heuristic_cache.cpp    | 10 +++--
 benchmark/heuristic_lookup.cpp   | 10 +++--
 benchmark/indexselect.cpp        | 28 +++++++++-----
 benchmark/lstm_cell.cpp          | 46 ++++++++++++++---------
 benchmark/many_pointwise_ops.cpp | 12 ++++--
 benchmark/matmul.cpp             | 64 ++++++++++++++++----------------
 benchmark/shape_inference.cpp    | 19 ++++++----
 benchmark/softmax.cpp            | 10 +++--
 9 files changed, 141 insertions(+), 97 deletions(-)

diff --git a/benchmark/gelu_backward.cpp b/benchmark/gelu_backward.cpp
index ea779836183..a7747c8ceba 100644
--- a/benchmark/gelu_backward.cpp
+++ b/benchmark/gelu_backward.cpp
@@ -92,18 +92,21 @@ static std::vector<c10::IValue> setupInputs() {
 
 //------------------------------------------------------------------------------
 
-static void GeluBackward_SetupFusion(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_GeluBackward_SetupFusion(
+    benchmark::State& benchmark_state) {
   for (auto _ : benchmark_state) {
     Fusion fusion;
     setupFusion(&fusion);
   }
 }
 
-BENCHMARK(GeluBackward_SetupFusion)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_GeluBackward_SetupFusion)
+    ->Unit(benchmark::kMicrosecond);
 
 //------------------------------------------------------------------------------
 
-static void GeluBackward_AutoSchedule(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_GeluBackward_AutoSchedule(
+    benchmark::State& benchmark_state) {
   for (auto _ : benchmark_state) {
     // Setup (not included in the measurement)
     benchmark_state.PauseTiming();
@@ -117,11 +120,13 @@ static void GeluBackward_AutoSchedule(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(GeluBackward_AutoSchedule)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_GeluBackward_AutoSchedule)
+    ->Unit(benchmark::kMicrosecond);
 
 //------------------------------------------------------------------------------
 
-static void GeluBackward_Lower(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_GeluBackward_Lower(
+    benchmark::State& benchmark_state) {
   Fusion fusion;
 
   // setup fusion
@@ -137,11 +142,12 @@ static void GeluBackward_Lower(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(GeluBackward_Lower)->Unit(benchmark::kMillisecond);
+BENCHMARK(NvFuserScheduler_GeluBackward_Lower)->Unit(benchmark::kMillisecond);
 
 //------------------------------------------------------------------------------
 
-static void GeluBackward_Compile(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_GeluBackward_Compile(
+    benchmark::State& benchmark_state) {
   Fusion fusion;
 
   // setup fusion
@@ -158,11 +164,12 @@ static void GeluBackward_Compile(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(GeluBackward_Compile)->Unit(benchmark::kMillisecond);
+BENCHMARK(NvFuserScheduler_GeluBackward_Compile)->Unit(benchmark::kMillisecond);
 
 //------------------------------------------------------------------------------
 
-static void GeluBackward_RunFusion(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_GeluBackward_RunFusion(
+    benchmark::State& benchmark_state) {
   Fusion fusion;
 
   // setup fusion
@@ -188,11 +195,13 @@ static void GeluBackward_RunFusion(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(GeluBackward_RunFusion)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion)
+    ->Unit(benchmark::kMicrosecond);
 
 //------------------------------------------------------------------------------
 
-static void GeluBackward_RunFusion_GpuOnly(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly(
+    benchmark::State& benchmark_state) {
   Fusion fusion;
 
   // setup fusion
@@ -209,13 +218,14 @@ static void GeluBackward_RunFusion_GpuOnly(benchmark::State& benchmark_state) {
   runBenchmarkIterations(benchmark_state, &executor, inputs, lparams);
 }
 
-BENCHMARK(GeluBackward_RunFusion_GpuOnly)
+BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_GpuOnly)
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
 
 //------------------------------------------------------------------------------
 
-static void GeluBackward_RunFusion_CpuOnly(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly(
+    benchmark::State& benchmark_state) {
   Fusion fusion;
 
   // setup fusion
@@ -238,4 +248,5 @@ static void GeluBackward_RunFusion_CpuOnly(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(GeluBackward_RunFusion_CpuOnly)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_GeluBackward_RunFusion_CpuOnly)
+    ->Unit(benchmark::kMicrosecond);
diff --git a/benchmark/heuristic_cache.cpp b/benchmark/heuristic_cache.cpp
index a5e3a6667c0..9f25e8a3d33 100644
--- a/benchmark/heuristic_cache.cpp
+++ b/benchmark/heuristic_cache.cpp
@@ -94,7 +94,7 @@ static auto getLayerBackwardNormRuntime(
   return fec->getMostRecentKernelRuntime();
 }
 
-static void LayerNormBackward_HeuristicLookup(
+static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
     benchmark::State& benchmark_state) {
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
   FusionGuard fg(fusion_ptr.get());
@@ -146,7 +146,7 @@ static auto getLayerForwardNormRuntime(
   return fec->getMostRecentKernelRuntime();
 }
 
-static void LayerNormForward_HeuristicLookup(
+static void NvFuserScheduler_LayerNormForward_HeuristicCache(
     benchmark::State& benchmark_state) {
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
   FusionGuard fg(fusion_ptr.get());
@@ -168,5 +168,7 @@ static void LayerNormForward_HeuristicLookup(
   }
 }
 
-BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_LayerNormBackward_HeuristicCache)
+    ->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_LayerNormForward_HeuristicCache)
+    ->Unit(benchmark::kMicrosecond);
diff --git a/benchmark/heuristic_lookup.cpp b/benchmark/heuristic_lookup.cpp
index b734054ca6b..91f108f1361 100644
--- a/benchmark/heuristic_lookup.cpp
+++ b/benchmark/heuristic_lookup.cpp
@@ -94,7 +94,7 @@ static auto getLayerBackwardNormRuntime(
   return fec->getMostRecentKernelRuntime();
 }
 
-static void LayerNormBackward_HeuristicLookup(
+static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
     benchmark::State& benchmark_state) {
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
   FusionGuard fg(fusion_ptr.get());
@@ -150,7 +150,7 @@ static auto getLayerForwardNormRuntime(
   return fec->getMostRecentKernelRuntime();
 }
 
-static void LayerNormForward_HeuristicLookup(
+static void NvFuserScheduler_LayerNormForward_HeuristicLookup(
     benchmark::State& benchmark_state) {
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
   FusionGuard fg(fusion_ptr.get());
@@ -176,5 +176,7 @@ static void LayerNormForward_HeuristicLookup(
   }
 }
 
-BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_LayerNormBackward_HeuristicLookup)
+    ->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_LayerNormForward_HeuristicLookup)
+    ->Unit(benchmark::kMicrosecond);
diff --git a/benchmark/indexselect.cpp b/benchmark/indexselect.cpp
index 05323d2806a..d8569e48998 100644
--- a/benchmark/indexselect.cpp
+++ b/benchmark/indexselect.cpp
@@ -62,18 +62,21 @@ static std::vector<c10::IValue> setupInputs() {
 
 //------------------------------------------------------------------------------
 
-static void IndexSelect_SetupFusion(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_IndexSelect_SetupFusion(
+    benchmark::State& benchmark_state) {
   for (auto _ : benchmark_state) {
     Fusion fusion;
     setupFusion(&fusion);
   }
 }
 
-BENCHMARK(IndexSelect_SetupFusion)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_IndexSelect_SetupFusion)
+    ->Unit(benchmark::kMicrosecond);
 
 //------------------------------------------------------------------------------
 
-static void IndexSelect_AutoSchedule(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_IndexSelect_AutoSchedule(
+    benchmark::State& benchmark_state) {
   for (auto _ : benchmark_state) {
     // Setup (not included in the measurement)
     benchmark_state.PauseTiming();
@@ -87,11 +90,13 @@ static void IndexSelect_AutoSchedule(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(IndexSelect_AutoSchedule)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_IndexSelect_AutoSchedule)
+    ->Unit(benchmark::kMicrosecond);
 
 //------------------------------------------------------------------------------
 
-static void IndexSelect_Lower(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_IndexSelect_Lower(
+    benchmark::State& benchmark_state) {
   Fusion fusion;
 
   // setup fusion
@@ -107,11 +112,12 @@ static void IndexSelect_Lower(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(IndexSelect_Lower)->Unit(benchmark::kMillisecond);
+BENCHMARK(NvFuserScheduler_IndexSelect_Lower)->Unit(benchmark::kMillisecond);
 
 //------------------------------------------------------------------------------
 
-static void IndexSelect_Compile(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_IndexSelect_Compile(
+    benchmark::State& benchmark_state) {
   Fusion fusion;
 
   // setup fusion
@@ -129,11 +135,12 @@ static void IndexSelect_Compile(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(IndexSelect_Compile)->Unit(benchmark::kMillisecond);
+BENCHMARK(NvFuserScheduler_IndexSelect_Compile)->Unit(benchmark::kMillisecond);
 
 //------------------------------------------------------------------------------
 
-static void IndexSelect_RunFusion(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_IndexSelect_RunFusion(
+    benchmark::State& benchmark_state) {
   Fusion fusion;
 
   // setup fusion
@@ -158,7 +165,8 @@ static void IndexSelect_RunFusion(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(IndexSelect_RunFusion)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_IndexSelect_RunFusion)
+    ->Unit(benchmark::kMicrosecond);
 
 static void setupIndexSelectSimple(
     Fusion* fusion,
diff --git a/benchmark/lstm_cell.cpp b/benchmark/lstm_cell.cpp
index eed45dd1e99..58b3d3b13da 100644
--- a/benchmark/lstm_cell.cpp
+++ b/benchmark/lstm_cell.cpp
@@ -78,18 +78,20 @@ static std::vector<c10::IValue> setupInputs(
 
 //------------------------------------------------------------------------------
 
-static void LstmCell_SetupFusion(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_LstmCell_SetupFusion(
+    benchmark::State& benchmark_state) {
   for (auto _ : benchmark_state) {
     Fusion fusion;
     setupFusion(&fusion);
   }
 }
 
-BENCHMARK(LstmCell_SetupFusion)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_LstmCell_SetupFusion)->Unit(benchmark::kMicrosecond);
 
 //------------------------------------------------------------------------------
 
-static void LstmCell_AutoSchedule(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_LstmCell_AutoSchedule(
+    benchmark::State& benchmark_state) {
   constexpr int kHiddenFeatures = 512;
   constexpr int kBatchSize = 64;
 
@@ -106,11 +108,12 @@ static void LstmCell_AutoSchedule(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(LstmCell_AutoSchedule)->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_LstmCell_AutoSchedule)
+    ->Unit(benchmark::kMicrosecond);
 
 //------------------------------------------------------------------------------
 
-static void LstmCell_Lower(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_LstmCell_Lower(benchmark::State& benchmark_state) {
   constexpr int kHiddenFeatures = 512;
   constexpr int kBatchSize = 64;
 
@@ -129,11 +132,12 @@ static void LstmCell_Lower(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(LstmCell_Lower)->Unit(benchmark::kMillisecond);
+BENCHMARK(NvFuserScheduler_LstmCell_Lower)->Unit(benchmark::kMillisecond);
 
 //------------------------------------------------------------------------------
 
-static void LstmCell_Compile(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_LstmCell_Compile(
+    benchmark::State& benchmark_state) {
   constexpr int kHiddenFeatures = 512;
   constexpr int kBatchSize = 64;
 
@@ -153,11 +157,11 @@ static void LstmCell_Compile(benchmark::State& benchmark_state) {
   }
 }
 
-BENCHMARK(LstmCell_Compile)->Unit(benchmark::kMillisecond);
+BENCHMARK(NvFuserScheduler_LstmCell_Compile)->Unit(benchmark::kMillisecond);
 
 //------------------------------------------------------------------------------
 
-static void LstmCell_RunFusion(
+static void NvFuserScheduler_LstmCell_RunFusion(
     benchmark::State& benchmark_state,
     int hidden_features,
     int batch_size) {
@@ -185,15 +189,15 @@ static void LstmCell_RunFusion(
   }
 }
 
-BENCHMARK_CAPTURE(LstmCell_RunFusion, Small, 512, 64)
+BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion, Small, 512, 64)
     ->Unit(benchmark::kMicrosecond);
 
-BENCHMARK_CAPTURE(LstmCell_RunFusion, Medium, 1024, 128)
+BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion, Medium, 1024, 128)
     ->Unit(benchmark::kMicrosecond);
 
 //------------------------------------------------------------------------------
 
-static void LstmCell_RunFusion_GpuOnly(
+static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly(
     benchmark::State& benchmark_state,
     int hidden_features,
     int batch_size) {
@@ -216,17 +220,21 @@ static void LstmCell_RunFusion_GpuOnly(
   runBenchmarkIterations(benchmark_state, &executor, inputs, lparams);
 }
 
-BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Small, 512, 64)
+BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion_GpuOnly, Small, 512, 64)
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
 
-BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Medium, 1024, 128)
+BENCHMARK_CAPTURE(
+    NvFuserScheduler_LstmCell_RunFusion_GpuOnly,
+    Medium,
+    1024,
+    128)
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
 
 //------------------------------------------------------------------------------
 
-static void LstmCell_RunFusion_CpuOnly(
+static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly(
     benchmark::State& benchmark_state,
     int hidden_features,
     int batch_size) {
@@ -252,8 +260,12 @@ static void LstmCell_RunFusion_CpuOnly(
   }
 }
 
-BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Small, 512, 64)
+BENCHMARK_CAPTURE(NvFuserScheduler_LstmCell_RunFusion_CpuOnly, Small, 512, 64)
     ->Unit(benchmark::kMicrosecond);
 
-BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Medium, 1024, 128)
+BENCHMARK_CAPTURE(
+    NvFuserScheduler_LstmCell_RunFusion_CpuOnly,
+    Medium,
+    1024,
+    128)
     ->Unit(benchmark::kMicrosecond);
diff --git a/benchmark/many_pointwise_ops.cpp b/benchmark/many_pointwise_ops.cpp
index 72ab48c7fd2..1617713f193 100644
--- a/benchmark/many_pointwise_ops.cpp
+++ b/benchmark/many_pointwise_ops.cpp
@@ -18,7 +18,7 @@ using namespace nvfuser;
 
 //------------------------------------------------------------------------------
 
-class ManyPointwiseOpsFixture : public benchmark::Fixture {
+class NvFuserScheduler_ManyPointwiseOpsFixture : public benchmark::Fixture {
  public:
   void SetUp(const ::benchmark::State& state) override {
     fusion_ = std::make_unique<Fusion>();
@@ -39,14 +39,16 @@ class ManyPointwiseOpsFixture : public benchmark::Fixture {
     fusion_.reset();
   }
 
-  ~ManyPointwiseOpsFixture() override {
+  ~NvFuserScheduler_ManyPointwiseOpsFixture() override {
     assert(fusion_ == nullptr);
   }
 
   std::unique_ptr<Fusion> fusion_ = nullptr;
 };
 
-BENCHMARK_DEFINE_F(ManyPointwiseOpsFixture, ManyPointwiseOpsCopyTest)
+BENCHMARK_DEFINE_F(
+    NvFuserScheduler_ManyPointwiseOpsFixture,
+    ManyPointwiseOpsCopyTest)
 (benchmark::State& state) {
   for (auto _ : state) {
     Fusion fcopy = *fusion_;
@@ -54,7 +56,9 @@ BENCHMARK_DEFINE_F(ManyPointwiseOpsFixture, ManyPointwiseOpsCopyTest)
   state.SetComplexityN(state.range(0));
 }
 
-BENCHMARK_REGISTER_F(ManyPointwiseOpsFixture, ManyPointwiseOpsCopyTest)
+BENCHMARK_REGISTER_F(
+    NvFuserScheduler_ManyPointwiseOpsFixture,
+    ManyPointwiseOpsCopyTest)
     ->RangeMultiplier(2)
     ->Range(1 << 3, 1 << 12)
     ->Complexity();
diff --git a/benchmark/matmul.cpp b/benchmark/matmul.cpp
index 0596bd55c2a..82623b90758 100644
--- a/benchmark/matmul.cpp
+++ b/benchmark/matmul.cpp
@@ -178,7 +178,7 @@ static void SingleMatmulBase(
   // TODO: FLOPS calculation
 }
 
-static void EagerModeMatmul(
+static void Baseline_Matmul(
     benchmark::State& benchmark_state,
     MatmulLayout layout) {
   std::vector<int64_t> input_mnk{
@@ -235,7 +235,7 @@ MatmulParams getMatmulParams(
   return params;
 }
 
-static void Nvfuser_Matmul_4warp3stage(
+static void NvFuserScheduler_Matmul_4warp3stage(
     benchmark::State& benchmark_state,
     MatmulLayout layout) {
   auto cta_tile = GemmTile(128, 128, 32);
@@ -250,7 +250,7 @@ static void Nvfuser_Matmul_4warp3stage(
   SingleMatmulBase(benchmark_state, layout, params);
 }
 
-static void Nvfuser_Matmul_8warp3stage(
+static void NvFuserScheduler_Matmul_8warp3stage(
     benchmark::State& benchmark_state,
     MatmulLayout layout) {
   auto cta_tile = GemmTile(256, 128, 32);
@@ -265,7 +265,7 @@ static void Nvfuser_Matmul_8warp3stage(
   SingleMatmulBase(benchmark_state, layout, params);
 }
 
-static void Nvfuser_Matmul_4warp4stage(
+static void NvFuserScheduler_Matmul_4warp4stage(
     benchmark::State& benchmark_state,
     MatmulLayout layout) {
   auto cta_tile = GemmTile(128, 128, 32);
@@ -280,7 +280,7 @@ static void Nvfuser_Matmul_4warp4stage(
   SingleMatmulBase(benchmark_state, layout, params);
 }
 
-static void Nvfuser_Matmul_8warp4stage(
+static void NvFuserScheduler_Matmul_8warp4stage(
     benchmark::State& benchmark_state,
     MatmulLayout layout) {
   auto cta_tile = GemmTile(256, 128, 32);
@@ -339,41 +339,41 @@ static void Nvfuser_Matmul_8warp4stage(
   run(NN_TIMM, MatmulLayout::NN, TIMMMatmulShapes)
 
 // Instantiations:
-#define Nvfuser_4warp3stage_test(layout_label, layout, shapes) \
-  BENCHMARK_CAPTURE(                                           \
-      Nvfuser_Matmul_4warp3stage,                              \
-      no_quant_nvfuser_4warp_##layout_label,                   \
-      layout)                                                  \
+#define NvFuserScheduler_4warp3stage_test(layout_label, layout, shapes) \
+  BENCHMARK_CAPTURE(                                                    \
+      NvFuserScheduler_Matmul_4warp3stage,                              \
+      no_quant_nvfuser_4warp_##layout_label,                            \
+      layout)                                                           \
       ->shapes
 
-#define Nvfuser_8warp3stage_test(layout_label, layout, shapes) \
-  BENCHMARK_CAPTURE(                                           \
-      Nvfuser_Matmul_8warp3stage,                              \
-      no_quant_nvfuser_8warp_##layout_label,                   \
-      layout)                                                  \
+#define NvFuserScheduler_8warp3stage_test(layout_label, layout, shapes) \
+  BENCHMARK_CAPTURE(                                                    \
+      NvFuserScheduler_Matmul_8warp3stage,                              \
+      no_quant_nvfuser_8warp_##layout_label,                            \
+      layout)                                                           \
       ->shapes
 
-#define Nvfuser_4warp4stage_test(layout_label, layout, shapes) \
-  BENCHMARK_CAPTURE(                                           \
-      Nvfuser_Matmul_4warp4stage,                              \
-      no_quant_nvfuser_4warp_##layout_label,                   \
-      layout)                                                  \
+#define NvFuserScheduler_4warp4stage_test(layout_label, layout, shapes) \
+  BENCHMARK_CAPTURE(                                                    \
+      NvFuserScheduler_Matmul_4warp4stage,                              \
+      no_quant_nvfuser_4warp_##layout_label,                            \
+      layout)                                                           \
       ->shapes
 
-#define Nvfuser_8warp4stage_test(layout_label, layout, shapes) \
-  BENCHMARK_CAPTURE(                                           \
-      Nvfuser_Matmul_8warp4stage,                              \
-      no_quant_nvfuser_8warp_##layout_label,                   \
-      layout)                                                  \
+#define NvFuserScheduler_8warp4stage_test(layout_label, layout, shapes) \
+  BENCHMARK_CAPTURE(                                                    \
+      NvFuserScheduler_Matmul_8warp4stage,                              \
+      no_quant_nvfuser_8warp_##layout_label,                            \
+      layout)                                                           \
       ->shapes
 
-#define Eagermode_test(layout_label, layout, shapes)              \
+#define Baseline_test(layout_label, layout, shapes)               \
   BENCHMARK_CAPTURE(                                              \
-      EagerModeMatmul, no_quant_eagermode_##layout_label, layout) \
+      Baseline_Matmul, no_quant_eagermode_##layout_label, layout) \
       ->shapes
 
-ForAllLayouts(Nvfuser_4warp3stage_test);
-ForAllLayouts(Nvfuser_4warp4stage_test);
-ForAllLayouts(Nvfuser_8warp3stage_test);
-ForAllLayouts(Nvfuser_8warp4stage_test);
-ForAllLayouts(Eagermode_test);
+ForAllLayouts(NvFuserScheduler_4warp3stage_test);
+ForAllLayouts(NvFuserScheduler_4warp4stage_test);
+ForAllLayouts(NvFuserScheduler_8warp3stage_test);
+ForAllLayouts(NvFuserScheduler_8warp4stage_test);
+ForAllLayouts(Baseline_test);
diff --git a/benchmark/shape_inference.cpp b/benchmark/shape_inference.cpp
index 590353d5a39..0916f3c1cf0 100644
--- a/benchmark/shape_inference.cpp
+++ b/benchmark/shape_inference.cpp
@@ -128,12 +128,12 @@ void LayerNormBackward_ShapeInference_Base(
   }
 }
 
-static void LayerNormBackward_ShapeInference(
+static void NvFuserScheduler_LayerNormBackward_ShapeInference(
     benchmark::State& benchmark_state) {
   LayerNormBackward_ShapeInference_Base(benchmark_state, true);
 }
 
-static void LayerNormBackward_NoShapeInferenceCachedBaseline(
+static void NvFuserScheduler_LayerNormBackward_NoShapeInferenceCachedBaseline(
     benchmark::State& benchmark_state) {
   LayerNormBackward_ShapeInference_Base(benchmark_state, false);
 }
@@ -203,18 +203,21 @@ void LayerNormForward_ShapeInferenceBase(
   }
 }
 
-static void LayerNormForward_NoShapeInferenceCachedBaseline(
+static void NvFuserScheduler_LayerNormForward_NoShapeInferenceCachedBaseline(
     benchmark::State& benchmark_state) {
   LayerNormForward_ShapeInferenceBase(benchmark_state, false);
 }
 
-static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_LayerNormForward_ShapeInference(
+    benchmark::State& benchmark_state) {
   LayerNormForward_ShapeInferenceBase(benchmark_state, true);
 }
 
-BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline)
+BENCHMARK(NvFuserScheduler_LayerNormBackward_ShapeInference)
+    ->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_LayerNormForward_ShapeInference)
+    ->Unit(benchmark::kMicrosecond);
+BENCHMARK(NvFuserScheduler_LayerNormBackward_NoShapeInferenceCachedBaseline)
     ->Unit(benchmark::kMicrosecond);
-BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline)
+BENCHMARK(NvFuserScheduler_LayerNormForward_NoShapeInferenceCachedBaseline)
     ->Unit(benchmark::kMicrosecond);
diff --git a/benchmark/softmax.cpp b/benchmark/softmax.cpp
index 138713fbb4f..b6489eeb0e4 100644
--- a/benchmark/softmax.cpp
+++ b/benchmark/softmax.cpp
@@ -78,7 +78,8 @@ static void NvFuserScheduler_Softmax(
 }
 
 // Warp softmax comparison
-static void Softmax_WarpReduceReference(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_Softmax_WarpReduceReference(
+    benchmark::State& benchmark_state) {
   auto dtype = DataType::Float;
   std::vector<int64_t> input_shape{
       benchmark_state.range(0), benchmark_state.range(1)};
@@ -113,7 +114,8 @@ static void Softmax_WarpReduceReference(benchmark::State& benchmark_state) {
       (2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
 }
 
-static void Softmax_WarpReduce(benchmark::State& benchmark_state) {
+static void NvFuserScheduler_Softmax_WarpReduce(
+    benchmark::State& benchmark_state) {
   auto dtype = DataType::Float;
   std::vector<int64_t> input_shape{
       benchmark_state.range(0), benchmark_state.range(1)};
@@ -158,13 +160,13 @@ static void Softmax_WarpReduce(benchmark::State& benchmark_state) {
       (2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
 }
 
-BENCHMARK(Softmax_WarpReduce)
+BENCHMARK(NvFuserScheduler_Softmax_WarpReduce)
     ->RangeMultiplier(2)
     ->Ranges({{8, 8}, {16 * 197, 16 * 197}})
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
 
-BENCHMARK(Softmax_WarpReduceReference)
+BENCHMARK(NvFuserScheduler_Softmax_WarpReduceReference)
     ->RangeMultiplier(2)
     ->Ranges({{8, 8}, {16 * 197, 16 * 197}})
     ->Unit(benchmark::kMicrosecond)