From e65c4a179efcb12de7cb2111da8437652f7fd35c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 16 Oct 2024 20:33:53 -0400
Subject: [PATCH 1/9] Disable cuBLAS half-precision matmul reduction in
 benchmark

This disables reduction in fp16 or bf16, which is enabled by default in
PyTorch. There are two reasons to disable this for our benchmarks:
1. nvFuser does not support split-K in reduced precision (see #1719).
   Since half precision reduction is much faster than single precision,
   this means eager mode will be faster but less precise than
   nvFuser by default. For fair comparison, we can both use single
   precision.
2. The accuracy of matmuls is degraded for split-K problems (small M&N,
   large K) by default in PyTorch. This can lead to validation errors
   where nvFuser actually performs an accurate computation but our
   baseline is inaccurate.
---
 benchmarks/python/test_matmul.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py
index 4eed491752d..ff5613ca819 100644
--- a/benchmarks/python/test_matmul.py
+++ b/benchmarks/python/test_matmul.py
@@ -40,6 +40,11 @@ def test_matmul_nvf_benchmark(
 
     clear_cuda_cache()
 
+    # disable half-precision split-K reduction in cuBLAS since we do not
+    # support this in nvFuser
+    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+    torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
+
     try:
         a = torch.randn(m, k, device="cuda", dtype=dtype)
         b = torch.randn(k, n, device="cuda", dtype=dtype)

From 17382c27d01046dca3c5e88e4fa964bc007a70fa Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 18 Oct 2024 11:03:15 -0400
Subject: [PATCH 2/9] Run eager using torch.matmul with and without half
 reduction

---
 benchmarks/python/test_matmul.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py
index ff5613ca819..2ae40fa32bb 100644
--- a/benchmarks/python/test_matmul.py
+++ b/benchmarks/python/test_matmul.py
@@ -25,14 +25,18 @@ def load_matmul_problems():
         return list((int(m), int(n), int(k), layout) for m, n, k, layout in reader)
 
 
+@pytest.mark.parametrize("eager", [False, True], ids=["nvfuser", "eager"])
+@pytest.mark.parametrize("half_reduction", [False, True])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize(
     "config", load_matmul_problems(), ids=lambda val: "_".join(str(v) for v in val)
 )
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_matmul_nvf_benchmark(
     benchmark,
+    eager: bool,
     config: tuple,
     dtype: torch.dtype,
+    half_reduction: bool,
     disable_validation: bool,
     disable_benchmarking: bool,
 ):
@@ -40,10 +44,12 @@ def test_matmul_nvf_benchmark(
 
     clear_cuda_cache()
 
-    # disable half-precision split-K reduction in cuBLAS since we do not
-    # support this in nvFuser
-    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
-    torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
+    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = half_reduction
+    torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = half_reduction
+
+    if half_reduction and not eager:
+        # See https://github.com/NVIDIA/Fuser/pull/1719
+        pytest.skip("Reduced precision reduction not implemented in nvFuser")
 
     try:
         a = torch.randn(m, k, device="cuda", dtype=dtype)
@@ -62,7 +68,11 @@ def test_matmul_nvf_benchmark(
             fd.validate([a, b], [eager_output])
 
         if not disable_benchmarking:
-            run_benchmark(benchmark, fd.execute, [a, b])
+            run_benchmark(
+                benchmark,
+                (lambda inps: torch.matmul(*inps)) if eager else fd.execute,
+                [a, b],
+            )
 
     except torch.OutOfMemoryError:
         pytest.skip("Test failed due to OutOfMemoryError")

From 3205d5075df42caf2597b9fdafe6d1824a90e612 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 18 Oct 2024 11:10:06 -0400
Subject: [PATCH 3/9] Fix up ids and reorder parametrization for test

---
 benchmarks/python/test_matmul.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py
index 2ae40fa32bb..27b5949bfa3 100644
--- a/benchmarks/python/test_matmul.py
+++ b/benchmarks/python/test_matmul.py
@@ -25,11 +25,11 @@ def load_matmul_problems():
         return list((int(m), int(n), int(k), layout) for m, n, k, layout in reader)
 
 
+@pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"])
 @pytest.mark.parametrize("eager", [False, True], ids=["nvfuser", "eager"])
-@pytest.mark.parametrize("half_reduction", [False, True])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
 @pytest.mark.parametrize(
-    "config", load_matmul_problems(), ids=lambda val: "_".join(str(v) for v in val)
+    "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val)
 )
 def test_matmul_nvf_benchmark(
     benchmark,

From 2c0898ad4c753ac825d3fa0132c70148752a5275 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 18 Oct 2024 15:41:56 -0400
Subject: [PATCH 4/9] Avoid creating FusionDefinition for eager tests

---
 benchmarks/python/test_matmul.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py
index 27b5949bfa3..2bdabdcc396 100644
--- a/benchmarks/python/test_matmul.py
+++ b/benchmarks/python/test_matmul.py
@@ -60,19 +60,23 @@ def test_matmul_nvf_benchmark(
         if layout == "TN" or layout == "NN":
             b = b.as_strided(size=[k, n], stride=[1, k])
 
-        with FusionDefinition() as fd:
-            matmul_fusion(fd, [a, b])
-
-        if not disable_validation:
-            eager_output = torch.matmul(a, b)
-            fd.validate([a, b], [eager_output])
-
-        if not disable_benchmarking:
+        if eager:
+            # NOTE: we never need to validate eager, as it is our baseline
             run_benchmark(
                 benchmark,
-                (lambda inps: torch.matmul(*inps)) if eager else fd.execute,
+                lambda ab: torch.matmul(*ab),
                 [a, b],
             )
+        else:
+            with FusionDefinition() as fd:
+                matmul_fusion(fd, [a, b])
+
+            if not disable_validation:
+                eager_output = torch.matmul(a, b)
+                fd.validate([a, b], [eager_output])
+
+            if not disable_benchmarking:
+                run_benchmark(benchmark, fd.execute, [a, b])
 
     except torch.OutOfMemoryError:
         pytest.skip("Test failed due to OutOfMemoryError")

From 9b5bb51e5f8491ee33193318fa0da5be98d92141 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 18 Oct 2024 19:24:49 -0400
Subject: [PATCH 5/9] Fail benchmark whenever no CUDA events are found

---
 benchmarks/python/core.py | 45 ++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py
index 18ff834a25f..12e7d0fbf9a 100644
--- a/benchmarks/python/core.py
+++ b/benchmarks/python/core.py
@@ -129,15 +129,18 @@ def torchprofile_timer(self) -> float:
         """
         try:
             self.prof.stop()
-            prof_averages = self.prof.key_averages()
-            elapsed_cuda_time = self._get_kernel_time(prof_averages)
-            self._increment_global_time(elapsed_cuda_time)
-            # Clear the internal profiler object to avoid accumulating function events and then restart the profiler
-            # See PR: https://github.com/pytorch/pytorch/pull/125510
-            self.prof.profiler = None
-            self.prof.start()
         except AssertionError:
             self.prof.start()
+            return self.current_time
+
+        prof_averages = self.prof.key_averages()
+        elapsed_cuda_time = self._get_kernel_time(prof_averages)
+        self._increment_global_time(elapsed_cuda_time)
+        # Clear the internal profiler object to avoid accumulating function events and then restart the profiler
+        # See PR: https://github.com/pytorch/pytorch/pull/125510
+        self.prof.profiler = None
+        self.prof.start()
+
         return self.current_time
 
     def fusionprofile_timer(self) -> float:
@@ -157,22 +160,20 @@ def _get_kernel_time(
         Returns:
             time_value: Elapsed CUDA time in seconds.
         """
-        elapsed_cuda_time = (
-            sum(
-                [
-                    # Re: torch profiler API changes in https://github.com/pytorch/pytorch/pull/123247
-                    (
-                        event.self_device_time_total
-                        if hasattr(event, "self_device_time_total")
-                        else event.self_cuda_time_total
-                    )
-                    for event in prof_averages
-                    if event.device_type == DeviceType.CUDA
-                ]
+        elapsed_cuda_time = 0
+        has_cuda_event = False
+        for event in prof_averages:
+            if event.device_type != DeviceType.CUDA:
+                continue
+            has_cuda_event = True
+            # Re: torch profiler API changes in https://github.com/pytorch/pytorch/pull/123247
+            elapsed_cuda_time = (
+                elapsed_cuda_time + event.self_device_time_total
+                if hasattr(event, "self_device_time_total")
+                else event.self_cuda_time_total
             )
-            / 1e6
-        )
-        return elapsed_cuda_time
+        assert has_cuda_event, "No CUDA events found"
+        return elapsed_cuda_time / 1e6
 
     def _increment_global_time(self, elapsed_time: float) -> None:
         self.current_time += elapsed_time

From e6ac0c3faed2901bd369d3916853a36142a59762 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Sat, 19 Oct 2024 07:35:26 -0400
Subject: [PATCH 6/9] Only set extra_info if there are CUDA events

---
 benchmarks/python/core.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py
index 12e7d0fbf9a..835e39953df 100644
--- a/benchmarks/python/core.py
+++ b/benchmarks/python/core.py
@@ -127,12 +127,12 @@ def torchprofile_timer(self) -> float:
         Returns:
             self.current_time: Global monotonic clock variable
         """
-        try:
-            self.prof.stop()
-        except AssertionError:
+        if self.prof.profiler is None:
             self.prof.start()
             return self.current_time
 
+        self.prof.stop()
+
         prof_averages = self.prof.key_averages()
         elapsed_cuda_time = self._get_kernel_time(prof_averages)
         self._increment_global_time(elapsed_cuda_time)
@@ -161,7 +161,7 @@ def _get_kernel_time(
             time_value: Elapsed CUDA time in seconds.
         """
         elapsed_cuda_time = 0
-        has_cuda_event = False
+        self.has_cuda_event = False
         for event in prof_averages:
             if event.device_type != DeviceType.CUDA:
                 continue
@@ -172,7 +172,6 @@ def _get_kernel_time(
                 if hasattr(event, "self_device_time_total")
                 else event.self_cuda_time_total
             )
-        assert has_cuda_event, "No CUDA events found"
         return elapsed_cuda_time / 1e6
 
     def _increment_global_time(self, elapsed_time: float) -> None:
@@ -223,14 +222,15 @@ def set_metrics(
                     iobytes += out.element_size() * out.numel()
 
         self.benchmark.extra_info["IOBytes"] = iobytes
-        bandwidth_bps = (
-            iobytes * self.benchmark.stats["rounds"]
-        ) / self.benchmark.stats["total"]
-        self.benchmark.extra_info["Bandwidth (Bps)"] = bandwidth_bps
-        self.benchmark.extra_info["Bandwidth (GBps)"] = bandwidth_bps / 1e9
-        self.benchmark.extra_info["% Peak Bandwidth (SOL)"] = (
-            100 * (bandwidth_bps / 1e9) / PEAK_BANDWIDTH_GBPS
-        )
+        if self.has_cuda_event:
+            bandwidth_bps = (
+                iobytes * self.benchmark.stats["rounds"]
+            ) / self.benchmark.stats["total"]
+            self.benchmark.extra_info["Bandwidth (Bps)"] = bandwidth_bps
+            self.benchmark.extra_info["Bandwidth (GBps)"] = bandwidth_bps / 1e9
+            self.benchmark.extra_info["% Peak Bandwidth (SOL)"] = (
+                100 * (bandwidth_bps / 1e9) / PEAK_BANDWIDTH_GBPS
+            )
 
 
 def run_benchmark(

From 6912ebe47ae4e3e746b3b57d84087f1365d9ab23 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 23 Oct 2024 15:35:44 -0400
Subject: [PATCH 7/9] Revert "Only set extra_info if there are CUDA events"

This reverts commit e6ac0c3faed2901bd369d3916853a36142a59762.
---
 benchmarks/python/core.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py
index 835e39953df..12e7d0fbf9a 100644
--- a/benchmarks/python/core.py
+++ b/benchmarks/python/core.py
@@ -127,12 +127,12 @@ def torchprofile_timer(self) -> float:
         Returns:
             self.current_time: Global monotonic clock variable
         """
-        if self.prof.profiler is None:
+        try:
+            self.prof.stop()
+        except AssertionError:
             self.prof.start()
             return self.current_time
 
-        self.prof.stop()
-
         prof_averages = self.prof.key_averages()
         elapsed_cuda_time = self._get_kernel_time(prof_averages)
         self._increment_global_time(elapsed_cuda_time)
@@ -161,7 +161,7 @@ def _get_kernel_time(
             time_value: Elapsed CUDA time in seconds.
         """
         elapsed_cuda_time = 0
-        self.has_cuda_event = False
+        has_cuda_event = False
         for event in prof_averages:
             if event.device_type != DeviceType.CUDA:
                 continue
@@ -172,6 +172,7 @@ def _get_kernel_time(
                 if hasattr(event, "self_device_time_total")
                 else event.self_cuda_time_total
             )
+        assert has_cuda_event, "No CUDA events found"
         return elapsed_cuda_time / 1e6
 
     def _increment_global_time(self, elapsed_time: float) -> None:
@@ -222,15 +223,14 @@ def set_metrics(
                     iobytes += out.element_size() * out.numel()
 
         self.benchmark.extra_info["IOBytes"] = iobytes
-        if self.has_cuda_event:
-            bandwidth_bps = (
-                iobytes * self.benchmark.stats["rounds"]
-            ) / self.benchmark.stats["total"]
-            self.benchmark.extra_info["Bandwidth (Bps)"] = bandwidth_bps
-            self.benchmark.extra_info["Bandwidth (GBps)"] = bandwidth_bps / 1e9
-            self.benchmark.extra_info["% Peak Bandwidth (SOL)"] = (
-                100 * (bandwidth_bps / 1e9) / PEAK_BANDWIDTH_GBPS
-            )
+        bandwidth_bps = (
+            iobytes * self.benchmark.stats["rounds"]
+        ) / self.benchmark.stats["total"]
+        self.benchmark.extra_info["Bandwidth (Bps)"] = bandwidth_bps
+        self.benchmark.extra_info["Bandwidth (GBps)"] = bandwidth_bps / 1e9
+        self.benchmark.extra_info["% Peak Bandwidth (SOL)"] = (
+            100 * (bandwidth_bps / 1e9) / PEAK_BANDWIDTH_GBPS
+        )
 
 
 def run_benchmark(

From 3cab9a7a49a4093268f09dc7c88599e3f9962139 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 24 Oct 2024 19:57:12 -0400
Subject: [PATCH 8/9] Split baseline out into separate test

---
 benchmarks/python/test_matmul.py | 67 ++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py
index 2bdabdcc396..6989b978a12 100644
--- a/benchmarks/python/test_matmul.py
+++ b/benchmarks/python/test_matmul.py
@@ -26,14 +26,14 @@ def load_matmul_problems():
 
 
 @pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"])
-@pytest.mark.parametrize("eager", [False, True], ids=["nvfuser", "eager"])
+@pytest.mark.parametrize("compile", [False], ids=["eager"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
 @pytest.mark.parametrize(
     "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val)
 )
-def test_matmul_nvf_benchmark(
+def test_matmul_baseline_benchmark(
     benchmark,
-    eager: bool,
+    compile: bool,
     config: tuple,
     dtype: torch.dtype,
     half_reduction: bool,
@@ -47,10 +47,6 @@ def test_matmul_nvf_benchmark(
     torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = half_reduction
     torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = half_reduction
 
-    if half_reduction and not eager:
-        # See https://github.com/NVIDIA/Fuser/pull/1719
-        pytest.skip("Reduced precision reduction not implemented in nvFuser")
-
     try:
         a = torch.randn(m, k, device="cuda", dtype=dtype)
         b = torch.randn(k, n, device="cuda", dtype=dtype)
@@ -60,23 +56,60 @@ def test_matmul_nvf_benchmark(
         if layout == "TN" or layout == "NN":
             b = b.as_strided(size=[k, n], stride=[1, k])
 
-        if eager:
-            # NOTE: we never need to validate eager, as it is our baseline
+        # NOTE: we never need to validate eager, as it is our baseline
+        if not disable_benchmarking:
             run_benchmark(
                 benchmark,
                 lambda ab: torch.matmul(*ab),
                 [a, b],
             )
-        else:
-            with FusionDefinition() as fd:
-                matmul_fusion(fd, [a, b])
 
-            if not disable_validation:
-                eager_output = torch.matmul(a, b)
-                fd.validate([a, b], [eager_output])
+    except torch.OutOfMemoryError:
+        pytest.skip("Test failed due to OutOfMemoryError")
+
+
+@pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+@pytest.mark.parametrize(
+    "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val)
+)
+def test_matmul_nvf_benchmark(
+    benchmark,
+    config: tuple,
+    dtype: torch.dtype,
+    half_reduction: bool,
+    disable_validation: bool,
+    disable_benchmarking: bool,
+):
+    m, n, k, layout = config
+
+    clear_cuda_cache()
+
+    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = half_reduction
+    torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = half_reduction
+
+    if half_reduction:
+        # See https://github.com/NVIDIA/Fuser/pull/1719
+        pytest.skip("Reduced precision reduction not implemented in nvFuser")
+
+    try:
+        a = torch.randn(m, k, device="cuda", dtype=dtype)
+        b = torch.randn(k, n, device="cuda", dtype=dtype)
+
+        if layout == "NT" or layout == "NN":
+            a = a.as_strided(size=[m, k], stride=[1, m])
+        if layout == "TN" or layout == "NN":
+            b = b.as_strided(size=[k, n], stride=[1, k])
+
+        with FusionDefinition() as fd:
+            matmul_fusion(fd, [a, b])
+
+        if not disable_validation:
+            eager_output = torch.matmul(a, b)
+            fd.validate([a, b], [eager_output])
 
-            if not disable_benchmarking:
-                run_benchmark(benchmark, fd.execute, [a, b])
+        if not disable_benchmarking:
+            run_benchmark(benchmark, fd.execute, [a, b])
 
     except torch.OutOfMemoryError:
         pytest.skip("Test failed due to OutOfMemoryError")

From 44c31a3236039fa3bbaad993467cd8bd1c3aae61 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 25 Oct 2024 16:06:18 -0400
Subject: [PATCH 9/9] Remove disable_benchmarking condition

---
 benchmarks/python/test_matmul.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py
index 6989b978a12..9c003343e4d 100644
--- a/benchmarks/python/test_matmul.py
+++ b/benchmarks/python/test_matmul.py
@@ -57,12 +57,11 @@ def test_matmul_baseline_benchmark(
             b = b.as_strided(size=[k, n], stride=[1, k])
 
         # NOTE: we never need to validate eager, as it is our baseline
-        if not disable_benchmarking:
-            run_benchmark(
-                benchmark,
-                lambda ab: torch.matmul(*ab),
-                [a, b],
-            )
+        run_benchmark(
+            benchmark,
+            lambda ab: torch.matmul(*ab),
+            [a, b],
+        )
 
     except torch.OutOfMemoryError:
         pytest.skip("Test failed due to OutOfMemoryError")