diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py
index 18ff834a25f..12e7d0fbf9a 100644
--- a/benchmarks/python/core.py
+++ b/benchmarks/python/core.py
@@ -129,15 +129,18 @@ def torchprofile_timer(self) -> float:
         """
         try:
             self.prof.stop()
-            prof_averages = self.prof.key_averages()
-            elapsed_cuda_time = self._get_kernel_time(prof_averages)
-            self._increment_global_time(elapsed_cuda_time)
-            # Clear the internal profiler object to avoid accumulating function events and then restart the profiler
-            # See PR: https://github.com/pytorch/pytorch/pull/125510
-            self.prof.profiler = None
-            self.prof.start()
         except AssertionError:
             self.prof.start()
+            return self.current_time
+
+        prof_averages = self.prof.key_averages()
+        elapsed_cuda_time = self._get_kernel_time(prof_averages)
+        self._increment_global_time(elapsed_cuda_time)
+        # Clear the internal profiler object to avoid accumulating function events and then restart the profiler
+        # See PR: https://github.com/pytorch/pytorch/pull/125510
+        self.prof.profiler = None
+        self.prof.start()
+
         return self.current_time
 
     def fusionprofile_timer(self) -> float:
@@ -157,22 +160,20 @@ def _get_kernel_time(
         Returns:
             time_value: Elapsed CUDA time in seconds.
         """
-        elapsed_cuda_time = (
-            sum(
-                [
-                    # Re: torch profiler API changes in https://github.com/pytorch/pytorch/pull/123247
-                    (
-                        event.self_device_time_total
-                        if hasattr(event, "self_device_time_total")
-                        else event.self_cuda_time_total
-                    )
-                    for event in prof_averages
-                    if event.device_type == DeviceType.CUDA
-                ]
+        elapsed_cuda_time = 0
+        has_cuda_event = False
+        for event in prof_averages:
+            if event.device_type != DeviceType.CUDA:
+                continue
+            has_cuda_event = True
+            # Re: torch profiler API changes in https://github.com/pytorch/pytorch/pull/123247
+            elapsed_cuda_time = (
+                elapsed_cuda_time + event.self_device_time_total
+                if hasattr(event, "self_device_time_total")
+                else event.self_cuda_time_total
             )
-            / 1e6
-        )
-        return elapsed_cuda_time
+        assert has_cuda_event, "No CUDA events found"
+        return elapsed_cuda_time / 1e6
 
     def _increment_global_time(self, elapsed_time: float) -> None:
         self.current_time += elapsed_time
diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py
index 4eed491752d..9c003343e4d 100644
--- a/benchmarks/python/test_matmul.py
+++ b/benchmarks/python/test_matmul.py
@@ -25,14 +25,58 @@ def load_matmul_problems():
         return list((int(m), int(n), int(k), layout) for m, n, k, layout in reader)
 
 
+@pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"])
+@pytest.mark.parametrize("compile", [False], ids=["eager"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
 @pytest.mark.parametrize(
-    "config", load_matmul_problems(), ids=lambda val: "_".join(str(v) for v in val)
+    "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val)
+)
+def test_matmul_baseline_benchmark(
+    benchmark,
+    compile: bool,
+    config: tuple,
+    dtype: torch.dtype,
+    half_reduction: bool,
+    disable_validation: bool,
+    disable_benchmarking: bool,
+):
+    m, n, k, layout = config
+
+    clear_cuda_cache()
+
+    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = half_reduction
+    torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = half_reduction
+
+    try:
+        a = torch.randn(m, k, device="cuda", dtype=dtype)
+        b = torch.randn(k, n, device="cuda", dtype=dtype)
+
+        if layout == "NT" or layout == "NN":
+            a = a.as_strided(size=[m, k], stride=[1, m])
+        if layout == "TN" or layout == "NN":
+            b = b.as_strided(size=[k, n], stride=[1, k])
+
+        # NOTE: we never need to validate eager, as it is our baseline
+        run_benchmark(
+            benchmark,
+            lambda ab: torch.matmul(*ab),
+            [a, b],
+        )
+
+    except torch.OutOfMemoryError:
+        pytest.skip("Test failed due to OutOfMemoryError")
+
+
+@pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
+@pytest.mark.parametrize(
+    "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val)
 )
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_matmul_nvf_benchmark(
     benchmark,
     config: tuple,
     dtype: torch.dtype,
+    half_reduction: bool,
     disable_validation: bool,
     disable_benchmarking: bool,
 ):
@@ -40,6 +84,13 @@ def test_matmul_nvf_benchmark(
 
     clear_cuda_cache()
 
+    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = half_reduction
+    torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = half_reduction
+
+    if half_reduction:
+        # See https://github.com/NVIDIA/Fuser/pull/1719
+        pytest.skip("Reduced precision reduction not implemented in nvFuser")
+
     try:
         a = torch.randn(m, k, device="cuda", dtype=dtype)
         b = torch.randn(k, n, device="cuda", dtype=dtype)