diff --git a/benchmarks/python/core.py b/benchmarks/python/core.py index 18ff834a25f..12e7d0fbf9a 100644 --- a/benchmarks/python/core.py +++ b/benchmarks/python/core.py @@ -129,15 +129,18 @@ def torchprofile_timer(self) -> float: """ try: self.prof.stop() - prof_averages = self.prof.key_averages() - elapsed_cuda_time = self._get_kernel_time(prof_averages) - self._increment_global_time(elapsed_cuda_time) - # Clear the internal profiler object to avoid accumulating function events and then restart the profiler - # See PR: https://github.com/pytorch/pytorch/pull/125510 - self.prof.profiler = None - self.prof.start() except AssertionError: self.prof.start() + return self.current_time + + prof_averages = self.prof.key_averages() + elapsed_cuda_time = self._get_kernel_time(prof_averages) + self._increment_global_time(elapsed_cuda_time) + # Clear the internal profiler object to avoid accumulating function events and then restart the profiler + # See PR: https://github.com/pytorch/pytorch/pull/125510 + self.prof.profiler = None + self.prof.start() + return self.current_time def fusionprofile_timer(self) -> float: @@ -157,22 +160,20 @@ def _get_kernel_time( Returns: time_value: Elapsed CUDA time in seconds. """ - elapsed_cuda_time = ( - sum( - [ - # Re: torch profiler API changes in https://github.com/pytorch/pytorch/pull/123247 - ( - event.self_device_time_total - if hasattr(event, "self_device_time_total") - else event.self_cuda_time_total - ) - for event in prof_averages - if event.device_type == DeviceType.CUDA - ] + elapsed_cuda_time = 0 + has_cuda_event = False + for event in prof_averages: + if event.device_type != DeviceType.CUDA: + continue + has_cuda_event = True + # Re: torch profiler API changes in https://github.com/pytorch/pytorch/pull/123247 + elapsed_cuda_time = ( + elapsed_cuda_time + event.self_device_time_total + if hasattr(event, "self_device_time_total") + else event.self_cuda_time_total ) - / 1e6 - ) - return elapsed_cuda_time + assert has_cuda_event, "No CUDA events found" + return elapsed_cuda_time / 1e6 def _increment_global_time(self, elapsed_time: float) -> None: self.current_time += elapsed_time diff --git a/benchmarks/python/test_matmul.py b/benchmarks/python/test_matmul.py index 4eed491752d..9c003343e4d 100644 --- a/benchmarks/python/test_matmul.py +++ b/benchmarks/python/test_matmul.py @@ -25,14 +25,58 @@ def load_matmul_problems(): return list((int(m), int(n), int(k), layout) for m, n, k, layout in reader) +@pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"]) +@pytest.mark.parametrize("compile", [False], ids=["eager"]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"]) @pytest.mark.parametrize( - "config", load_matmul_problems(), ids=lambda val: "_".join(str(v) for v in val) + "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val) +) +def test_matmul_baseline_benchmark( + benchmark, + compile: bool, + config: tuple, + dtype: torch.dtype, + half_reduction: bool, + disable_validation: bool, + disable_benchmarking: bool, +): + m, n, k, layout = config + + clear_cuda_cache() + + torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = half_reduction + torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = half_reduction + + try: + a = torch.randn(m, k, device="cuda", dtype=dtype) + b = torch.randn(k, n, device="cuda", dtype=dtype) + + if layout == "NT" or layout == "NN": + a = a.as_strided(size=[m, k], stride=[1, m]) + if layout == "TN" or layout == "NN": + b = b.as_strided(size=[k, n], stride=[1, k]) + + # NOTE: we never need to validate eager, as it is our baseline + run_benchmark( + benchmark, + lambda ab: torch.matmul(*ab), + [a, b], + ) + + except torch.OutOfMemoryError: + pytest.skip("Test failed due to OutOfMemoryError") + + +@pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"]) +@pytest.mark.parametrize( + "config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val) ) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) def test_matmul_nvf_benchmark( benchmark, config: tuple, dtype: torch.dtype, + half_reduction: bool, disable_validation: bool, disable_benchmarking: bool, ): @@ -40,6 +84,13 @@ def test_matmul_nvf_benchmark( clear_cuda_cache() + torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = half_reduction + torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = half_reduction + + if half_reduction: + # See https://github.com/NVIDIA/Fuser/pull/1719 + pytest.skip("Reduced precision reduction not implemented in nvFuser") + try: a = torch.randn(m, k, device="cuda", dtype=dtype) b = torch.randn(k, n, device="cuda", dtype=dtype)