Skip to content
Merged
45 changes: 23 additions & 22 deletions benchmarks/python/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,18 @@ def torchprofile_timer(self) -> float:
"""
try:
self.prof.stop()
prof_averages = self.prof.key_averages()
elapsed_cuda_time = self._get_kernel_time(prof_averages)
self._increment_global_time(elapsed_cuda_time)
# Clear the internal profiler object to avoid accumulating function events and then restart the profiler
# See PR: https://github.com/pytorch/pytorch/pull/125510
self.prof.profiler = None
self.prof.start()
except AssertionError:
self.prof.start()
return self.current_time

prof_averages = self.prof.key_averages()
elapsed_cuda_time = self._get_kernel_time(prof_averages)
self._increment_global_time(elapsed_cuda_time)
# Clear the internal profiler object to avoid accumulating function events and then restart the profiler
# See PR: https://github.com/pytorch/pytorch/pull/125510
self.prof.profiler = None
self.prof.start()

return self.current_time

def fusionprofile_timer(self) -> float:
Expand All @@ -157,22 +160,20 @@ def _get_kernel_time(
Returns:
time_value: Elapsed CUDA time in seconds.
"""
elapsed_cuda_time = (
sum(
[
# Re: torch profiler API changes in https://github.com/pytorch/pytorch/pull/123247
(
event.self_device_time_total
if hasattr(event, "self_device_time_total")
else event.self_cuda_time_total
)
for event in prof_averages
if event.device_type == DeviceType.CUDA
]
elapsed_cuda_time = 0
has_cuda_event = False
for event in prof_averages:
if event.device_type != DeviceType.CUDA:
continue
has_cuda_event = True
# Re: torch profiler API changes in https://github.com/pytorch/pytorch/pull/123247
elapsed_cuda_time = (
elapsed_cuda_time + event.self_device_time_total
if hasattr(event, "self_device_time_total")
else event.self_cuda_time_total
)
/ 1e6
)
return elapsed_cuda_time
assert has_cuda_event, "No CUDA events found"
return elapsed_cuda_time / 1e6

def _increment_global_time(self, elapsed_time: float) -> None:
self.current_time += elapsed_time
Expand Down
55 changes: 53 additions & 2 deletions benchmarks/python/test_matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,72 @@ def load_matmul_problems():
return list((int(m), int(n), int(k), layout) for m, n, k, layout in reader)


@pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"])
@pytest.mark.parametrize("compile", [False], ids=["eager"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
@pytest.mark.parametrize(
"config", load_matmul_problems(), ids=lambda val: "_".join(str(v) for v in val)
"config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val)
)
def test_matmul_baseline_benchmark(
benchmark,
compile: bool,
config: tuple,
dtype: torch.dtype,
half_reduction: bool,
disable_validation: bool,
disable_benchmarking: bool,
):
m, n, k, layout = config

clear_cuda_cache()

torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = half_reduction
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = half_reduction

try:
a = torch.randn(m, k, device="cuda", dtype=dtype)
b = torch.randn(k, n, device="cuda", dtype=dtype)

if layout == "NT" or layout == "NN":
a = a.as_strided(size=[m, k], stride=[1, m])
if layout == "TN" or layout == "NN":
b = b.as_strided(size=[k, n], stride=[1, k])

# NOTE: we never need to validate eager, as it is our baseline
run_benchmark(
benchmark,
lambda ab: torch.matmul(*ab),
[a, b],
)

except torch.OutOfMemoryError:
pytest.skip("Test failed due to OutOfMemoryError")


@pytest.mark.parametrize("half_reduction", [False, True], ids=["fullred", "halfred"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
@pytest.mark.parametrize(
"config", load_matmul_problems(), ids=lambda val: "-".join(str(v) for v in val)
)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_matmul_nvf_benchmark(
benchmark,
config: tuple,
dtype: torch.dtype,
half_reduction: bool,
disable_validation: bool,
disable_benchmarking: bool,
):
m, n, k, layout = config

clear_cuda_cache()

torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = half_reduction
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = half_reduction

if half_reduction:
# See https://github.com/NVIDIA/Fuser/pull/1719
pytest.skip("Reduced precision reduction not implemented in nvFuser")

try:
a = torch.randn(m, k, device="cuda", dtype=dtype)
b = torch.randn(k, n, device="cuda", dtype=dtype)
Expand Down