From f7e85ae83e01110f3fb9f4de7c0feb20161bd092 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Jan 2025 10:44:23 +0530 Subject: [PATCH 1/7] conditionally check if compute capability is met. --- src/diffusers/utils/torch_utils.py | 10 ++++++++++ tests/models/test_modeling_common.py | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py index 12eef8899bbb..3c8911773e39 100644 --- a/src/diffusers/utils/torch_utils.py +++ b/src/diffusers/utils/torch_utils.py @@ -149,3 +149,13 @@ def apply_freeu( res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s2"]) return hidden_states, res_hidden_states + + +def get_torch_cuda_device_capability(): + if torch.cuda.is_available(): + device = torch.device("cuda") + compute_capability = torch.cuda.get_device_capability(device) + compute_capability = f"{compute_capability[0]}.{compute_capability[1]}" + return float(compute_capability) + else: + return None diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index b88b6f16b9fb..e8d43fb7aea3 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -68,6 +68,7 @@ torch_all_close, torch_device, ) +from diffusers.utils.torch_utils import get_torch_cuda_device_capability from ..others.test_utils import TOKEN, USER, is_staging_test @@ -1412,10 +1413,12 @@ def get_memory_usage(storage_dtype, compute_dtype): torch.float8_e4m3fn, torch.bfloat16 ) + compute_capability = get_torch_cuda_device_capability() self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint) # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32. # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes. - self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory) + if compute_capability < 8.9: + self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory) # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few # bytes. This only happens for some models, so we allow a small tolerance. # For any real model being tested, the order would be fp8_e4m3_bf16 < fp8_e4m3_fp32 < fp32. From f72b0cb89c0dd8837e18f1eb185a465f6c1e17e5 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Jan 2025 10:58:52 +0530 Subject: [PATCH 2/7] log info. --- src/diffusers/utils/torch_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py index 3c8911773e39..bca12fa156d8 100644 --- a/src/diffusers/utils/torch_utils.py +++ b/src/diffusers/utils/torch_utils.py @@ -154,8 +154,11 @@ def apply_freeu( def get_torch_cuda_device_capability(): if torch.cuda.is_available(): device = torch.device("cuda") + gpu_name = torch.cuda.get_device_name(device) compute_capability = torch.cuda.get_device_capability(device) compute_capability = f"{compute_capability[0]}.{compute_capability[1]}" + print(f"{gpu_name=}, {compute_capability=}") return float(compute_capability) else: return None + From 1d4bc033a436d5a42286150e5ada44c71e7c573c Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Jan 2025 11:00:42 +0530 Subject: [PATCH 3/7] fix condition. --- tests/models/test_modeling_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index e8d43fb7aea3..4ab5b3c5527f 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -1417,7 +1417,7 @@ def get_memory_usage(storage_dtype, compute_dtype): self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint) # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32. # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes. - if compute_capability < 8.9: + if compute_capability >= 8.9: self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory) # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few # bytes. This only happens for some models, so we allow a small tolerance. From 7725271ba0a3455a37b1323df29c422723e1369c Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Jan 2025 11:11:08 +0530 Subject: [PATCH 4/7] updates --- src/diffusers/utils/torch_utils.py | 2 -- tests/models/test_modeling_common.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py index bca12fa156d8..524ffcb81f58 100644 --- a/src/diffusers/utils/torch_utils.py +++ b/src/diffusers/utils/torch_utils.py @@ -154,10 +154,8 @@ def apply_freeu( def get_torch_cuda_device_capability(): if torch.cuda.is_available(): device = torch.device("cuda") - gpu_name = torch.cuda.get_device_name(device) compute_capability = torch.cuda.get_device_capability(device) compute_capability = f"{compute_capability[0]}.{compute_capability[1]}" - print(f"{gpu_name=}, {compute_capability=}") return float(compute_capability) else: return None diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 4ab5b3c5527f..f390b562fdfb 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -1385,6 +1385,7 @@ def test_layerwise_casting(storage_dtype, compute_dtype): @require_torch_gpu def test_layerwise_casting_memory(self): MB_TOLERANCE = 0.2 + LEAST_COMPUTE_CAPABILITY = 8.0 def reset_memory_stats(): gc.collect() @@ -1417,7 +1418,7 @@ def get_memory_usage(storage_dtype, compute_dtype): self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint) # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32. # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes. - if compute_capability >= 8.9: + if compute_capability >= LEAST_COMPUTE_CAPABILITY: self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory) # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few # bytes. This only happens for some models, so we allow a small tolerance. From ebf1db542451ef8ad80264583d0b02d7d4784ed8 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Jan 2025 11:14:15 +0530 Subject: [PATCH 5/7] updates --- tests/models/test_modeling_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index f390b562fdfb..86f89c612e0a 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -1418,7 +1418,7 @@ def get_memory_usage(storage_dtype, compute_dtype): self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint) # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32. # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes. - if compute_capability >= LEAST_COMPUTE_CAPABILITY: + if compute_capability and compute_capability >= LEAST_COMPUTE_CAPABILITY: self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory) # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few # bytes. This only happens for some models, so we allow a small tolerance. From f162b53f5718cfbb7956303206c79729818e25f5 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Jan 2025 11:15:44 +0530 Subject: [PATCH 6/7] updates --- tests/models/test_modeling_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 86f89c612e0a..c3cb082b0ef1 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -1416,8 +1416,8 @@ def get_memory_usage(storage_dtype, compute_dtype): compute_capability = get_torch_cuda_device_capability() self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint) - # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32. - # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes. + # NOTE: the following assertion would fail on our CI (running Tesla T4) due to bf16 using more memory than fp32. + # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes. So, we conditionally check it. if compute_capability and compute_capability >= LEAST_COMPUTE_CAPABILITY: self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory) # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few From 5ca175630827de000fc216558e18da73914d21bf Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 28 Jan 2025 11:42:45 +0530 Subject: [PATCH 7/7] updates --- src/diffusers/utils/torch_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py index 524ffcb81f58..3c8911773e39 100644 --- a/src/diffusers/utils/torch_utils.py +++ b/src/diffusers/utils/torch_utils.py @@ -159,4 +159,3 @@ def get_torch_cuda_device_capability(): return float(compute_capability) else: return None -