From f7e85ae83e01110f3fb9f4de7c0feb20161bd092 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Jan 2025 10:44:23 +0530
Subject: [PATCH 1/7] conditionally check if compute capability is met.

---
 src/diffusers/utils/torch_utils.py   | 10 ++++++++++
 tests/models/test_modeling_common.py |  5 ++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
index 12eef8899bbb..3c8911773e39 100644
--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -149,3 +149,13 @@ def apply_freeu(
         res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s2"])
 
     return hidden_states, res_hidden_states
+
+
+def get_torch_cuda_device_capability():
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        compute_capability = torch.cuda.get_device_capability(device)
+        compute_capability = f"{compute_capability[0]}.{compute_capability[1]}"
+        return float(compute_capability)
+    else:
+        return None
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index b88b6f16b9fb..e8d43fb7aea3 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -68,6 +68,7 @@
     torch_all_close,
     torch_device,
 )
+from diffusers.utils.torch_utils import get_torch_cuda_device_capability
 
 from ..others.test_utils import TOKEN, USER, is_staging_test
 
@@ -1412,10 +1413,12 @@ def get_memory_usage(storage_dtype, compute_dtype):
             torch.float8_e4m3fn, torch.bfloat16
         )
 
+        compute_capability = get_torch_cuda_device_capability()
         self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint)
         # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32.
         # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes.
-        self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory)
+        if compute_capability < 8.9:
+            self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory)
         # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few
         # bytes. This only happens for some models, so we allow a small tolerance.
         # For any real model being tested, the order would be fp8_e4m3_bf16 < fp8_e4m3_fp32 < fp32.

From f72b0cb89c0dd8837e18f1eb185a465f6c1e17e5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Jan 2025 10:58:52 +0530
Subject: [PATCH 2/7] log info.

---
 src/diffusers/utils/torch_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
index 3c8911773e39..bca12fa156d8 100644
--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -154,8 +154,11 @@ def apply_freeu(
 def get_torch_cuda_device_capability():
     if torch.cuda.is_available():
         device = torch.device("cuda")
+        gpu_name = torch.cuda.get_device_name(device)
         compute_capability = torch.cuda.get_device_capability(device)
         compute_capability = f"{compute_capability[0]}.{compute_capability[1]}"
+        print(f"{gpu_name=}, {compute_capability=}")
         return float(compute_capability)
     else:
         return None
+

From 1d4bc033a436d5a42286150e5ada44c71e7c573c Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Jan 2025 11:00:42 +0530
Subject: [PATCH 3/7] fix condition.

---
 tests/models/test_modeling_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index e8d43fb7aea3..4ab5b3c5527f 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1417,7 +1417,7 @@ def get_memory_usage(storage_dtype, compute_dtype):
         self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint)
         # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32.
         # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes.
-        if compute_capability < 8.9:
+        if compute_capability >= 8.9:
             self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory)
         # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few
         # bytes. This only happens for some models, so we allow a small tolerance.

From 7725271ba0a3455a37b1323df29c422723e1369c Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Jan 2025 11:11:08 +0530
Subject: [PATCH 4/7] updates

---
 src/diffusers/utils/torch_utils.py   | 2 --
 tests/models/test_modeling_common.py | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
index bca12fa156d8..524ffcb81f58 100644
--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -154,10 +154,8 @@ def apply_freeu(
 def get_torch_cuda_device_capability():
     if torch.cuda.is_available():
         device = torch.device("cuda")
-        gpu_name = torch.cuda.get_device_name(device)
         compute_capability = torch.cuda.get_device_capability(device)
         compute_capability = f"{compute_capability[0]}.{compute_capability[1]}"
-        print(f"{gpu_name=}, {compute_capability=}")
         return float(compute_capability)
     else:
         return None
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 4ab5b3c5527f..f390b562fdfb 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1385,6 +1385,7 @@ def test_layerwise_casting(storage_dtype, compute_dtype):
     @require_torch_gpu
     def test_layerwise_casting_memory(self):
         MB_TOLERANCE = 0.2
+        LEAST_COMPUTE_CAPABILITY = 8.0
 
         def reset_memory_stats():
             gc.collect()
@@ -1417,7 +1418,7 @@ def get_memory_usage(storage_dtype, compute_dtype):
         self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint)
         # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32.
         # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes.
-        if compute_capability >= 8.9:
+        if compute_capability >= LEAST_COMPUTE_CAPABILITY:
             self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory)
         # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few
         # bytes. This only happens for some models, so we allow a small tolerance.

From ebf1db542451ef8ad80264583d0b02d7d4784ed8 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Jan 2025 11:14:15 +0530
Subject: [PATCH 5/7] updates

---
 tests/models/test_modeling_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index f390b562fdfb..86f89c612e0a 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1418,7 +1418,7 @@ def get_memory_usage(storage_dtype, compute_dtype):
         self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint)
         # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32.
         # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes.
-        if compute_capability >= LEAST_COMPUTE_CAPABILITY:
+        if compute_capability and compute_capability >= LEAST_COMPUTE_CAPABILITY:
             self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory)
         # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few
         # bytes. This only happens for some models, so we allow a small tolerance.

From f162b53f5718cfbb7956303206c79729818e25f5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Jan 2025 11:15:44 +0530
Subject: [PATCH 6/7] updates

---
 tests/models/test_modeling_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 86f89c612e0a..c3cb082b0ef1 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -1416,8 +1416,8 @@ def get_memory_usage(storage_dtype, compute_dtype):
 
         compute_capability = get_torch_cuda_device_capability()
         self.assertTrue(fp8_e4m3_bf16_memory_footprint < fp8_e4m3_fp32_memory_footprint < fp32_memory_footprint)
-        # NOTE: the following assertion will fail on our CI (running Tesla T4) due to bf16 using more memory than fp32.
-        # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes.
+        # NOTE: the following assertion would fail on our CI (running Tesla T4) due to bf16 using more memory than fp32.
+        # On other devices, such as DGX (Ampere) and Audace (Ada), the test passes. So, we conditionally check it.
         if compute_capability and compute_capability >= LEAST_COMPUTE_CAPABILITY:
             self.assertTrue(fp8_e4m3_bf16_max_memory < fp8_e4m3_fp32_max_memory)
         # On this dummy test case with a small model, sometimes fp8_e4m3_fp32 max memory usage is higher than fp32 by a few

From 5ca175630827de000fc216558e18da73914d21bf Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Tue, 28 Jan 2025 11:42:45 +0530
Subject: [PATCH 7/7] updates

---
 src/diffusers/utils/torch_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
index 524ffcb81f58..3c8911773e39 100644
--- a/src/diffusers/utils/torch_utils.py
+++ b/src/diffusers/utils/torch_utils.py
@@ -159,4 +159,3 @@ def get_torch_cuda_device_capability():
         return float(compute_capability)
     else:
         return None
-