From 6e4a88eccc2fb4497da9a137b45c4dfe71fb6c60 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Thu, 9 May 2024 21:01:57 +0000
Subject: [PATCH 1/4] [Runtime] Allow query of available device memory through
 DeviceAPI

Prior to this commit, the total device memory could be queried through
the `DeviceAPI` interface, but the currently available device memory
could not.  This functionality may be useful for debugging, or for
validating available memory prior to model execution.

This commit implements the property `Device.available_global_memory`,
which queries the `DeviceAttrKind::kAvailableGlobalMemory`.  Support
for this query, like all device attribute queries, may vary across
different backends, and will return `None` for backends that do not
support this query.  This commit only currently implements support for
`kAvailableGlobalMemory` for TVM's Cuda backend.
---
 include/tvm/runtime/device_api.h              |  1 +
 python/tvm/_ffi/runtime_ctypes.py             | 16 ++++-
 src/runtime/cuda/cuda_device_api.cc           |  6 ++
 .../test_runtime_ndarray.py                   | 70 +++++++++++++------
 4 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index b419212602c4..14b2b84b0d36 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -51,6 +51,7 @@ enum DeviceAttrKind : int {
   kDriverVersion = 12,
   kL2CacheSizeBytes = 13,
   kTotalGlobalMemory = 14,
+  kAvailableGlobalMemory = 15,
 };
 
 #ifdef TVM_KALLOC_ALIGNMENT
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 099cbe972a4a..8462c2fe82df 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -539,11 +539,25 @@ def total_global_memory(self):
         Returns
         -------
         total_global_memory : int or None
-            Return the global memory available on device in bytes.
+            Return the total size of global memory on device in bytes.
             Return None if the device does not support this feature.
         """
         return self._GetDeviceAttr(self.device_type, self.device_id, 14)
 
+    @property
+    def available_global_memory(self):
+        """Return size of the total global memory.
+
+        Supported devices include CUDA/ROCm/Metal/OpenCL.
+
+        Returns
+        -------
+        total_global_memory : int or None
+            Return the amount of unallocated global memory on device in bytes.
+            Return None if the device does not support this feature.
+        """
+        return self._GetDeviceAttr(self.device_type, self.device_id, 15)
+
     def texture_spatial_limit(self):
         """Returns limits for textures by spatial dimensions
 
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 1c80397125e4..55efaf78e825 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -121,6 +121,12 @@ class CUDADeviceAPI final : public DeviceAPI {
         *rv = total_global_memory;
         return;
       }
+      case kAvailableGlobalMemory: {
+        size_t free_mem, total_mem;
+        CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
+        *rv = static_cast<int64_t>(free_mem);
+        return;
+      }
     }
     *rv = value;
   }
diff --git a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
index 197a2f88e3fa..38a1f32a10c3 100644
--- a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
+++ b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py
@@ -16,33 +16,63 @@
 # under the License.
 """Basic runtime enablement test."""
 
-import tvm
-from tvm import te
+import math
+
+import pytest
 import numpy as np
+
+import tvm
 import tvm.testing
+from tvm import te
+
+dtype = tvm.testing.parameter("uint8", "int8", "uint16", "int16", "uint32", "int32", "float32")
+
+
+def test_nd_create(target, dev, dtype):
+    x = np.random.randint(0, 10, size=(3, 4))
+    x = np.array(x, dtype=dtype)
+    y = tvm.nd.array(x, device=dev)
+    z = y.copyto(dev)
+    assert y.dtype == x.dtype
+    assert y.shape == x.shape
+    assert isinstance(y, tvm.nd.NDArray)
+    np.testing.assert_equal(x, y.numpy())
+    np.testing.assert_equal(x, z.numpy())
+
+    # no need here, just to test usablity
+    dev.sync()
+
+
+def test_memory_usage(target, dev, dtype):
+    available_memory_before = dev.available_global_memory
+    if available_memory_before is None:
+        pytest.skip(reason=f"Target '{target}' does not support queries of available memory")
+
+    arr = tvm.nd.empty([1024, 1024], dtype=dtype, device=dev)
+    available_memory_after = dev.available_global_memory
+
+    num_elements = math.prod(arr.shape)
+    element_nbytes = tvm.runtime.DataType(dtype).itemsize()
+    expected_memory_after = available_memory_before - num_elements * element_nbytes
+
+    # Allocations may be padded out to provide alignment, to match a
+    # page boundary, due to additional device-side bookkeeping
+    # required by the TVM backend or the driver, etc.  Therefore, the
+    # available memory may decrease by more than the requested amount.
+    assert available_memory_after <= expected_memory_after
 
+    # TVM's NDArray type is a reference-counted handle to the
+    # underlying reference.  After the last reference to an NDArray is
+    # cleared, the backing allocation will be freed.
+    del arr
 
-@tvm.testing.uses_gpu
-def test_nd_create():
-    for target, dev in tvm.testing.enabled_targets():
-        for dtype in ["uint8", "int8", "uint16", "int16", "uint32", "int32", "float32"]:
-            x = np.random.randint(0, 10, size=(3, 4))
-            x = np.array(x, dtype=dtype)
-            y = tvm.nd.array(x, device=dev)
-            z = y.copyto(dev)
-            assert y.dtype == x.dtype
-            assert y.shape == x.shape
-            assert isinstance(y, tvm.nd.NDArray)
-            np.testing.assert_equal(x, y.numpy())
-            np.testing.assert_equal(x, z.numpy())
-        # no need here, just to test usablity
-        dev.sync()
+    assert dev.available_global_memory == available_memory_before
 
 
 def test_fp16_conversion():
     n = 100
 
-    for (src, dst) in [("float32", "float16"), ("float16", "float32")]:
+    for src, dst in [("float32", "float16"), ("float16", "float32")]:
         A = te.placeholder((n,), dtype=src)
         B = te.compute((n,), lambda i: A[i].astype(dst))
 
@@ -66,6 +96,4 @@ def test_dtype():
 
 
 if __name__ == "__main__":
-    test_nd_create()
-    test_fp16_conversion()
-    test_dtype()
+    tvm.testing.main()

From 823fd4496698d9726a99bad9e84ade85e0a1dc21 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Mon, 13 May 2024 20:22:48 +0000
Subject: [PATCH 2/4] Updated docstring to fix copy/paste typo

---
 python/tvm/_ffi/runtime_ctypes.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 8462c2fe82df..f148e26f3fcb 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -546,13 +546,13 @@ def total_global_memory(self):
 
     @property
     def available_global_memory(self):
-        """Return size of the total global memory.
+        """Return size of the available global memory.
 
-        Supported devices include CUDA/ROCm/Metal/OpenCL.
+        Supported devices include CUDA.
 
         Returns
         -------
-        total_global_memory : int or None
+        available_global_memory : int or None
             Return the amount of unallocated global memory on device in bytes.
             Return None if the device does not support this feature.
         """

From cd298d8952d61c6f171259e1036ee1e953f3c232 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Wed, 15 May 2024 08:40:03 -0500
Subject: [PATCH 3/4] Lint fix, cover all enum values in case/switch

---
 src/runtime/opencl/opencl_device_api.cc | 6 ++++++
 src/runtime/vulkan/vulkan_device_api.cc | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index ab553052bbda..0057d0a10102 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -214,6 +214,12 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = static_cast<int64_t>(total_global_memory);
       return;
     }
+
+    case kAvailableGlobalMemory:
+      // Not currently implemented.  Based on
+      // https://stackoverflow.com/a/3568223, may not be implementable
+      // at all through OpenCL API.
+      break;
   }
 }
 
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
index 4b337dd52455..483668a2a75f 100644
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -168,6 +168,11 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = device(index).compute_memory_size;
       return;
     }
+
+    case kAvailableGlobalMemory:
+      // Not currently implemented.  Will only be implementable for
+      // devices that support the VK_EXT_memory_budget extension.
+      break;
   }
 }
 

From edd86bf2c8e5a31e9c0f87ba6943017d49583832 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <elunderberg@octoml.ai>
Date: Fri, 17 May 2024 07:16:02 -0500
Subject: [PATCH 4/4] Fix rocm compilation warning

---
 src/runtime/rocm/rocm_device_api.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index ffc8d5a80597..f3cc46f92723 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -136,6 +136,10 @@ class ROCMDeviceAPI final : public DeviceAPI {
         *rv = total_global_memory;
         return;
       }
+
+      case kAvailableGlobalMemory:
+        // Not currently implemented.
+        break;
     }
     *rv = value;
   }