From 0e1cd889d237ca479e411d81a2e28800b59fc9f9 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 7 May 2025 09:54:24 -0400
Subject: [PATCH] [FFI][FEAT] AutoDLPack to enable external tensor args.

This PR introduces autodlpack feature to the tvm ffi.
When an ffi Function takes Tensor argument that conforms to DLPack
it automatically imports into NDArray and pass as argument.

The feature will allow compiled function to directly take torch.Tensor
as input argument without extra set of changes. When a function returns
NDArray, the return value still needs to be converted back via torch.from_dlpack.

However, a common use case is the destination passing, where all inputs
outputs are pre-allocated and passed into the function. AutoDLPack effectively
enables zero overhead support for a wide range of python arrays.

We also added a benchmark script to measure the overall ffi overhead.
One thing to note is that there is still continuguous and alignment
requirement that is needed by underlying DSL compiler, as of now
we use a global value. So x.continugous is still needed before passing
the argument if tranpose or other ops are performed.
---
 ffi/scripts/benchmark_dlpack.py    | 345 +++++++++++++++++++++++++++++
 python/tvm/ffi/convert.py          |   5 +
 python/tvm/ffi/cython/function.pxi |  16 ++
 python/tvm/ffi/cython/ndarray.pxi  |   2 +
 tests/python/ffi/test_ndarray.py   |  27 +++
 5 files changed, 395 insertions(+)
 create mode 100644 ffi/scripts/benchmark_dlpack.py

diff --git a/ffi/scripts/benchmark_dlpack.py b/ffi/scripts/benchmark_dlpack.py
new file mode 100644
index 000000000000..b19f566364e4
--- /dev/null
+++ b/ffi/scripts/benchmark_dlpack.py
@@ -0,0 +1,345 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This script is used to benchmark the API overhead of different
+python FFI API calling overhead, through DLPack API.
+
+Specifically, we would like to understand the overall overhead
+python/C++ API calls. The general goal is to understand the overall
+space and get a sense of what are the possible operations.
+
+We pick function f(x, y, z) where x, y, z are length 1 tensors.
+The benchmark is running in eager mode so we can see what is possible.
+It is orthogonal to other optimizations. For example cudagraph can
+eliminate these overheads completely. So the goal is to get a sense
+of what is possible under eager mode.
+
+Summary of some takeaways:
+- numpy.add roughly takes 0.36 us per call, which gives roughly what can
+  be done in python env.
+- torch.add on gpu takes about 3.7us per call, giving us an idea of what
+  roughly we need to get to in eager mode.
+-
+
+"""
+import torch
+import numpy as np
+from tvm import ffi as tvm_ffi
+import time
+
+
+def print_speed(name, speed):
+    print(f"{name:<40} {speed} sec/call")
+
+
+def print_error(name, error):
+    print(f"{name:<40} {error}")
+
+
+def baseline_torch_add(repeat):
+    """Run torch.add with one element"""
+
+    def run_bench(device):
+        x = torch.arange(1, device=device)
+        y = torch.arange(1, device=device)
+        z = torch.arange(1, device=device)
+
+        torch.add(x, y, out=z)
+        if device == "cuda":
+            torch.cuda.synchronize()
+        start = time.time()
+        for i in range(repeat):
+            torch.add(x, y, out=z)
+        # note we deliberately do not use torch.cuda.synchronize()
+        # because we want to see the overhead of the FFI call.
+        end = time.time()
+        print_speed(f"torch.add[{device}]", (end - start) / repeat)
+
+    # rough take away: add on cuda roughly takes 3e-6 sec/call
+    run_bench("cpu")
+    run_bench("cuda")
+
+
+def baseline_numpy_add(repeat):
+    """Run numpy.add with one element"""
+    x = np.arange(1)
+    y = np.arange(1)
+    z = np.arange(1)
+
+    np.add(x, y, out=z)
+    start = time.time()
+    for i in range(repeat):
+        np.add(x, y, out=z)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed("numpy.add", speed)
+
+
+def baseline_cupy_add(repeat):
+    """Run cupy.add with one element"""
+    try:
+        import cupy
+    except ImportError:
+        # skip if cupy is not installed
+        return
+    x = cupy.arange(1)
+    y = cupy.arange(1)
+    z = cupy.arange(1)
+
+    cupy.add(x, y, out=z)
+    start = time.time()
+    for i in range(repeat):
+        cupy.add(x, y, out=z)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed("cupy.add", speed)
+
+
+def tvm_ffi_nop(repeat):
+    """Overhead of tvm FFI python call via calling a NOP.
+
+    testing.nop is defined in c++ and do nothing.
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    x = tvm_ffi.from_dlpack(torch.arange(1))
+    y = tvm_ffi.from_dlpack(torch.arange(1))
+    z = tvm_ffi.from_dlpack(torch.arange(1))
+    nop(x, y, z)
+    start = time.time()
+    for i in range(repeat):
+        y = tvm_ffi.from_dlpack(x)
+    end = time.time()
+    print_speed("tvm.ffi.nop", (end - start) / repeat)
+
+
+def bench_ffi_nop_from_dlpack(name, x, y, z, repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    tx = tvm_ffi.from_dlpack(x)
+    ty = tvm_ffi.from_dlpack(y)
+    tz = tvm_ffi.from_dlpack(z)
+    nop(tx, ty, tz)
+
+    start = time.time()
+    for i in range(repeat):
+        tx = tvm_ffi.from_dlpack(x)
+        ty = tvm_ffi.from_dlpack(y)
+        tz = tvm_ffi.from_dlpack(z)
+        nop(tx, ty, tz)
+    end = time.time()
+    print_speed(name, (end - start) / repeat)
+
+
+def tvm_ffi_nop_from_torch_dlpack(repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    x = torch.arange(1)
+    y = torch.arange(1)
+    z = torch.arange(1)
+    bench_ffi_nop_from_dlpack("tvm.ffi.nop+from_dlpack(torch)", x, y, z, repeat)
+
+
+def tvm_ffi_nop_from_numpy_dlpack(repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    x = np.arange(1)
+    y = np.arange(1)
+    z = np.arange(1)
+    bench_ffi_nop_from_dlpack("tvm.ffi.nop+from_dlpack(numpy)", x, y, z, repeat)
+
+
+def tvm_ffi_self_dlpack_nop(repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    x = tvm_ffi.from_dlpack(torch.arange(1))
+    y = tvm_ffi.from_dlpack(torch.arange(1))
+    z = tvm_ffi.from_dlpack(torch.arange(1))
+    bench_ffi_nop_from_dlpack("tvm.ffi.nop+from_dlpack(tvm)", x, y, z, repeat)
+
+
+def bench_ffi_nop_from_dlpack(name, x, y, z, repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    tx = tvm_ffi.from_dlpack(x)
+    ty = tvm_ffi.from_dlpack(y)
+    tz = tvm_ffi.from_dlpack(z)
+    nop(tx, ty, tz)
+
+    start = time.time()
+    for i in range(repeat):
+        tx = tvm_ffi.from_dlpack(x)
+        ty = tvm_ffi.from_dlpack(y)
+        tz = tvm_ffi.from_dlpack(z)
+        nop(tx, ty, tz)
+    end = time.time()
+    print_speed(name, (end - start) / repeat)
+
+
+def tvm_ffi_nop_from_torch_utils_to_dlpack(repeat):
+    """
+    Measures overhead of running dlpack for each args then invoke
+    but uses the legacy torch.utils.dlpack.to_dlpack API
+
+    This helps to measure possible implementation overhead of torch.
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    x = torch.arange(1)
+    y = torch.arange(1)
+    z = torch.arange(1)
+
+    tx = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(x))
+    ty = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(y))
+    tz = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(z))
+    nop(tx, ty, tz)
+
+    start = time.time()
+    for i in range(repeat):
+        tx = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(x))
+        ty = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(y))
+        tz = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(z))
+        nop(tx, ty, tz)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed("tvm.ffi.nop+from_dlpack(torch.utils)", speed)
+
+
+def bench_tvm_ffi_nop_autodlpack(name, x, y, z, repeat):
+    """
+    Measures overhead of running dlpack via auto convert by directly
+    take torch.Tensor as inputs.
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    nop(x, y, z)
+    start = time.time()
+    for i in range(repeat):
+        nop(x, y, z)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed(name, speed)
+
+
+def tvm_ffi_nop_autodlpack_from_torch(repeat, device="cpu"):
+    """
+    Measures overhead of running dlpack via auto convert by directly
+    take torch.Tensor as inputs.
+    """
+    # use larger to ensure alignment req is met
+    x = torch.arange(1, device=device)
+    y = torch.arange(1, device=device)
+    z = torch.arange(1, device=device)
+    bench_tvm_ffi_nop_autodlpack(f"tvm.ffi.nop.autodlpack(torch[{device}])", x, y, z, repeat)
+
+
+def tvm_ffi_nop_autodlpack_from_numpy(repeat):
+    """
+    Measures overhead of running dlpack via auto convert by directly
+    take numpy.ndarray as inputs.
+    """
+    # use larger to ensure alignment req is met
+    x = np.arange(256)
+    y = np.arange(256)
+    z = np.arange(256)
+    bench_tvm_ffi_nop_autodlpack("tvm.ffi.nop.autodlpack(numpy)", x, y, z, repeat)
+
+
+def bench_to_dlpack(x, name, repeat):
+    x.__dlpack__()
+    start = time.time()
+    for i in range(repeat):
+        x.__dlpack__()
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed(name, speed)
+
+
+def bench_to_dlpack_versioned(x, name, repeat, max_version=(1, 1)):
+    """
+    Measures overhead of running dlpack with latest 1.1.
+    """
+    try:
+        x.__dlpack__(max_version=max_version)
+        start = time.time()
+        for i in range(repeat):
+            x.__dlpack__(max_version=max_version)
+        end = time.time()
+        speed = (end - start) / repeat
+        print_speed(name, speed)
+    except Exception as e:
+        print_error(name, e)
+
+
+def bench_torch_utils_to_dlpack(repeat):
+    """
+    Measures overhead of running torch.utils.dlpack.to_dlpack
+    """
+    x = torch.arange(1)
+    torch.utils.dlpack.to_dlpack(x)
+    start = time.time()
+    for i in range(repeat):
+        torch.utils.dlpack.to_dlpack(x)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed("torch.utils.dlpack.to_dlpack", speed)
+
+
+def main():
+    repeat = 10000
+    print("-----------------------------")
+    print("Benchmark f(x, y, z) overhead")
+    print("-----------------------------")
+    baseline_numpy_add(repeat)
+    baseline_torch_add(repeat)
+    baseline_cupy_add(repeat)
+    tvm_ffi_nop(repeat)
+    tvm_ffi_nop_from_torch_dlpack(repeat)
+    tvm_ffi_nop_from_numpy_dlpack(repeat)
+    tvm_ffi_self_dlpack_nop(repeat)
+    tvm_ffi_nop_from_torch_utils_to_dlpack(repeat)
+    tvm_ffi_nop_autodlpack_from_torch(repeat, "cpu")
+    tvm_ffi_nop_autodlpack_from_torch(repeat, "cuda")
+    tvm_ffi_nop_autodlpack_from_numpy(repeat)
+    print("-------------------------------")
+    print("Benchmark x.__dlpack__ overhead")
+    print("-------------------------------")
+    bench_torch_utils_to_dlpack(repeat)
+    bench_to_dlpack(torch.arange(1), "torch.__dlpack__", repeat)
+    bench_to_dlpack(np.arange(1), "numpy.__dlpack__", repeat)
+    bench_to_dlpack(tvm_ffi.from_dlpack(torch.arange(1)), "tvm.__dlpack__", repeat)
+    print("---------------------------------------------------")
+    print("Benchmark x.__dlpack__(max_version=(1,1)) overhead")
+    print("---------------------------------------------------")
+    bench_to_dlpack_versioned(torch.arange(1), "torch.__dlpack__(max_version=(1,1))", repeat)
+    bench_to_dlpack_versioned(np.arange(1), "numpy.__dlpack__(max_version=(1,1))", repeat)
+    bench_to_dlpack_versioned(
+        tvm_ffi.from_dlpack(torch.arange(1)), "tvm.__dlpack__(max_version=(1,1))", repeat
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/ffi/convert.py b/python/tvm/ffi/convert.py
index 467f7a2fb491..5b25ddae259b 100644
--- a/python/tvm/ffi/convert.py
+++ b/python/tvm/ffi/convert.py
@@ -54,6 +54,11 @@ def convert(value: Any) -> Any:
         return core._convert_to_ffi_func(value)
     elif value is None:
         return None
+    elif hasattr(value, "__dlpack__"):
+        return core.from_dlpack(
+            value,
+            required_alignment=core.__dlpack_auto_import_required_alignment__,
+        )
     elif isinstance(value, Exception):
         return core._convert_to_ffi_error(value)
     else:
diff --git a/python/tvm/ffi/cython/function.pxi b/python/tvm/ffi/cython/function.pxi
index be80023c85b4..294a1246b27b 100644
--- a/python/tvm/ffi/cython/function.pxi
+++ b/python/tvm/ffi/cython/function.pxi
@@ -17,6 +17,11 @@
 import ctypes
 from numbers import Real, Integral
 
+try:
+    import torch
+except ImportError:
+    torch = None
+
 
 cdef inline object make_ret(TVMFFIAny result):
     """convert result to return value."""
@@ -71,6 +76,17 @@ cdef inline int make_args(tuple py_args, TVMFFIAny* out, list temp_args) except
         elif isinstance(arg, Object):
             out[i].type_index = TVMFFIObjectGetTypeIndex((<Object>arg).chandle)
             out[i].v_ptr = (<Object>arg).chandle
+        elif torch is not None and isinstance(arg, torch.Tensor):
+            arg = from_dlpack(torch.utils.dlpack.to_dlpack(arg),
+                              required_alignment=__dlpack_auto_import_required_alignment__)
+            out[i].type_index = kTVMFFINDArray
+            out[i].v_ptr = (<NDArray>arg).chandle
+            temp_args.append(arg)
+        elif hasattr(arg, "__dlpack__"):
+            arg = from_dlpack(arg, required_alignment=__dlpack_auto_import_required_alignment__)
+            out[i].type_index = kTVMFFINDArray
+            out[i].v_ptr = (<NDArray>arg).chandle
+            temp_args.append(arg)
         elif isinstance(arg, PyNativeObject):
             arg = arg.__tvm_ffi_object__
             out[i].type_index = TVMFFIObjectGetTypeIndex((<Object>arg).chandle)
diff --git a/python/tvm/ffi/cython/ndarray.pxi b/python/tvm/ffi/cython/ndarray.pxi
index cadf3de4fd6e..b8534b41b38b 100644
--- a/python/tvm/ffi/cython/ndarray.pxi
+++ b/python/tvm/ffi/cython/ndarray.pxi
@@ -16,8 +16,10 @@
 # under the License.
 
 __dlpack_version__ = (1, 1)
+__dlpack_auto_import_required_alignment__ = 8
 _CLASS_NDARRAY = None
 
+
 def _set_class_ndarray(cls):
     global _CLASS_NDARRAY
     _CLASS_NDARRAY = cls
diff --git a/tests/python/ffi/test_ndarray.py b/tests/python/ffi/test_ndarray.py
index a5a6f5b07438..5b75171b55bb 100644
--- a/tests/python/ffi/test_ndarray.py
+++ b/tests/python/ffi/test_ndarray.py
@@ -14,6 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
+
+try:
+    import torch
+except ImportError:
+    torch = None
 
 from tvm import ffi as tvm_ffi
 import numpy as np
@@ -47,3 +53,24 @@ def test_shape_object():
     shape3 = tvm_ffi.convert(shape)
     assert shape3.__tvm_ffi_object__.same_as(shape.__tvm_ffi_object__)
     assert isinstance(shape3, tvm_ffi.Shape)
+
+
+@pytest.mark.skipif(torch is None, reason="Torch is not installed")
+def test_ndarray_auto_dlpack():
+    def check(x, y):
+        assert isinstance(y, tvm_ffi.NDArray)
+        assert y.shape == (128,)
+        assert y.dtype == tvm_ffi.dtype("int64")
+        assert y.device.device_type == tvm_ffi.Device.kDLCPU
+        assert y.device.device_id == 0
+        x2 = torch.from_dlpack(y)
+        np.testing.assert_equal(x2.numpy(), x.numpy())
+
+    x = torch.arange(128)
+    fecho = tvm_ffi.get_global_func("testing.echo")
+    y = fecho(x)
+    check(x, y)
+
+    # pass in list of tensors
+    y = fecho([x])
+    check(x, y[0])