From 3f9519d25dacd728873e0bd06b0f1cf3af5cd46b Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Fri, 7 Apr 2023 12:49:21 -0400
Subject: [PATCH 01/23] GH-34971: [Format] Enhance C-Data API to support
 non-cpu cases

---
 cpp/src/arrow/c/abi.h | 134 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index d58417e6fbc..a786566c007 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -65,6 +65,69 @@ struct ArrowArray {
 
 #endif  // ARROW_C_DATA_INTERFACE
 
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+// ArrowDeviceType is compatible with dlpack DLDeviceType for portability
+// it uses the same values for each enum as the equivalent kDL<type> from dlpack.h
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
+typedef enum {
+#endif
+  // CPU device, same as using ArrowArray directly
+  kArrowCPU = 1,
+  // CUDA GPU Device
+  kArrowCUDA = 2,
+  // Pinned CUDA CPU memory by cudaMallocHost
+  kArrowCUDAHost = 3,
+  // OpenCL Device
+  kArrowOpenCL = 4,
+  // Vulkan buffer for next-gen graphics
+  kArrowVulkan = 7,
+  // Metal for Apple GPU
+  kArrowMetal = 8,
+  // Verilog simulator buffer
+  kArrowVPI = 9,
+  // ROCm GPUs for AMD GPUs
+  kArrowROCM = 10,
+  // Pinned ROCm CPU memory allocated by hipMallocHost
+  kArrowROCMHost = 11,
+  // Reserved for extension
+  // used to quickly test extension devices,
+  // semantics can differ based on the implementation
+  kArrowExtDev = 12,
+  // CUDA managed/unified memory allocated by cudaMallocManaged
+  kArrowCUDAManaged = 13,
+  // unified shared memory allocated on a oneAPI non-partitioned
+  // device. call to oneAPI runtime is required to determine the
+  // device type, the USM allocation type and the sycl context it
+  // is bound to
+  kArrowOneAPI = 14,
+  // GPU support for next-gen WebGPU standard
+  kArrowWebGPU = 15,
+  // Qualcomm Hexagon DSP
+  kArrowHexagon = 16,
+} ArrowDeviceType;
+
+struct ArrowDeviceArray {
+  // the private_date and release callback of the arrow array
+  // should contain any necessary information and structures
+  // related to freeing the array according to the device it
+  // is allocated on, rather than having a separate release
+  // callback embedded here.
+  struct ArrowArray* array;
+  int device_id;
+  ArrowDeviceType device_type;
+  // reserve 128 bytes for future expansion
+  // of this struct as non-CPU development expands
+  // so that we can update without ABI breaking
+  // changes.
+  int64_t reserved[2];
+};
+
+#endif // ARROW_C_DEVICE_DATA_INTERFACE
+
 #ifndef ARROW_C_STREAM_INTERFACE
 #define ARROW_C_STREAM_INTERFACE
 
@@ -106,6 +169,77 @@ struct ArrowArrayStream {
 
 #endif  // ARROW_C_STREAM_INTERFACE
 
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#define ARROW_C_DEVICE_STREAM_INTERFACE
+
+
+struct ArrowDeviceArrayStream {
+  // The device that this stream produces data on.
+  // All ArrowDeviceArrays that are produced by this
+  // stream should have the same device_type as set
+  // here. The device_type needs to be provided here
+  // so that consumers can provide the correct type
+  // of stream_ptr when calling get_next.
+  ArrowDeviceType device_type;
+
+  // Callback to get the stream schema
+  // (will be the same for all arrays in the stream).
+  //
+  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // If successful, the ArrowSchema must be released independently from the stream.
+  int (*get_schema)(struct ArrowDeviceArrayStream*, struct ArrowSchema* out);
+
+  // Callback to get the device id for the next array.
+  // This is necessary so that the proper/correct stream pointer can be provided
+  // to get_next. The parameter provided must not be null.
+  //
+  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // The next call to `get_next` should provide an ArrowDeviceArray whose
+  // device_id matches what is provided here, and whose device_type is the
+  // same as the device_type member of this stream.
+  int (*get_next_device_id)(struct ArrowDeviceArrayStream*, int* out_device_id);
+
+  // Callback to get the next array
+  // (if no error and the array is released, the stream has ended)
+  //
+  // the provided stream_ptr should be the appropriate stream, or 
+  // equivalent object, for the device that the data is allocated on
+  // to indicate where the consumer wants the data to be accessible.
+  // if stream_ptr is NULL then the default stream (e.g. CUDA stream 0)
+  // should be used to ensure that the memory is accessible from any stream.
+  //
+  // because different frameworks use different types to represent this, we
+  // accept a void* which should then be reinterpreted into whatever the
+  // appropriate type is (e.g. cudaStream_t) for use by the producer.
+  //
+  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // If successful, the ArrowArray must be released independently from the stream.
+  int (*get_next)(struct ArrowDeviceArrayStream*, const void* stream_ptr, struct ArrowDeviceArray* out);
+
+  // Callback to get optional detailed error information.
+  // This must only be called if the last stream operation failed
+  // with a non-0 return code.
+  //
+  // Return value: pointer to a null-terminated character array describing
+  // the last error, or NULL if no description is available.
+  //
+  // The returned pointer is only valid until the next operation on this stream
+  // (including release).
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream*);
+
+  // Release callback: release the stream's own resources.
+  // Note that arrays returned by `get_next` must be individually released.
+  void (*release)(struct ArrowDeviceArrayStream*);
+
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
 #ifdef __cplusplus
 }
 #endif

From 63a65dc5bf06ed99f90f5ee96935531883b750de Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Fri, 7 Apr 2023 13:36:36 -0400
Subject: [PATCH 02/23] clang-format

---
 cpp/src/arrow/c/abi.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index a786566c007..bfe6e86451b 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -126,7 +126,7 @@ struct ArrowDeviceArray {
   int64_t reserved[2];
 };
 
-#endif // ARROW_C_DEVICE_DATA_INTERFACE
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
 
 #ifndef ARROW_C_STREAM_INTERFACE
 #define ARROW_C_STREAM_INTERFACE
@@ -172,7 +172,6 @@ struct ArrowArrayStream {
 #ifndef ARROW_C_DEVICE_STREAM_INTERFACE
 #define ARROW_C_DEVICE_STREAM_INTERFACE
 
-
 struct ArrowDeviceArrayStream {
   // The device that this stream produces data on.
   // All ArrowDeviceArrays that are produced by this
@@ -204,7 +203,7 @@ struct ArrowDeviceArrayStream {
   // Callback to get the next array
   // (if no error and the array is released, the stream has ended)
   //
-  // the provided stream_ptr should be the appropriate stream, or 
+  // the provided stream_ptr should be the appropriate stream, or
   // equivalent object, for the device that the data is allocated on
   // to indicate where the consumer wants the data to be accessible.
   // if stream_ptr is NULL then the default stream (e.g. CUDA stream 0)
@@ -217,7 +216,8 @@ struct ArrowDeviceArrayStream {
   // Return value: 0 if successful, an `errno`-compatible error code otherwise.
   //
   // If successful, the ArrowArray must be released independently from the stream.
-  int (*get_next)(struct ArrowDeviceArrayStream*, const void* stream_ptr, struct ArrowDeviceArray* out);
+  int (*get_next)(struct ArrowDeviceArrayStream*, const void* stream_ptr,
+                  struct ArrowDeviceArray* out);
 
   // Callback to get optional detailed error information.
   // This must only be called if the last stream operation failed

From fc16391111fd920dbb688d25ea42d1c21f592a8e Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Mon, 10 Apr 2023 13:11:18 -0400
Subject: [PATCH 03/23] updates from feedback

---
 cpp/src/arrow/c/abi.h | 276 ++++++++++++++++++++++++++----------------
 1 file changed, 170 insertions(+), 106 deletions(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index bfe6e86451b..29cd26e20e7 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -15,10 +15,30 @@
 // specific language governing permissions and limitations
 // under the License.
 
+/// \file abi.h Arrow C-Data Interface
+///
+/// The Arrow C-Data interface defines a very small, stable set
+/// of C definitions which can be easily copied into any project's
+/// source code and vendored to be used for columnar data interchange
+/// in the Arrow format. For non-C/C++ languages and runtimes,
+/// it should be almost as easy to translate the C definitions into
+/// the corresponding C FFI declarations.
+///
+/// Applications and libraries can therefore work with Arrow memory
+/// without necessarily using the Arrow libraries or reinventing
+/// the wheel. Developers can choose between tight integration
+/// with the Arrow software project or minimal integration with
+/// the Arrow format only.
+
 #pragma once
 
 #include <stdint.h>
 
+/// \defgroup Arrow C-Data Interface
+/// Definitions for the C-Data Interface/C-Stream Interface.
+///
+/// @{
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -68,61 +88,84 @@ struct ArrowArray {
 #ifndef ARROW_C_DEVICE_DATA_INTERFACE
 #define ARROW_C_DEVICE_DATA_INTERFACE
 
-// ArrowDeviceType is compatible with dlpack DLDeviceType for portability
-// it uses the same values for each enum as the equivalent kDL<type> from dlpack.h
-#ifdef __cplusplus
-typedef enum : int32_t {
-#else
-typedef enum {
-#endif
-  // CPU device, same as using ArrowArray directly
-  kArrowCPU = 1,
-  // CUDA GPU Device
-  kArrowCUDA = 2,
-  // Pinned CUDA CPU memory by cudaMallocHost
-  kArrowCUDAHost = 3,
-  // OpenCL Device
-  kArrowOpenCL = 4,
-  // Vulkan buffer for next-gen graphics
-  kArrowVulkan = 7,
-  // Metal for Apple GPU
-  kArrowMetal = 8,
-  // Verilog simulator buffer
-  kArrowVPI = 9,
-  // ROCm GPUs for AMD GPUs
-  kArrowROCM = 10,
-  // Pinned ROCm CPU memory allocated by hipMallocHost
-  kArrowROCMHost = 11,
-  // Reserved for extension
-  // used to quickly test extension devices,
-  // semantics can differ based on the implementation
-  kArrowExtDev = 12,
-  // CUDA managed/unified memory allocated by cudaMallocManaged
-  kArrowCUDAManaged = 13,
-  // unified shared memory allocated on a oneAPI non-partitioned
-  // device. call to oneAPI runtime is required to determine the
-  // device type, the USM allocation type and the sycl context it
-  // is bound to
-  kArrowOneAPI = 14,
-  // GPU support for next-gen WebGPU standard
-  kArrowWebGPU = 15,
-  // Qualcomm Hexagon DSP
-  kArrowHexagon = 16,
-} ArrowDeviceType;
+/// \defgroup arrow-device-types Device Types
+/// These macros are compatible with the dlpack DLDeviceType values,
+/// using the same value for each enum as the equivalent kDL<type>
+/// from dlpack.h
+///
+/// To ensure predictability with the ABI we use macros instead of
+/// an enum so the storage type is not compiler dependent.
+///
+/// @{
+
+/// \brief DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
 
+/// \brief CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU = 1
+/// \brief CUDA GPU Device
+#define ARROW_DEVICE_CUDA = 2
+/// \brief Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST = 3
+/// \brief OpenCL Device
+#define ARROW_DEVICE_OPENCL = 4
+/// \brief Vulkan buffer for next-gen graphics
+#define ARROW_DEVICE_VULKAN = 7
+/// \brief Metal for Apple GPU
+#define ARROW_DEVICE_METAL = 8
+/// \brief Verilog simulator buffer
+#define ARROW_DEVICE_VPI = 9
+/// \brief ROCm GPUs for AMD GPUs
+#define ARROW_DEVICE_ROCM = 10
+/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+#define ARROW_DEVICE_ROCMHOST = 11
+/// \brief Reserved for extension
+///
+/// used to quickly test extension devices, semantics
+/// can differ based on the implementation
+#define ARROW_DEVICE_EXT_DEV = 12
+/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED = 13
+/// \brief unified shared memory allocated on a oneAPI
+/// non-partitioned device.
+///
+/// A call to the oneAPI runtime is required to determine the device
+/// type, the USM allocation type, and the sycl context it is bound to.
+#define ARROW_DEVICE_ONEAPI = 14
+/// \brief GPU support for next-gen WebGPU standard
+#define ARROW_DEVICE_WEBGPU = 15
+/// \brief Qualcomm Hexagon DSP
+#define ARROW_DEVICE_HEXAGON = 16
+
+/// @}
+
+/// \brief Struct for passing an Arrow Array alongside
+/// device memory information.
 struct ArrowDeviceArray {
-  // the private_date and release callback of the arrow array
-  // should contain any necessary information and structures
-  // related to freeing the array according to the device it
-  // is allocated on, rather than having a separate release
-  // callback embedded here.
-  struct ArrowArray* array;
-  int device_id;
+  /// \brief the Allocated Array
+  ///
+  /// the buffers in the array (along with the buffers of any
+  /// children) are what is allocated on the device.
+  ///
+  /// the private_data and release callback of the arrow array
+  /// should contain any necessary information and structures
+  /// related to freeing the array according to the device it
+  /// is allocated on, rather than having a separate release
+  /// callback embedded here.
+  struct ArrowArray array;
+  /// \brief The device id to identify a specific device
+  /// if multiple of this type are on the system.
+  ///
+  /// the semantics of the id will be hardware dependant.
+  int64_t device_id;
+  /// \brief The type of device which can access this memory.
   ArrowDeviceType device_type;
-  // reserve 128 bytes for future expansion
-  // of this struct as non-CPU development expands
-  // so that we can update without ABI breaking
-  // changes.
+  /// \brief Reserved bytes for future expansion.
+  ///
+  /// As non-CPU development expands we can update,
+  /// without ABI breaking changes. These bytes should
+  /// be zero'd out after allocation in order to ensure
+  /// safe evolution of the ABI in the future.
   int64_t reserved[2];
 };
 
@@ -172,69 +215,88 @@ struct ArrowArrayStream {
 #ifndef ARROW_C_DEVICE_STREAM_INTERFACE
 #define ARROW_C_DEVICE_STREAM_INTERFACE
 
+/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+///
+/// This stream is intended to provide a stream of data on a single
+/// device, if a producer wants data to be produced on multiple devices
+/// then multiple streams should be provided. One per device.
 struct ArrowDeviceArrayStream {
-  // The device that this stream produces data on.
-  // All ArrowDeviceArrays that are produced by this
-  // stream should have the same device_type as set
-  // here. The device_type needs to be provided here
-  // so that consumers can provide the correct type
-  // of stream_ptr when calling get_next.
+  /// \brief The device that this stream produces data on.
+  ///
+  /// All ArrowDeviceArrays that are produced by this
+  /// stream should have the same device_type as set
+  /// here. The device_type needs to be provided here
+  /// so that consumers can provide the correct type
+  /// of queue_ptr when calling get_next.
   ArrowDeviceType device_type;
 
-  // Callback to get the stream schema
-  // (will be the same for all arrays in the stream).
-  //
-  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
-  //
-  // If successful, the ArrowSchema must be released independently from the stream.
-  int (*get_schema)(struct ArrowDeviceArrayStream*, struct ArrowSchema* out);
+  /// \brief Callback to get the stream schema
+  /// (will be the same for all arrays in the stream).
+  ///
+  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// The schema should be accessible via CPU memory.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out C struct to export the schema to
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
 
-  // Callback to get the device id for the next array.
-  // This is necessary so that the proper/correct stream pointer can be provided
-  // to get_next. The parameter provided must not be null.
-  //
-  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
-  //
-  // The next call to `get_next` should provide an ArrowDeviceArray whose
-  // device_id matches what is provided here, and whose device_type is the
-  // same as the device_type member of this stream.
-  int (*get_next_device_id)(struct ArrowDeviceArrayStream*, int* out_device_id);
+  /// \brief Callback to get the device id for the next array.
+  ///
+  /// This is necessary so that the proper/correct stream pointer can be provided
+  /// to get_next.
+  ///
+  /// The next call to `get_next` should provide an ArrowDeviceArray whose
+  /// device_id matches what is provided here, and whose device_type is the
+  /// same as the device_type member of this stream.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[out] out_device_id Pointer to be populated with the device id, must not be
+  /// null \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next_device_id)(struct ArrowDeviceArrayStream* self, int64_t* out_device_id);
 
-  // Callback to get the next array
-  // (if no error and the array is released, the stream has ended)
-  //
-  // the provided stream_ptr should be the appropriate stream, or
-  // equivalent object, for the device that the data is allocated on
-  // to indicate where the consumer wants the data to be accessible.
-  // if stream_ptr is NULL then the default stream (e.g. CUDA stream 0)
-  // should be used to ensure that the memory is accessible from any stream.
-  //
-  // because different frameworks use different types to represent this, we
-  // accept a void* which should then be reinterpreted into whatever the
-  // appropriate type is (e.g. cudaStream_t) for use by the producer.
-  //
-  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
-  //
-  // If successful, the ArrowArray must be released independently from the stream.
-  int (*get_next)(struct ArrowDeviceArrayStream*, const void* stream_ptr,
+  /// \brief Callback to get the next array
+  ///
+  /// If there is no error and the returned array has been released, the stream
+  /// has ended. If successful, the ArrowArray must be released independently
+  /// from the stream.
+  ///
+  /// Because different frameworks use different types to represent this, we
+  /// accept a void* which should then be reinterpreted into whatever the
+  /// appropriate type is (e.g. cudaStream_t) for use by the producer.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \param[in] queue_ptr The appropriate queue, stream, or
+  /// equivalent object for the device that the data is allocated on
+  /// to indicate where the consumer wants the data to be accessible.
+  /// If queue_ptr is NULL then the default stream (e.g. CUDA stream 0)
+  /// should be used to ensure that the memory is accessible from any stream.
+  /// \param[out] out C struct where to export the Array and device info
+  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, const void* queue_ptr,
                   struct ArrowDeviceArray* out);
 
-  // Callback to get optional detailed error information.
-  // This must only be called if the last stream operation failed
-  // with a non-0 return code.
-  //
-  // Return value: pointer to a null-terminated character array describing
-  // the last error, or NULL if no description is available.
-  //
-  // The returned pointer is only valid until the next operation on this stream
-  // (including release).
-  const char* (*get_last_error)(struct ArrowDeviceArrayStream*);
+  /// \brief Callback to get optional detailed error information.
+  ///
+  /// This must only be called if the last stream operation failed
+  /// with a non-0 return code.
+  ///
+  /// The returned pointer is only valid until the next operation on this stream
+  /// (including release).
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  /// \return pointer to a null-terminated character array describing
+  /// the last error, or NULL if no description is available.  
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
 
-  // Release callback: release the stream's own resources.
-  // Note that arrays returned by `get_next` must be individually released.
-  void (*release)(struct ArrowDeviceArrayStream*);
+  /// \brief Release callback: release the stream's own resources.
+  ///
+  /// Note that arrays returned by `get_next` must be individually released.
+  ///
+  /// \param[in] self The ArrowDeviceArrayStream object itself
+  void (*release)(struct ArrowDeviceArrayStream* self);
 
-  // Opaque producer-specific data
+  /// \brief Opaque producer-specific data
   void* private_data;
 };
 
@@ -243,3 +305,5 @@ struct ArrowDeviceArrayStream {
 #ifdef __cplusplus
 }
 #endif
+
+/// @}
\ No newline at end of file

From 26c9aa928fcd66bb7054e48ebfc0ff07b055751f Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Mon, 10 Apr 2023 13:21:30 -0400
Subject: [PATCH 04/23] format/trim

---
 cpp/src/arrow/c/abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 29cd26e20e7..777f3eb9c4c 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -286,7 +286,7 @@ struct ArrowDeviceArrayStream {
   ///
   /// \param[in] self The ArrowDeviceArrayStream object itself
   /// \return pointer to a null-terminated character array describing
-  /// the last error, or NULL if no description is available.  
+  /// the last error, or NULL if no description is available.
   const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
 
   /// \brief Release callback: release the stream's own resources.

From e85d307be179fa9b26043838c3fa625926c770b4 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Mon, 10 Apr 2023 13:32:44 -0400
Subject: [PATCH 05/23] newline at end of file

---
 cpp/src/arrow/c/abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 777f3eb9c4c..873507ff45a 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -306,4 +306,4 @@ struct ArrowDeviceArrayStream {
 }
 #endif
 
-/// @}
\ No newline at end of file
+/// @}

From 13e94a5ff45d26f306ab19597e3088ffcb495364 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Wed, 10 May 2023 13:30:10 -0400
Subject: [PATCH 06/23] update based on feedback.

---
 cpp/src/arrow/c/abi.h | 55 +++++++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 873507ff45a..62cf4d863b4 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -91,7 +91,9 @@ struct ArrowArray {
 /// \defgroup arrow-device-types Device Types
 /// These macros are compatible with the dlpack DLDeviceType values,
 /// using the same value for each enum as the equivalent kDL<type>
-/// from dlpack.h
+/// from dlpack.h. This list should continue to be kept in sync with
+/// the equivalent dlpack.h enum values over time to ensure 
+/// compatibility, rather than potentially diverging.
 ///
 /// To ensure predictability with the ABI we use macros instead of
 /// an enum so the storage type is not compiler dependent.
@@ -160,13 +162,30 @@ struct ArrowDeviceArray {
   int64_t device_id;
   /// \brief The type of device which can access this memory.
   ArrowDeviceType device_type;
+  /// \brief An event-like object to synchronize on if needed.
+  ///
+  /// Many devices, like GPUs, are primarily asynchronous with
+  /// respect to CPU processing. As such in order to safely access
+  /// memory, it is often necessary to have an object to synchronize
+  /// processing on. Since different devices will use different types
+  /// to specify this we use a void* that can be coerced into
+  /// whatever the device appropriate type is (e.g. cudaEvent_t for
+  /// CUDA and hipEvent_t for HIP). 
+  ///
+  /// If synchronization is not needed this can be null. If this is
+  /// non-null, then it should be used to call the appropriate sync
+  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  void* sync_event;
   /// \brief Reserved bytes for future expansion.
   ///
-  /// As non-CPU development expands we can update,
-  /// without ABI breaking changes. These bytes should
-  /// be zero'd out after allocation in order to ensure
-  /// safe evolution of the ABI in the future.
-  int64_t reserved[2];
+  /// As non-CPU development expands we can update this struct
+  /// without ABI breaking changes. This also rounds out the
+  /// total size of this struct to be 128 bytes (power of 2)
+  /// on 64-bit systems. These bytes should be zero'd out after 
+  /// allocation in order to ensure safe evolution of the ABI in 
+  /// the future.
+  int64_t reserved[3];
+  int32_t reserved_addl;
 };
 
 #endif  // ARROW_C_DEVICE_DATA_INTERFACE
@@ -241,20 +260,6 @@ struct ArrowDeviceArrayStream {
   /// \return 0 if successful, an `errno`-compatible error code otherwise.
   int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
 
-  /// \brief Callback to get the device id for the next array.
-  ///
-  /// This is necessary so that the proper/correct stream pointer can be provided
-  /// to get_next.
-  ///
-  /// The next call to `get_next` should provide an ArrowDeviceArray whose
-  /// device_id matches what is provided here, and whose device_type is the
-  /// same as the device_type member of this stream.
-  ///
-  /// \param[in] self The ArrowDeviceArrayStream object itself
-  /// \param[out] out_device_id Pointer to be populated with the device id, must not be
-  /// null \return 0 if successful, an `errno`-compatible error code otherwise.
-  int (*get_next_device_id)(struct ArrowDeviceArrayStream* self, int64_t* out_device_id);
-
   /// \brief Callback to get the next array
   ///
   /// If there is no error and the returned array has been released, the stream
@@ -265,16 +270,10 @@ struct ArrowDeviceArrayStream {
   /// accept a void* which should then be reinterpreted into whatever the
   /// appropriate type is (e.g. cudaStream_t) for use by the producer.
   ///
-  /// \param[in] self The ArrowDeviceArrayStream object itself
-  /// \param[in] queue_ptr The appropriate queue, stream, or
-  /// equivalent object for the device that the data is allocated on
-  /// to indicate where the consumer wants the data to be accessible.
-  /// If queue_ptr is NULL then the default stream (e.g. CUDA stream 0)
-  /// should be used to ensure that the memory is accessible from any stream.
+  /// \param[in] self The ArrowDeviceArrayStream object itself  
   /// \param[out] out C struct where to export the Array and device info
   /// \return 0 if successful, an `errno`-compatible error code otherwise.
-  int (*get_next)(struct ArrowDeviceArrayStream* self, const void* queue_ptr,
-                  struct ArrowDeviceArray* out);
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
 
   /// \brief Callback to get optional detailed error information.
   ///

From 826390ec3fe0cedc9457ce8d070afe36ecf49085 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 11 May 2023 12:02:30 -0400
Subject: [PATCH 07/23] updates from feedback

---
 cpp/src/arrow/c/abi.h | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 62cf4d863b4..506e30571b2 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -92,7 +92,7 @@ struct ArrowArray {
 /// These macros are compatible with the dlpack DLDeviceType values,
 /// using the same value for each enum as the equivalent kDL<type>
 /// from dlpack.h. This list should continue to be kept in sync with
-/// the equivalent dlpack.h enum values over time to ensure 
+/// the equivalent dlpack.h enum values over time to ensure
 /// compatibility, rather than potentially diverging.
 ///
 /// To ensure predictability with the ABI we use macros instead of
@@ -170,7 +170,7 @@ struct ArrowDeviceArray {
   /// processing on. Since different devices will use different types
   /// to specify this we use a void* that can be coerced into
   /// whatever the device appropriate type is (e.g. cudaEvent_t for
-  /// CUDA and hipEvent_t for HIP). 
+  /// CUDA and hipEvent_t for HIP).
   ///
   /// If synchronization is not needed this can be null. If this is
   /// non-null, then it should be used to call the appropriate sync
@@ -181,11 +181,10 @@ struct ArrowDeviceArray {
   /// As non-CPU development expands we can update this struct
   /// without ABI breaking changes. This also rounds out the
   /// total size of this struct to be 128 bytes (power of 2)
-  /// on 64-bit systems. These bytes should be zero'd out after 
-  /// allocation in order to ensure safe evolution of the ABI in 
+  /// on 64-bit systems. These bytes should be zero'd out after
+  /// allocation in order to ensure safe evolution of the ABI in
   /// the future.
   int64_t reserved[3];
-  int32_t reserved_addl;
 };
 
 #endif  // ARROW_C_DEVICE_DATA_INTERFACE
@@ -264,13 +263,9 @@ struct ArrowDeviceArrayStream {
   ///
   /// If there is no error and the returned array has been released, the stream
   /// has ended. If successful, the ArrowArray must be released independently
-  /// from the stream.
+  /// from the stream.  
   ///
-  /// Because different frameworks use different types to represent this, we
-  /// accept a void* which should then be reinterpreted into whatever the
-  /// appropriate type is (e.g. cudaStream_t) for use by the producer.
-  ///
-  /// \param[in] self The ArrowDeviceArrayStream object itself  
+  /// \param[in] self The ArrowDeviceArrayStream object itself
   /// \param[out] out C struct where to export the Array and device info
   /// \return 0 if successful, an `errno`-compatible error code otherwise.
   int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);

From 09e5ee6197a756870a788cd66a021a4b3d2c762d Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 11 May 2023 12:07:26 -0400
Subject: [PATCH 08/23] lint

---
 cpp/src/arrow/c/abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 506e30571b2..d52edaf602b 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -263,7 +263,7 @@ struct ArrowDeviceArrayStream {
   ///
   /// If there is no error and the returned array has been released, the stream
   /// has ended. If successful, the ArrowArray must be released independently
-  /// from the stream.  
+  /// from the stream.
   ///
   /// \param[in] self The ArrowDeviceArrayStream object itself
   /// \param[out] out C struct where to export the Array and device info

From 564ce5890bacf384308dc39ef49cc32fd384fddf Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Mon, 15 May 2023 11:29:31 -0400
Subject: [PATCH 09/23] Update cpp/src/arrow/c/abi.h

Co-authored-by: John Zedlewski <904524+JohnZed@users.noreply.github.com>
---
 cpp/src/arrow/c/abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index d52edaf602b..4394bda53ac 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -120,7 +120,7 @@ typedef int32_t ArrowDeviceType;
 /// \brief ROCm GPUs for AMD GPUs
 #define ARROW_DEVICE_ROCM = 10
 /// \brief Pinned ROCm CPU memory allocated by hipMallocHost
-#define ARROW_DEVICE_ROCMHOST = 11
+#define ARROW_DEVICE_ROCM_HOST = 11
 /// \brief Reserved for extension
 ///
 /// used to quickly test extension devices, semantics

From e74c286e7235f4b453e8f3575e2164ec8f8ff8b0 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Mon, 15 May 2023 11:31:06 -0400
Subject: [PATCH 10/23] add event type examples

---
 cpp/src/arrow/c/abi.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 4394bda53ac..d56463e1a42 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -175,6 +175,15 @@ struct ArrowDeviceArray {
   /// If synchronization is not needed this can be null. If this is
   /// non-null, then it should be used to call the appropriate sync
   /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
+  ///
+  /// Expected type to coerce this void* to depending on device type:
+  ///   cuda: cudaEvent_t*
+  ///   ROCm: hipEvent_t*
+  ///   OpenCL: cl_event*
+  ///   Vulkan: VkEvent*
+  ///   Metal: MTLEvent*
+  ///   OneAPI: syscl::event*
+  ///
   void* sync_event;
   /// \brief Reserved bytes for future expansion.
   ///

From 1eb7ee9a5a5ee6884cfac9819b26ddbf65bc2bc4 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Tue, 23 May 2023 15:26:35 -0400
Subject: [PATCH 11/23] add spec writeup and feedback

---
 cpp/src/arrow/c/abi.h                       |  37 +-
 docs/source/format/CDataInterface.rst       |   4 +
 docs/source/format/CDeviceDataInterface.rst | 613 ++++++++++++++++++++
 docs/source/index.rst                       |   1 +
 4 files changed, 637 insertions(+), 18 deletions(-)
 create mode 100644 docs/source/format/CDeviceDataInterface.rst

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index d56463e1a42..0ba4b8a0aed 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -104,40 +104,40 @@ struct ArrowArray {
 typedef int32_t ArrowDeviceType;
 
 /// \brief CPU device, same as using ArrowArray directly
-#define ARROW_DEVICE_CPU = 1
+#define ARROW_DEVICE_CPU 1
 /// \brief CUDA GPU Device
-#define ARROW_DEVICE_CUDA = 2
+#define ARROW_DEVICE_CUDA 2
 /// \brief Pinned CUDA CPU memory by cudaMallocHost
-#define ARROW_DEVICE_CUDA_HOST = 3
+#define ARROW_DEVICE_CUDA_HOST 3
 /// \brief OpenCL Device
-#define ARROW_DEVICE_OPENCL = 4
+#define ARROW_DEVICE_OPENCL 4
 /// \brief Vulkan buffer for next-gen graphics
-#define ARROW_DEVICE_VULKAN = 7
+#define ARROW_DEVICE_VULKAN 7
 /// \brief Metal for Apple GPU
-#define ARROW_DEVICE_METAL = 8
+#define ARROW_DEVICE_METAL 8
 /// \brief Verilog simulator buffer
-#define ARROW_DEVICE_VPI = 9
+#define ARROW_DEVICE_VPI 9
 /// \brief ROCm GPUs for AMD GPUs
-#define ARROW_DEVICE_ROCM = 10
+#define ARROW_DEVICE_ROCM 10
 /// \brief Pinned ROCm CPU memory allocated by hipMallocHost
-#define ARROW_DEVICE_ROCM_HOST = 11
+#define ARROW_DEVICE_ROCM_HOST 11
 /// \brief Reserved for extension
 ///
 /// used to quickly test extension devices, semantics
 /// can differ based on the implementation
-#define ARROW_DEVICE_EXT_DEV = 12
+#define ARROW_DEVICE_EXT_DEV 12
 /// \brief CUDA managed/unified memory allocated by cudaMallocManaged
-#define ARROW_DEVICE_CUDA_MANAGED = 13
+#define ARROW_DEVICE_CUDA_MANAGED 13
 /// \brief unified shared memory allocated on a oneAPI
 /// non-partitioned device.
 ///
 /// A call to the oneAPI runtime is required to determine the device
 /// type, the USM allocation type, and the sycl context it is bound to.
-#define ARROW_DEVICE_ONEAPI = 14
+#define ARROW_DEVICE_ONEAPI 14
 /// \brief GPU support for next-gen WebGPU standard
-#define ARROW_DEVICE_WEBGPU = 15
+#define ARROW_DEVICE_WEBGPU 15
 /// \brief Qualcomm Hexagon DSP
-#define ARROW_DEVICE_HEXAGON = 16
+#define ARROW_DEVICE_HEXAGON 16
 
 /// @}
 
@@ -182,7 +182,7 @@ struct ArrowDeviceArray {
   ///   OpenCL: cl_event*
   ///   Vulkan: VkEvent*
   ///   Metal: MTLEvent*
-  ///   OneAPI: syscl::event*
+  ///   OneAPI: sycl::event*
   ///
   void* sync_event;
   /// \brief Reserved bytes for future expansion.
@@ -252,9 +252,10 @@ struct ArrowDeviceArrayStream {
   ///
   /// All ArrowDeviceArrays that are produced by this
   /// stream should have the same device_type as set
-  /// here. The device_type needs to be provided here
-  /// so that consumers can provide the correct type
-  /// of queue_ptr when calling get_next.
+  /// here. Including it here in the stream object is
+  /// a convenience to allow consumers simpler processing
+  /// since they can assume all arrays that result from
+  /// this stream to be on this device type.
   ArrowDeviceType device_type;
 
   /// \brief Callback to get the stream schema
diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst
index 17a5e37cd7e..fff34656d10 100644
--- a/docs/source/format/CDataInterface.rst
+++ b/docs/source/format/CDataInterface.rst
@@ -246,6 +246,7 @@ Examples
   has format string ``+us:4,5``; its two children have names ``ints`` and
   ``floats``, and format strings ``i`` and ``f`` respectively.
 
+.. _c-data-interface-struct-defs:
 
 Structure definitions
 =====================
@@ -531,6 +532,7 @@ parameterized extension types).
 The ``ArrowArray`` structure exported from an extension array simply points
 to the storage data of the extension array.
 
+.. _c-data-interface-semantics:
 
 Semantics
 =========
@@ -703,6 +705,8 @@ C producer examples
 Exporting a simple ``int32`` array
 ----------------------------------
 
+.. _c-data-interface-export-int32-schema:
+
 Export a non-nullable ``int32`` type with empty metadata.  In this case,
 all ``ArrowSchema`` members point to statically-allocated data, so the
 release callback is trivial.
diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst
new file mode 100644
index 00000000000..333d2be08ed
--- /dev/null
+++ b/docs/source/format/CDeviceDataInterface.rst
@@ -0,0 +1,613 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+.. highlight:: c
+
+.. _c-device-data-interface:
+
+=================================
+The Arrow C Device data interface
+=================================
+
+.. note:: The Arrow C Device Data Interface should be considered experimental
+
+Rationale
+=========
+
+The current :ref:`C Data Interface <c-data-interface>`, and most
+implementations of it, make the assumption that all data buffers provided
+are CPU buffers. Since Apache Arrow is designed to be a universal in-memory
+format for representing tabular ("columnar") data, there will be the desire
+to leverage this data on Non-CPU hardware such as GPUs. One example of such
+a case is the `RAPIDS cuDF library` which uses the Arrow memory format with
+CUDA for NVIDIA GPUs. Since copying data from host to device and back is
+expensive, the ideal would be to be able to leave the data on the device
+for as long as possible, even when passing it between runtimes and
+libraries.
+
+The Arrow C Device data interface builds on the existing C data interface
+by adding a very small, stable set of C definitions to it. For non-C/C++
+languages and runtimes, translating the C definitions to corresponding
+C FFI declarations should be just as simple as with the current C data
+interface.
+
+Applications and libraries can then use Arrow schemas and Arrow formatted
+memory on non-cpu devices to exchange data just as easily as they do
+now with CPU data. This will enable leaving data on those devices longer
+and avoiding costly copies back and forth between the host and device
+just to leverage new libraries and runtimes.
+
+Goals
+-----
+
+* Expose an ABI-stable interface built on the existing C data interface.
+* Make it easy for third-party projects to implement support with little
+  initial investment.
+* Allow zero-copy sharing of Arrow formatted device memory between
+  independant runtimes and components running in the same process.
+* Avoid the need for one-to-one adaptation layers such as the
+  `CUDA Array Interface` for Python processes to pass CUDA data.
+* Enable integration without explicit dependencies (either at compile-time
+  or runtime) on the Arrow software project itself.
+
+The intent is for the Arrow C Device data interface to expand the reach
+of the current C data interface, allowing it to also become the standard
+low-level building block for columnar processing on devices like GPUs or
+FPGAs.
+
+Structure definitions
+=====================
+
+Because this is built on the C data interface, the C Device data interface
+uses the ``ArrowSchema`` and ``ArrowArray`` structures as defined in the
+:ref:`C data interface spec <c-data-interface-struct-defs>`. It then adds the
+following free-standing definitions. Like the rest of the Arrow project,
+they are available under the Apache License 2.0.
+
+.. code-block:: c
+
+    #ifndef ARROW_C_DEVICE_DATA_INTERFACE
+    #define ARROW_C_DEVICE_DATA_INTERFACE
+
+    // Device type for the allocated memory
+    typedef int32_t ArrowDeviceType;
+
+    // CPU device, same as using ArrowArray directly
+    #define ARROW_DEVICE_CPU 1
+    // CUDA GPU Device
+    #define ARROW_DEVICE_CUDA 2
+    // Pinned CUDA CPU memory by cudaMallocHost
+    #define ARROW_DEVICE_CUDA_HOST 3
+    // OpenCL Device
+    #define ARROW_DEVICE_OPENCL 4
+    // Vulkan buffer for next-gen graphics
+    #define ARROW_DEVICE_VULKAN 7
+    // Metal for Apple GPU
+    #define ARROW_DEVICE_METAL 8
+    // Verilog simulator buffer
+    #define ARROW_DEVICE_VPI 9
+    // ROCm GPUs for AMD GPUs
+    #define ARROW_DEVICE_ROCM 10
+    // Pinned ROCm CPU memory allocated by hipMallocHost
+    #define ARROW_DEVICE_ROCM_HOST 11
+    // Reserved for extension
+    //
+    // used to quickly test extension devices, semantics
+    // can differ based on implementation
+    #define ARROW_DEVICE_EXT_DEV 12
+    // CUDA managed/unified memory allocated by cudaMallocManaged
+    #define ARROW_DEVICE_CUDA_MANAGED 13
+    // Unified shared memory allocated on a oneAPI
+    // non-partitioned device.
+    //
+    // A call to the oneAPI runtime is required to determine the
+    // device type, the USM allocation type and the sycl context
+    // that it is bound to.
+    #define ARROW_DEVICE_ONEAPI 14
+    // GPU support for next-gen WebGPU standard
+    #define ARROW_DEVICE_WEBGPU 15
+    // Qualcomm Hexagon DSP
+    #define ARROW_DEVICE_HEXAGON 16
+
+    struct ArrowDeviceArray {
+      struct ArrowArray array;
+      int64_t device_id;
+      ArrowDeviceType device_type;
+      void* sync_event;
+
+      // reserved bytes for future expansion
+      int64_t reserved[3];
+    };
+
+    #endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+.. note::
+   The canonical guard ``ARROW_C_DEVICE_DATA_INTERFACE`` is meant to avoid
+   duplicate definitions if two projects copy the definitions in their own
+   headers, and a third-party project includes from these two projects. It
+   is therefore important that this guard is kept exactly as-is when these
+   definitions are copied.
+
+ArrowDeviceType
+---------------
+
+The ``ArrowDeviceType`` typedef is used to indicate what type of device the
+provided memory buffers were allocated on. This, in conjunction with the
+``device_id``, should be sufficient to reference the correct data buffers.
+
+We then use macros to define values for different device types. The provided
+macro values are compatible with the widely used `dlpack` ``DLDeviceType``
+definition values, using the same value for each as the equivalent
+``kDL<type>`` enum from dlpack.h. The list will be kept in sync with those
+equivalent enum values over time to ensure compatibility, rather than
+potentially diverging. To avoid the Arrow project having to be in the
+position of vetting new hardware devices, new additions should first be
+added to dlpack before we add a corresponding macro here.
+
+To ensure predictability with the ABI, we use macros instead of an ``enum``
+so the storage type is not compiler dependent.
+
+.. c:macro:: ARROW_DEVICE_CPU
+
+    CPU Device, equivalent to just using ``ArrowArray`` directly instead of
+    using ``ArrowDeviceArray``.
+
+.. c:macro:: ARROW_DEVICE_CUDA
+
+    A `CUDA` GPU Device. This could represent data allocated either with the
+    runtime library (``cudaMalloc``) or the device driver (``cuMemAlloc``).
+
+.. c:macro:: ARROW_DEVICE_CUDA_HOST
+
+    CPU memory that was pinned and page-locked by CUDA by using
+    ``cudaMallocHost`` or ``cuMemAllocHost``.
+
+.. c:macro:: ARROW_DEVICE_OPENCL
+
+    Data allocated on the device by using the `OpenCL (Open Computing Language)`
+    framework.
+
+.. c:macro:: ARROW_DEVICE_VULKAN
+
+    Data allocated by the `Vulkan` framework and libraries.
+
+.. c:macro:: ARROW_DEVICE_METAL
+
+    Data on Apple GPU devices using the `Metal` framework and libraries.
+
+.. c:macro:: ARROW_DEVICE_VPI
+
+    Indicates usage of a Verilog simulator buffer.
+
+.. c:macro:: ARROW_DEVICE_ROCM
+
+    An AMD device using the `ROCm` stack.
+
+.. c:macro:: ARROW_DEVICE_ROCM_HOST
+
+    CPU memory pinned and page-locked allocated ``hipMallocHost``.
+
+.. c:macro:: ARROW_DEVICE_EXT_DEV
+
+    This value is an escape-hatch for devices to extend which aren't
+    currently represented otherwise. Producers would need to provide
+    additional information/context specific to the device if using
+    this device type. This is used to quickly test extension devices
+    and semantics can differ based on the implementation.
+
+.. c:macro:: ARROW_DEVICE_CUDA_MANAGED
+
+    CUDA managed/unified memory which is allocated by ``cudaMallocManaged``.
+
+.. c:macro:: ARROW_DEVICE_ONEAPI
+
+    Unified shared memory allocated on an Intel `oneAPI` non-partitioned
+    device. A call to the ``oneAPI`` runtime is required to determine
+    the specific device type, the USM allocation type and the sycl context
+    that it is bound to.
+
+.. c:macro:: ARROW_DEVICE_WEBGPU
+
+    GPU support for next-gen WebGPU standards
+
+.. c:macro:: ARROW_DEVICE_HEXAGON
+
+    Data allocated on a Qualcomm Hexagon DSP device.
+
+The ArrowDeviceArray structure
+------------------------------
+
+The ``ArrowDeviceArray`` structure embeds the C data ``ArrowArray`` structure
+and adds additional information necessary for consumers to use the data. It
+has the following fields:
+
+.. c:member:: struct ArrowArray ArrowDeviceArray.array
+
+    The allocated array data. The values in the ``void**`` buffers (along
+    with the buffers of any children) are what is allocated on the device.
+    The buffer values should be device pointers. The rest of the structure
+    should be accessible to the CPU.
+
+    The ``private_data`` and ``release`` callback of this structure should
+    contain any necessary information and structures related to freeing
+    the array according to the device it is allocated on, rather than
+    having a separate release callback and ``private_data`` pointer here.
+
+.. c:member:: int64_t ArrowDeviceArray.device_id
+
+    The device id to identify a specific device if multiple devices of this
+    type are on the system. The semantics of the id will be hardware dependent,
+    but we use an ``int64_t`` to future-proof the id as devices change over time.
+
+.. c:member:: ArrowDeviceType ArrowDeviceArray.device_type
+
+    The type of the device which can access the buffers in the array.
+
+.. c:member:: void* ArrowDeviceArray.sync_event
+
+    Optional. An event-like object to synchronize on if needed.
+
+    Many devices, like GPUs, are primarily asynchronous with respect to
+    CPU processing. As such in order to safely access memory, it is often
+    necessary to have an object to synchronize processing with. Since
+    different devices will use different types to specify this, we use a
+    void* which can be coerced into a pointer to whatever the device
+    appropriate type is.
+
+    If synchronization is not needed, this can be null. If this is non-null
+    then it MUST be used to call the appropriate sync method for the device
+    (e.g. ``cudaStreamWaitEvent`` or ``hipStreamWaitEvent``) before attempting
+    to access the memory in the buffers.
+
+    Expected types to coerce this ``void*`` to depending on the reported
+    device type:
+
+    * CUDA: ``cudaEvent_t*``
+    * ROCm: ``hipEvent_t*``
+    * OpenCL: ``cl_event*``
+    * Vulkan: ``VkEvent*``
+    * Metal: ``MTLEvent*``
+    * OneAPI: ``sycl::event*``
+
+    If an event is provided, then the producer MUST ensure that the event
+    is triggered/recorded at the end of the processing stream once the data
+    is considered available for use.
+
+
+.. c:member:: int64_t ArrowDeviceArray.reserved[3]
+
+    As non-CPU development expands, there may be a need to expand this
+    structure. In order to do so without potentially breaking ABI changes,
+    we reserve 24 bytes at the end of the object. This also has the added
+    benefit of bringing the total size of this structure to exactly 128
+    bytes (a power of 2) on 64-bit systems. These bytes should be zero'd
+    out after allocation in order to ensure safe evolution of the ABI in
+    the future.
+
+.. note::
+    Rather than store the shape / types of the data alongside the
+    ``ArrowDeviceArray``, users should utilize the existing ``ArrowSchema``
+    structure to pass any data type and shape information.
+
+Semantics
+=========
+
+Memory management
+-----------------
+
+The ``ArrowDeviceArray`` structure contains an ``ArrowArray`` object which
+itself has :ref:`specific semantics <c-data-interface-semantics>` for releasing
+memory. The term *"base structure"* below refers to the ``ArrowDeviceArray``
+object that is passed directly between the producer and consumer -- not any
+child structure thereof.
+
+It is intended for the base structure to be stack- or heap-allocated by the
+*consumer*. In this case, the producer API should take a pointer to the
+consumer-allocated structure.
+
+However, any data pointed to by the struct MUST be allocated and maintained
+by the producer. This includes the ``sync_event`` member if it is not null,
+along with any pointers in the ``ArrowArray`` object as usual. Data lifetime
+is managed through the ``release`` callback of the ``ArrowArray`` member.
+
+For an ``ArrowDeviceArray``, the semantics of a released structure and the
+callback semantics are identical to those for
+:ref:`ArrowArray itself <c-data-interface-released>`. Any producer specific context
+information necessary for releasing the device data buffers, in addition to
+any allocated event, should be stored in the ``private_data`` member of
+the ``ArrowArray`` and managed by the ``release`` callback.
+
+Moving an array
+'''''''''''''''
+
+The consumer can *move* the ``ArrowDeviceArray`` structure by bitwise copying
+or shallow member-wise copying. Then it MUST mark the source structure released
+by setting the ``release`` member of the embedded ``ArrowArray`` structure to
+``NULL``, but *without* calling that release callback. This ensures that only
+one live copy of the struct is active at any given time and that lifetime is
+correctly communicated to the producer.
+
+As usual, the release callback will be called on the destination structure
+when it is not needed anymore.
+
+Record batches
+--------------
+As with the C data interface itself, a record batch can be trivially considered
+as an equivalent struct array. In this case the metadata of the top-level
+``ArrowSchema`` can be used for schema-level metadata of the record batch.
+
+Mutability
+----------
+
+Both the producer and the consumer SHOULD consider the exported data (that
+is, the data reachable on the device through the ``buffers`` member of
+the embedded ``ArrowArray``) to be immutable, as either party could otherwise
+see inconsistent data while the other is mutating it.
+
+Likewise, if the ``sync_event`` member is non-NULL, the consumer should not
+attempt to access or read the data until they have synchronized on that event.
+
+C producer example
+====================
+
+Exporting a simple ``int32`` device array
+-----------------------------------------
+
+Export a non-nullable ``int32`` type with empty metadata. An example of this
+can be seen in the :ref:`C data interface docs directly <c-data-interface-export-int32-schema>`.
+
+To export the data itself, we transfer ownership to the consumer through
+the release callback. This example will use CUDA, but the equivalent calls
+could be used for any device:
+
+.. code-block:: c
+
+    static void release_int32_device_array(struct ArrowArray* array) {
+        assert(array->n_buffers == 2);
+        // destroy the event
+        cudaEvent_t* ev_ptr = reinterpret_cast<cudaEvent_t*>(array->private_data);
+        cudaError_t status = cudaEventDestroy(*ev_ptr);
+        assert(status == cudaSuccess);
+        free(ev_ptr);
+
+        // free the buffers and the buffers array
+        status = cudaFree(array->buffers[1]);
+        assert(status == cudaSuccess);
+        free(array->buffers);
+
+        // mark released
+        array->release = NULL;
+    }
+
+    __host__ void export_int32_device_array(void* cudaAllocdPtr,
+                                            cudaStream_t stream,
+                                            int64_t length,
+                                            struct ArrowDeviceArray* array) {
+        // get device id
+        int device;
+        cudaError_t status;
+        status = cudaGetDevice(&device);
+        assert(status == cudaSuccess);
+
+        cudaEvent_t* ev_ptr = (cudaEvent_t*)malloc(sizeof(cudaEvent_t));
+        assert(ev_ptr != NULL);
+        status = cudaEventCreate(ev_ptr);
+        assert(status == cudaSuccess);
+
+        // record event on the stream, assuming that the passed in
+        // stream is where the work to produce the data will be processing.        
+        status = cudaEventRecord(*ev_ptr, stream);
+        assert(status == cudaSuccess);
+
+        // initialize fields
+        *array = (struct ArrowDeviceArray) {
+            .array = (struct ArrowArray) {
+                .length = length,
+                .null_count = 0,
+                .offset = 0,
+                .n_buffers = 2,
+                .n_children = 0,
+                .children = NULL,
+                .dictionary = NULL,
+                // bookeeping
+                .release = &release_int32_device_array,
+                .private_data = reinterpret_cast<void*>(ev_ptr),
+            },
+            .device_id = static_cast<int64_t>(device),
+            .device_type = ARROW_DEVICE_CUDA,
+            // pass the event pointer to the consumer
+            .sync_event = reinterpret_cast<void*>(ev_ptr),
+        };
+
+        // allocate list of buffers
+        array->array.buffers = (const void**)malloc(sizeof(void*) * array->array.n_buffers);
+        assert(array->array.buffers != NULL);
+        array->array.buffers[0] = NULL;
+        array->array.buffers[1] = cudaAllocdPtr;
+    }
+
+================
+Device Stream Interface
+================
+
+Like the :ref:`C stream interface <c-stream-interface>`, the C Device data 
+interface also specifies a higher-level structure for easing communication
+of streaming data within a single process. Defining an ``ArrowDeviceArrayStream``
+structure.
+
+Semantics
+=========
+
+An Arrow C device stream exposes a streaming source of data chunks, each with
+the same schema. Chunks are obtained by calling a blocking pull-style iteration
+function. It is expected that all chunks should be providing data on the same
+device type (but not necessarily the same device id). If it is necessary
+to provide a stream of data on multiple device types, a producer should 
+provide a separate stream object for each device type.
+
+Structure definition
+====================
+
+The C device stream interface is defined by a single ``struct`` definition:
+
+.. code-block:: c
+
+    #ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+    #define ARROW_C_DEVICE_STREAM_INTERFACE
+
+    struct ArrowDeviceArrayStream {
+        // device type that all arrays will be accessible from
+        ArrowDeviceType device_type;
+        // callbacks
+        int (*get_schema)(struct ArrowDeviceArrayStream*, struct ArrowSchema*);
+        int (*get_next)(struct ArrowDeviceArrayStream*, struct ArrowDeviceArray*);
+        const char* (*get_last_error)(struct ArrowDeviceArrayStream*);
+
+        // release callback
+        void (*release)(struct ArrowDeviceArrayStream*);
+
+        // opaque producer-specific data
+        void* private_data;
+    };
+
+    #endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+.. note::
+    The canonical guard ``ARROW_C_DEVICE_STREAM_INTERFACE`` is meant to avoid
+    duplicate definitions if two projects copy the C device stream interface
+    definitions into their own headers, and a third-party project includes
+    from these two projects. It is therefore important that this guard is
+    kept exactly as-is when these definitions are copied.
+
+The ArrowDeviceArrayStream structure
+------------------------------------
+
+The ``ArrowDeviceArrayStream`` provides a device type that can access the
+resulting data along with the required callbacks to interact with a
+streaming source of Arrow arrays. It has the following fields:
+
+.. c:member:: ArrowDeviceType device_type
+
+    The device type that this stream produces data on. All 
+    ``ArrowDeviceArray``s that are produced by this stream should have the
+    same device type as is set here. This is a convenience for the consumer
+    to not have to check every array that is retrieved and instead allows
+    higher-level coding constructs for streams.
+
+.. c:member:: int (*ArrowDeviceArrayStream.get_schema)(struct ArrowDeviceArrayStream*, struct ArrowSchema* out)
+
+    *Mandatory.* This callback allows the consumer to query the schema of
+    the chunks of data in the stream. The schema is the same for all data
+    chunks.
+
+    This callback must NOT be called on a released ``ArrowDeviceArrayStream``.
+
+    *Return value:* 0 on success, a non-zero
+    :ref:`error code <c-stream-interface-error-codes>` otherwise.
+
+.. c:member:: int (*ArrowDeviceArrayStream.get_next)(struct ArrowDeviceArrayStream*, struct ArrowDeviceArray* out)
+
+    *Mandatory.* This callback allows the consumer to get the next chunk of
+    data in the stream.
+
+    This callback must NOT be called on a released ``ArrowDeviceArrayStream``.
+
+    The next chunk of data MUST be accessible from a device type matching the
+    :c:member:`ArrowDeviceArrayStream.device_type`.
+
+    *Return value:* 0 on success, a non-zero
+    :ref:`error code <c-stream-interface-error-codes>` otherwise.
+
+    On success, the consumer must check whether the ``ArrowDeviceArray``'s
+    embedded ``ArrowArray`` is marked :ref:`released <c-data-interface-released>`.
+    If the embedded ``ArrowDeviceArray.array`` is released, then the end of the 
+    stream has been reached. Otherwise, the ``ArrowDeviceArray`` contains a
+    valid data chunk.
+
+.. c:member:: const char* (*ArrowDeviceArrayStream.get_last_error)(struct ArrowDeviceArrayStream*)
+
+    *Mandatory.* This callback allows the consumer to get a textual description
+    of the last error.
+
+    This callback must ONLY be called if the last operation on the 
+    ``ArrowDeviceArrayStream`` returned an error. It must NOT be called on a
+    released ``ArrowDeviceArrayStream``.
+
+    *Return value:* a pointer to a NULL-terminated character string 
+    (UTF8-encoded). NULL can also be returned if no detailed description is
+    available.
+
+    The returned pointer is only guaranteed to be valid until the next call
+    of one of the stream's callbacks. The character string it points to should
+    be copied to consumer-managed storage if it is intended to survive longer.
+
+.. c:member:: void (*ArrowDeviceArrayStream.release)(struct ArrowDeviceArrayStream*)
+
+    *Mandatory.* A pointer to a producer-provided release callback.
+
+.. c:member:: void* ArrowDeviceArrayStream.private_data
+
+    *Optional.* An opaque pointer to producer-provided private data.
+
+    Consumers MUST NOT process this member. Lifetime of this member is
+    handled by the producer, and especially by the release callback.
+
+Result lifetimes
+----------------
+
+The data returned by the ``get_schema`` and ``get_next`` callbacks must be
+released independantly. Their lifetimes are not tied to that of 
+``ArrowDeviceArrayStream``.
+
+Stream lifetime
+---------------
+
+Lifetime of the C stream is managed using a release callback with similar 
+usage as in :ref:`C data interface <c-data-interface-released>`.
+
+Thread safety
+-------------
+
+The stream source is not assumed to be thread-safe. Consumers wanting to
+call ``get_next`` from several threads should ensure those calls are 
+serialized.
+
+Updating this specification
+===========================
+
+Once this specification is supported in an official Arrow release, the C ABI
+is frozen. This means that the ``ArrowDeviceArray`` structure definition 
+should not change in any way -- including adding new members. 
+
+Backwards-compatible changes are allowed, for example new macro values for
+:c:typedef:`ArrowDeviceType` or converting the reserved 24 bytes into a
+different type/member without changing the size of the structure.
+
+Any incompatible changes should be part of a new specification, for example
+``ArrowDeviceArrayV2``.
+
+
+
+.. _RAPIDS cuDF library: https://docs.rapids.ai/api/cudf/stable/
+.. _CUDA Array Interface: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
+.. _dlpack: https://dmlc.github.io/dlpack/latest/c_api.html#c-api
+.. _CUDA: https://developer.nvidia.com/cuda-toolkit
+.. _OpenCL (Open Computing Language): https://www.khronos.org/opencl/
+.. _Vulkan: https://www.vulkan.org/
+.. _Metal: https://developer.apple.com/metal/
+.. _ROCm: https://www.amd.com/en/graphics/servers-solutions-rocm
+.. _oneAPI: https://www.intel.com/content/www/us/en/developer/tools/oneapi/overview.html
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8341b9f3543..56079b9b7d0 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -79,6 +79,7 @@ target environment.**
    format/FlightSql
    format/Integration
    format/CDataInterface
+   format/CDeviceDataInterface
    format/CStreamInterface
    format/ADBC
    format/Other

From 05f70f7b217b1f240819ab7da8357bc5b0bfe6d7 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Wed, 24 May 2023 10:59:19 -0400
Subject: [PATCH 12/23] Update docs/source/format/CDeviceDataInterface.rst

Co-authored-by: David Li <li.davidm96@gmail.com>
---
 docs/source/format/CDeviceDataInterface.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst
index 333d2be08ed..500735af64d 100644
--- a/docs/source/format/CDeviceDataInterface.rst
+++ b/docs/source/format/CDeviceDataInterface.rst
@@ -32,7 +32,7 @@ The current :ref:`C Data Interface <c-data-interface>`, and most
 implementations of it, make the assumption that all data buffers provided
 are CPU buffers. Since Apache Arrow is designed to be a universal in-memory
 format for representing tabular ("columnar") data, there will be the desire
-to leverage this data on Non-CPU hardware such as GPUs. One example of such
+to leverage this data on non-CPU hardware such as GPUs. One example of such
 a case is the `RAPIDS cuDF library` which uses the Arrow memory format with
 CUDA for NVIDIA GPUs. Since copying data from host to device and back is
 expensive, the ideal would be to be able to leave the data on the device

From 43ce6c93d6c65d0498241dd8942f78b810b03fd3 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Wed, 24 May 2023 11:00:38 -0400
Subject: [PATCH 13/23] Update docs/source/format/CDeviceDataInterface.rst

Co-authored-by: David Li <li.davidm96@gmail.com>
---
 docs/source/format/CDeviceDataInterface.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst
index 500735af64d..4ff94515b88 100644
--- a/docs/source/format/CDeviceDataInterface.rst
+++ b/docs/source/format/CDeviceDataInterface.rst
@@ -46,7 +46,7 @@ C FFI declarations should be just as simple as with the current C data
 interface.
 
 Applications and libraries can then use Arrow schemas and Arrow formatted
-memory on non-cpu devices to exchange data just as easily as they do
+memory on non-CPU devices to exchange data just as easily as they do
 now with CPU data. This will enable leaving data on those devices longer
 and avoiding costly copies back and forth between the host and device
 just to leverage new libraries and runtimes.

From 2c9a7a6b673faffd3994f5250373c4b5686e3e95 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Wed, 24 May 2023 11:16:49 -0400
Subject: [PATCH 14/23] updates from feedback

---
 docs/source/format/CDeviceDataInterface.rst | 67 +++++++++++++--------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst
index 4ff94515b88..1162a5d4455 100644
--- a/docs/source/format/CDeviceDataInterface.rst
+++ b/docs/source/format/CDeviceDataInterface.rst
@@ -33,17 +33,20 @@ implementations of it, make the assumption that all data buffers provided
 are CPU buffers. Since Apache Arrow is designed to be a universal in-memory
 format for representing tabular ("columnar") data, there will be the desire
 to leverage this data on non-CPU hardware such as GPUs. One example of such
-a case is the `RAPIDS cuDF library` which uses the Arrow memory format with
+a case is the `RAPIDS cuDF library`_ which uses the Arrow memory format with
 CUDA for NVIDIA GPUs. Since copying data from host to device and back is
 expensive, the ideal would be to be able to leave the data on the device
 for as long as possible, even when passing it between runtimes and
 libraries.
 
 The Arrow C Device data interface builds on the existing C data interface
-by adding a very small, stable set of C definitions to it. For non-C/C++
-languages and runtimes, translating the C definitions to corresponding
-C FFI declarations should be just as simple as with the current C data
-interface.
+by adding a very small, stable set of C definitions to it. These definitions
+are equivalents to the ``ArrowArray`` and ``ArrowArrayStream`` structures
+from the C Data Interface which add members to allow specifying the device
+type and pass necessary information to synchronize with the producer.
+For non-C/C++ languages and runtimes, translating the C definitions to
+corresponding C FFI declarations should be just as simple as with the
+current C data interface.
 
 Applications and libraries can then use Arrow schemas and Arrow formatted
 memory on non-CPU devices to exchange data just as easily as they do
@@ -60,7 +63,7 @@ Goals
 * Allow zero-copy sharing of Arrow formatted device memory between
   independant runtimes and components running in the same process.
 * Avoid the need for one-to-one adaptation layers such as the
-  `CUDA Array Interface` for Python processes to pass CUDA data.
+  `CUDA Array Interface`_ for Python processes to pass CUDA data.
 * Enable integration without explicit dependencies (either at compile-time
   or runtime) on the Arrow software project itself.
 
@@ -150,7 +153,7 @@ provided memory buffers were allocated on. This, in conjunction with the
 ``device_id``, should be sufficient to reference the correct data buffers.
 
 We then use macros to define values for different device types. The provided
-macro values are compatible with the widely used `dlpack` ``DLDeviceType``
+macro values are compatible with the widely used `dlpack`_ ``DLDeviceType``
 definition values, using the same value for each as the equivalent
 ``kDL<type>`` enum from dlpack.h. The list will be kept in sync with those
 equivalent enum values over time to ensure compatibility, rather than
@@ -168,7 +171,7 @@ so the storage type is not compiler dependent.
 
 .. c:macro:: ARROW_DEVICE_CUDA
 
-    A `CUDA` GPU Device. This could represent data allocated either with the
+    A `CUDA`_ GPU Device. This could represent data allocated either with the
     runtime library (``cudaMalloc``) or the device driver (``cuMemAlloc``).
 
 .. c:macro:: ARROW_DEVICE_CUDA_HOST
@@ -178,16 +181,16 @@ so the storage type is not compiler dependent.
 
 .. c:macro:: ARROW_DEVICE_OPENCL
 
-    Data allocated on the device by using the `OpenCL (Open Computing Language)`
+    Data allocated on the device by using the `OpenCL (Open Computing Language)`_
     framework.
 
 .. c:macro:: ARROW_DEVICE_VULKAN
 
-    Data allocated by the `Vulkan` framework and libraries.
+    Data allocated by the `Vulkan`_ framework and libraries.
 
 .. c:macro:: ARROW_DEVICE_METAL
 
-    Data on Apple GPU devices using the `Metal` framework and libraries.
+    Data on Apple GPU devices using the `Metal`_ framework and libraries.
 
 .. c:macro:: ARROW_DEVICE_VPI
 
@@ -195,7 +198,7 @@ so the storage type is not compiler dependent.
 
 .. c:macro:: ARROW_DEVICE_ROCM
 
-    An AMD device using the `ROCm` stack.
+    An AMD device using the `ROCm`_ stack.
 
 .. c:macro:: ARROW_DEVICE_ROCM_HOST
 
@@ -215,7 +218,7 @@ so the storage type is not compiler dependent.
 
 .. c:macro:: ARROW_DEVICE_ONEAPI
 
-    Unified shared memory allocated on an Intel `oneAPI` non-partitioned
+    Unified shared memory allocated on an Intel `oneAPI`_ non-partitioned
     device. A call to the ``oneAPI`` runtime is required to determine
     the specific device type, the USM allocation type and the sycl context
     that it is bound to.
@@ -409,7 +412,7 @@ could be used for any device:
         assert(status == cudaSuccess);
 
         // record event on the stream, assuming that the passed in
-        // stream is where the work to produce the data will be processing.        
+        // stream is where the work to produce the data will be processing.
         status = cudaEventRecord(*ev_ptr, stream);
         assert(status == cudaSuccess);
 
@@ -444,7 +447,7 @@ could be used for any device:
 Device Stream Interface
 ================
 
-Like the :ref:`C stream interface <c-stream-interface>`, the C Device data 
+Like the :ref:`C stream interface <c-stream-interface>`, the C Device data
 interface also specifies a higher-level structure for easing communication
 of streaming data within a single process. Defining an ``ArrowDeviceArrayStream``
 structure.
@@ -456,7 +459,7 @@ An Arrow C device stream exposes a streaming source of data chunks, each with
 the same schema. Chunks are obtained by calling a blocking pull-style iteration
 function. It is expected that all chunks should be providing data on the same
 device type (but not necessarily the same device id). If it is necessary
-to provide a stream of data on multiple device types, a producer should 
+to provide a stream of data on multiple device types, a producer should
 provide a separate stream object for each device type.
 
 Structure definition
@@ -502,7 +505,7 @@ streaming source of Arrow arrays. It has the following fields:
 
 .. c:member:: ArrowDeviceType device_type
 
-    The device type that this stream produces data on. All 
+    The device type that this stream produces data on. All
     ``ArrowDeviceArray``s that are produced by this stream should have the
     same device type as is set here. This is a convenience for the consumer
     to not have to check every array that is retrieved and instead allows
@@ -534,7 +537,7 @@ streaming source of Arrow arrays. It has the following fields:
 
     On success, the consumer must check whether the ``ArrowDeviceArray``'s
     embedded ``ArrowArray`` is marked :ref:`released <c-data-interface-released>`.
-    If the embedded ``ArrowDeviceArray.array`` is released, then the end of the 
+    If the embedded ``ArrowDeviceArray.array`` is released, then the end of the
     stream has been reached. Otherwise, the ``ArrowDeviceArray`` contains a
     valid data chunk.
 
@@ -543,11 +546,11 @@ streaming source of Arrow arrays. It has the following fields:
     *Mandatory.* This callback allows the consumer to get a textual description
     of the last error.
 
-    This callback must ONLY be called if the last operation on the 
+    This callback must ONLY be called if the last operation on the
     ``ArrowDeviceArrayStream`` returned an error. It must NOT be called on a
     released ``ArrowDeviceArrayStream``.
 
-    *Return value:* a pointer to a NULL-terminated character string 
+    *Return value:* a pointer to a NULL-terminated character string
     (UTF8-encoded). NULL can also be returned if no detailed description is
     available.
 
@@ -570,28 +573,41 @@ Result lifetimes
 ----------------
 
 The data returned by the ``get_schema`` and ``get_next`` callbacks must be
-released independantly. Their lifetimes are not tied to that of 
+released independantly. Their lifetimes are not tied to that of
 ``ArrowDeviceArrayStream``.
 
 Stream lifetime
 ---------------
 
-Lifetime of the C stream is managed using a release callback with similar 
+Lifetime of the C stream is managed using a release callback with similar
 usage as in :ref:`C data interface <c-data-interface-released>`.
 
 Thread safety
 -------------
 
 The stream source is not assumed to be thread-safe. Consumers wanting to
-call ``get_next`` from several threads should ensure those calls are 
+call ``get_next`` from several threads should ensure those calls are
 serialized.
 
 Updating this specification
 ===========================
 
+.. note::
+    Since this specification is still considered experimental, there is the
+    (still very low) possibility it might change slightly. Once it is
+    supported in an official Arrow release and the "experimental" tag is
+    removed from it, this section will apply and the ABI will be frozen.
+    
+    The reason for the "experimental" tag is because we don't know what we
+    don't know. While it was attempted to ensure this is generic enough to
+    work with a multitude of different frameworks, it's also possible that
+    something was missed. Once there is some usage of this and we are
+    confident there isn't any necessary modifications, the "experimental"
+    tag will be removed and the ABI frozen.
+
 Once this specification is supported in an official Arrow release, the C ABI
-is frozen. This means that the ``ArrowDeviceArray`` structure definition 
-should not change in any way -- including adding new members. 
+is frozen. This means that the ``ArrowDeviceArray`` structure definition
+should not change in any way -- including adding new members.
 
 Backwards-compatible changes are allowed, for example new macro values for
 :c:typedef:`ArrowDeviceType` or converting the reserved 24 bytes into a
@@ -601,7 +617,6 @@ Any incompatible changes should be part of a new specification, for example
 ``ArrowDeviceArrayV2``.
 
 
-
 .. _RAPIDS cuDF library: https://docs.rapids.ai/api/cudf/stable/
 .. _CUDA Array Interface: https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
 .. _dlpack: https://dmlc.github.io/dlpack/latest/c_api.html#c-api

From 4f636a89347ab7d70fc12960040c74b83bb2cd22 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Wed, 31 May 2023 13:12:33 -0400
Subject: [PATCH 15/23] Update docs/source/format/CDeviceDataInterface.rst

Co-authored-by: Antoine Pitrou <pitrou@free.fr>
---
 docs/source/format/CDeviceDataInterface.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst
index 1162a5d4455..30add4297e5 100644
--- a/docs/source/format/CDeviceDataInterface.rst
+++ b/docs/source/format/CDeviceDataInterface.rst
@@ -23,7 +23,7 @@
 The Arrow C Device data interface
 =================================
 
-.. note:: The Arrow C Device Data Interface should be considered experimental
+.. warning:: The Arrow C Device Data Interface should be considered experimental
 
 Rationale
 =========

From 4e1d1f6548eec009ef99d04c2f8a380ba016bd4b Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 1 Jun 2023 10:09:45 -0400
Subject: [PATCH 16/23] Update cpp/src/arrow/c/abi.h

Co-authored-by: Gang Wu <ustcwg@gmail.com>
---
 cpp/src/arrow/c/abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 0ba4b8a0aed..99c7bb08ed4 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -158,7 +158,7 @@ struct ArrowDeviceArray {
   /// \brief The device id to identify a specific device
   /// if multiple of this type are on the system.
   ///
-  /// the semantics of the id will be hardware dependant.
+  /// the semantics of the id will be hardware dependent.
   int64_t device_id;
   /// \brief The type of device which can access this memory.
   ArrowDeviceType device_type;

From 2b24a10e30d3e6d8cbb6be40cb64406c35ec0bd9 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 1 Jun 2023 10:10:00 -0400
Subject: [PATCH 17/23] Update cpp/src/arrow/c/abi.h

Co-authored-by: Gang Wu <ustcwg@gmail.com>
---
 cpp/src/arrow/c/abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 99c7bb08ed4..4ed6f7e2bc7 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -261,7 +261,7 @@ struct ArrowDeviceArrayStream {
   /// \brief Callback to get the stream schema
   /// (will be the same for all arrays in the stream).
   ///
-  /// If successful, the ArrowSchema must be released independantly from the stream.
+  /// If successful, the ArrowSchema must be released independently from the stream.
   /// The schema should be accessible via CPU memory.
   ///
   /// \param[in] self The ArrowDeviceArrayStream object itself

From 451d3a3e71ca22b917484a5f35c3b8cc6f6a0f04 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 1 Jun 2023 10:10:16 -0400
Subject: [PATCH 18/23] Update cpp/src/arrow/c/abi.h

Co-authored-by: Gang Wu <ustcwg@gmail.com>
---
 cpp/src/arrow/c/abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 4ed6f7e2bc7..45258ee7c8e 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -177,7 +177,7 @@ struct ArrowDeviceArray {
   /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
   ///
   /// Expected type to coerce this void* to depending on device type:
-  ///   cuda: cudaEvent_t*
+  ///   CUDA: cudaEvent_t*
   ///   ROCm: hipEvent_t*
   ///   OpenCL: cl_event*
   ///   Vulkan: VkEvent*

From e34746f6d30e0a1366badeb4073f7226ddf0e79c Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 1 Jun 2023 15:43:23 -0400
Subject: [PATCH 19/23] addressing review feedback

---
 cpp/src/arrow/c/abi.h                       | 194 ++++++--------------
 docs/source/format/CDeviceDataInterface.rst | 116 +++++++++---
 docs/source/index.rst                       |   2 +-
 3 files changed, 146 insertions(+), 166 deletions(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 45258ee7c8e..d876d993652 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -34,10 +34,7 @@
 
 #include <stdint.h>
 
-/// \defgroup Arrow C-Data Interface
-/// Definitions for the C-Data Interface/C-Stream Interface.
-///
-/// @{
+// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html
 
 #ifdef __cplusplus
 extern "C" {
@@ -88,111 +85,53 @@ struct ArrowArray {
 #ifndef ARROW_C_DEVICE_DATA_INTERFACE
 #define ARROW_C_DEVICE_DATA_INTERFACE
 
-/// \defgroup arrow-device-types Device Types
-/// These macros are compatible with the dlpack DLDeviceType values,
-/// using the same value for each enum as the equivalent kDL<type>
-/// from dlpack.h. This list should continue to be kept in sync with
-/// the equivalent dlpack.h enum values over time to ensure
-/// compatibility, rather than potentially diverging.
-///
-/// To ensure predictability with the ABI we use macros instead of
-/// an enum so the storage type is not compiler dependent.
-///
-/// @{
+// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html
 
-/// \brief DeviceType for the allocated memory
+// DeviceType for the allocated memory
 typedef int32_t ArrowDeviceType;
 
-/// \brief CPU device, same as using ArrowArray directly
+// CPU device, same as using ArrowArray directly
 #define ARROW_DEVICE_CPU 1
-/// \brief CUDA GPU Device
+// CUDA GPU Device
 #define ARROW_DEVICE_CUDA 2
-/// \brief Pinned CUDA CPU memory by cudaMallocHost
+// Pinned CUDA CPU memory by cudaMallocHost
 #define ARROW_DEVICE_CUDA_HOST 3
-/// \brief OpenCL Device
+// OpenCL Device
 #define ARROW_DEVICE_OPENCL 4
-/// \brief Vulkan buffer for next-gen graphics
+// Vulkan buffer for next-gen graphics
 #define ARROW_DEVICE_VULKAN 7
-/// \brief Metal for Apple GPU
+// Metal for Apple GPU
 #define ARROW_DEVICE_METAL 8
-/// \brief Verilog simulator buffer
+// Verilog simulator buffer
 #define ARROW_DEVICE_VPI 9
-/// \brief ROCm GPUs for AMD GPUs
+// ROCm GPUs for AMD GPUs
 #define ARROW_DEVICE_ROCM 10
-/// \brief Pinned ROCm CPU memory allocated by hipMallocHost
+// Pinned ROCm CPU memory allocated by hipMallocHost
 #define ARROW_DEVICE_ROCM_HOST 11
-/// \brief Reserved for extension
-///
-/// used to quickly test extension devices, semantics
-/// can differ based on the implementation
+// Reserved for extension
 #define ARROW_DEVICE_EXT_DEV 12
-/// \brief CUDA managed/unified memory allocated by cudaMallocManaged
+// CUDA managed/unified memory allocated by cudaMallocManaged
 #define ARROW_DEVICE_CUDA_MANAGED 13
-/// \brief unified shared memory allocated on a oneAPI
-/// non-partitioned device.
-///
-/// A call to the oneAPI runtime is required to determine the device
-/// type, the USM allocation type, and the sycl context it is bound to.
+// unified shared memory allocated on a oneAPI non-partitioned device.
 #define ARROW_DEVICE_ONEAPI 14
-/// \brief GPU support for next-gen WebGPU standard
+// GPU support for next-gen WebGPU standard
 #define ARROW_DEVICE_WEBGPU 15
-/// \brief Qualcomm Hexagon DSP
+// Qualcomm Hexagon DSP
 #define ARROW_DEVICE_HEXAGON 16
 
-/// @}
-
-/// \brief Struct for passing an Arrow Array alongside
-/// device memory information.
 struct ArrowDeviceArray {
-  /// \brief the Allocated Array
-  ///
-  /// the buffers in the array (along with the buffers of any
-  /// children) are what is allocated on the device.
-  ///
-  /// the private_data and release callback of the arrow array
-  /// should contain any necessary information and structures
-  /// related to freeing the array according to the device it
-  /// is allocated on, rather than having a separate release
-  /// callback embedded here.
+  // the Allocated Array
+  //
+  // the buffers in the array (along with the buffers of any
+  // children) are what is allocated on the device.  
   struct ArrowArray array;
-  /// \brief The device id to identify a specific device
-  /// if multiple of this type are on the system.
-  ///
-  /// the semantics of the id will be hardware dependent.
+  // The device id to identify a specific device  
   int64_t device_id;
-  /// \brief The type of device which can access this memory.
+  // The type of device which can access this memory.
   ArrowDeviceType device_type;
-  /// \brief An event-like object to synchronize on if needed.
-  ///
-  /// Many devices, like GPUs, are primarily asynchronous with
-  /// respect to CPU processing. As such in order to safely access
-  /// memory, it is often necessary to have an object to synchronize
-  /// processing on. Since different devices will use different types
-  /// to specify this we use a void* that can be coerced into
-  /// whatever the device appropriate type is (e.g. cudaEvent_t for
-  /// CUDA and hipEvent_t for HIP).
-  ///
-  /// If synchronization is not needed this can be null. If this is
-  /// non-null, then it should be used to call the appropriate sync
-  /// method for the device (e.g. cudaStreamWaitEvent / hipStreamWaitEvent).
-  ///
-  /// Expected type to coerce this void* to depending on device type:
-  ///   CUDA: cudaEvent_t*
-  ///   ROCm: hipEvent_t*
-  ///   OpenCL: cl_event*
-  ///   Vulkan: VkEvent*
-  ///   Metal: MTLEvent*
-  ///   OneAPI: sycl::event*
-  ///
+  // An event-like object to synchronize on if needed.  
   void* sync_event;
-  /// \brief Reserved bytes for future expansion.
-  ///
-  /// As non-CPU development expands we can update this struct
-  /// without ABI breaking changes. This also rounds out the
-  /// total size of this struct to be 128 bytes (power of 2)
-  /// on 64-bit systems. These bytes should be zero'd out after
-  /// allocation in order to ensure safe evolution of the ABI in
-  /// the future.
+  // Reserved bytes for future expansion.  
   int64_t reserved[3];
 };
 
@@ -242,65 +181,48 @@ struct ArrowArrayStream {
 #ifndef ARROW_C_DEVICE_STREAM_INTERFACE
 #define ARROW_C_DEVICE_STREAM_INTERFACE
 
-/// \brief Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
-///
-/// This stream is intended to provide a stream of data on a single
-/// device, if a producer wants data to be produced on multiple devices
-/// then multiple streams should be provided. One per device.
+// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+//
+// This stream is intended to provide a stream of data on a single
+// device, if a producer wants data to be produced on multiple devices
+// then multiple streams should be provided. One per device.
 struct ArrowDeviceArrayStream {
-  /// \brief The device that this stream produces data on.
-  ///
-  /// All ArrowDeviceArrays that are produced by this
-  /// stream should have the same device_type as set
-  /// here. Including it here in the stream object is
-  /// a convenience to allow consumers simpler processing
-  /// since they can assume all arrays that result from
-  /// this stream to be on this device type.
+  // The device that this stream produces data on.  
   ArrowDeviceType device_type;
 
-  /// \brief Callback to get the stream schema
-  /// (will be the same for all arrays in the stream).
-  ///
-  /// If successful, the ArrowSchema must be released independently from the stream.
-  /// The schema should be accessible via CPU memory.
-  ///
-  /// \param[in] self The ArrowDeviceArrayStream object itself
-  /// \param[out] out C struct to export the schema to
-  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  // Callback to get the stream schema
+  // (will be the same for all arrays in the stream).
+  //
+  // Return value 0 if successful, an `errno`-compatible error code otherwise.
+  //    
+  // If successful, the ArrowSchema must be released independently from the stream.
+  // The schema should be accessible via CPU memory.
   int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
 
-  /// \brief Callback to get the next array
-  ///
-  /// If there is no error and the returned array has been released, the stream
-  /// has ended. If successful, the ArrowArray must be released independently
-  /// from the stream.
-  ///
-  /// \param[in] self The ArrowDeviceArrayStream object itself
-  /// \param[out] out C struct where to export the Array and device info
-  /// \return 0 if successful, an `errno`-compatible error code otherwise.
+  // Callback to get the next array
+  // (if no error and the array is released, the stream has ended)
+  //
+  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // If successful, the ArrowDeviceArray must be released independently from the stream.
   int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
-
-  /// \brief Callback to get optional detailed error information.
-  ///
-  /// This must only be called if the last stream operation failed
-  /// with a non-0 return code.
-  ///
-  /// The returned pointer is only valid until the next operation on this stream
-  /// (including release).
-  ///
-  /// \param[in] self The ArrowDeviceArrayStream object itself
-  /// \return pointer to a null-terminated character array describing
-  /// the last error, or NULL if no description is available.
+  
+  // Callback to get optional detailed error information.
+  // This must only be called if the last stream operation failed
+  // with a non-0 return code.
+  //
+  // Return value: pointer to a null-terminated character array describing
+  // the last error, or NULL if no description is available.
+  //
+  // The returned pointer is only valid until the next operation on this stream
+  // (including release).
   const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
 
-  /// \brief Release callback: release the stream's own resources.
-  ///
-  /// Note that arrays returned by `get_next` must be individually released.
-  ///
-  /// \param[in] self The ArrowDeviceArrayStream object itself
+  // Release callback: release the stream's own resources.
+  // Note that arrays returned by `get_next` must be individually released.
   void (*release)(struct ArrowDeviceArrayStream* self);
 
-  /// \brief Opaque producer-specific data
+  // Opaque producer-specific data
   void* private_data;
 };
 
@@ -309,5 +231,3 @@ struct ArrowDeviceArrayStream {
 #ifdef __cplusplus
 }
 #endif
-
-/// @}
diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst
index 30add4297e5..d2e211689f3 100644
--- a/docs/source/format/CDeviceDataInterface.rst
+++ b/docs/source/format/CDeviceDataInterface.rst
@@ -286,10 +286,10 @@ has the following fields:
     * Metal: ``MTLEvent*``
     * OneAPI: ``sycl::event*``
 
-    If an event is provided, then the producer MUST ensure that the event
-    is triggered/recorded at the end of the processing stream once the data
-    is considered available for use.
-
+    If an event is provided, then the producer MUST ensure that the exported
+    data is available on the device before the event is triggered. The 
+    consumer SHOULD wait on the event before trying to access the exported
+    data.
 
 .. c:member:: int64_t ArrowDeviceArray.reserved[3]
 
@@ -297,15 +297,65 @@ has the following fields:
     structure. In order to do so without potentially breaking ABI changes,
     we reserve 24 bytes at the end of the object. This also has the added
     benefit of bringing the total size of this structure to exactly 128
-    bytes (a power of 2) on 64-bit systems. These bytes should be zero'd
-    out after allocation in order to ensure safe evolution of the ABI in
-    the future.
+    bytes (a power of 2) on 64-bit systems. These bytes MUST be zero'd
+    out after initialization by the producer in order to ensure safe 
+    evolution of the ABI in the future.
 
 .. note::
     Rather than store the shape / types of the data alongside the
     ``ArrowDeviceArray``, users should utilize the existing ``ArrowSchema``
     structure to pass any data type and shape information.
 
+Synchronization event types
+---------------------------
+
+The table below lists the expected event types for each device type.
+If no event type is supported ("N/A"), then the ``sync_event`` member
+should always be null. 
+
+Remember that the event *CAN* be null if synchronization is not needed
+to access the data.
+
++---------------------------+--------------------+---------+
+| Device Type               | Actual Event Type  | Notes   |
++===========================+====================+=========+
+| ARROW_DEVICE_CPU          | N/A                |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_CUDA         | ``cudaEvent_t*``   |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_CUDA_HOST    | ``cudaEvent_t*``   |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_OPENCL       | ``cl_event*``      |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_VULKAN       | ``VkEvent*``       |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_METAL        | ``MTLEvent*``      |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_VPI          | N/A                | (1)     |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_ROCM         | ``hipEvent_t*``    |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_ROCM_HOST    | ``hipEvent_t*``    |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_EXT_DEV      |                    | (2)     |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_CUDA_MANAGED | ``cudaEvent_t*``   |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_ONEAPI       | ``sycl::event*``   |         |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_WEBGPU       | N/A                | (1)     |
++---------------------------+--------------------+---------+
+| ARROW_DEVICE_HEXAGON      | N/A                | (1)     |
++---------------------------+--------------------+---------+
+
+Notes:
+
+* \(1) Currently unknown if framework has an event type to support.
+* \(2) Extension Device has producer defined semantics and thus if
+       synchronization is needed for an extension device, the producer
+       should document the type.
+
+
 Semantics
 =========
 
@@ -361,8 +411,13 @@ is, the data reachable on the device through the ``buffers`` member of
 the embedded ``ArrowArray``) to be immutable, as either party could otherwise
 see inconsistent data while the other is mutating it.
 
-Likewise, if the ``sync_event`` member is non-NULL, the consumer should not
-attempt to access or read the data until they have synchronized on that event.
+Synchronization
+---------------
+
+If the ``sync_event`` member is non-NULL, the consumer should not attempt 
+to access or read the data until they have synchronized on that event. If
+the ``sync_event`` member is NULL, then it MUST be safe to access the data
+without any synchronization necessary on the part of the consumer.
 
 C producer example
 ====================
@@ -382,7 +437,7 @@ could be used for any device:
     static void release_int32_device_array(struct ArrowArray* array) {
         assert(array->n_buffers == 2);
         // destroy the event
-        cudaEvent_t* ev_ptr = reinterpret_cast<cudaEvent_t*>(array->private_data);
+        cudaEvent_t* ev_ptr = (cudaEvent_t*)(array->private_data);
         cudaError_t status = cudaEventDestroy(*ev_ptr);
         assert(status == cudaSuccess);
         free(ev_ptr);
@@ -396,10 +451,10 @@ could be used for any device:
         array->release = NULL;
     }
 
-    __host__ void export_int32_device_array(void* cudaAllocdPtr,
-                                            cudaStream_t stream,
-                                            int64_t length,
-                                            struct ArrowDeviceArray* array) {
+    void export_int32_device_array(void* cudaAllocdPtr,
+                                   cudaStream_t stream,
+                                   int64_t length,
+                                   struct ArrowDeviceArray* array) {
         // get device id
         int device;
         cudaError_t status;
@@ -428,12 +483,14 @@ could be used for any device:
                 .dictionary = NULL,
                 // bookeeping
                 .release = &release_int32_device_array,
-                .private_data = reinterpret_cast<void*>(ev_ptr),
+                // store the event pointer as private data in the array
+                // so that we can access it in the release callback.
+                .private_data = (void*)(ev_ptr),
             },
-            .device_id = static_cast<int64_t>(device),
+            .device_id = (int64_t)(device),
             .device_type = ARROW_DEVICE_CUDA,
             // pass the event pointer to the consumer
-            .sync_event = reinterpret_cast<void*>(ev_ptr),
+            .sync_event = (void*)(ev_ptr),
         };
 
         // allocate list of buffers
@@ -443,9 +500,15 @@ could be used for any device:
         array->array.buffers[1] = cudaAllocdPtr;
     }
 
-================
+    // calling the release callback should be done using the array member
+    // of the device array.
+    static void release_device_array_helper(struct ArrowDeviceArray* arr) {
+        arr->array.release(&arr->array);
+    }
+
+=======================
 Device Stream Interface
-================
+=======================
 
 Like the :ref:`C stream interface <c-stream-interface>`, the C Device data
 interface also specifies a higher-level structure for easing communication
@@ -594,15 +657,12 @@ Updating this specification
 
 .. note::
     Since this specification is still considered experimental, there is the
-    (still very low) possibility it might change slightly. Once it is
-    supported in an official Arrow release and the "experimental" tag is
-    removed from it, this section will apply and the ABI will be frozen.
-    
-    The reason for the "experimental" tag is because we don't know what we
-    don't know. While it was attempted to ensure this is generic enough to
-    work with a multitude of different frameworks, it's also possible that
-    something was missed. Once there is some usage of this and we are
-    confident there isn't any necessary modifications, the "experimental"
+    (still very low) possibility it might change slightly. The reason for
+    tagging this as "experimental" is because we don't know what we don't know.
+    Work and research was done to ensure a generic ABI compatible with many
+    different frameworks, but it is always possible something was missed.
+    Once this is supported in an official Arrow release and usage is observed
+    to confirm there aren't any modifications necessary, the "experimental"
     tag will be removed and the ABI frozen.
 
 Once this specification is supported in an official Arrow release, the C ABI
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 56079b9b7d0..f012d43a862 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -79,8 +79,8 @@ target environment.**
    format/FlightSql
    format/Integration
    format/CDataInterface
-   format/CDeviceDataInterface
    format/CStreamInterface
+   format/CDeviceDataInterface
    format/ADBC
    format/Other
    format/Glossary

From b359239e5f68c39263c43d57e3154b0c82833e8c Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Tue, 6 Jun 2023 11:11:52 -0400
Subject: [PATCH 20/23] Apply suggestions from code review

Co-authored-by: Antoine Pitrou <pitrou@free.fr>
---
 docs/source/format/CDeviceDataInterface.rst | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst
index d2e211689f3..4f97c4590d3 100644
--- a/docs/source/format/CDeviceDataInterface.rst
+++ b/docs/source/format/CDeviceDataInterface.rst
@@ -155,7 +155,7 @@ provided memory buffers were allocated on. This, in conjunction with the
 We then use macros to define values for different device types. The provided
 macro values are compatible with the widely used `dlpack`_ ``DLDeviceType``
 definition values, using the same value for each as the equivalent
-``kDL<type>`` enum from dlpack.h. The list will be kept in sync with those
+``kDL<type>`` enum from ``dlpack.h``. The list will be kept in sync with those
 equivalent enum values over time to ensure compatibility, rather than
 potentially diverging. To avoid the Arrow project having to be in the
 position of vetting new hardware devices, new additions should first be
@@ -202,7 +202,7 @@ so the storage type is not compiler dependent.
 
 .. c:macro:: ARROW_DEVICE_ROCM_HOST
 
-    CPU memory pinned and page-locked allocated ``hipMallocHost``.
+    CPU memory that was pinned and page-locked by ROCm by using ``hipMallocHost``.
 
 .. c:macro:: ARROW_DEVICE_EXT_DEV
 
@@ -265,10 +265,10 @@ has the following fields:
     Optional. An event-like object to synchronize on if needed.
 
     Many devices, like GPUs, are primarily asynchronous with respect to
-    CPU processing. As such in order to safely access memory, it is often
+    CPU processing. As such, in order to safely access device memory, it is often
     necessary to have an object to synchronize processing with. Since
     different devices will use different types to specify this, we use a
-    void* which can be coerced into a pointer to whatever the device
+    ``void*`` which can be coerced into a pointer to whatever the device
     appropriate type is.
 
     If synchronization is not needed, this can be null. If this is non-null
@@ -481,7 +481,7 @@ could be used for any device:
                 .n_children = 0,
                 .children = NULL,
                 .dictionary = NULL,
-                // bookeeping
+                // bookkeeping
                 .release = &release_int32_device_array,
                 // store the event pointer as private data in the array
                 // so that we can access it in the release callback.
@@ -512,8 +512,7 @@ Device Stream Interface
 
 Like the :ref:`C stream interface <c-stream-interface>`, the C Device data
 interface also specifies a higher-level structure for easing communication
-of streaming data within a single process. Defining an ``ArrowDeviceArrayStream``
-structure.
+of streaming data within a single process.
 
 Semantics
 =========

From 4845e197b674a40bbe6552c3616bd4950425d154 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Tue, 6 Jun 2023 11:32:39 -0400
Subject: [PATCH 21/23] linting

---
 cpp/src/arrow/c/abi.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index d876d993652..7061fa9003f 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -123,15 +123,15 @@ struct ArrowDeviceArray {
   // the Allocated Array
   //
   // the buffers in the array (along with the buffers of any
-  // children) are what is allocated on the device.  
+  // children) are what is allocated on the device.
   struct ArrowArray array;
-  // The device id to identify a specific device  
+  // The device id to identify a specific device
   int64_t device_id;
   // The type of device which can access this memory.
   ArrowDeviceType device_type;
-  // An event-like object to synchronize on if needed.  
+  // An event-like object to synchronize on if needed.
   void* sync_event;
-  // Reserved bytes for future expansion.  
+  // Reserved bytes for future expansion.
   int64_t reserved[3];
 };
 
@@ -187,14 +187,14 @@ struct ArrowArrayStream {
 // device, if a producer wants data to be produced on multiple devices
 // then multiple streams should be provided. One per device.
 struct ArrowDeviceArrayStream {
-  // The device that this stream produces data on.  
+  // The device that this stream produces data on.
   ArrowDeviceType device_type;
 
   // Callback to get the stream schema
   // (will be the same for all arrays in the stream).
   //
   // Return value 0 if successful, an `errno`-compatible error code otherwise.
-  //    
+  //
   // If successful, the ArrowSchema must be released independently from the stream.
   // The schema should be accessible via CPU memory.
   int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
@@ -206,7 +206,7 @@ struct ArrowDeviceArrayStream {
   //
   // If successful, the ArrowDeviceArray must be released independently from the stream.
   int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
-  
+
   // Callback to get optional detailed error information.
   // This must only be called if the last stream operation failed
   // with a non-0 return code.

From fa8aab0ef90b81c600cc86b36697944eae3a73c9 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Tue, 6 Jun 2023 11:32:50 -0400
Subject: [PATCH 22/23] applying review feedback

---
 docs/source/format/CDeviceDataInterface.rst | 50 ++++++++++++---------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/docs/source/format/CDeviceDataInterface.rst b/docs/source/format/CDeviceDataInterface.rst
index 4f97c4590d3..ed3f3fb16d4 100644
--- a/docs/source/format/CDeviceDataInterface.rst
+++ b/docs/source/format/CDeviceDataInterface.rst
@@ -240,7 +240,7 @@ has the following fields:
 
 .. c:member:: struct ArrowArray ArrowDeviceArray.array
 
-    The allocated array data. The values in the ``void**`` buffers (along
+    *Mandatory.* The allocated array data. The values in the ``void**`` buffers (along
     with the buffers of any children) are what is allocated on the device.
     The buffer values should be device pointers. The rest of the structure
     should be accessible to the CPU.
@@ -252,17 +252,17 @@ has the following fields:
 
 .. c:member:: int64_t ArrowDeviceArray.device_id
 
-    The device id to identify a specific device if multiple devices of this
+    *Mandatory.* The device id to identify a specific device if multiple devices of this
     type are on the system. The semantics of the id will be hardware dependent,
     but we use an ``int64_t`` to future-proof the id as devices change over time.
 
 .. c:member:: ArrowDeviceType ArrowDeviceArray.device_type
 
-    The type of the device which can access the buffers in the array.
+    *Mandatory.* The type of the device which can access the buffers in the array.
 
 .. c:member:: void* ArrowDeviceArray.sync_event
 
-    Optional. An event-like object to synchronize on if needed.
+    *Optional.* An event-like object to synchronize on if needed.
 
     Many devices, like GPUs, are primarily asynchronous with respect to
     CPU processing. As such, in order to safely access device memory, it is often
@@ -276,35 +276,24 @@ has the following fields:
     (e.g. ``cudaStreamWaitEvent`` or ``hipStreamWaitEvent``) before attempting
     to access the memory in the buffers.
 
-    Expected types to coerce this ``void*`` to depending on the reported
-    device type:
-
-    * CUDA: ``cudaEvent_t*``
-    * ROCm: ``hipEvent_t*``
-    * OpenCL: ``cl_event*``
-    * Vulkan: ``VkEvent*``
-    * Metal: ``MTLEvent*``
-    * OneAPI: ``sycl::event*``
-
     If an event is provided, then the producer MUST ensure that the exported
     data is available on the device before the event is triggered. The 
     consumer SHOULD wait on the event before trying to access the exported
     data.
 
+.. seealso::
+    The :ref:`synchronization event types <_c-device-data-interface-event-types>`
+    section below.
+
 .. c:member:: int64_t ArrowDeviceArray.reserved[3]
 
     As non-CPU development expands, there may be a need to expand this
     structure. In order to do so without potentially breaking ABI changes,
-    we reserve 24 bytes at the end of the object. This also has the added
-    benefit of bringing the total size of this structure to exactly 128
-    bytes (a power of 2) on 64-bit systems. These bytes MUST be zero'd
+    we reserve 24 bytes at the end of the object. These bytes MUST be zero'd
     out after initialization by the producer in order to ensure safe 
     evolution of the ABI in the future.
 
-.. note::
-    Rather than store the shape / types of the data alongside the
-    ``ArrowDeviceArray``, users should utilize the existing ``ArrowSchema``
-    structure to pass any data type and shape information.
+.. _c-device-data-interface-event-types:
 
 Synchronization event types
 ---------------------------
@@ -362,6 +351,11 @@ Semantics
 Memory management
 -----------------
 
+First and foremost: Out of everything in this interface, it is *only* the
+data buffers themselves which reside in device memory (i.e. the ``buffers`` 
+member of the ``ArrowArray`` struct). Everything else should be in CPU
+memory.
+
 The ``ArrowDeviceArray`` structure contains an ``ArrowArray`` object which
 itself has :ref:`specific semantics <c-data-interface-semantics>` for releasing
 memory. The term *"base structure"* below refers to the ``ArrowDeviceArray``
@@ -471,6 +465,7 @@ could be used for any device:
         status = cudaEventRecord(*ev_ptr, stream);
         assert(status == cudaSuccess);
 
+        memset(array, 0, sizeof(struct ArrowDeviceArray));
         // initialize fields
         *array = (struct ArrowDeviceArray) {
             .array = (struct ArrowArray) {
@@ -567,7 +562,7 @@ streaming source of Arrow arrays. It has the following fields:
 
 .. c:member:: ArrowDeviceType device_type
 
-    The device type that this stream produces data on. All
+    *Mandatory.* The device type that this stream produces data on. All
     ``ArrowDeviceArray``s that are produced by this stream should have the
     same device type as is set here. This is a convenience for the consumer
     to not have to check every array that is retrieved and instead allows
@@ -651,6 +646,17 @@ The stream source is not assumed to be thread-safe. Consumers wanting to
 call ``get_next`` from several threads should ensure those calls are
 serialized.
 
+Interoperability with other interchange formats
+===============================================
+
+Other interchange APIs, such as the `CUDA Array Interface`_, include
+members to pass the shape and the data types of the data buffers being
+exported. This information is necessary to interpret the raw bytes in the
+device data buffers that are being shared. Rather than store the 
+shape / types of the data alongside the ``ArrowDeviceArray``, users 
+should utilize the existing ``ArrowSchema`` structure to pass any data 
+type and shape information.
+
 Updating this specification
 ===========================
 

From e068bc3503f02f966d9d9e21e6ba0da74c33357d Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Tue, 6 Jun 2023 11:38:15 -0400
Subject: [PATCH 23/23] Update cpp/src/arrow/c/abi.h

Co-authored-by: Antoine Pitrou <pitrou@free.fr>
---
 cpp/src/arrow/c/abi.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h
index 7061fa9003f..6abe866b5f6 100644
--- a/cpp/src/arrow/c/abi.h
+++ b/cpp/src/arrow/c/abi.h
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-/// \file abi.h Arrow C-Data Interface
+/// \file abi.h Arrow C Data Interface
 ///
-/// The Arrow C-Data interface defines a very small, stable set
+/// The Arrow C Data interface defines a very small, stable set
 /// of C definitions which can be easily copied into any project's
 /// source code and vendored to be used for columnar data interchange
 /// in the Arrow format. For non-C/C++ languages and runtimes,