From a7f085761677b6dfa0d780b303ad8aeb6f22644e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Fri, 4 Apr 2025 09:16:26 +0200
Subject: [PATCH 1/4] GH-46025: [C++] Use ARROW_CUDA_EXPORT instead of
 ARROW_EXPORT for libarrow_cuda

---
 .env                               |  2 +-
 cpp/src/arrow/gpu/CMakeLists.txt   |  2 +-
 cpp/src/arrow/gpu/cuda_arrow_ipc.h |  6 +++---
 cpp/src/arrow/gpu/cuda_context.h   | 22 +++++++++++-----------
 cpp/src/arrow/gpu/cuda_internal.h  |  3 ++-
 cpp/src/arrow/gpu/cuda_memory.h    | 17 +++++++++--------
 6 files changed, 27 insertions(+), 25 deletions(-)
diff --git a/.env b/.env
index bdb74d89e1c..d2badc1e342 100644
--- a/.env
+++ b/.env
@@ -55,7 +55,7 @@ UBUNTU=22.04
 # Default versions for various dependencies
 CLANG_TOOLS=14
 CMAKE=3.25.0
-CUDA=11.2.2
+CUDA=11.7.1
 DASK=latest
 DOTNET=8.0
 GCC=
diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt
index 7f8650c7dc8..f9ebb14be75 100644
--- a/cpp/src/arrow/gpu/CMakeLists.txt
+++ b/cpp/src/arrow/gpu/CMakeLists.txt
@@ -79,7 +79,7 @@ add_arrow_lib(arrow_cuda
 add_dependencies(arrow_cuda ${ARROW_CUDA_LIBRARIES})
 
 foreach(LIB_TARGET ${ARROW_CUDA_LIBRARIES})
-  target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING)
+  target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_CUDA_EXPORTING)
 endforeach()
 
 # CUDA build version
diff --git a/cpp/src/arrow/gpu/cuda_arrow_ipc.h b/cpp/src/arrow/gpu/cuda_arrow_ipc.h
index b7200a94b93..1cdacdf343d 100644
--- a/cpp/src/arrow/gpu/cuda_arrow_ipc.h
+++ b/cpp/src/arrow/gpu/cuda_arrow_ipc.h
@@ -22,9 +22,9 @@
 
 #include "arrow/buffer.h"
 #include "arrow/type_fwd.h"
-#include "arrow/util/visibility.h"
 
 #include "arrow/gpu/cuda_memory.h"
+#include "arrow/gpu/visibility.h"
 
 namespace arrow {
 
@@ -49,7 +49,7 @@ namespace cuda {
 /// \param[in] batch record batch to write
 /// \param[in] ctx CudaContext to allocate device memory from
 /// \return CudaBuffer or Status
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 Result<std::shared_ptr<CudaBuffer>> SerializeRecordBatch(const RecordBatch& batch,
                                                          CudaContext* ctx);
 
@@ -61,7 +61,7 @@ Result<std::shared_ptr<CudaBuffer>> SerializeRecordBatch(const RecordBatch& batc
 /// \param[in] buffer a CudaBuffer containing the complete IPC message
 /// \param[in] pool a MemoryPool to use for allocating space for the metadata
 /// \return RecordBatch or Status
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
     const std::shared_ptr<Schema>& schema, const ipc::DictionaryMemo* dictionary_memo,
     const std::shared_ptr<CudaBuffer>& buffer, MemoryPool* pool = default_memory_pool());
diff --git a/cpp/src/arrow/gpu/cuda_context.h b/cpp/src/arrow/gpu/cuda_context.h
index 56c4f320352..35657590c67 100644
--- a/cpp/src/arrow/gpu/cuda_context.h
+++ b/cpp/src/arrow/gpu/cuda_context.h
@@ -24,8 +24,8 @@
 #include <cuda.h>
 
 #include "arrow/device.h"
+#include "arrow/gpu/visibility.h"
 #include "arrow/result.h"
-#include "arrow/util/visibility.h"
 
 namespace arrow {
 namespace cuda {
@@ -41,7 +41,7 @@ class CudaMemoryManager;
 
 // XXX Should CudaContext be merged into CudaMemoryManager?
 
-class ARROW_EXPORT CudaDeviceManager {
+class ARROW_CUDA_EXPORT CudaDeviceManager {
  public:
   static Result<CudaDeviceManager*> Instance();
 
@@ -88,7 +88,7 @@ class ARROW_EXPORT CudaDeviceManager {
 ///
 /// Each CudaDevice instance is tied to a particular CUDA device
 /// (identified by its logical device number).
-class ARROW_EXPORT CudaDevice : public Device {
+class ARROW_CUDA_EXPORT CudaDevice : public Device {
  public:
   const char* type_name() const override;
   std::string ToString() const override;
@@ -148,7 +148,7 @@ class ARROW_EXPORT CudaDevice : public Device {
   /// and freed using cuStreamCreate and cuStreamDestroy (or equivalent).
   /// Default construction will use the cuda default stream, and does not allow
   /// construction from literal 0 or nullptr.
-  class ARROW_EXPORT Stream : public Device::Stream {
+  class ARROW_CUDA_EXPORT Stream : public Device::Stream {
    public:
     ~Stream() = default;
 
@@ -195,7 +195,7 @@ class ARROW_EXPORT CudaDevice : public Device {
   Result<std::shared_ptr<Device::Stream>> WrapStream(
       void* device_stream, Stream::release_fn_t release_fn) override;
 
-  class ARROW_EXPORT SyncEvent : public Device::SyncEvent {
+  class ARROW_CUDA_EXPORT SyncEvent : public Device::SyncEvent {
    public:
     [[nodiscard]] CUevent value() const {
       if (sync_event_) {
@@ -240,17 +240,17 @@ class ARROW_EXPORT CudaDevice : public Device {
 };
 
 /// \brief Return whether a device instance is a CudaDevice
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 bool IsCudaDevice(const Device& device);
 
 /// \brief Cast a device instance to a CudaDevice
 ///
 /// An error is returned if the device is not a CudaDevice.
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 Result<std::shared_ptr<CudaDevice>> AsCudaDevice(const std::shared_ptr<Device>& device);
 
 /// \brief MemoryManager implementation for CUDA
-class ARROW_EXPORT CudaMemoryManager : public MemoryManager {
+class ARROW_CUDA_EXPORT CudaMemoryManager : public MemoryManager {
  public:
   Result<std::shared_ptr<io::RandomAccessFile>> GetBufferReader(
       std::shared_ptr<Buffer> buf) override;
@@ -304,19 +304,19 @@ class ARROW_EXPORT CudaMemoryManager : public MemoryManager {
 };
 
 /// \brief Return whether a MemoryManager instance is a CudaMemoryManager
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 bool IsCudaMemoryManager(const MemoryManager& mm);
 
 /// \brief Cast a MemoryManager instance to a CudaMemoryManager
 ///
 /// An error is returned if the MemoryManager is not a CudaMemoryManager.
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 Result<std::shared_ptr<CudaMemoryManager>> AsCudaMemoryManager(
     const std::shared_ptr<MemoryManager>& mm);
 
 /// \class CudaContext
 /// \brief Object-oriented interface to the low-level CUDA driver API
-class ARROW_EXPORT CudaContext : public std::enable_shared_from_this<CudaContext> {
+class ARROW_CUDA_EXPORT CudaContext : public std::enable_shared_from_this<CudaContext> {
  public:
   ~CudaContext();
 
diff --git a/cpp/src/arrow/gpu/cuda_internal.h b/cpp/src/arrow/gpu/cuda_internal.h
index d70873634f8..71a9a8d8a05 100644
--- a/cpp/src/arrow/gpu/cuda_internal.h
+++ b/cpp/src/arrow/gpu/cuda_internal.h
@@ -25,6 +25,7 @@
 #include <cuda.h>
 
 #include "arrow/gpu/cuda_context.h"
+#include "arrow/gpu/visibility.h"
 #include "arrow/status.h"
 
 namespace arrow {
@@ -33,7 +34,7 @@ namespace internal {
 
 std::string CudaErrorDescription(CUresult err);
 
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 Status StatusFromCuda(CUresult res, const char* function_name = nullptr);
 
 #define CU_RETURN_NOT_OK(FUNC_NAME, STMT)                               \
diff --git a/cpp/src/arrow/gpu/cuda_memory.h b/cpp/src/arrow/gpu/cuda_memory.h
index 488f4183730..ce4b4b11cb2 100644
--- a/cpp/src/arrow/gpu/cuda_memory.h
+++ b/cpp/src/arrow/gpu/cuda_memory.h
@@ -22,6 +22,7 @@
 
 #include "arrow/buffer.h"
 #include "arrow/c/abi.h"
+#include "arrow/gpu/visibility.h"
 #include "arrow/io/concurrency.h"
 #include "arrow/type_fwd.h"
 
@@ -35,7 +36,7 @@ class CudaIpcMemHandle;
 /// \brief An Arrow buffer located on a GPU device
 ///
 /// Be careful using this in any Arrow code which may not be GPU-aware
-class ARROW_EXPORT CudaBuffer : public Buffer {
+class ARROW_CUDA_EXPORT CudaBuffer : public Buffer {
  public:
   // XXX deprecate?
   CudaBuffer(uint8_t* data, int64_t size, const std::shared_ptr<CudaContext>& context,
@@ -109,7 +110,7 @@ class ARROW_EXPORT CudaBuffer : public Buffer {
 
 /// \class CudaHostBuffer
 /// \brief Device-accessible CPU memory created using cudaHostAlloc
-class ARROW_EXPORT CudaHostBuffer : public MutableBuffer {
+class ARROW_CUDA_EXPORT CudaHostBuffer : public MutableBuffer {
  public:
   CudaHostBuffer(uint8_t* data, const int64_t size);
 
@@ -121,7 +122,7 @@ class ARROW_EXPORT CudaHostBuffer : public MutableBuffer {
 
 /// \class CudaIpcHandle
 /// \brief A container for a CUDA IPC handle
-class ARROW_EXPORT CudaIpcMemHandle {
+class ARROW_CUDA_EXPORT CudaIpcMemHandle {
  public:
   ~CudaIpcMemHandle();
 
@@ -158,7 +159,7 @@ class ARROW_EXPORT CudaIpcMemHandle {
 /// pointing to CPU memory.
 /// Reading to a raw pointer, though, copies device memory into the host
 /// memory pointed to.
-class ARROW_EXPORT CudaBufferReader
+class ARROW_CUDA_EXPORT CudaBufferReader
     : public ::arrow::io::internal::RandomAccessFileConcurrencyWrapper<CudaBufferReader> {
  public:
   explicit CudaBufferReader(const std::shared_ptr<Buffer>& buffer);
@@ -200,7 +201,7 @@ class ARROW_EXPORT CudaBufferReader
 
 /// \class CudaBufferWriter
 /// \brief File interface for writing to CUDA buffers, with optional buffering
-class ARROW_EXPORT CudaBufferWriter : public io::WritableFile {
+class ARROW_CUDA_EXPORT CudaBufferWriter : public io::WritableFile {
  public:
   explicit CudaBufferWriter(const std::shared_ptr<CudaBuffer>& buffer);
   ~CudaBufferWriter() override;
@@ -247,17 +248,17 @@ class ARROW_EXPORT CudaBufferWriter : public io::WritableFile {
 /// \param[in] device_number device to expose host memory
 /// \param[in] size number of bytes
 /// \return Host buffer or Status
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 Result<std::shared_ptr<CudaHostBuffer>> AllocateCudaHostBuffer(int device_number,
                                                                const int64_t size);
 
 /// Low-level: get a device address through which the CPU data be accessed.
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 Result<uintptr_t> GetDeviceAddress(const uint8_t* cpu_data,
                                    const std::shared_ptr<CudaContext>& ctx);
 
 /// Low-level: get a CPU address through which the device data be accessed.
-ARROW_EXPORT
+ARROW_CUDA_EXPORT
 Result<uint8_t*> GetHostAddress(uintptr_t device_ptr);
 
 ARROW_DEPRECATED(

From 6ce6a12f0487bb6ce4e8150ecd6dc76b7ced8519 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Fri, 4 Apr 2025 09:19:47 +0200
Subject: [PATCH 2/4] Add missing new file to previous commit :)

---
 cpp/src/arrow/gpu/visibility.h | 49 ++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 cpp/src/arrow/gpu/visibility.h

diff --git a/cpp/src/arrow/gpu/visibility.h b/cpp/src/arrow/gpu/visibility.h
new file mode 100644
index 00000000000..3ba75a12bdc
--- /dev/null
+++ b/cpp/src/arrow/gpu/visibility.h
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_CUDA_STATIC
+#    define ARROW_CUDA_EXPORT
+#  elif defined(ARROW_CUDA_EXPORTING)
+#    define ARROW_CUDA_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_CUDA_EXPORT __declspec(dllimport)
+#  endif
+
+#  define ARROW_CUDA_NO_EXPORT
+
+#  if defined(_MSC_VER)
+#    pragma warning(pop)
+#  endif
+
+#else  // Not Windows
+#  ifndef ARROW_CUDA_EXPORT
+#    define ARROW_CUDA_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_CUDA_NO_EXPORT
+#    define ARROW_CUDA_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif

From f60ca031390ea1c5b1ddb367daa835bcd10083e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Fri, 4 Apr 2025 12:29:26 +0200
Subject: [PATCH 3/4] Add new flags needed for pkg-config when static linking
 the library

---
 cpp/src/arrow/gpu/arrow-cuda.pc.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/arrow/gpu/arrow-cuda.pc.in b/cpp/src/arrow/gpu/arrow-cuda.pc.in
index 173d7d91ef9..e3a4d7d6a14 100644
--- a/cpp/src/arrow/gpu/arrow-cuda.pc.in
+++ b/cpp/src/arrow/gpu/arrow-cuda.pc.in
@@ -25,3 +25,4 @@ Version: @ARROW_VERSION@
 Requires: arrow cuda
 Libs: -L${libdir} -larrow_cuda
 Cflags: -I${includedir}
+Cflags.private: -DARROW_CUDA_STATIC

From 9aa1470710dc85d8299ce40c7e73c8393451c050 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= <raulcumplido@gmail.com>
Date: Mon, 7 Apr 2025 12:27:27 +0200
Subject: [PATCH 4/4] Fix Cflags and Cflags.private to include correct flags
 when static linking even when only libarrow_cida.a is being built

---
 cpp/CMakeLists.txt                 |  7 +++++++
 cpp/src/arrow/gpu/CMakeLists.txt   | 11 +++++++++++
 cpp/src/arrow/gpu/arrow-cuda.pc.in |  4 ++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d1b7f3f31bd..d6ecd2a355f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -553,6 +553,13 @@ if(ARROW_BUILD_STATIC)
   string(APPEND ARROW_ACERO_PC_CFLAGS_PRIVATE " -DARROW_ACERO_STATIC")
 endif()
 
+# For arrow-cuda.pc.
+set(ARROW_CUDA_PC_CFLAGS "")
+set(ARROW_CUDA_PC_CFLAGS_PRIVATE "")
+if(ARROW_BUILD_STATIC)
+  string(APPEND ARROW_CUDA_PC_CFLAGS_PRIVATE " -DARROW_CUDA_STATIC")
+endif()
+
 # For arrow-dataset.pc.
 set(ARROW_DATASET_PC_CFLAGS "")
 set(ARROW_DATASET_PC_CFLAGS_PRIVATE "")
diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt
index f9ebb14be75..9723cea7c23 100644
--- a/cpp/src/arrow/gpu/CMakeLists.txt
+++ b/cpp/src/arrow/gpu/CMakeLists.txt
@@ -45,6 +45,17 @@ else()
   set(ARROW_CUDA_SHARED_LINK_LIBS CUDA::cuda_driver)
 endif()
 
+# If only libarrow_cuda.a is built, "pkg-config --cflags --libs
+# arrow-cuda" will include build flags for static linking not shared
+# linking.
+# The variables ARROW_CUDA_PC_* (except ARROW_CUDA_PC_*_PRIVATE) are used
+# specifically for the static linking case to ensure the correct flags
+# are provided.
+if(NOT ARROW_BUILD_SHARED AND ARROW_BUILD_STATIC)
+  string(APPEND ARROW_CUDA_PC_CFLAGS "${ARROW_CUDA_PC_CFLAGS_PRIVATE}")
+  set(ARROW_CUDA_PC_CFLAGS_PRIVATE "")
+endif()
+
 set(ARROW_CUDA_SRCS cuda_arrow_ipc.cc cuda_context.cc cuda_internal.cc cuda_memory.cc)
 
 set(ARROW_CUDA_PKG_CONFIG_NAME_ARGS)
diff --git a/cpp/src/arrow/gpu/arrow-cuda.pc.in b/cpp/src/arrow/gpu/arrow-cuda.pc.in
index e3a4d7d6a14..5a2b13fff55 100644
--- a/cpp/src/arrow/gpu/arrow-cuda.pc.in
+++ b/cpp/src/arrow/gpu/arrow-cuda.pc.in
@@ -24,5 +24,5 @@ Description: CUDA integration library for Apache Arrow
 Version: @ARROW_VERSION@
 Requires: arrow cuda
 Libs: -L${libdir} -larrow_cuda
-Cflags: -I${includedir}
-Cflags.private: -DARROW_CUDA_STATIC
+Cflags: -I${includedir}@ARROW_CUDA_PC_CFLAGS@
+Cflags.private:@ARROW_CUDA_PC_CFLAGS_PRIVATE@