From 9b15c2daebd3d63ac08c449cdc9b4433e883884c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 22 Aug 2025 12:16:41 +0900 Subject: [PATCH] GH-47399: [C++] Update bundled Apache ORC to 2.2.0 with Protobuf patch --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 119 +++++++++----------- cpp/cmake_modules/orc-2345.patch | 43 +++++++ cpp/cmake_modules/orc-2357.patch | 86 ++++++++++++++ cpp/thirdparty/versions.txt | 4 +- 4 files changed, 187 insertions(+), 65 deletions(-) create mode 100644 cpp/cmake_modules/orc-2345.patch create mode 100644 cpp/cmake_modules/orc-2357.patch diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 21bf5f98935..3baf73a358f 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1024,6 +1024,14 @@ macro(prepare_fetchcontent) # We should remove it once we have updated the dependencies: # https://github.com/apache/arrow/issues/45985 set(CMAKE_POLICY_VERSION_MINIMUM 3.5) + # Use "NEW" for CMP0077 by default. + # + # https://cmake.org/cmake/help/latest/policy/CMP0077.html + # + # option() honors normal variables. + set(CMAKE_POLICY_DEFAULT_CMP0077 + NEW + CACHE STRING "") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "") if(MSVC) @@ -4599,8 +4607,26 @@ target_include_directories(arrow::hadoop INTERFACE "${HADOOP_HOME}/include") # Apache ORC function(build_orc) + list(APPEND CMAKE_MESSAGE_INDENT "Apache ORC: ") + message(STATUS "Building Apache ORC from source") + set(ORC_PATCHES) + if(MSVC) + # We can remove this once bundled Apache ORC is 2.2.1 or later. + list(APPEND ORC_PATCHES ${CMAKE_CURRENT_LIST_DIR}/orc-2345.patch) + endif() + if(Protobuf_VERSION VERSION_GREATER_EQUAL 32.0) + # We can remove this once bundled Apache ORC is 2.2.1 or later. + list(APPEND ORC_PATCHES ${CMAKE_CURRENT_LIST_DIR}/orc-2357.patch) + endif() + if(ORC_PATCHES) + find_program(PATCH patch REQUIRED) + set(ORC_PATCH_COMMAND ${PATCH} -p1 -i ${ORC_PATCHES}) + else() + set(ORC_PATCH_COMMAND) + endif() + if(LZ4_VENDORED) set(ORC_LZ4_TARGET lz4_static) set(ORC_LZ4_ROOT "${lz4_SOURCE_DIR}") @@ -4615,34 +4641,23 @@ function(build_orc) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.29) fetchcontent_declare(orc ${FC_DECLARE_COMMON_OPTIONS} + PATCH_COMMAND ${ORC_PATCH_COMMAND} URL ${ORC_SOURCE_URL} URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}") prepare_fetchcontent() set(CMAKE_UNITY_BUILD FALSE) - set(ORC_PREFER_STATIC_LZ4 - OFF - CACHE BOOL "" FORCE) - set(LZ4_HOME - "${ORC_LZ4_ROOT}" - CACHE STRING "" FORCE) - set(LZ4_INCLUDE_DIR - "${ORC_LZ4_INCLUDE_DIR}" - CACHE STRING "" FORCE) - set(LZ4_LIBRARY - ${ORC_LZ4_TARGET} - CACHE STRING "" FORCE) + set(ORC_PREFER_STATIC_LZ4 OFF) + set(LZ4_HOME "${ORC_LZ4_ROOT}") + set(LZ4_INCLUDE_DIR "${ORC_LZ4_INCLUDE_DIR}") + set(LZ4_LIBRARY ${ORC_LZ4_TARGET}) - set(ORC_PREFER_STATIC_PROTOBUF - OFF - CACHE BOOL "" FORCE) + set(ORC_PREFER_STATIC_PROTOBUF OFF) get_target_property(PROTOBUF_INCLUDE_DIR ${ARROW_PROTOBUF_LIBPROTOBUF} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(Protobuf_ROOT "${PROTOBUF_INCLUDE_DIR}" DIRECTORY) - set(PROTOBUF_HOME - ${Protobuf_ROOT} - CACHE STRING "" FORCE) + set(PROTOBUF_HOME ${Protobuf_ROOT}) # ORC uses this. target_include_directories(${ARROW_PROTOBUF_LIBPROTOC} INTERFACE "${PROTOBUF_INCLUDE_DIR}") @@ -4650,63 +4665,38 @@ function(build_orc) set(PROTOBUF_LIBRARY ${ARROW_PROTOBUF_LIBPROTOBUF}) set(PROTOC_LIBRARY ${ARROW_PROTOBUF_LIBPROTOC}) - set(ORC_PREFER_STATIC_SNAPPY - OFF - CACHE BOOL "" FORCE) + set(ORC_PREFER_STATIC_SNAPPY OFF) get_target_property(SNAPPY_INCLUDE_DIR ${Snappy_TARGET} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(Snappy_ROOT "${SNAPPY_INCLUDE_DIR}" DIRECTORY) - set(SNAPPY_HOME - ${Snappy_ROOT} - CACHE STRING "" FORCE) - set(SNAPPY_LIBRARY - ${Snappy_TARGET} - CACHE STRING "" FORCE) + set(SNAPPY_HOME ${Snappy_ROOT}) + set(SNAPPY_LIBRARY ${Snappy_TARGET}) - set(ORC_PREFER_STATIC_ZLIB - OFF - CACHE BOOL "" FORCE) + set(ORC_PREFER_STATIC_ZLIB OFF) get_target_property(ZLIB_INCLUDE_DIR ZLIB::ZLIB INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ZLIB_ROOT "${ZLIB_INCLUDE_DIR}" DIRECTORY) - set(ZLIB_HOME - ${ZLIB_ROOT} - CACHE STRING "" FORCE) - # From CMake 3.21 onwards the set(CACHE) command does not remove any normal - # variable of the same name from the current scope. We have to manually remove - # the variable via unset to avoid ORC not finding the ZLIB_LIBRARY. + set(ZLIB_HOME ${ZLIB_ROOT}) + # From CMake 3.21 onwards the set(CACHE) command does not remove + # any normal variable of the same name from the current scope. We + # have to manually remove the variable via unset to avoid ORC not + # finding the ZLIB_LIBRARY. unset(ZLIB_LIBRARY) set(ZLIB_LIBRARY ZLIB::ZLIB CACHE STRING "" FORCE) - set(ORC_PREFER_STATIC_ZSTD - OFF - CACHE BOOL "" FORCE) + set(ORC_PREFER_STATIC_ZSTD OFF) get_target_property(ZSTD_INCLUDE_DIR ${ARROW_ZSTD_LIBZSTD} INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(ZSTD_ROOT "${ZSTD_INCLUDE_DIR}" DIRECTORY) - set(ZSTD_HOME - ${ZSTD_ROOT} - CACHE STRING "" FORCE) + set(ZSTD_HOME ${ZSTD_ROOT}) set(ZSTD_LIBRARY ${ARROW_ZSTD_LIBZSTD}) - set(BUILD_CPP_TESTS - OFF - CACHE BOOL "" FORCE) - set(BUILD_JAVA - OFF - CACHE BOOL "" FORCE) - set(BUILD_LIBHDFSPP - OFF - CACHE BOOL "" FORCE) - set(BUILD_TOOLS - OFF - CACHE BOOL "" FORCE) - set(INSTALL_VENDORED_LIBS - OFF - CACHE BOOL "" FORCE) - set(STOP_BUILD_ON_WARNING - OFF - CACHE BOOL "" FORCE) + set(BUILD_CPP_TESTS OFF) + set(BUILD_JAVA OFF) + set(BUILD_LIBHDFSPP OFF) + set(BUILD_TOOLS OFF) + set(INSTALL_VENDORED_LIBS OFF) + set(STOP_BUILD_ON_WARNING OFF) fetchcontent_makeavailable(orc) @@ -4769,8 +4759,6 @@ function(build_orc) externalproject_add(orc_ep ${EP_COMMON_OPTIONS} - URL ${ORC_SOURCE_URL} - URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}" BUILD_BYPRODUCTS ${ORC_STATIC_LIB} CMAKE_ARGS ${ORC_CMAKE_ARGS} DEPENDS ${ARROW_PROTOBUF_LIBPROTOBUF} @@ -4778,7 +4766,10 @@ function(build_orc) ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET} ${ORC_LZ4_TARGET} - ZLIB::ZLIB) + ZLIB::ZLIB + PATCH_COMMAND ${ORC_PATCH_COMMAND} + URL ${ORC_SOURCE_URL} + URL_HASH "SHA256=${ARROW_ORC_BUILD_SHA256_CHECKSUM}") add_library(orc::orc STATIC IMPORTED) set_target_properties(orc::orc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}") target_include_directories(orc::orc BEFORE INTERFACE "${ORC_INCLUDE_DIR}") @@ -4806,6 +4797,8 @@ function(build_orc) set(ARROW_BUNDLED_STATIC_LIBS ${ARROW_BUNDLED_STATIC_LIBS} PARENT_SCOPE) + + list(POP_BACK CMAKE_MESSAGE_INDENT) endfunction() if(ARROW_ORC) diff --git a/cpp/cmake_modules/orc-2345.patch b/cpp/cmake_modules/orc-2345.patch new file mode 100644 index 00000000000..ee5e38d6e6a --- /dev/null +++ b/cpp/cmake_modules/orc-2345.patch @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +From a76249e13a6e364e0507a12cb71abaaf1647252e Mon Sep 17 00:00:00 2001 +From: Yuriy Chernyshov +Date: Thu, 31 Jul 2025 13:20:15 +0200 +Subject: [PATCH] Fix Windows build + +See +https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/byteswap-uint64-byteswap-ulong-byteswap-ushort?view=msvc-170 +--- + c++/src/Geospatial.cc | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc +index 6d7d268703..2b110cacb6 100644 +--- a/c++/src/Geospatial.cc ++++ b/c++/src/Geospatial.cc +@@ -66,8 +66,8 @@ namespace orc::geospatial { + + #if defined(_MSC_VER) + #include // IWYU pragma: keep +-#define ORC_BYTE_SWAP64 _byteSwap_uint64 +-#define ORC_BYTE_SWAP32 _byteSwap_ulong ++#define ORC_BYTE_SWAP64 _byteswap_uint64 ++#define ORC_BYTE_SWAP32 _byteswap_ulong + #else + #define ORC_BYTE_SWAP64 __builtin_bswap64 + #define ORC_BYTE_SWAP32 __builtin_bswap32 diff --git a/cpp/cmake_modules/orc-2357.patch b/cpp/cmake_modules/orc-2357.patch new file mode 100644 index 00000000000..41096e10429 --- /dev/null +++ b/cpp/cmake_modules/orc-2357.patch @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +From a66baec5731b65a81189f48c242433d01580f344 Mon Sep 17 00:00:00 2001 +From: Dongjoon Hyun +Date: Fri, 15 Aug 2025 12:31:09 -0700 +Subject: [PATCH] ORC-1973: [C++] Use `int64_t` instead of + `google::protobuf::int64` + +--- + c++/src/io/InputStream.cc | 4 ++-- + c++/src/io/InputStream.hh | 2 +- + c++/src/io/OutputStream.cc | 4 ++-- + c++/src/io/OutputStream.hh | 2 +- + 4 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/c++/src/io/InputStream.cc b/c++/src/io/InputStream.cc +index 06ef40bd4c..5e1dc00ccd 100644 +--- a/c++/src/io/InputStream.cc ++++ b/c++/src/io/InputStream.cc +@@ -112,8 +112,8 @@ namespace orc { + return false; + } + +- google::protobuf::int64 SeekableArrayInputStream::ByteCount() const { +- return static_cast(position_); ++ int64_t SeekableArrayInputStream::ByteCount() const { ++ return static_cast(position_); + } + + void SeekableArrayInputStream::seek(PositionProvider& seekPosition) { +diff --git a/c++/src/io/InputStream.hh b/c++/src/io/InputStream.hh +index 07aa623b5f..8b251c9301 100644 +--- a/c++/src/io/InputStream.hh ++++ b/c++/src/io/InputStream.hh +@@ -72,7 +72,7 @@ namespace orc { + virtual bool Next(const void** data, int* size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; +- virtual google::protobuf::int64 ByteCount() const override; ++ virtual int64_t ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + }; +diff --git a/c++/src/io/OutputStream.cc b/c++/src/io/OutputStream.cc +index fbf1ca61dd..a55050d122 100644 +--- a/c++/src/io/OutputStream.cc ++++ b/c++/src/io/OutputStream.cc +@@ -65,8 +65,8 @@ namespace orc { + // PASS + } + +- google::protobuf::int64 BufferedOutputStream::ByteCount() const { +- return static_cast(dataBuffer_->size()); ++ int64_t BufferedOutputStream::ByteCount() const { ++ return static_cast(dataBuffer_->size()); + } + + bool BufferedOutputStream::WriteAliasedRaw(const void*, int) { +diff --git a/c++/src/io/OutputStream.hh b/c++/src/io/OutputStream.hh +index 6319de96d6..b029818125 100644 +--- a/c++/src/io/OutputStream.hh ++++ b/c++/src/io/OutputStream.hh +@@ -61,7 +61,7 @@ namespace orc { + + virtual bool Next(void** data, int* size) override; + virtual void BackUp(int count) override; +- virtual google::protobuf::int64 ByteCount() const override; ++ virtual int64_t ByteCount() const override; + virtual bool WriteAliasedRaw(const void* data, int size) override; + virtual bool AllowsAliasing() const override; + diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 2504e6fd3dd..9f16db79f12 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -90,8 +90,8 @@ ARROW_OPENTELEMETRY_BUILD_VERSION=v1.21.0 ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM=98e5546f577a11b52a57faed1f4cc60d8c1daa44760eba393f43eab5a8ec46a2 ARROW_OPENTELEMETRY_PROTO_BUILD_VERSION=v1.7.0 ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM=11330d850f5e24d34c4246bc8cb21fcd311e7565d219195713455a576bb11bed -ARROW_ORC_BUILD_VERSION=2.1.2 -ARROW_ORC_BUILD_SHA256_CHECKSUM=55451e65dea6ed42afb39fe33a88f9dcea8928dca0a0c9c23ef5545587810b4c +ARROW_ORC_BUILD_VERSION=2.2.0 +ARROW_ORC_BUILD_SHA256_CHECKSUM=b15aca45a7e73ffbd1bbc36a78cd1422d41f07721092a25f43448e6e16f4763b ARROW_PROTOBUF_BUILD_VERSION=v21.3 ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM=2f723218f6cb709ae4cdc4fb5ed56a5951fc5d466f0128ce4c946b8c78c8c49f # Because of https://github.com/Tencent/rapidjson/pull/1323, we require