From 11213a41190767110fd863dfddfe7925c3293fda Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 19 Jun 2025 09:20:13 -0400 Subject: [PATCH 1/6] GH-46375: [C++] Add adapters/orc directory to Meson --- cpp/meson.build | 9 +++--- cpp/meson.options | 6 ++++ cpp/src/arrow/adapters/orc/meson.build | 39 ++++++++++++++++++++++++++ cpp/src/arrow/meson.build | 20 +++++++++++++ cpp/src/arrow/util/meson.build | 2 +- cpp/subprojects/apache-orc.wrap | 27 ++++++++++++++++++ 6 files changed, 98 insertions(+), 5 deletions(-) create mode 100644 cpp/src/arrow/adapters/orc/meson.build create mode 100644 cpp/subprojects/apache-orc.wrap diff --git a/cpp/meson.build b/cpp/meson.build index 81143ed1e28..ea7b50ec94b 100644 --- a/cpp/meson.build +++ b/cpp/meson.build @@ -95,12 +95,13 @@ needs_testing = (get_option('testing').enabled() or needs_integration ) needs_json = get_option('json').enabled() or needs_testing +needs_orc = get_option('orc').enabled() needs_brotli = get_option('brotli').enabled() or needs_fuzzing needs_bz2 = get_option('bz2').enabled() -needs_lz4 = get_option('lz4').enabled() -needs_snappy = get_option('snappy').enabled() -needs_zlib = get_option('zlib').enabled() -needs_zstd = get_option('zstd').enabled() +needs_lz4 = get_option('lz4').enabled() or needs_orc +needs_snappy = get_option('snappy').enabled() or needs_orc +needs_zlib = get_option('zlib').enabled() or needs_orc +needs_zstd = get_option('zstd').enabled() or needs_orc needs_utilities = get_option('utilities').enabled() subdir('src/arrow') diff --git a/cpp/meson.options b/cpp/meson.options index 668f440ee72..baf51c8c5bd 100644 --- a/cpp/meson.options +++ b/cpp/meson.options @@ -84,6 +84,12 @@ option('git_description', type: 'string') option('lz4', type: 'feature', description: 'Build with lz4 compression') +option( + 'orc', + type: 'feature', + description: 'Build the Arrow ORC adapter', +) + option( 'package_kind', type: 'string', diff --git a/cpp/src/arrow/adapters/orc/meson.build b/cpp/src/arrow/adapters/orc/meson.build new file mode 100644 index 00000000000..dacea07108b --- /dev/null +++ b/cpp/src/arrow/adapters/orc/meson.build @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers(['adapter.h', 'options.h'], subdir: 'arrow/adapters/orc') + +arrow_orc_dep = declare_dependency( + include_directories: include_directories('.'), + dependencies: [arrow_dep], +) + +meson.override_dependency('arrow-orc', arrow_orc_dep) + +pkg.generate( + filebase: 'arrow-orc', + name: 'Apache Arrow ORC', + description: 'ORC modules for Apache Arrow', + requires: ['arrow'], +) + +exc = executable( + 'arrow-orc-adapter-test', + sources: ['adapter_test.cc'], + dependencies: [arrow_test_dep, orc_dep], +) +test('arrow-orc-adapter-test', exc) diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index a04fdf88c2d..baf9ca1359c 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -472,6 +472,22 @@ if needs_json } endif +if needs_orc + orc_dep = dependency('orc') + arrow_components += { + 'arrow_orc': { + 'sources': files( + 'adapters/orc/adapter.cc', + 'adapters/orc/options.cc', + 'adapters/orc/util.cc', + ), + 'dependencies': [orc_dep], + }, + } +else + orc_dep = disabler() +endif + arrow_srcs = [] include_dir = include_directories('..') arrow_includes = [include_dir] @@ -835,6 +851,10 @@ if needs_json subdir('json') endif +if needs_orc + subdir('adapters/orc') +endif + if needs_ipc subdir('ipc') endif diff --git a/cpp/src/arrow/util/meson.build b/cpp/src/arrow/util/meson.build index 2fbbedbb931..6edcd716896 100644 --- a/cpp/src/arrow/util/meson.build +++ b/cpp/src/arrow/util/meson.build @@ -49,7 +49,7 @@ conf_data.set('ARROW_JEMALLOC', false) conf_data.set('ARROW_JEMALLOC_VENDORED', false) conf_data.set('ARROW_JSON', needs_json) conf_data.set('ARROW_MIMALLOC', false) -conf_data.set('ARROW_ORC', false) +conf_data.set('ARROW_ORC', needs_orc) conf_data.set('ARROW_PARQUET', needs_parquet) conf_data.set('ARROW_SUBSTRAIT', false) conf_data.set('ARROW_AZURE', false) diff --git a/cpp/subprojects/apache-orc.wrap b/cpp/subprojects/apache-orc.wrap new file mode 100644 index 00000000000..b95657cf5bd --- /dev/null +++ b/cpp/subprojects/apache-orc.wrap @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[wrap-file] +directory = orc-2.2.0 +source_url = https://www.apache.org/dyn/closer.lua?action=download&filename=orc/orc-2.2.0/orc-2.2.0.tar.gz +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/apache-orc_2.2.0-1/orc-2.2.0.tar.gz +source_filename = orc-2.2.0.tar.gz +source_hash = b15aca45a7e73ffbd1bbc36a78cd1422d41f07721092a25f43448e6e16f4763b +wrapdb_version = 2.2.0-1 + +[provide] +orc = orc_dep From 421c466559f288956896433e94116c68315fa971 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 27 Aug 2025 11:05:59 -0400 Subject: [PATCH 2/6] Fix UB --- cpp/src/arrow/adapters/orc/util.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/adapters/orc/util.cc b/cpp/src/arrow/adapters/orc/util.cc index 6974faae59b..68d062f125f 100644 --- a/cpp/src/arrow/adapters/orc/util.cc +++ b/cpp/src/arrow/adapters/orc/util.cc @@ -212,7 +212,10 @@ Status AppendTimestampBatch(liborc::ColumnVectorBatch* column_vector_batch, const int64_t* seconds = batch->data.data() + offset; const int64_t* nanos = batch->nanoseconds.data() + offset; - auto transform_timestamp = [seconds, nanos](int64_t index) { + auto transform_timestamp = [seconds, nanos, valid_bytes](int64_t index) -> int64_t { + if (valid_bytes && !valid_bytes[index]) { + return 0; + } return seconds[index] * kOneSecondNanos + nanos[index]; }; From ecbf7d5147fb79329258a5d38efa962b7a17ed74 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 27 Aug 2025 11:15:23 -0400 Subject: [PATCH 3/6] debug with sanitizers --- ci/scripts/cpp_build.sh | 7 +++---- ci/scripts/cpp_test.sh | 3 ++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 3a4431239f1..750859eaad7 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -143,10 +143,9 @@ if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then --prefix=${MESON_PREFIX:-${ARROW_HOME}} \ --buildtype=${ARROW_BUILD_TYPE:-debug} \ --pkg-config-path="${CONDA_PREFIX}/lib/pkgconfig/" \ - -Dauto_features=enabled \ - -Dfuzzing=disabled \ - -Dgcs=disabled \ - -Ds3=disabled \ + -Db_sanitize=address,undefined \ + -Dorc=enabled \ + -Dtests=enabled \ . \ ${source_dir} diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 88c06849c8b..80d4a548399 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -91,9 +91,10 @@ if [ -z "${PYTHON}" ] && ! which python > /dev/null 2>&1; then fi if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then ARROW_BUILD_EXAMPLES=OFF # TODO: Remove this - meson test \ + meson test arrow-orc-adapter-test \ --no-rebuild \ --print-errorlogs \ + --max-lines=0 \ --suite arrow \ "$@" else From 7fab6ea9f51adfc6cebbcbe8e662f048b2d75caf Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 3 Sep 2025 11:43:13 -0400 Subject: [PATCH 4/6] Remove conda-install orc --- ci/conda_env_cpp.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index 731b49fa462..e950be7554b 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -40,7 +40,6 @@ make meson ninja nodejs -orc pkg-config python rapidjson From b16a0db8c5af9eec0cee658b5abcc13748339859 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 3 Sep 2025 13:11:38 -0400 Subject: [PATCH 5/6] try custom detection --- ci/scripts/cpp_build.sh | 3 +++ cpp/src/arrow/meson.build | 17 ++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 750859eaad7..03f5d374c7d 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -43,6 +43,9 @@ if [ ! -z "${CONDA_PREFIX}" ] && [ "${ARROW_EMSCRIPTEN:-OFF}" = "OFF" ]; then fi export ARROW_CMAKE_ARGS export ARROW_GANDIVA_PC_CXX_FLAGS=$(echo | ${CXX} -E -Wp,-v -xc++ - 2>&1 | grep '^ ' | awk '{print "-isystem;" substr($1, 1)}' | tr '\n' ';') + + export LIBRARY_PATH="${CONDA_PREFIX}/lib" + export CPLUS_INCLUDE_PATH="${CONDA_PREFIX}/include" elif [ -x "$(command -v xcrun)" ]; then export ARROW_GANDIVA_PC_CXX_FLAGS="-isysroot;$(xcrun --show-sdk-path)" fi diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index baf9ca1359c..8da4685fe42 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -473,7 +473,22 @@ if needs_json endif if needs_orc - orc_dep = dependency('orc') + # not all versions of orc installed via conda distribute + # cmake/pkgconfig information, so they are undetectable + # without custom logic + orc_headers = ['orc/orc-config.hh'] + orc_lib = cpp_compiler.find_library( + 'orc', + has_headers: orc_headers, + required: false, + ) + + if orc_lib.found() + orc_dep = declare_dependency(dependencies: orc_lib) + else + orc_dep = dependency('orc') + endif + arrow_components += { 'arrow_orc': { 'sources': files( From 640a4a777ad53350e7699b95429bdf7eb50c2c84 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 12 Sep 2025 17:32:24 -0400 Subject: [PATCH 6/6] try new conda package for grpc --- ci/conda_env_cpp.txt | 3 ++- ci/scripts/cpp_build.sh | 10 ++++------ ci/scripts/cpp_test.sh | 3 +-- cpp/src/arrow/meson.build | 17 +---------------- 4 files changed, 8 insertions(+), 25 deletions(-) diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index e950be7554b..50b91b7a656 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -31,8 +31,8 @@ gflags glog gmock>=1.10.0 google-cloud-cpp>=1.34.0 -grpc-cpp<=1.50.1 gtest>=1.10.0 +libgrpc libprotobuf libutf8proc lz4-c @@ -40,6 +40,7 @@ make meson ninja nodejs +orc pkg-config python rapidjson diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 03f5d374c7d..3a4431239f1 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -43,9 +43,6 @@ if [ ! -z "${CONDA_PREFIX}" ] && [ "${ARROW_EMSCRIPTEN:-OFF}" = "OFF" ]; then fi export ARROW_CMAKE_ARGS export ARROW_GANDIVA_PC_CXX_FLAGS=$(echo | ${CXX} -E -Wp,-v -xc++ - 2>&1 | grep '^ ' | awk '{print "-isystem;" substr($1, 1)}' | tr '\n' ';') - - export LIBRARY_PATH="${CONDA_PREFIX}/lib" - export CPLUS_INCLUDE_PATH="${CONDA_PREFIX}/include" elif [ -x "$(command -v xcrun)" ]; then export ARROW_GANDIVA_PC_CXX_FLAGS="-isysroot;$(xcrun --show-sdk-path)" fi @@ -146,9 +143,10 @@ if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then --prefix=${MESON_PREFIX:-${ARROW_HOME}} \ --buildtype=${ARROW_BUILD_TYPE:-debug} \ --pkg-config-path="${CONDA_PREFIX}/lib/pkgconfig/" \ - -Db_sanitize=address,undefined \ - -Dorc=enabled \ - -Dtests=enabled \ + -Dauto_features=enabled \ + -Dfuzzing=disabled \ + -Dgcs=disabled \ + -Ds3=disabled \ . \ ${source_dir} diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh index 80d4a548399..88c06849c8b 100755 --- a/ci/scripts/cpp_test.sh +++ b/ci/scripts/cpp_test.sh @@ -91,10 +91,9 @@ if [ -z "${PYTHON}" ] && ! which python > /dev/null 2>&1; then fi if [ "${ARROW_USE_MESON:-OFF}" = "ON" ]; then ARROW_BUILD_EXAMPLES=OFF # TODO: Remove this - meson test arrow-orc-adapter-test \ + meson test \ --no-rebuild \ --print-errorlogs \ - --max-lines=0 \ --suite arrow \ "$@" else diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index 8da4685fe42..baf9ca1359c 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -473,22 +473,7 @@ if needs_json endif if needs_orc - # not all versions of orc installed via conda distribute - # cmake/pkgconfig information, so they are undetectable - # without custom logic - orc_headers = ['orc/orc-config.hh'] - orc_lib = cpp_compiler.find_library( - 'orc', - has_headers: orc_headers, - required: false, - ) - - if orc_lib.found() - orc_dep = declare_dependency(dependencies: orc_lib) - else - orc_dep = dependency('orc') - endif - + orc_dep = dependency('orc') arrow_components += { 'arrow_orc': { 'sources': files(