diff --git a/.clang-format b/.clang-format index 06453dfbb25..9448dc8d8c8 100644 --- a/.clang-format +++ b/.clang-format @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. --- -BasedOnStyle: Google -DerivePointerAlignment: false +BasedOnStyle: Google ColumnLimit: 90 +DerivePointerAlignment: false +IncludeBlocks: Preserve diff --git a/.env b/.env index 43cfefccee4..7dcfa306bf0 100644 --- a/.env +++ b/.env @@ -29,47 +29,63 @@ DOCKER_VOLUME_PREFIX= # turn on inline build cache, this is a docker buildx feature documented # at https://github.com/docker/buildx#--cache-tonametypetypekeyvalue +BUILDKIT_INLINE_CACHE=1 COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 -BUILDKIT_INLINE_CACHE=1 # different architecture notations ARCH=amd64 ARCH_ALIAS=x86_64 -ARCH_SHORT_ALIAS=x64 +ARCH_SHORT=amd64 -ULIMIT_CORE=-1 +# Default repository to pull and push images from REPO=apache/arrow-dev -CUDA=9.1 + +# The setup attempts to generate coredumps by default, in order to disable the +# coredump generation set it to 0 +ULIMIT_CORE=-1 + +# Default versions for platforms DEBIAN=11 -UBUNTU=20.04 FEDORA=33 -PYTHON=3.6 -LLVM=12 -CLANG_TOOLS=8 +UBUNTU=20.04 + +# Default versions for various dependencies +CLANG_TOOLS=12 +CUDA=9.1 +DASK=latest +DOTNET=3.1 +GCC_VERSION="" GO=1.15 -NODE=14 -MAVEN=3.5.4 +HDFS=3.2.1 JDK=8 +KARTOTHEK=latest +LLVM=12 +MAVEN=3.5.4 +NODE=14 NUMPY=latest PANDAS=latest -DASK=latest -TURBODBC=latest -KARTOTHEK=latest -HDFS=3.2.1 -SPARK=master -DOTNET=3.1 +PYTHON=3.6 R=4.1 -ARROW_R_DEV=TRUE -GCC_VERSION="" +SPARK=master +TURBODBC=latest + # These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-gcc-release:latest -R_ORG=rhub +ARROW_R_DEV=TRUE R_IMAGE=ubuntu-gcc-release +R_ORG=rhub +R_PRUNE_DEPS=FALSE R_TAG=latest TZ=UTC + # -1 does not attempt to install a devtoolset version, any positive integer will install devtoolset-n DEVTOOLSET_VERSION=-1 -# Used for the manylinux and windows wheels, please update the crossbow configuration on update: +# Used through docker-compose.yml and serves as the default version for the +# ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the +# docker tags more readable. +# +# Please also update the crossbow configuration in order to keep the github +# actions cache up to date for the macOS wheels: # https://github.com/ursacomputing/crossbow/blob/master/.github/workflows/cache_vcpkg.yml -VCPKG="2021.04.30" +VCPKG="30465138e" diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 9a374ebf282..468b3ea8687 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -108,7 +108,11 @@ jobs: uses: actions/checkout@v2 with: fetch-depth: 0 - - name: Run + - name: Check CMake presets + run: | + cd cpp + cmake --list-presets + - name: Run minimal example run: | cd cpp/examples/minimal_build docker-compose run --rm minimal @@ -178,7 +182,7 @@ jobs: ci/scripts/cpp_test.sh $(pwd) $(pwd)/build windows: - name: AMD64 ${{ matrix.name }} C++ + name: AMD64 ${{ matrix.name }} C++17 runs-on: ${{ matrix.os }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 45 @@ -197,6 +201,7 @@ jobs: ARROW_BUILD_SHARED: ON ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: ON + ARROW_CXXFLAGS: "/std:c++17" ARROW_DATASET: ON ARROW_FLIGHT: OFF ARROW_HDFS: ON @@ -218,7 +223,7 @@ jobs: CMAKE_INSTALL_LIBDIR: bin CMAKE_INSTALL_PREFIX: /usr CMAKE_UNITY_BUILD: ON - NPROC: 2 + NPROC: 3 steps: - name: Disable Crash Dialogs run: | diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000000..19bebedb46e --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Docs + +on: + push: + pull_request: + paths: + - '.github/workflows/docs.yml' + - 'ci/docker/linux-apt-docs.dockerfile' + - 'ci/docker/linux-apt-python-3.dockerfile' + - 'ci/docker/ubuntu-20.04-cpp.dockerfile' + - 'ci/scripts/c_glib_build.sh' + - 'ci/scripts/cpp_build.sh' + - 'ci/scripts/docs_build.sh' + - 'ci/scripts/java_build.sh' + - 'ci/scripts/js_build.sh' + - 'ci/scripts/python_build.sh' + - 'ci/scripts/r_build.sh' + +env: + ARROW_ENABLE_TIMING_TESTS: OFF + DOCKER_VOLUME_PREFIX: ".docker/" + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + +jobs: + + docker: + name: AMD64 Ubuntu 20.04 Complete Documentation + runs-on: ubuntu-latest + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 150 + env: + UBUNTU: "20.04" + steps: + - name: Checkout Arrow + uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Fetch Submodules and Tags + run: ci/scripts/util_checkout.sh + - name: Free Up Disk Space + run: ci/scripts/util_cleanup.sh + - name: Cache Docker Volumes + uses: actions/cache@v2 + with: + path: .docker + key: ubuntu-docs-${{ hashFiles('cpp/**') }} + restore-keys: ubuntu-docs- + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Setup Archery + run: pip install -e dev/archery[docker] + - name: Execute Docker Build + run: archery docker run ubuntu-docs + - name: Docker Push + if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + continue-on-error: true + run: archery docker push ubuntu-docs diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 2afb7532318..5e861ecf2cf 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -58,7 +58,7 @@ jobs: name: AMD64 Ubuntu ${{ matrix.ubuntu }} GLib & Ruby runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 40 + timeout-minutes: 60 strategy: fail-fast: false matrix: diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 888b7143fbf..114e884212c 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -78,6 +78,9 @@ G_BEGIN_DECLS * #GArrowLargeStringDataType is a class for the 64-bit offsets UTF-8 * encoded string data type. * + * #GArrowTemporalDataType is an abstract class for temporal related data type + * such as #GArrowDate32DataType. + * * #GArrowDate32DataType is a class for the number of days since UNIX * epoch in the 32-bit signed integer data type. * @@ -975,9 +978,24 @@ garrow_large_string_data_type_new(void) } +G_DEFINE_ABSTRACT_TYPE(GArrowTemporalDataType, + garrow_temporal_data_type, + GARROW_TYPE_FIXED_WIDTH_DATA_TYPE) + +static void +garrow_temporal_data_type_init(GArrowTemporalDataType *object) +{ +} + +static void +garrow_temporal_data_type_class_init(GArrowTemporalDataTypeClass *klass) +{ +} + + G_DEFINE_TYPE(GArrowDate32DataType, garrow_date32_data_type, - GARROW_TYPE_DATA_TYPE) + GARROW_TYPE_TEMPORAL_DATA_TYPE) static void garrow_date32_data_type_init(GArrowDate32DataType *object) @@ -1012,7 +1030,7 @@ garrow_date32_data_type_new(void) G_DEFINE_TYPE(GArrowDate64DataType, garrow_date64_data_type, - GARROW_TYPE_DATA_TYPE) + GARROW_TYPE_TEMPORAL_DATA_TYPE) static void garrow_date64_data_type_init(GArrowDate64DataType *object) @@ -1047,7 +1065,7 @@ garrow_date64_data_type_new(void) G_DEFINE_TYPE(GArrowTimestampDataType, garrow_timestamp_data_type, - GARROW_TYPE_DATA_TYPE) + GARROW_TYPE_TEMPORAL_DATA_TYPE) static void garrow_timestamp_data_type_init(GArrowTimestampDataType *object) @@ -1102,7 +1120,7 @@ garrow_timestamp_data_type_get_unit(GArrowTimestampDataType *timestamp_data_type G_DEFINE_ABSTRACT_TYPE(GArrowTimeDataType, garrow_time_data_type, - GARROW_TYPE_DATA_TYPE) + GARROW_TYPE_TEMPORAL_DATA_TYPE) static void garrow_time_data_type_init(GArrowTimeDataType *object) diff --git a/c_glib/arrow-glib/basic-data-type.h b/c_glib/arrow-glib/basic-data-type.h index b498583e265..7dca40f7377 100644 --- a/c_glib/arrow-glib/basic-data-type.h +++ b/c_glib/arrow-glib/basic-data-type.h @@ -357,15 +357,27 @@ GARROW_AVAILABLE_IN_0_17 GArrowLargeStringDataType *garrow_large_string_data_type_new(void); +#define GARROW_TYPE_TEMPORAL_DATA_TYPE (garrow_temporal_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowTemporalDataType, + garrow_temporal_data_type, + GARROW, + TEMPORAL_DATA_TYPE, + GArrowFixedWidthDataType) +struct _GArrowTemporalDataTypeClass +{ + GArrowFixedWidthDataTypeClass parent_class; +}; + + #define GARROW_TYPE_DATE32_DATA_TYPE (garrow_date32_data_type_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowDate32DataType, garrow_date32_data_type, GARROW, DATE32_DATA_TYPE, - GArrowDataType) + GArrowTemporalDataType) struct _GArrowDate32DataTypeClass { - GArrowDataTypeClass parent_class; + GArrowTemporalDataTypeClass parent_class; }; GArrowDate32DataType *garrow_date32_data_type_new (void); @@ -376,10 +388,10 @@ G_DECLARE_DERIVABLE_TYPE(GArrowDate64DataType, garrow_date64_data_type, GARROW, DATE64_DATA_TYPE, - GArrowDataType) + GArrowTemporalDataType) struct _GArrowDate64DataTypeClass { - GArrowDataTypeClass parent_class; + GArrowTemporalDataTypeClass parent_class; }; GArrowDate64DataType *garrow_date64_data_type_new (void); @@ -390,10 +402,10 @@ G_DECLARE_DERIVABLE_TYPE(GArrowTimestampDataType, garrow_timestamp_data_type, GARROW, TIMESTAMP_DATA_TYPE, - GArrowDataType) + GArrowTemporalDataType) struct _GArrowTimestampDataTypeClass { - GArrowDataTypeClass parent_class; + GArrowTemporalDataTypeClass parent_class; }; GArrowTimestampDataType *garrow_timestamp_data_type_new (GArrowTimeUnit unit); @@ -406,10 +418,10 @@ G_DECLARE_DERIVABLE_TYPE(GArrowTimeDataType, garrow_time_data_type, GARROW, TIME_DATA_TYPE, - GArrowDataType) + GArrowTemporalDataType) struct _GArrowTimeDataTypeClass { - GArrowDataTypeClass parent_class; + GArrowTemporalDataTypeClass parent_class; }; GArrowTimeUnit garrow_time_data_type_get_unit (GArrowTimeDataType *time_data_type); diff --git a/ci/appveyor-cpp-setup.bat b/ci/appveyor-cpp-setup.bat index cee9bc28e5f..00f4dff0ac9 100644 --- a/ci/appveyor-cpp-setup.bat +++ b/ci/appveyor-cpp-setup.bat @@ -43,6 +43,11 @@ conda config --set remote_connect_timeout_secs 12 conda config --append disallowed_packages pypy3 conda info -a +@rem +@rem Install mamba to the base environment +@rem +conda install -q -y -c conda-forge mamba + @rem @rem Create conda environment for Build and Toolchain jobs @rem @@ -60,7 +65,7 @@ if "%JOB%" == "Toolchain" ( ) if "%JOB%" NEQ "Build_Debug" ( @rem Arrow conda environment is only required for the Build and Toolchain jobs - conda create -n arrow -q -y -c conda-forge ^ + mamba create -n arrow -q -y -c conda-forge ^ --file=ci\conda_env_python.txt ^ %CONDA_PACKAGES% ^ "cmake=3.17" ^ @@ -74,7 +79,7 @@ if "%JOB%" NEQ "Build_Debug" ( @rem On Windows, GTest is always bundled from source instead of using @rem conda binaries, avoid any interference between the two versions. if "%JOB%" == "Toolchain" ( - conda uninstall -n arrow -q -y -c conda-forge gtest + mamba uninstall -n arrow -q -y -c conda-forge gtest ) ) diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index 6d1ebf35341..cd7136cebbd 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -16,7 +16,7 @@ # under the License. aws-sdk-cpp -benchmark>=1.5.4 +benchmark>=1.6.0 boost-cpp>=1.68.0 brotli bzip2 diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index ca82705e73c..9f6c99b132e 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -19,6 +19,9 @@ breathe doxygen ipython -sphinx=4.2 +sphinx>=4.2 pydata-sphinx-theme -sphinx-tabs + +# Unable to install sphinx-tabs from conda-forge due to: +# - package sphinx-tabs-1.2.1-py_0 requires sphinx >=2,<4, but none of the providers can be installed +# sphinx-tabs diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index 40a855b5dd2..8fd5e46fd6d 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -19,17 +19,23 @@ ARG repo ARG arch FROM ${repo}:${arch}-conda +COPY ci/scripts/install_minio.sh /arrow/ci/scripts +RUN /arrow/ci/scripts/install_minio.sh latest /opt/conda + +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + # install the required conda packages into the test environment COPY ci/conda_env_cpp.txt \ ci/conda_env_gandiva.txt \ /arrow/ci/ -RUN conda install \ +RUN mamba install \ --file arrow/ci/conda_env_cpp.txt \ --file arrow/ci/conda_env_gandiva.txt \ compilers \ doxygen \ valgrind && \ - conda clean --all + mamba clean --all ENV ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index d40973330a6..378fe71df70 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -27,7 +27,7 @@ ARG go=1.15 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ -RUN conda install -q \ +RUN mamba install -q \ --file arrow/ci/conda_env_archery.txt \ "python>=3.7" \ numpy \ @@ -36,7 +36,7 @@ RUN conda install -q \ nodejs=${node} \ yarn \ openjdk=${jdk} && \ - conda clean --all --force-pkgs-dirs + mamba clean --all --force-pkgs-dirs # Install Rust with only the needed components # (rustfmt is needed for tonic-build to compile the protobuf definitions) diff --git a/ci/docker/conda-python-hdfs.dockerfile b/ci/docker/conda-python-hdfs.dockerfile index f6ffc71ce62..65a1befa64c 100644 --- a/ci/docker/conda-python-hdfs.dockerfile +++ b/ci/docker/conda-python-hdfs.dockerfile @@ -22,11 +22,11 @@ FROM ${repo}:${arch}-conda-python-${python} ARG jdk=8 ARG maven=3.5 -RUN conda install -q \ +RUN mamba install -q \ maven=${maven} \ openjdk=${jdk} \ pandas && \ - conda clean --all + mamba clean --all # installing libhdfs (JNI) ARG hdfs=3.2.1 diff --git a/ci/docker/conda-python-jpype.dockerfile b/ci/docker/conda-python-jpype.dockerfile index f77ef9bf66b..005007e2c4d 100644 --- a/ci/docker/conda-python-jpype.dockerfile +++ b/ci/docker/conda-python-jpype.dockerfile @@ -22,8 +22,8 @@ FROM ${repo}:${arch}-conda-python-${python} ARG jdk=11 ARG maven=3.6 -RUN conda install -q \ +RUN mamba install -q \ maven=${maven} \ openjdk=${jdk} \ jpype1 && \ - conda clean --all + mamba clean --all diff --git a/ci/docker/conda-python-kartothek.dockerfile b/ci/docker/conda-python-kartothek.dockerfile index d523161822c..77dd1f5214f 100644 --- a/ci/docker/conda-python-kartothek.dockerfile +++ b/ci/docker/conda-python-kartothek.dockerfile @@ -21,12 +21,13 @@ ARG python=3.6 FROM ${repo}:${arch}-conda-python-${python} # install kartothek dependencies from conda-forge -RUN conda install -c conda-forge -q \ +RUN mamba install -c conda-forge -q \ attrs \ click \ cloudpickle \ dask \ decorator \ + deprecation \ freezegun \ msgpack-python \ prompt-toolkit \ @@ -39,7 +40,7 @@ RUN conda install -c conda-forge -q \ toolz \ urlquote \ zstandard && \ - conda clean --all + mamba clean --all ARG kartothek=latest COPY ci/scripts/install_kartothek.sh /arrow/ci/scripts/ diff --git a/ci/docker/conda-python-pandas.dockerfile b/ci/docker/conda-python-pandas.dockerfile index 303cc80e48a..9f1ba099116 100644 --- a/ci/docker/conda-python-pandas.dockerfile +++ b/ci/docker/conda-python-pandas.dockerfile @@ -23,5 +23,5 @@ FROM ${repo}:${arch}-conda-python-${python} ARG pandas=latest ARG numpy=latest COPY ci/scripts/install_pandas.sh /arrow/ci/scripts/ -RUN conda uninstall -q -y numpy && \ +RUN mamba uninstall -q -y numpy && \ /arrow/ci/scripts/install_pandas.sh ${pandas} ${numpy} diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index a2af2ac135c..233a1fc363f 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -23,11 +23,11 @@ FROM ${repo}:${arch}-conda-python-${python} ARG jdk=8 ARG maven=3.5 -RUN conda install -q \ +RUN mamba install -q \ openjdk=${jdk} \ maven=${maven} \ pandas && \ - conda clean --all + mamba clean --all # installing specific version of spark ARG spark=master diff --git a/ci/docker/conda-python-turbodbc.dockerfile b/ci/docker/conda-python-turbodbc.dockerfile index e748604dee3..e702cd13524 100644 --- a/ci/docker/conda-python-turbodbc.dockerfile +++ b/ci/docker/conda-python-turbodbc.dockerfile @@ -30,12 +30,12 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ rm -rf /var/lib/apt/lists/* # install turbodbc dependencies from conda-forge -RUN conda install -c conda-forge -q \ +RUN mamba install -c conda-forge -q \ pybind11 \ pytest-cov \ mock \ unixodbc && \ - conda clean --all + mamba clean --all RUN service postgresql start && \ sudo -u postgres psql -U postgres -c \ diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index ab3f77be1b6..ae7abd9c9db 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -21,13 +21,20 @@ FROM ${repo}:${arch}-conda-cpp # install python specific packages ARG python=3.6 -COPY ci/conda_env_python.txt /arrow/ci/ -RUN conda install -q \ +COPY ci/conda_env_python.txt \ + ci/conda_env_sphinx.txt \ + /arrow/ci/ +RUN mamba install -q \ --file arrow/ci/conda_env_python.txt \ + --file arrow/ci/conda_env_sphinx.txt \ $([ "$python" == "3.6" -o "$python" == "3.7" ] && echo "pickle5") \ python=${python} \ nomkl && \ - conda clean --all + mamba clean --all + +# unable to install from conda-forge due to sphinx version pin, see comment in +# arrow/ci/conda_env_sphinx.txt +RUN pip install sphinx-tabs ENV ARROW_PYTHON=ON \ ARROW_BUILD_STATIC=OFF \ diff --git a/ci/docker/conda.dockerfile b/ci/docker/conda.dockerfile index adb64f9fad0..d0545e3bf84 100644 --- a/ci/docker/conda.dockerfile +++ b/ci/docker/conda.dockerfile @@ -18,10 +18,6 @@ ARG arch=amd64 FROM ${arch}/ubuntu:18.04 -# arch is unset after the FROM statement, so need to define it again -ARG arch=amd64 -ARG prefix=/opt/conda - # install build essentials RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update -y -q && \ @@ -29,27 +25,22 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -ENV PATH=${prefix}/bin:$PATH -# install conda and minio -COPY ci/scripts/install_conda.sh \ - ci/scripts/install_minio.sh \ - /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_conda.sh ${arch} linux latest ${prefix} -RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest ${prefix} -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts -RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default +# install conda and mamba via mambaforge +COPY ci/scripts/install_conda.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_conda.sh mambaforge latest /opt/conda +ENV PATH=/opt/conda/bin:$PATH # create a conda environment ADD ci/conda_env_unix.txt /arrow/ci/ -RUN conda create -n arrow --file arrow/ci/conda_env_unix.txt git && \ - conda clean --all +RUN mamba create -n arrow --file arrow/ci/conda_env_unix.txt git && \ + mamba clean --all # activate the created environment by default RUN echo "conda activate arrow" >> ~/.profile -ENV CONDA_PREFIX=${prefix}/envs/arrow +ENV CONDA_PREFIX=/opt/conda/envs/arrow # use login shell to activate arrow environment un the RUN commands -SHELL [ "/bin/bash", "-c", "-l" ] +SHELL ["/bin/bash", "-c", "-l"] # use login shell when running the container -ENTRYPOINT [ "/bin/bash", "-c", "-l" ] +ENTRYPOINT ["/bin/bash", "-c", "-l"] diff --git a/ci/docker/debian-10-cpp.dockerfile b/ci/docker/debian-10-cpp.dockerfile index f85408f0348..2bb9e88a711 100644 --- a/ci/docker/debian-10-cpp.dockerfile +++ b/ci/docker/debian-10-cpp.dockerfile @@ -17,7 +17,6 @@ ARG arch=amd64 FROM ${arch}/debian:10 -ARG arch ENV DEBIAN_FRONTEND noninteractive @@ -67,13 +66,14 @@ RUN apt-get update -y -q && \ protobuf-compiler \ python3-pip \ rapidjson-dev \ + rsync \ tzdata \ zlib1g-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local ENV ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ diff --git a/ci/docker/debian-10-js.dockerfile b/ci/docker/debian-10-js.dockerfile index 5bb31f2e32e..f994de4141d 100644 --- a/ci/docker/debian-10-js.dockerfile +++ b/ci/docker/debian-10-js.dockerfile @@ -21,6 +21,12 @@ FROM ${arch}/node:${node} ENV NODE_NO_WARNINGS=1 +# install rsync for copying the generated documentation +RUN apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends rsync && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # TODO(kszucs): # 1. add the files required to install the dependencies to .dockerignore # 2. copy these files to their appropriate path diff --git a/ci/docker/debian-11-cpp.dockerfile b/ci/docker/debian-11-cpp.dockerfile index 659881b0c8b..c2b0e98d9fc 100644 --- a/ci/docker/debian-11-cpp.dockerfile +++ b/ci/docker/debian-11-cpp.dockerfile @@ -65,15 +65,17 @@ RUN apt-get update -y -q && \ protobuf-compiler-grpc \ python3-pip \ rapidjson-dev \ + rsync \ tzdata \ zlib1g-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default +RUN /arrow/ci/scripts/install_gcs_testbench.sh default ENV ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ diff --git a/ci/docker/debian-11-js.dockerfile b/ci/docker/debian-11-js.dockerfile index 5bb31f2e32e..f994de4141d 100644 --- a/ci/docker/debian-11-js.dockerfile +++ b/ci/docker/debian-11-js.dockerfile @@ -21,6 +21,12 @@ FROM ${arch}/node:${node} ENV NODE_NO_WARNINGS=1 +# install rsync for copying the generated documentation +RUN apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends rsync && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # TODO(kszucs): # 1. add the files required to install the dependencies to .dockerignore # 2. copy these files to their appropriate path diff --git a/ci/docker/fedora-33-cpp.dockerfile b/ci/docker/fedora-33-cpp.dockerfile index 61964a476e9..a60a572ec3a 100644 --- a/ci/docker/fedora-33-cpp.dockerfile +++ b/ci/docker/fedora-33-cpp.dockerfile @@ -64,9 +64,10 @@ RUN dnf update -y && \ zlib-devel COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default +RUN /arrow/ci/scripts/install_gcs_testbench.sh default ENV ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index 12c797f9651..3cd62a23d7d 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -27,9 +27,8 @@ RUN apt-get update -y && \ dirmngr \ apt-transport-https \ software-properties-common && \ - apt-key adv \ - --keyserver keyserver.ubuntu.com \ - --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ + wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ + tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran40/' && \ apt-get install -y --no-install-recommends \ autoconf-archive \ @@ -75,14 +74,13 @@ RUN wget -q -O - https://deb.nodesource.com/setup_${node}.x | bash - && \ rm -rf /var/lib/apt/lists/* && \ npm install -g yarn -# ARROW-13353: breathe >= 4.29.1 tries to parse template arguments, -# but Sphinx can't parse constructs like `typename...`. RUN pip install \ - meson \ - breathe==4.29.0 \ + breathe \ ipython \ - sphinx \ - pydata-sphinx-theme + meson \ + pydata-sphinx-theme \ + sphinx-tabs \ + sphinx>=4.2 COPY c_glib/Gemfile /arrow/c_glib/ RUN gem install --no-document bundler && \ @@ -107,4 +105,4 @@ ENV ARROW_FLIGHT=ON \ ARROW_BUILD_TESTS=OFF \ ARROW_BUILD_UTILITIES=OFF \ ARROW_USE_GLOG=OFF \ - CMAKE_UNITY_BUILD=ON \ + CMAKE_UNITY_BUILD=ON diff --git a/ci/docker/linux-apt-lint.dockerfile b/ci/docker/linux-apt-lint.dockerfile index 84de6b05f31..a92aa276f9a 100644 --- a/ci/docker/linux-apt-lint.dockerfile +++ b/ci/docker/linux-apt-lint.dockerfile @@ -41,9 +41,8 @@ RUN apt-get update && \ && rm -rf /var/lib/apt/lists/* ARG r=4.1 -RUN apt-key adv \ - --keyserver keyserver.ubuntu.com \ - --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ +RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ + tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ # NOTE: R 3.5 and 3.6 are available in the repos with -cran35 suffix # for trusty, xenial, bionic, and eoan (as of May 2020) # -cran40 has 4.0 versions for bionic and focal diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index 894f53bc0a3..8e290dc00e0 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -17,7 +17,6 @@ ARG base FROM ${base} -ARG arch ARG tz="UTC" ENV TZ=${tz} @@ -34,9 +33,8 @@ RUN apt-get update -y && \ dirmngr \ apt-transport-https \ software-properties-common && \ - apt-key adv \ - --keyserver keyserver.ubuntu.com \ - --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ + wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ + tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ # NOTE: R 3.5 and 3.6 are available in the repos with -cran35 suffix # for trusty, xenial, bionic, and eoan (as of May 2020) # -cran40 has 4.0 versions for bionic and focal @@ -84,18 +82,19 @@ RUN cat /arrow/ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site # Also ensure parallel compilation of C/C++ code RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Renviron.site +# Set up Python 3 and its dependencies +RUN ln -s /usr/bin/python3 /usr/local/bin/python && \ + ln -s /usr/bin/pip3 /usr/local/bin/pip + COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local -# Set up Python 3 and its dependencies -RUN ln -s /usr/bin/python3 /usr/local/bin/python && \ - ln -s /usr/bin/pip3 /usr/local/bin/pip +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_gcs_testbench.sh default COPY python/requirements-build.txt /arrow/python/ RUN pip install -r arrow/python/requirements-build.txt diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile index ca701e259c4..1cbde3207ed 100644 --- a/ci/docker/linux-r.dockerfile +++ b/ci/docker/linux-r.dockerfile @@ -46,6 +46,10 @@ COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/r_docker_configure.sh +# Set up Python 3 and its dependencies +RUN ln -s /usr/bin/python3 /usr/local/bin/python && \ + ln -s /usr/bin/pip3 /usr/local/bin/pip + COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow diff --git a/ci/docker/python-wheel-manylinux-201x.dockerfile b/ci/docker/python-wheel-manylinux-201x.dockerfile index ae1b0a7767c..dee2b1244b1 100644 --- a/ci/docker/python-wheel-manylinux-201x.dockerfile +++ b/ci/docker/python-wheel-manylinux-201x.dockerfile @@ -18,56 +18,54 @@ ARG base FROM ${base} -ARG arch_alias -ARG arch_short_alias +ARG arch +ARG arch_short +ARG manylinux +ENV MANYLINUX_VERSION=${manylinux} + +# Install basic dependencies RUN yum install -y git flex curl autoconf zip wget # Install CMake -ARG cmake=3.19.3 -RUN wget -q https://github.com/Kitware/CMake/releases/download/v${cmake}/cmake-${cmake}-Linux-${arch_alias}.tar.gz -O - | \ - tar -xzf - --directory /usr/local --strip-components=1 +# AWS SDK doesn't work with CMake=3.22 due to https://gitlab.kitware.com/cmake/cmake/-/issues/22524 +ARG cmake=3.21.4 +COPY ci/scripts/install_cmake.sh arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_cmake.sh ${arch} linux ${cmake} /usr/local # Install Ninja ARG ninja=1.10.2 -RUN mkdir /tmp/ninja && \ - wget -q https://github.com/ninja-build/ninja/archive/v${ninja}.tar.gz -O - | \ - tar -xzf - --directory /tmp/ninja --strip-components=1 && \ - cd /tmp/ninja && \ - ./configure.py --bootstrap && \ - mv ninja /usr/local/bin && \ - rm -rf /tmp/ninja +COPY ci/scripts/install_ninja.sh arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_ninja.sh ${ninja} /usr/local # Install ccache ARG ccache=4.1 -RUN mkdir /tmp/ccache && \ - wget -q https://github.com/ccache/ccache/archive/v${ccache}.tar.gz -O - | \ - tar -xzf - --directory /tmp/ccache --strip-components=1 && \ - cd /tmp/ccache && \ - mkdir build && \ - cd build && \ - cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DZSTD_FROM_INTERNET=ON .. && \ - ninja install && \ - rm -rf /tmp/ccache +COPY ci/scripts/install_ccache.sh arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_ccache.sh ${ccache} /usr/local -# Install vcpkg +# Install vcpkg and in case of manylinux2010 install a more recent glibc>2.15 +# for the prebuilt vcpkg binary ARG vcpkg -RUN git clone https://github.com/microsoft/vcpkg /opt/vcpkg && \ - git -C /opt/vcpkg checkout ${vcpkg} && \ - /opt/vcpkg/bootstrap-vcpkg.sh -useSystemBinaries -disableMetrics && \ - ln -s /opt/vcpkg/vcpkg /usr/bin/vcpkg - -# Patch ports files as needed +ARG glibc=2.18 COPY ci/vcpkg/*.patch \ ci/vcpkg/*linux*.cmake \ arrow/ci/vcpkg/ -RUN cd /opt/vcpkg && git apply --ignore-whitespace /arrow/ci/vcpkg/ports.patch +COPY ci/scripts/install_vcpkg.sh \ + ci/scripts/install_glibc.sh \ + arrow/ci/scripts/ +RUN arrow/ci/scripts/install_vcpkg.sh /opt/vcpkg ${vcpkg} && \ + if [ "${manylinux}" == "2010" ]; then \ + arrow/ci/scripts/install_glibc.sh ${glibc} /opt/glibc-${glibc} && \ + patchelf --set-interpreter /opt/glibc-2.18/lib/ld-linux-x86-64.so.2 /opt/vcpkg/vcpkg && \ + patchelf --set-rpath /opt/glibc-2.18/lib:/usr/lib64 /opt/vcpkg/vcpkg; \ + fi +ENV PATH="/opt/vcpkg:${PATH}" ARG build_type=release ENV CMAKE_BUILD_TYPE=${build_type} \ VCPKG_FORCE_SYSTEM_BINARIES=1 \ VCPKG_OVERLAY_TRIPLETS=/arrow/ci/vcpkg \ - VCPKG_DEFAULT_TRIPLET=${arch_short_alias}-linux-static-${build_type} \ + VCPKG_DEFAULT_TRIPLET=${arch_short}-linux-static-${build_type} \ VCPKG_FEATURE_FLAGS=-manifests # Need to install the boost-build prior installing the boost packages, otherwise @@ -75,8 +73,6 @@ ENV CMAKE_BUILD_TYPE=${build_type} \ # TODO(kszucs): factor out the package enumeration to a text file and reuse it # from the windows image and potentially in a future macos wheel build RUN vcpkg install --clean-after-build \ - boost-build:${arch_short_alias}-linux && \ - vcpkg install --clean-after-build \ abseil \ aws-sdk-cpp[config,cognito-identity,core,identity-management,s3,sts,transfer] \ boost-filesystem \ @@ -87,6 +83,7 @@ RUN vcpkg install --clean-after-build \ flatbuffers \ gflags \ glog \ + google-cloud-cpp[core,storage] \ grpc \ lz4 \ openssl \ diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2017.dockerfile index 9a2afb781fa..e7ff011629b 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2017.dockerfile @@ -17,7 +17,7 @@ # based on mcr.microsoft.com/windows/servercore:ltsc2019 # contains choco and vs2017 preinstalled -FROM abrarov/msvc-2017:2.10.0 +FROM abrarov/msvc-2017:2.12.1 # Install CMake and Ninja RUN choco install --no-progress -r -y cmake --installargs 'ADD_CMAKE_TO_PATH=System' && \ @@ -31,16 +31,12 @@ RUN setx path "%path%;C:\Program Files\Git\usr\bin" # Compiling vcpkg itself from a git tag doesn't work anymore since vcpkg has # started to ship precompiled binaries for the vcpkg-tool. ARG vcpkg -RUN git clone https://github.com/Microsoft/vcpkg && \ - vcpkg\bootstrap-vcpkg.bat -disableMetrics && \ - setx PATH "%PATH%;C:\vcpkg" && \ - git -C vcpkg checkout %vcpkg% - -# Patch ports files as needed COPY ci/vcpkg/*.patch \ ci/vcpkg/*windows*.cmake \ arrow/ci/vcpkg/ -RUN cd vcpkg && git apply --ignore-whitespace C:/arrow/ci/vcpkg/ports.patch +COPY ci/scripts/install_vcpkg.sh arrow/ci/scripts/ +RUN bash arrow/ci/scripts/install_vcpkg.sh /c/vcpkg %vcpkg% && \ + setx PATH "%PATH%;C:\vcpkg" # Configure vcpkg and install dependencies # NOTE: use windows batch environment notation for build arguments in RUN @@ -50,7 +46,7 @@ RUN cd vcpkg && git apply --ignore-whitespace C:/arrow/ci/vcpkg/ports.patch ARG build_type=release ENV CMAKE_BUILD_TYPE=${build_type} \ VCPKG_OVERLAY_TRIPLETS=C:\\arrow\\ci\\vcpkg \ - VCPKG_DEFAULT_TRIPLET=x64-windows-static-md-${build_type} \ + VCPKG_DEFAULT_TRIPLET=amd64-windows-static-md-${build_type} \ VCPKG_FEATURE_FLAGS=-manifests RUN vcpkg install --clean-after-build \ @@ -66,6 +62,7 @@ RUN vcpkg install --clean-after-build \ flatbuffers \ gflags \ glog \ + google-cloud-cpp[core,storage] \ grpc \ lz4 \ openssl \ diff --git a/ci/docker/ubuntu-18.04-cpp.dockerfile b/ci/docker/ubuntu-18.04-cpp.dockerfile index 0c05ac4ee6b..d3023f33794 100644 --- a/ci/docker/ubuntu-18.04-cpp.dockerfile +++ b/ci/docker/ubuntu-18.04-cpp.dockerfile @@ -83,6 +83,7 @@ RUN apt-get update -y -q && \ pkg-config \ protobuf-compiler \ rapidjson-dev \ + rsync \ tzdata && \ apt-get clean && \ rm -rf /var/lib/apt/lists* diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index d7076b45bab..f6ca6190559 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -17,7 +17,6 @@ ARG base=amd64/ubuntu:20.04 FROM ${base} -ARG arch SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -38,9 +37,10 @@ RUN apt-get update -y -q && \ rm -rf /var/lib/apt/lists* COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default +RUN /arrow/ci/scripts/install_gcs_testbench.sh default ENV ARROW_BUILD_TESTS=ON \ ARROW_DATASET=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 5a48c648e3b..842be8ddbde 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -17,7 +17,6 @@ ARG base=amd64/ubuntu:20.04 FROM ${base} -ARG arch SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -30,20 +29,27 @@ RUN echo "debconf debconf/frontend select Noninteractive" | \ # while debugging package list with docker build. ARG clang_tools ARG llvm -RUN if [ "${llvm}" -gt "10" ]; then \ +RUN latest_system_llvm=10 && \ + if [ ${llvm} -gt ${latest_system_llvm} -o \ + ${clang_tools} -gt ${latest_system_llvm} ]; then \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ apt-transport-https \ ca-certificates \ gnupg \ + lsb-release \ wget && \ wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - echo "deb https://apt.llvm.org/focal/ llvm-toolchain-focal-${llvm} main" > \ - /etc/apt/sources.list.d/llvm.list && \ - if [ "${clang_tools}" != "${llvm}" -a "${clang_tools}" -gt 10 ]; then \ - echo "deb https://apt.llvm.org/focal/ llvm-toolchain-focal-${clang_tools} main" > \ + code_name=$(lsb_release --codename --short) && \ + if [ ${llvm} -gt 10 ]; then \ + echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${llvm} main" > \ + /etc/apt/sources.list.d/llvm.list; \ + fi && \ + if [ ${clang_tools} -ne ${llvm} -a \ + ${clang_tools} -gt ${latest_system_llvm} ]; then \ + echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${clang_tools} main" > \ /etc/apt/sources.list.d/clang-tools.list; \ - fi \ + fi; \ fi && \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ @@ -93,15 +99,18 @@ RUN apt-get update -y -q && \ python3-rados \ rados-objclass-dev \ rapidjson-dev \ + rsync \ tzdata \ wget && \ apt-get clean && \ rm -rf /var/lib/apt/lists* COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + COPY ci/scripts/install_ceph.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_ceph.sh diff --git a/ci/docker/ubuntu-20.10-cpp.dockerfile b/ci/docker/ubuntu-20.10-cpp.dockerfile deleted file mode 100644 index 59f5fa4c886..00000000000 --- a/ci/docker/ubuntu-20.10-cpp.dockerfile +++ /dev/null @@ -1,140 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG base=amd64/ubuntu:20.10 -FROM ${base} -ARG arch - -SHELL ["/bin/bash", "-o", "pipefail", "-c"] - -RUN echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -# Installs LLVM toolchain, for Gandiva and testing other compilers -# -# Note that this is installed before the base packages to improve iteration -# while debugging package list with docker build. -ARG clang_tools -ARG llvm -RUN if [ "${llvm}" -gt "10" ]; then \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - apt-transport-https \ - ca-certificates \ - gnupg \ - wget && \ - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - echo "deb https://apt.llvm.org/groovy/ llvm-toolchain-groovy-${llvm} main" > \ - /etc/apt/sources.list.d/llvm.list && \ - if [ "${clang_tools}" != "${llvm}" -a "${clang_tools}" -gt 10 ]; then \ - echo "deb https://apt.llvm.org/groovy/ llvm-toolchain-groovy-${clang_tools} main" > \ - /etc/apt/sources.list.d/clang-tools.list; \ - fi \ - fi && \ - apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - clang-${clang_tools} \ - clang-${llvm} \ - clang-format-${clang_tools} \ - clang-tidy-${clang_tools} \ - llvm-${llvm}-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists* - -# Installs C++ toolchain and dependencies -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends \ - autoconf \ - ca-certificates \ - ccache \ - cmake \ - g++ \ - gcc \ - gdb \ - git \ - libbenchmark-dev \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgflags-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - liblz4-dev \ - libprotobuf-dev \ - libprotoc-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libzstd-dev \ - make \ - ninja-build \ - pkg-config \ - protobuf-compiler \ - protobuf-compiler-grpc \ - python3-pip \ - rapidjson-dev \ - tzdata \ - wget && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists* - -COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default - -# Prioritize system packages and local installation -# The following dependencies will be downloaded due to missing/invalid packages -# provided by the distribution: -# - libc-ares-dev does not install CMake config files -# - flatbuffer is not packaged -# - libgtest-dev only provide sources -# - libprotobuf-dev only provide sources -ENV ARROW_BUILD_TESTS=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ - ARROW_DATASET=ON \ - ARROW_FLIGHT=OFF \ - ARROW_GANDIVA=ON \ - ARROW_HDFS=ON \ - ARROW_HOME=/usr/local \ - ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ - ARROW_ORC=ON \ - ARROW_PARQUET=ON \ - ARROW_PLASMA=ON \ - ARROW_S3=ON \ - ARROW_USE_ASAN=OFF \ - ARROW_USE_CCACHE=ON \ - ARROW_USE_UBSAN=OFF \ - ARROW_WITH_BROTLI=ON \ - ARROW_WITH_BZ2=ON \ - ARROW_WITH_LZ4=ON \ - ARROW_WITH_SNAPPY=ON \ - ARROW_WITH_ZLIB=ON \ - ARROW_WITH_ZSTD=ON \ - AWSSDK_SOURCE=BUNDLED \ - GTest_SOURCE=BUNDLED \ - ORC_SOURCE=BUNDLED \ - PARQUET_BUILD_EXAMPLES=ON \ - PARQUET_BUILD_EXECUTABLES=ON \ - PATH=/usr/lib/ccache/:$PATH \ - PYTHON=python3 diff --git a/ci/docker/ubuntu-21.04-cpp.dockerfile b/ci/docker/ubuntu-21.04-cpp.dockerfile index 9fc857c6266..58639cb1cf9 100644 --- a/ci/docker/ubuntu-21.04-cpp.dockerfile +++ b/ci/docker/ubuntu-21.04-cpp.dockerfile @@ -15,9 +15,8 @@ # specific language governing permissions and limitations # under the License. -ARG base=amd64/ubuntu:20.04 +ARG base=amd64/ubuntu:21.04 FROM ${base} -ARG arch SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -30,20 +29,27 @@ RUN echo "debconf debconf/frontend select Noninteractive" | \ # while debugging package list with docker build. ARG clang_tools ARG llvm -RUN if [ "${llvm}" -gt "10" ]; then \ +RUN latest_system_llvm=12 && \ + if [ ${llvm} -gt ${latest_system_llvm} -o \ + ${clang_tools} -gt ${latest_system_llvm} ]; then \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ apt-transport-https \ ca-certificates \ gnupg \ + lsb-release \ wget && \ wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - echo "deb https://apt.llvm.org/hirsute/ llvm-toolchain-hirsute-${llvm} main" > \ - /etc/apt/sources.list.d/llvm.list && \ - if [ "${clang_tools}" != "${llvm}" -a "${clang_tools}" -gt 10 ]; then \ - echo "deb https://apt.llvm.org/hirsute/ llvm-toolchain-hirsute-${clang_tools} main" > \ + code_name=$(lsb_release --codename --short) && \ + if [ ${llvm} -gt 10 ]; then \ + echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${llvm} main" > \ + /etc/apt/sources.list.d/llvm.list; \ + fi && \ + if [ ${clang_tools} -ne ${llvm} -a \ + ${clang_tools} -gt ${latest_system_llvm} ]; then \ + echo "deb https://apt.llvm.org/${code_name}/ llvm-toolchain-${code_name}-${clang_tools} main" > \ /etc/apt/sources.list.d/clang-tools.list; \ - fi \ + fi; \ fi && \ apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ @@ -90,15 +96,17 @@ RUN apt-get update -y -q && \ protobuf-compiler-grpc \ python3-pip \ rapidjson-dev \ + rsync \ tzdata \ wget && \ apt-get clean && \ rm -rf /var/lib/apt/lists* COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local +RUN /arrow/ci/scripts/install_minio.sh latest /usr/local + COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_gcs_testbench.sh ${arch} default +RUN /arrow/ci/scripts/install_gcs_testbench.sh default # Prioritize system packages and local installation # The following dependencies will be downloaded due to missing/invalid packages diff --git a/ci/etc/rprofile b/ci/etc/rprofile index 5ef1dca8ff3..e9e98b12e40 100644 --- a/ci/etc/rprofile +++ b/ci/etc/rprofile @@ -1,4 +1,4 @@ - local({ +local({ .pick_cran <- function() { # Return a CRAN repo URL, preferring RSPM binaries if available for this OS rspm_template <- "https://packagemanager.rstudio.com/cran/__linux__/%s/latest" @@ -38,14 +38,14 @@ } } - return("https://cloud.r-project.org") + return(NULL) } options( Ncpus = parallel::detectCores(), - repos = tryCatch(.pick_cran(), error = function(e) "https://cloud.r-project.org"), + repos = c(tryCatch(.pick_cran(), error = function(e) NULL), "https://cloud.r-project.org"), HTTPUserAgent = sprintf( - 'R/%s R (%s)', + "R/%s R (%s)", getRversion(), paste(getRversion(), R.version$platform, R.version$arch, R.version$os) ) diff --git a/ci/scripts/c_glib_build.sh b/ci/scripts/c_glib_build.sh index ce3cea18e71..e0baf7f3d13 100755 --- a/ci/scripts/c_glib_build.sh +++ b/ci/scripts/c_glib_build.sh @@ -21,8 +21,11 @@ set -ex source_dir=${1}/c_glib build_dir=${2}/c_glib -: ${ARROW_GLIB_GTK_DOC:=false} +build_root=${2} + : ${ARROW_GLIB_DEVELOPMENT_MODE:=false} +: ${BUILD_DOCS_C_GLIB:=OFF} +with_gtk_doc=$([ "${BUILD_DOCS_C_GLIB}" == "ON" ] && echo "true" || echo "false") export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig @@ -35,7 +38,7 @@ mkdir -p ${build_dir} meson --prefix=$ARROW_HOME \ --libdir=lib \ -Ddevelopment_mode=${ARROW_GLIB_DEVELOPMENT_MODE} \ - -Dgtk_doc=${ARROW_GLIB_GTK_DOC} \ + -Dgtk_doc=${with_gtk_doc} \ ${build_dir} \ ${source_dir} @@ -43,3 +46,8 @@ pushd ${build_dir} ninja ninja install popd + +if [ "${BUILD_DOCS_C_GLIB}" == "ON" ]; then + mkdir -p ${build_root}/docs/c_glib + rsync -a ${ARROW_HOME}/share/gtk-doc/html/ ${build_root}/docs/c_glib +fi diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 0ea9b1b89dc..f791ddd5645 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -21,9 +21,9 @@ set -ex source_dir=${1}/cpp build_dir=${2}/cpp -with_docs=${3:-false} : ${ARROW_USE_CCACHE:=OFF} +: ${BUILD_DOCS_CPP:=OFF} # TODO(kszucs): consider to move these to CMake if [ ! -z "${CONDA_PREFIX}" ]; then @@ -49,97 +49,98 @@ fi mkdir -p ${build_dir} pushd ${build_dir} -cmake -G "${CMAKE_GENERATOR:-Ninja}" \ - -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ - -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ - -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ - -DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \ - -DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \ - -DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \ - -DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \ - -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ - -DARROW_BUILD_UTILITIES=${ARROW_BUILD_UTILITIES:-ON} \ - -DARROW_COMPUTE=${ARROW_COMPUTE:-ON} \ - -DARROW_CSV=${ARROW_CSV:-ON} \ - -DARROW_CUDA=${ARROW_CUDA:-OFF} \ - -DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \ - -DARROW_DATASET=${ARROW_DATASET:-ON} \ - -DARROW_ENGINE=${ARROW_ENGINE:-ON} \ - -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ - -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ - -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ - -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ - -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ - -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ - -DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA:-OFF} \ - -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ - -DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \ - -DARROW_GCS=${ARROW_GCS:-OFF} \ - -DARROW_HDFS=${ARROW_HDFS:-ON} \ - -DARROW_HIVESERVER2=${ARROW_HIVESERVER2:-OFF} \ - -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ - -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ - -DARROW_JNI=${ARROW_JNI:-OFF} \ - -DARROW_JSON=${ARROW_JSON:-ON} \ - -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ - -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ - -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ - -DARROW_ORC=${ARROW_ORC:-OFF} \ - -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ - -DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT:-OFF} \ - -DARROW_PLASMA=${ARROW_PLASMA:-OFF} \ - -DARROW_PYTHON=${ARROW_PYTHON:-OFF} \ - -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ - -DARROW_S3=${ARROW_S3:-OFF} \ - -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ - -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ - -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ - -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ - -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ - -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ - -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ - -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ - -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ - -DARROW_USE_TSAN=${ARROW_USE_TSAN:-OFF} \ - -DARROW_USE_UBSAN=${ARROW_USE_UBSAN:-OFF} \ - -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ - -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-OFF} \ - -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-OFF} \ - -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-OFF} \ - -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-OFF} \ - -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ - -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ - -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ - -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ - -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ - -DBOOST_SOURCE=${BOOST_SOURCE:-} \ - -DBrotli_SOURCE=${Brotli_SOURCE:-} \ - -DBUILD_WARNING_LEVEL=${BUILD_WARNING_LEVEL:-CHECKIN} \ - -Dc-ares_SOURCE=${cares_SOURCE:-} \ - -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \ - -DCMAKE_C_FLAGS="${CFLAGS:-}" \ - -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ - -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ - -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ - -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ - -Dgflags_SOURCE=${gflags_SOURCE:-} \ - -Dgoogle_cloud_cpp_storage_SOURCE=${google_cloud_cpp_storage_SOURCE:-} \ - -DgRPC_SOURCE=${gRPC_SOURCE:-} \ - -DGTest_SOURCE=${GTest_SOURCE:-} \ - -DLz4_SOURCE=${Lz4_SOURCE:-} \ - -DORC_SOURCE=${ORC_SOURCE:-} \ - -DPARQUET_BUILD_EXECUTABLES=${PARQUET_BUILD_EXECUTABLES:-OFF} \ - -DPARQUET_BUILD_EXAMPLES=${PARQUET_BUILD_EXAMPLES:-OFF} \ - -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} \ - -DProtobuf_SOURCE=${Protobuf_SOURCE:-} \ - -DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \ - -Dre2_SOURCE=${re2_SOURCE:-} \ - -DSnappy_SOURCE=${Snappy_SOURCE:-} \ - -DThrift_SOURCE=${Thrift_SOURCE:-} \ - -Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \ - -Dzstd_SOURCE=${zstd_SOURCE:-} \ - ${CMAKE_ARGS} \ - ${source_dir} +cmake \ + -DARROW_BOOST_USE_SHARED=${ARROW_BOOST_USE_SHARED:-ON} \ + -DARROW_BUILD_BENCHMARKS_REFERENCE=${ARROW_BUILD_BENCHMARKS:-OFF} \ + -DARROW_BUILD_BENCHMARKS=${ARROW_BUILD_BENCHMARKS:-OFF} \ + -DARROW_BUILD_EXAMPLES=${ARROW_BUILD_EXAMPLES:-OFF} \ + -DARROW_BUILD_INTEGRATION=${ARROW_BUILD_INTEGRATION:-OFF} \ + -DARROW_BUILD_SHARED=${ARROW_BUILD_SHARED:-ON} \ + -DARROW_BUILD_STATIC=${ARROW_BUILD_STATIC:-ON} \ + -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ + -DARROW_BUILD_UTILITIES=${ARROW_BUILD_UTILITIES:-ON} \ + -DARROW_COMPUTE=${ARROW_COMPUTE:-ON} \ + -DARROW_CSV=${ARROW_CSV:-ON} \ + -DARROW_CUDA=${ARROW_CUDA:-OFF} \ + -DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \ + -DARROW_DATASET=${ARROW_DATASET:-ON} \ + -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ + -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ + -DARROW_ENGINE=${ARROW_ENGINE:-ON} \ + -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ + -DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \ + -DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \ + -DARROW_FUZZING=${ARROW_FUZZING:-OFF} \ + -DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA:-OFF} \ + -DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \ + -DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \ + -DARROW_GCS=${ARROW_GCS:-OFF} \ + -DARROW_HDFS=${ARROW_HDFS:-ON} \ + -DARROW_HIVESERVER2=${ARROW_HIVESERVER2:-OFF} \ + -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ + -DARROW_JEMALLOC=${ARROW_JEMALLOC:-ON} \ + -DARROW_JNI=${ARROW_JNI:-OFF} \ + -DARROW_JSON=${ARROW_JSON:-ON} \ + -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ + -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ + -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ + -DARROW_ORC=${ARROW_ORC:-OFF} \ + -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ + -DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT:-OFF} \ + -DARROW_PLASMA=${ARROW_PLASMA:-OFF} \ + -DARROW_PYTHON=${ARROW_PYTHON:-OFF} \ + -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ + -DARROW_S3=${ARROW_S3:-OFF} \ + -DARROW_SKYHOOK=${ARROW_SKYHOOK:-OFF} \ + -DARROW_TEST_LINKAGE=${ARROW_TEST_LINKAGE:-shared} \ + -DARROW_TEST_MEMCHECK=${ARROW_TEST_MEMCHECK:-OFF} \ + -DARROW_USE_ASAN=${ARROW_USE_ASAN:-OFF} \ + -DARROW_USE_CCACHE=${ARROW_USE_CCACHE:-ON} \ + -DARROW_USE_GLOG=${ARROW_USE_GLOG:-OFF} \ + -DARROW_USE_LD_GOLD=${ARROW_USE_LD_GOLD:-OFF} \ + -DARROW_USE_PRECOMPILED_HEADERS=${ARROW_USE_PRECOMPILED_HEADERS:-OFF} \ + -DARROW_USE_STATIC_CRT=${ARROW_USE_STATIC_CRT:-OFF} \ + -DARROW_USE_TSAN=${ARROW_USE_TSAN:-OFF} \ + -DARROW_USE_UBSAN=${ARROW_USE_UBSAN:-OFF} \ + -DARROW_VERBOSE_THIRDPARTY_BUILD=${ARROW_VERBOSE_THIRDPARTY_BUILD:-OFF} \ + -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI:-OFF} \ + -DARROW_WITH_BZ2=${ARROW_WITH_BZ2:-OFF} \ + -DARROW_WITH_LZ4=${ARROW_WITH_LZ4:-OFF} \ + -DARROW_WITH_SNAPPY=${ARROW_WITH_SNAPPY:-OFF} \ + -DARROW_WITH_UTF8PROC=${ARROW_WITH_UTF8PROC:-ON} \ + -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ + -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ + -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ + -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ + -DBOOST_SOURCE=${BOOST_SOURCE:-} \ + -DBrotli_SOURCE=${Brotli_SOURCE:-} \ + -DBUILD_WARNING_LEVEL=${BUILD_WARNING_LEVEL:-CHECKIN} \ + -Dc-ares_SOURCE=${cares_SOURCE:-} \ + -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \ + -DCMAKE_C_FLAGS="${CFLAGS:-}" \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS:-}" \ + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \ + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD:-OFF} \ + -Dgflags_SOURCE=${gflags_SOURCE:-} \ + -Dgoogle_cloud_cpp_storage_SOURCE=${google_cloud_cpp_storage_SOURCE:-} \ + -DgRPC_SOURCE=${gRPC_SOURCE:-} \ + -DGTest_SOURCE=${GTest_SOURCE:-} \ + -DLz4_SOURCE=${Lz4_SOURCE:-} \ + -DORC_SOURCE=${ORC_SOURCE:-} \ + -DPARQUET_BUILD_EXAMPLES=${PARQUET_BUILD_EXAMPLES:-OFF} \ + -DPARQUET_BUILD_EXECUTABLES=${PARQUET_BUILD_EXECUTABLES:-OFF} \ + -DPARQUET_REQUIRE_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} \ + -DProtobuf_SOURCE=${Protobuf_SOURCE:-} \ + -DRapidJSON_SOURCE=${RapidJSON_SOURCE:-} \ + -Dre2_SOURCE=${re2_SOURCE:-} \ + -DSnappy_SOURCE=${Snappy_SOURCE:-} \ + -DThrift_SOURCE=${Thrift_SOURCE:-} \ + -Dutf8proc_SOURCE=${utf8proc_SOURCE:-} \ + -Dzstd_SOURCE=${zstd_SOURCE:-} \ + -G "${CMAKE_GENERATOR:-Ninja}" \ + ${CMAKE_ARGS} \ + ${source_dir} if [ ! -z "${CPP_MAKE_PARALLELISM}" ]; then time cmake --build . --target install -- -j${CPP_MAKE_PARALLELISM} @@ -158,7 +159,7 @@ if [ "${ARROW_USE_CCACHE}" == "ON" ]; then ccache -s fi -if [ "${with_docs}" == "true" ]; then +if [ "${DOCS_BUILD_CPP}" == "ON" ]; then pushd ${source_dir}/apidoc doxygen popd diff --git a/ci/scripts/docs_build.sh b/ci/scripts/docs_build.sh deleted file mode 100755 index e6ee768ee87..00000000000 --- a/ci/scripts/docs_build.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -ex - -arrow_dir=${1} -build_dir=${2}/docs - -export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} -export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH} -export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 -export CFLAGS="-DARROW_NO_DEPRECATED_API" -export CXXFLAGS="-DARROW_NO_DEPRECATED_API" - -ncpus=$(python3 -c "import os; print(os.cpu_count())") - -# Sphinx docs -sphinx-build -b html -j ${ncpus} ${arrow_dir}/docs/source ${build_dir} - -# C++ - original doxygen -# rsync -a ${arrow_dir}/cpp/apidoc/ ${build_dir}/cpp - -# R -rsync -a ${arrow_dir}/r/docs/ ${build_dir}/r - -# C GLib -rsync -a ${ARROW_HOME}/share/gtk-doc/html/ ${build_dir}/c_glib - -# Java -rsync -a ${arrow_dir}/java/target/site/apidocs/ ${build_dir}/java/reference - -# Javascript -rsync -a ${arrow_dir}/js/doc/ ${build_dir}/js diff --git a/ci/scripts/install_ccache.sh b/ci/scripts/install_ccache.sh new file mode 100755 index 00000000000..8c64fe56c41 --- /dev/null +++ b/ci/scripts/install_ccache.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +version=$1 +prefix=$2 +url="https://github.com/ccache/ccache/archive/v${version}.tar.gz" + +mkdir /tmp/ccache +wget -q ${url} -O - | tar -xzf - --directory /tmp/ccache --strip-components=1 + +mkdir /tmp/ccache/build +pushd /tmp/ccache/build +cmake \ + -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=${prefix} \ + -DZSTD_FROM_INTERNET=ON \ + .. +ninja install +popd + +rm -rf /tmp/ccache diff --git a/ci/scripts/install_cmake.sh b/ci/scripts/install_cmake.sh new file mode 100755 index 00000000000..2f5e5d52051 --- /dev/null +++ b/ci/scripts/install_cmake.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +declare -A archs +archs=([amd64]=x86_64 + [arm64v8]=aarch64) + +declare -A platforms +platforms=([linux]=linux + [macos]=macos + [windows]=windows) + +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi + +arch=${archs[$1]} +platform=${platforms[$2]} +version=$3 +prefix=$4 + +url="https://github.com/Kitware/CMake/releases/download/v${version}/cmake-${version}-${platform}-${arch}.tar.gz" +wget -q ${url} -O - | tar -xzf - --directory ${prefix} --strip-components=1 diff --git a/ci/scripts/install_conda.sh b/ci/scripts/install_conda.sh index f4d313b63df..2e864163022 100755 --- a/ci/scripts/install_conda.sh +++ b/ci/scripts/install_conda.sh @@ -19,48 +19,30 @@ set -e -declare -A archs -archs=([amd64]=x86_64 - [arm32v7]=armv7l - [ppc64le]=ppc64le - [i386]=x86) - -declare -A platforms -platforms=([windows]=Windows - [macos]=MacOSX - [linux]=Linux) - -if [ "$#" -ne 4 ]; then - echo "Usage: $0 " - exit 1 -elif [[ -z ${archs[$1]} ]]; then - echo "Unexpected architecture: ${1}" - exit 1 -elif [[ -z ${platforms[$2]} ]]; then - echo "Unexpected platform: ${2}" +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " exit 1 fi -arch=${archs[$1]} -platform=${platforms[$2]} -version=$3 -prefix=$4 +arch=$(uname -m) +platform=$(uname) +installer=$1 +version=$2 +prefix=$3 echo "Downloading Miniconda installer..." -wget -nv https://repo.continuum.io/miniconda/Miniconda3-${version}-${platform}-${arch}.sh -O /tmp/miniconda.sh -bash /tmp/miniconda.sh -b -p ${prefix} -rm /tmp/miniconda.sh +wget -nv https://github.com/conda-forge/miniforge/releases/latest/download/${installer^}-${platform}-${arch}.sh -O /tmp/installer.sh +bash /tmp/installer.sh -b -p ${prefix} +rm /tmp/installer.sh # Like "conda init", but for POSIX sh rather than bash ln -s ${prefix}/etc/profile.d/conda.sh /etc/profile.d/conda.sh +export PATH=/opt/conda/bin:$PATH + # Configure -source /etc/profile.d/conda.sh -conda config --add channels conda-forge -conda config --set channel_priority strict conda config --set show_channel_urls True conda config --set remote_connect_timeout_secs 12 # Update and clean -conda update --all -y conda clean --all -y diff --git a/ci/scripts/install_dask.sh b/ci/scripts/install_dask.sh index 954ce3249d9..ff0def006a8 100755 --- a/ci/scripts/install_dask.sh +++ b/ci/scripts/install_dask.sh @@ -29,8 +29,7 @@ dask=$1 if [ "${dask}" = "master" ]; then pip install https://github.com/dask/dask/archive/main.tar.gz#egg=dask[dataframe] elif [ "${dask}" = "latest" ]; then - conda install -q dask + pip install dask else - conda install -q dask=${dask} + pip install dask==${dask} fi -conda clean --all diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 579a78944b5..16e3c4042cf 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -19,18 +19,17 @@ set -e -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " exit 1 fi -arch=$1 -if [ "${arch}" != "amd64" ]; then +if [ "$(uname -m)" != "x86_64" ]; then echo "GCS testbench won't install on non-x86 architecture" exit 0 fi -version=$2 +version=$1 if [[ "${version}" -eq "default" ]]; then version="v0.7.0" fi diff --git a/ci/scripts/install_glibc.sh b/ci/scripts/install_glibc.sh new file mode 100755 index 00000000000..d1a49818178 --- /dev/null +++ b/ci/scripts/install_glibc.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +version=$1 +prefix=$2 + +url="http://ftp.gnu.org/gnu/glibc/glibc-${version}.tar.gz" + +mkdir /tmp/glibc +wget -q ${url} -O - | tar -xzf - --directory /tmp/glibc --strip-components=1 + +mkdir /tmp/glibc/build +pushd /tmp/glibc/build +../configure --prefix=${prefix} +make -j$(nproc) +make install +popd + +rm -rf /tmp/glibc diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh index 5cda46e59e6..af99f966661 100755 --- a/ci/scripts/install_minio.sh +++ b/ci/scripts/install_minio.sh @@ -20,34 +20,37 @@ set -e declare -A archs -archs=([amd64]=amd64 - [arm64v8]=arm64 - [arm32v7]=arm +archs=([x86_64]=amd64 + [arm64]=arm64 + [aarch64]=arm64 [s390x]=s390x) declare -A platforms -platforms=([linux]=linux - [macos]=darwin) +platforms=([Linux]=linux + [Darwin]=darwin) -arch=${archs[$1]} -platform=${platforms[$2]} -version=$3 -prefix=$4 +arch=$(uname -m) +platform=$(uname) +version=$1 +prefix=$2 -if [ "$#" -ne 4 ]; then - echo "Usage: $0 " +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " exit 1 -elif [[ -z ${arch} ]]; then - echo "Unexpected architecture: ${1}" - exit 1 -elif [[ -z ${platform} ]]; then - echo "Unexpected platform: ${2}" - exit 1 -elif [[ ${version} != "latest" ]]; then +elif [ -z ${archs[$arch]} ]; then + echo "Unsupported architecture: ${arch}" + exit 0 +elif [ -z ${platforms[$platform]} ]; then + echo "Unsupported platform: ${platform}" + exit 0 +elif [ "${version}" != "latest" ]; then echo "Cannot fetch specific versions of minio, only latest is supported." exit 1 fi +arch=${archs[$arch]} +platform=${platforms[$platform]} + if [[ ! -x ${prefix}/bin/minio ]]; then url="https://dl.min.io/server/minio/release/${platform}-${arch}/minio" echo "Fetching ${url}..." diff --git a/ci/scripts/install_ninja.sh b/ci/scripts/install_ninja.sh new file mode 100755 index 00000000000..0440d563fb1 --- /dev/null +++ b/ci/scripts/install_ninja.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +version=$1 +prefix=$2 + +url="https://github.com/ninja-build/ninja/archive/v${version}.tar.gz" + +mkdir /tmp/ninja +wget -q ${url} -O - | tar -xzf - --directory /tmp/ninja --strip-components=1 + +pushd /tmp/ninja +./configure.py --bootstrap +mv ninja ${prefix}/bin +popd + +rm -rf /tmp/ninja diff --git a/ci/scripts/install_vcpkg.sh b/ci/scripts/install_vcpkg.sh index fe99a7fea2f..cc80582326e 100755 --- a/ci/scripts/install_vcpkg.sh +++ b/ci/scripts/install_vcpkg.sh @@ -19,21 +19,35 @@ set -e -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " +if [ "$#" -lt 1 ]; then + echo "Usage: $0 `` [ []]" exit 1 fi -vcpkg_version=$1 -vcpkg_destination=$2 -vcpkg_patch=$(realpath $(dirname "${0}")/../vcpkg/ports.patch) +arrow_dir=$(cd -- "$(dirname -- "$0")/../.." && pwd -P) +default_vcpkg_version=$(cat "${arrow_dir}/.env" | grep "VCPKG" | cut -d "=" -f2 | tr -d '"') +default_vcpkg_ports_patch="${arrow_dir}/ci/vcpkg/ports.patch" -git clone --depth 1 --branch ${vcpkg_version} https://github.com/microsoft/vcpkg ${vcpkg_destination} +vcpkg_destination=$1 +vcpkg_version=${2:-$default_vcpkg_version} +vcpkg_ports_patch=${3:-$default_vcpkg_ports_patch} + +# reduce the fetched data using a shallow clone +git clone --shallow-since=2021-04-01 https://github.com/microsoft/vcpkg ${vcpkg_destination} pushd ${vcpkg_destination} -./bootstrap-vcpkg.sh -useSystemBinaries -disableMetrics -git apply --ignore-whitespace ${vcpkg_patch} -echo "Patch successfully applied!" +git checkout "${vcpkg_version}" + +if [[ "$OSTYPE" == "msys" ]]; then + ./bootstrap-vcpkg.bat -disableMetrics +else + ./bootstrap-vcpkg.sh -disableMetrics +fi + +if [ -f "${vcpkg_ports_patch}" ]; then + git apply --verbose --ignore-whitespace ${vcpkg_ports_patch} + echo "Patch successfully applied to the VCPKG port files!" +fi popd diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 1ba37606d3c..7693bd883a1 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -20,9 +20,11 @@ set -ex arrow_dir=${1} source_dir=${1}/java -cpp_build_dir=${2}/cpp/${ARROW_BUILD_TYPE:-debug} -cdata_dist_dir=${2}/java/c -with_docs=${3:-false} +build_dir=${2} +cpp_build_dir=${build_dir}/cpp/${ARROW_BUILD_TYPE:-debug} +cdata_dist_dir=${build_dir}/java/c + +: ${BUILD_DOCS_JAVA:=OFF} if [[ "$(uname -s)" == "Linux" ]] && [[ "$(uname -m)" == "s390x" ]]; then # Since some files for s390_64 are not available at maven central, @@ -99,9 +101,11 @@ if [ "${ARROW_PLASMA}" = "ON" ]; then popd fi -if [ "${with_docs}" == "true" ]; then +if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 + mkdir -p ${build_dir}/docs/java/reference ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false install site + rsync -a ${arrow_dir}/java/target/site/apidocs/ ${build_dir}/docs/java/reference fi popd diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh index 12f58d54bb8..333c9cf441e 100755 --- a/ci/scripts/js_build.sh +++ b/ci/scripts/js_build.sh @@ -19,17 +19,19 @@ set -ex -source_dir=${1}/js -with_docs=${2:-false} +arrow_dir=${1} +source_dir=${arrow_dir}/js +build_dir=${2} + +: ${BUILD_DOCS_JS:=OFF} pushd ${source_dir} yarn --frozen-lockfile -# TODO(kszucs): linting should be moved to archery yarn lint:ci yarn build -if [ "${with_docs}" == "true" ]; then +if [ "${BUILD_DOCS_JS}" == "ON" ]; then if [ "$(git config --get remote.origin.url)" == "https://github.com/apache/arrow.git" ]; then yarn doc elif [ "$(git config --get remote.upstream.url)" == "https://github.com/apache/arrow.git" ]; then @@ -40,6 +42,8 @@ if [ "${with_docs}" == "true" ]; then echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." exit 0 fi + mkdir -p ${build_dir}/docs/js + rsync -a ${arrow_dir}/js/doc/ ${build_dir}/docs/js fi popd diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index ec6d723b2a7..ccf068078d5 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -19,9 +19,12 @@ set -ex -source_dir=${1}/python +arrow_dir=${1} +source_dir=${arrow_dir}/python build_dir=${2}/python +: ${BUILD_DOCS_PYTHON:=OFF} + if [ ! -z "${CONDA_PREFIX}" ]; then echo -e "===\n=== Conda environment for build\n===" conda list @@ -52,3 +55,8 @@ ${PYTHON:-python} \ --record $relative_build_dir/record.txt popd + +if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then + ncpus=$(python -c "import os; print(os.cpu_count())") + sphinx-build -b html -j ${ncpus} ${arrow_dir}/docs/source ${build_dir}/docs +fi diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 1a52a2ad52b..4689a251aa0 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -64,6 +64,7 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} +: ${ARROW_GCS:=OFF} : ${ARROW_HDFS:=ON} : ${ARROW_JEMALLOC:=ON} : ${ARROW_MIMALLOC:=ON} @@ -99,6 +100,7 @@ cmake \ -DARROW_DEPENDENCY_USE_SHARED=OFF \ -DARROW_FLIGHT=${ARROW_FLIGHT} \ -DARROW_GANDIVA=${ARROW_GANDIVA} \ + -DARROW_GCS=${ARROW_GCS} \ -DARROW_HDFS=${ARROW_HDFS} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC} \ diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 434605cf202..6b467ffc046 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -51,6 +51,7 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} +: ${ARROW_GCS:=OFF} : ${ARROW_HDFS:=ON} : ${ARROW_JEMALLOC:=ON} : ${ARROW_MIMALLOC:=ON} @@ -88,8 +89,9 @@ cmake \ -DARROW_DATASET=${ARROW_DATASET} \ -DARROW_DEPENDENCY_SOURCE="VCPKG" \ -DARROW_DEPENDENCY_USE_SHARED=OFF \ - -DARROW_FLIGHT==${ARROW_FLIGHT} \ + -DARROW_FLIGHT=${ARROW_FLIGHT} \ -DARROW_GANDIVA=${ARROW_GANDIVA} \ + -DARROW_GCS=${ARROW_GCS} \ -DARROW_HDFS=${ARROW_HDFS} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC} \ diff --git a/ci/scripts/r_build.sh b/ci/scripts/r_build.sh index 2a2b9d7d161..a435b04317d 100755 --- a/ci/scripts/r_build.sh +++ b/ci/scripts/r_build.sh @@ -20,14 +20,17 @@ set -ex : ${R_BIN:=R} source_dir=${1}/r -with_docs=${2:-false} +build_dir=${2} + +: ${BUILD_DOCS_R:=OFF} pushd ${source_dir} ${R_BIN} CMD INSTALL ${INSTALL_ARGS} . -if [ "${with_docs}" == "true" ]; then +if [ "${BUILD_DOCS_R}" == "ON" ]; then ${R_BIN} -e "pkgdown::build_site(install = FALSE)" + rsync -a ${source_dir}/docs/ ${build_dir}/docs/r fi -popd \ No newline at end of file +popd diff --git a/ci/scripts/r_deps.sh b/ci/scripts/r_deps.sh index 79977fca16d..0fa4368cd8a 100755 --- a/ci/scripts/r_deps.sh +++ b/ci/scripts/r_deps.sh @@ -41,6 +41,15 @@ fi # but we want to error/fail the build. # options(warn=2) turns warnings into errors ${R_BIN} -e "options(warn=2); install.packages('remotes'); remotes::install_cran(c('glue', 'rcmdcheck', 'sys')); remotes::install_deps(INSTALL_opts = '"${INSTALL_ARGS}"')" + +# (Temporarily) install DuckDB from source to avoid their Unity builds +# (though only if we haven't filtered it out of the deps above, +# and if we can't get a binary from RSPM) +# Remove when there is a DuckDB release > 0.3.1-1 +if grep -q "duckdb" DESCRIPTION; then + ${R_BIN} -e "if (all(!grepl('packagemanager.rstudio', options('repos')))) { remotes::install_github('duckdb/duckdb', subdir = '/tools/rpkg', build = FALSE) }" +fi + # Separately install the optional/test dependencies but don't error on them, # they're not available everywhere and that's ok ${R_BIN} -e "remotes::install_deps(dependencies = TRUE, INSTALL_opts = '"${INSTALL_ARGS}"')" diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 20c987085ae..4d837d9d7f1 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -69,11 +69,11 @@ if [ "$ARROW_S3" == "ON" ] || [ "$ARROW_R_DEV" == "TRUE" ]; then # The Dockerfile should have put this file here if [ -f "/arrow/ci/scripts/install_minio.sh" ] && [ "`which wget`" ]; then - /arrow/ci/scripts/install_minio.sh amd64 linux latest /usr/local + /arrow/ci/scripts/install_minio.sh latest /usr/local fi if [ -f "/arrow/ci/scripts/install_gcs_testbench.sh" ] && [ "`which pip`" ]; then - /arrow/ci/scripts/install_gcs_testbench.sh amd64 default + /arrow/ci/scripts/install_gcs_testbench.sh default fi fi diff --git a/ci/vcpkg/x64-linux-static-debug.cmake b/ci/vcpkg/amd64-linux-static-debug.cmake similarity index 100% rename from ci/vcpkg/x64-linux-static-debug.cmake rename to ci/vcpkg/amd64-linux-static-debug.cmake diff --git a/ci/vcpkg/x64-linux-static-release.cmake b/ci/vcpkg/amd64-linux-static-release.cmake similarity index 100% rename from ci/vcpkg/x64-linux-static-release.cmake rename to ci/vcpkg/amd64-linux-static-release.cmake diff --git a/ci/vcpkg/x64-osx-static-debug.cmake b/ci/vcpkg/amd64-osx-static-debug.cmake similarity index 100% rename from ci/vcpkg/x64-osx-static-debug.cmake rename to ci/vcpkg/amd64-osx-static-debug.cmake diff --git a/ci/vcpkg/x64-osx-static-release.cmake b/ci/vcpkg/amd64-osx-static-release.cmake similarity index 100% rename from ci/vcpkg/x64-osx-static-release.cmake rename to ci/vcpkg/amd64-osx-static-release.cmake diff --git a/ci/vcpkg/x64-windows-static-md-debug.cmake b/ci/vcpkg/amd64-windows-static-md-debug.cmake similarity index 100% rename from ci/vcpkg/x64-windows-static-md-debug.cmake rename to ci/vcpkg/amd64-windows-static-md-debug.cmake diff --git a/ci/vcpkg/x64-windows-static-md-release.cmake b/ci/vcpkg/amd64-windows-static-md-release.cmake similarity index 100% rename from ci/vcpkg/x64-windows-static-md-release.cmake rename to ci/vcpkg/amd64-windows-static-md-release.cmake diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 7bcba49c194..940722abb2e 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,63 +1,245 @@ -diff --git a/ports/aws-c-common/portfile.cmake b/ports/aws-c-common/portfile.cmake -index f3704ef05b..3af543058d 100644 ---- a/ports/aws-c-common/portfile.cmake -+++ b/ports/aws-c-common/portfile.cmake -@@ -1,8 +1,8 @@ - vcpkg_from_github( - OUT_SOURCE_PATH SOURCE_PATH - REPO awslabs/aws-c-common -- REF 4a21a1c0757083a16497fea27886f5f20ccdf334 # v0.4.56 -- SHA512 68898a8ac15d5490f45676eabfbe0df9e45370a74c543a28909fd0d85fed48dfcf4bcd6ea2d01d1a036dd352e2e4e0b08c48c63ab2a2b477fe150b46a827136e -+ REF 13adef72b7813ec878817c6d50a7a3f241015d8a # v0.4.57 -+ SHA512 28256522ac6af544d7464e3e7dcd4dc802ae2b09728bf8f167f86a6487bb756d0cad5eb4a2480610b2967b9c24c4a7f70621894517aa2828ffdeb0479453803b - HEAD_REF master - PATCHES - disable-error-4068.patch # This patch fixes dependency port compilation failure +diff --git a/ports/aws-c-auth/vcpkg.json b/ports/aws-c-auth/vcpkg.json +index dc8f75e8..be703324 100644 +--- a/ports/aws-c-auth/vcpkg.json ++++ b/ports/aws-c-auth/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 1, + "description": "C99 library implementation of AWS client-side authentication: standard credentials providers and signing.", + "homepage": "https://github.com/awslabs/aws-c-auth", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "aws-c-common", + "aws-c-http", +diff --git a/ports/aws-c-cal/vcpkg.json b/ports/aws-c-cal/vcpkg.json +index 48c7406c..40e284dd 100644 +--- a/ports/aws-c-cal/vcpkg.json ++++ b/ports/aws-c-cal/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 1, + "description": "C99 wrapper for cryptography primitives.", + "homepage": "https://github.com/awslabs/aws-c-cal", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "aws-c-common", + { +diff --git a/ports/aws-c-common/vcpkg.json b/ports/aws-c-common/vcpkg.json +index 67d58320..3c8e96f7 100644 +--- a/ports/aws-c-common/vcpkg.json ++++ b/ports/aws-c-common/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 1, + "description": "AWS common library for C", + "homepage": "https://github.com/awslabs/aws-c-common", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "openssl", + { +diff --git a/ports/aws-c-compression/vcpkg.json b/ports/aws-c-compression/vcpkg.json +index 8b62d038..7aa75eda 100644 +--- a/ports/aws-c-compression/vcpkg.json ++++ b/ports/aws-c-compression/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 1, + "description": "C99 implementation of huffman encoding/decoding", + "homepage": "https://github.com/awslabs/aws-c-compression", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "aws-c-common", + { +diff --git a/ports/aws-c-event-stream/vcpkg.json b/ports/aws-c-event-stream/vcpkg.json +index 7018ea0b..df3f85af 100644 +--- a/ports/aws-c-event-stream/vcpkg.json ++++ b/ports/aws-c-event-stream/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 1, + "description": "C99 implementation of the vnd.amazon.event-stream content-type.", + "homepage": "https://github.com/awslabs/aws-c-event-stream", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "aws-c-cal", + "aws-c-common", +diff --git a/ports/aws-c-http/vcpkg.json b/ports/aws-c-http/vcpkg.json +index f7cf6547..20ee8be7 100644 +--- a/ports/aws-c-http/vcpkg.json ++++ b/ports/aws-c-http/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 1, + "description": "C99 implementation of the HTTP/1.1 and HTTP/2 specifications", + "homepage": "https://github.com/awslabs/aws-c-http", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "aws-c-cal", + "aws-c-common", +diff --git a/ports/aws-c-io/vcpkg.json b/ports/aws-c-io/vcpkg.json +index 60a1fb93..b695deb5 100644 +--- a/ports/aws-c-io/vcpkg.json ++++ b/ports/aws-c-io/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 1, + "description": "Handles all IO and TLS work for application protocols.", + "homepage": "https://github.com/awslabs/aws-c-io", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "aws-c-cal", + "aws-c-common", +diff --git a/ports/aws-c-mqtt/vcpkg.json b/ports/aws-c-mqtt/vcpkg.json +index de7e464e..3f459a79 100644 +--- a/ports/aws-c-mqtt/vcpkg.json ++++ b/ports/aws-c-mqtt/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 1, + "description": "C99 implementation of the MQTT 3.1.1 specification.", + "homepage": "https://github.com/awslabs/aws-c-mqtt", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + { + "name": "s2n", +diff --git a/ports/aws-c-s3/vcpkg.json b/ports/aws-c-s3/vcpkg.json +index 31dba925..902c07f9 100644 +--- a/ports/aws-c-s3/vcpkg.json ++++ b/ports/aws-c-s3/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 1, + "description": "C99 library implementation for communicating with the S3 service, designed for maximizing throughput on high bandwidth EC2 instances.", + "homepage": "https://github.com/awslabs/aws-c-s3", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "aws-c-auth", + "aws-c-cal", +diff --git a/ports/aws-checksums/vcpkg.json b/ports/aws-checksums/vcpkg.json +index 3cdea453..1cbed512 100644 +--- a/ports/aws-checksums/vcpkg.json ++++ b/ports/aws-checksums/vcpkg.json +@@ -4,7 +4,6 @@ + "port-version": 2, + "description": "Cross-Platform HW accelerated CRC32c and CRC32 with fallback to efficient SW implementations.", + "homepage": "https://github.com/awslabs/aws-checksums", +- "supports": "!arm", + "dependencies": [ + "aws-c-common", + { +diff --git a/ports/aws-crt-cpp/vcpkg.json b/ports/aws-crt-cpp/vcpkg.json +index 3b8cfafd..38c37ec9 100644 +--- a/ports/aws-crt-cpp/vcpkg.json ++++ b/ports/aws-crt-cpp/vcpkg.json +@@ -4,7 +4,7 @@ + "port-version": 2, + "description": "Cross-Platform HW accelerated CRC32c and CRC32 with fallback to efficient SW implementations.", + "homepage": "https://github.com/awslabs/aws-crt-cpp", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "aws-c-auth", + "aws-c-cal", +diff --git a/ports/aws-sdk-cpp/fix-find-crypto.patch b/ports/aws-sdk-cpp/fix-find-crypto.patch +new file mode 100644 +index 00000000..4566cc66 +--- /dev/null ++++ b/ports/aws-sdk-cpp/fix-find-crypto.patch +@@ -0,0 +1,20 @@ ++diff --git a/cmake/setup_cmake_find_module.cmake b/cmake/setup_cmake_find_module.cmake ++index f5f147a0f..4561b8c39 100644 ++--- a/cmake/setup_cmake_find_module.cmake +++++ b/cmake/setup_cmake_find_module.cmake ++@@ -30,15 +30,6 @@ file(WRITE ++ "set(AWSSDK_INSTALL_BINDIR ${BINARY_DIRECTORY})\n" ++ "set(AWSSDK_INSTALL_INCLUDEDIR ${INCLUDE_DIRECTORY})\n" ++ "set(AWSSDK_INSTALL_ARCHIVEDIR ${ARCHIVE_DIRECTORY})\n" ++- "if (NOT LibCrypto_INCLUDE_DIR)\n" ++- " set(LibCrypto_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR} CACHE INTERNAL \"The OpenSSL include directory\")\n" ++- "endif()\n" ++- "if (NOT LibCrypto_STATIC_LIBRARY)\n" ++- " set(LibCrypto_STATIC_LIBRARY ${OPENSSL_CRYPTO_LIBRARY} CACHE INTERNAL \"The OpenSSL crypto static library\")\n" ++- "endif()\n" ++- "if (NOT LibCrypto_SHARED_LIBRARY)\n" ++- " set(LibCrypto_SHARED_LIBRARY ${OPENSSL_CRYPTO_LIBRARY} CACHE INTERNAL \"The OpenSSL crypto shared library\")\n" ++- "endif()\n" ++ ) ++ ++ if (NOT SIMPLE_INSTALL) +diff --git a/ports/aws-sdk-cpp/portfile.cmake b/ports/aws-sdk-cpp/portfile.cmake +index 2d6bba4d..0ac47887 100644 +--- a/ports/aws-sdk-cpp/portfile.cmake ++++ b/ports/aws-sdk-cpp/portfile.cmake +@@ -8,6 +8,7 @@ vcpkg_from_github( + PATCHES + patch-relocatable-rpath.patch + fix-aws-root.patch ++ fix-find-crypto.patch + ) + + string(COMPARE EQUAL "${VCPKG_CRT_LINKAGE}" "dynamic" FORCE_SHARED_CRT) +diff --git a/ports/aws-sdk-cpp/vcpkg.json b/ports/aws-sdk-cpp/vcpkg.json +index 3836e2b6..cd88ef07 100644 +--- a/ports/aws-sdk-cpp/vcpkg.json ++++ b/ports/aws-sdk-cpp/vcpkg.json +@@ -5,7 +5,7 @@ + "port-version": 5, + "description": "AWS SDK for C++", + "homepage": "https://github.com/aws/aws-sdk-cpp", +- "supports": "!arm & !uwp", ++ "supports": "!uwp", + "dependencies": [ + "aws-crt-cpp", + { diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake -index be66d452be..a5ce325e9d 100644 +index ce8b7570..8255ba24 100644 --- a/ports/curl/portfile.cmake +++ b/ports/curl/portfile.cmake -@@ -94,6 +94,8 @@ vcpkg_configure_cmake( +@@ -92,6 +92,10 @@ vcpkg_cmake_configure( -DCMAKE_DISABLE_FIND_PACKAGE_Perl=ON -DENABLE_DEBUG=ON -DCURL_CA_FALLBACK=ON + -DCURL_CA_PATH=none + -DCURL_CA_BUNDLE=none - OPTIONS_DEBUG - ${EXTRA_ARGS_DEBUG} ++ OPTIONS_DEBUG ++ ${EXTRA_ARGS_DEBUG} OPTIONS_RELEASE + ${OPTIONS_RELEASE} + OPTIONS_DEBUG diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake -index 75dd133027..84345c7caa 100644 +index 45b8c706..b409d8a7 100644 --- a/ports/snappy/portfile.cmake +++ b/ports/snappy/portfile.cmake @@ -4,6 +4,7 @@ vcpkg_from_github( - REF 537f4ad6240e586970fe554614542e9717df7902 # 1.1.8 - SHA512 555d3b69a6759592736cbaae8f41654f0cf14e8be693b5dde37640191e53daec189f895872557b173e905d10681ef502f3e6ed8566811add963ffef96ce4016d + REF 1.1.9 + SHA512 f1f8a90f5f7f23310423574b1d8c9acb84c66ea620f3999d1060395205e5760883476837aba02f0aa913af60819e34c625d8308c18a5d7a9c4e190f35968b024 HEAD_REF master + PATCHES "snappy-disable-bmi.patch" ) - - vcpkg_configure_cmake( + + vcpkg_cmake_configure( diff --git a/ports/snappy/snappy-disable-bmi.patch b/ports/snappy/snappy-disable-bmi.patch new file mode 100644 -index 0000000000..2cbb1533a8 +index 00000000..a57ce0c2 --- /dev/null +++ b/ports/snappy/snappy-disable-bmi.patch -@@ -0,0 +1,17 @@ -+--- snappy.cc 2020-06-27 17:38:49.718993748 -0500 -++++ snappy.cc 2020-06-27 17:37:57.543268213 -0500 -+@@ -717,14 +717,10 @@ -+ static inline uint32 ExtractLowBytes(uint32 v, int n) { +@@ -0,0 +1,19 @@ ++diff --git a/snappy.cc b/snappy.cc ++index 79dc0e8..f3153ea 100644 ++--- a/snappy.cc +++++ b/snappy.cc ++@@ -965,14 +965,10 @@ static inline void Report(const char *algorithm, size_t compressed_size, ++ static inline uint32_t ExtractLowBytes(uint32_t v, int n) { + assert(n >= 0); + assert(n <= 4); +-#if SNAPPY_HAVE_BMI2 +- return _bzhi_u32(v, 8 * n); +-#else -+ // This needs to be wider than uint32 otherwise `mask << 32` will be ++ // This needs to be wider than uint32_t otherwise `mask << 32` will be + // undefined. -+ uint64 mask = 0xffffffff; ++ uint64_t mask = 0xffffffff; + return v & ~(mask << (8 * n)); +-#endif + } -+ -+ static inline bool LeftShiftOverflows(uint8 value, uint32 shift) { ++ ++ static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { diff --git a/ci/vcpkg/universal2-osx-static-debug.cmake b/ci/vcpkg/universal2-osx-static-debug.cmake index 706ac47a72c..29e4b0e63c5 100644 --- a/ci/vcpkg/universal2-osx-static-debug.cmake +++ b/ci/vcpkg/universal2-osx-static-debug.cmake @@ -20,7 +20,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) -set(VCPKG_OSX_ARCHITECTURES "x86_64\;arm64") +set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") set(VCPKG_OSX_DEPLOYMENT_TARGET "10.13") set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/universal2-osx-static-release.cmake b/ci/vcpkg/universal2-osx-static-release.cmake index 8670690171e..8111169fab2 100644 --- a/ci/vcpkg/universal2-osx-static-release.cmake +++ b/ci/vcpkg/universal2-osx-static-release.cmake @@ -20,7 +20,7 @@ set(VCPKG_CRT_LINKAGE dynamic) set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) -set(VCPKG_OSX_ARCHITECTURES "x86_64\;arm64") +set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") set(VCPKG_OSX_DEPLOYMENT_TARGET "10.13") set(VCPKG_BUILD_TYPE release) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f3d6b24c48f..0262357d6c7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -765,6 +765,13 @@ if(ARROW_S3) list(APPEND ARROW_STATIC_LINK_LIBS ${AWSSDK_LINK_LIBRARIES}) endif() +if(ARROW_WITH_OPENTELEMETRY) + list(APPEND ARROW_LINK_LIBS opentelemetry-cpp::trace + opentelemetry-cpp::otlp_http_exporter) + list(APPEND ARROW_STATIC_LINK_LIBS opentelemetry-cpp::trace + opentelemetry-cpp::otlp_http_exporter) +endif() + if(ARROW_WITH_UTF8PROC) list(APPEND ARROW_LINK_LIBS utf8proc::utf8proc) list(APPEND ARROW_STATIC_LINK_LIBS utf8proc::utf8proc) diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index a9ca585abdc..ebb3d86283b 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -1,165 +1,266 @@ -{ - "version": 2, - "cmakeMinimumRequired": { - "major": 3, - "minor": 20, - "patch": 0 - }, - "configurePresets": [ - { - "name": "ninja-benchmarks", - "description": "Build for benchmarks", - "inherits": "ninja-release", - "cacheVariables": { - "ARROW_BUILD_BENCHMARKS": "ON", - "ARROW_BUILD_BENCHMARKS_REFERENCE": "ON", - "ARROW_BUILD_TESTS": "OFF" - } - }, - { - "name": "ninja-debug", - "description": "Debug configuration with basic build", - "binaryDir": "${sourceDir}/build/${presetName}", - "generator": "Ninja", - "cacheVariables": { - "ARROW_BUILD_BENCHMARKS": { - "type": "BOOL", - "value": "OFF" - }, - "ARROW_BUILD_TESTS": { - "type": "BOOL", - "value": "ON" - }, - "ARROW_COMPUTE": { - "type": "BOOL", - "value": "ON" - }, - "ARROW_CSV": { - "type": "BOOL", - "value": "ON" - }, - "ARROW_CUDA": { - "type": "BOOL", - "value": "OFF" - }, - "ARROW_DATASET": { - "type": "BOOL", - "value": "OFF" - }, - "ARROW_GANDIVA": { - "type": "BOOL", - "value": "OFF" - }, - "ARROW_GANDIVA_JAVA": { - "type": "BOOL", - "value": "OFF" - }, - "ARROW_GANDIVA_JNI": { - "type": "BOOL", - "value": "OFF" - }, - "ARROW_FILESYSTEM": { - "type": "BOOL", - "value": "ON" - }, - "ARROW_IPC": { - "type": "BOOL", - "value": "ON" - }, - "ARROW_PARQUET": { - "type": "BOOL", - "value": "OFF" - }, - "ARROW_PLASMA_JAVA_CLIENT": { - "type": "BOOL", - "value": "OFF" - }, - "ARROW_PYTHON": { - "type": "BOOL", - "value": "ON" - }, - "ARROW_SKYHOOK": { - "type": "BOOL", - "value": "OFF" - }, - "ARROW_WITH_RE2": { - "type": "BOOL", - "value": "ON" - }, - "CMAKE_BUILD_TYPE": { - "type": "String", - "value": "Debug" - }, - "CMAKE_INSTALL_PREFIX": { - "type": "PATH", - "value": "${sourceDir}/build/${presetName}/pkg" - } - } - }, - { - "name": "ninja-debug-cuda", - "description": "Debug Arrow build with CUDA extensions (requires CUDA toolkit)", - "inherits": "ninja-debug", - "cacheVariables": { - "ARROW_CUDA": "ON" - } - }, - { - "name": "ninja-debug-dataset", - "description": "Builds Arrow Dataset modules", - "inherits": "ninja-debug", - "cacheVariables": { - "ARROW_DATASET": "ON" - } - }, - { - "name": "ninja-debug-gandiva", - "description": "Builds Gandiva libraries", - "inherits": "ninja-debug", - "cacheVariables": { - "ARROW_GANDIVA": "ON" - } - }, - { - "name": "ninja-debug-parquet", - "description": "Builds Parquet libraries", - "inherits": "ninja-debug", - "cacheVariables": { - "ARROW_PARQUET": "ON" - } - }, - { - "name": "ninja-debug-skyhook", - "description": "Builds Skyhook libraries", - - "inherits": "ninja-debug", - "cacheVariables": { - "ARROW_SKYHOOK": "ON" - } - }, - { - "name": "ninja-release", - "description": "Release configuration", - "inherits": "ninja-debug", - "cacheVariables": { - "CMAKE_BUILD_TYPE": "Release" - } - }, - { - "name": "ninja-release-gandiva", - "description": "Release configuration with Gandiva", - "inherits": "ninja-release", - "cacheVariables": { - "ARROW_GANDIVA": "ON" - } - }, - { - "name": "ninja-release-parquet", - "description": "Release configuration with Parquet", - "inherits": "ninja-release", - "cacheVariables": { - "ARROW_PARQUET": "ON" - } - } - ] -} +{ + "version": 3, + "cmakeMinimumRequired": { + "major": 3, + "minor": 21, + "patch": 0 + }, + "configurePresets": [ + { + "name": "base", + "hidden": true, + "generator": "Ninja", + "cacheVariables": { + "ARROW_BUILD_STATIC": "OFF" + } + }, + { + "name": "base-debug", + "inherits": "base", + "hidden": true, + "cacheVariables": { + "ARROW_BUILD_INTEGRATION": "ON", + "ARROW_BUILD_TESTS": "ON", + "ARROW_EXTRA_ERROR_CONTEXT": "ON", + "CMAKE_BUILD_TYPE": "Debug" + } + }, + { + "name": "base-release", + "inherits": "base", + "hidden": true, + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release" + } + }, + { + "name": "base-benchmarks", + "inherits": "base", + "hidden": true, + "cacheVariables": { + "ARROW_BUILD_BENCHMARKS": "ON", + "ARROW_BUILD_BENCHMARKS_REFERENCE": "ON", + "CMAKE_BUILD_TYPE": "RelWithDebInfo" + } + }, + { + "name": "features-minimal", + "hidden": true, + "cacheVariables": { + "ARROW_WITH_RE2": "OFF", + "ARROW_WITH_UTF8PROC": "OFF" + } + }, + { + "name": "features-basic", + "inherits": "features-minimal", + "hidden": true, + "cacheVariables": { + "ARROW_COMPUTE": "ON", + "ARROW_CSV": "ON", + "ARROW_DATASET": "ON", + "ARROW_FILESYSTEM": "ON", + "ARROW_JSON": "ON" + } + }, + { + "name": "features-main", + "inherits": "features-basic", + "hidden": true, + "cacheVariables": { + "ARROW_ENGINE": "ON", + "ARROW_MIMALLOC": "ON", + "ARROW_PARQUET": "ON", + "ARROW_WITH_BROTLI": "ON", + "ARROW_WITH_BZ2": "ON", + "ARROW_WITH_LZ4": "ON", + "ARROW_WITH_RE2": "ON", + "ARROW_WITH_SNAPPY": "ON", + "ARROW_WITH_UTF8PROC": "ON", + "ARROW_WITH_ZLIB": "ON", + "ARROW_WITH_ZSTD": "ON" + } + }, + { + "name": "features-cuda", + "inherits": "features-basic", + "hidden": true, + "cacheVariables": { + "ARROW_CUDA": "ON" + } + }, + { + "name": "features-filesystems", + "inherits": "features-basic", + "hidden": true, + "cacheVariables": { + "ARROW_GCS": "ON", + "ARROW_HDFS": "ON", + "ARROW_S3": "ON" + } + }, + { + "name": "features-flight", + "inherits": "features-basic", + "hidden": true, + "cacheVariables": { + "ARROW_FLIGHT": "ON" + } + }, + { + "name": "features-gandiva", + "inherits": "features-basic", + "hidden": true, + "cacheVariables": { + "ARROW_GANDIVA": "ON" + } + }, + { + "name": "features-python", + "inherits": "features-main", + "hidden": true, + "cacheVariables": { + "ARROW_PYTHON": "ON" + } + }, + { + "name": "features-maximal", + "inherits": ["features-main", "features-cuda", + "features-filesystems", "features-flight", + "features-gandiva", "features-python"], + "hidden": true, + "displayName": "Debug build with everything enabled (except benchmarks and CUDA)", + "cacheVariables": { + "ARROW_BUILD_EXAMPLES": "ON", + "ARROW_BUILD_UTILITIES": "ON", + "ARROW_HIVESERVER2": "ON", + "ARROW_ORC": "ON", + "ARROW_SKYHOOK": "ON", + "ARROW_TENSORFLOW": "ON", + "PARQUET_BUILD_EXAMPLES": "ON", + "PARQUET_BUILD_EXECUTABLES": "ON", + "PARQUET_REQUIRE_ENCRYPTION": "ON" + } + }, + + { + "name": "ninja-debug-minimal", + "inherits": ["base-debug", "features-minimal"], + "displayName": "Debug build without anything enabled", + "cacheVariables": { + "ARROW_BUILD_INTEGRATION": "OFF", + "ARROW_BUILD_TESTS": "OFF" + } + }, + { + "name": "ninja-debug-basic", + "inherits": ["base-debug", "features-basic"], + "displayName": "Debug build with tests and reduced dependencies", + "cacheVariables": {} + }, + { + "name": "ninja-debug", + "inherits": ["base-debug", "features-main"], + "displayName": "Debug build with tests and more optional components", + "cacheVariables": {} + }, + { + "name": "ninja-debug-cuda", + "inherits": ["base-debug", "features-cuda"], + "displayName": "Debug build with tests and CUDA integration", + "cacheVariables": {} + }, + { + "name": "ninja-debug-filesystems", + "inherits": ["base-debug", "features-filesystems"], + "displayName": "Debug build with tests and filesystems", + "cacheVariables": {} + }, + { + "name": "ninja-debug-flight", + "inherits": ["base-debug", "features-flight"], + "displayName": "Debug build with tests and Flight", + "cacheVariables": {} + }, + { + "name": "ninja-debug-gandiva", + "inherits": ["base-debug", "features-gandiva"], + "displayName": "Debug build with tests and Gandiva", + "cacheVariables": {} + }, + { + "name": "ninja-debug-python", + "inherits": ["base-debug", "features-python"], + "displayName": "Debug build with tests and Python support", + "cacheVariables": {} + }, + { + "name": "ninja-debug-maximal", + "inherits": ["base-debug", "features-maximal"], + "displayName": "Debug build with everything enabled (except benchmarks and CUDA)", + "cacheVariables": {} + }, + + { + "name": "ninja-release-minimal", + "inherits": ["base-release", "features-minimal"], + "displayName": "Release build without anything enabled", + "cacheVariables": {} + }, + { + "name": "ninja-release-basic", + "inherits": ["base-release", "features-basic"], + "displayName": "Release build with reduced dependencies", + "cacheVariables": {} + }, + { + "name": "ninja-release", + "inherits": ["base-release", "features-main"], + "displayName": "Release build with more optional components", + "cacheVariables": {} + }, + { + "name": "ninja-release-cuda", + "inherits": ["base-release", "features-cuda"], + "displayName": "Release build with CUDA integration", + "cacheVariables": {} + }, + { + "name": "ninja-release-flight", + "inherits": ["base-release", "features-flight"], + "displayName": "Release build with Flight", + "cacheVariables": {} + }, + { + "name": "ninja-release-gandiva", + "inherits": ["base-release", "features-gandiva"], + "displayName": "Release build with Gandiva", + "cacheVariables": {} + }, + { + "name": "ninja-release-python", + "inherits": ["base-release", "features-python"], + "displayName": "Release build with Python support", + "cacheVariables": {} + }, + { + "name": "ninja-release-maximal", + "inherits": ["base-release", "features-maximal"], + "displayName": "Release build with everything enabled (except benchmarks and CUDA)", + "cacheVariables": {} + }, + + { + "name": "ninja-benchmarks-basic", + "inherits": ["base-benchmarks", "features-basic"], + "displayName": "Benchmarking build with reduced dependencies", + "cacheVariables": {} + }, + { + "name": "ninja-benchmarks", + "inherits": ["base-benchmarks", "features-main"], + "displayName": "Benchmarking build with more optional components", + "cacheVariables": {} + } + ] +} diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index f81a1b15779..f2ddff3997d 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -219,6 +219,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_DATASET "Build the Arrow Dataset Modules" OFF) + define_option(ARROW_ENGINE "Build the Arrow Execution Engine" OFF) + define_option(ARROW_FILESYSTEM "Build the Arrow Filesystem Layer" OFF) define_option(ARROW_FLIGHT @@ -371,6 +373,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_WITH_BACKTRACE "Build with backtrace support" ON) + define_option(ARROW_WITH_OPENTELEMETRY + "Build libraries with OpenTelemetry support for distributed tracing" OFF) + define_option(ARROW_WITH_BROTLI "Build with Brotli compression" OFF) define_option(ARROW_WITH_BZ2 "Build with BZ2 compression" OFF) define_option(ARROW_WITH_LZ4 "Build with lz4 compression" OFF) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 72bd1705e04..267024ce63b 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -63,6 +63,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES GTest LLVM Lz4 + opentelemetry-cpp ORC re2 Protobuf @@ -160,6 +161,8 @@ macro(build_dependency DEPENDENCY_NAME) build_gtest() elseif("${DEPENDENCY_NAME}" STREQUAL "Lz4") build_lz4() + elseif("${DEPENDENCY_NAME}" STREQUAL "opentelemetry-cpp") + build_opentelemetry() elseif("${DEPENDENCY_NAME}" STREQUAL "ORC") build_orc() elseif("${DEPENDENCY_NAME}" STREQUAL "Protobuf") @@ -269,6 +272,11 @@ if(PARQUET_REQUIRE_ENCRYPTION) set(ARROW_JSON ON) endif() +if(ARROW_WITH_OPENTELEMETRY) + set(ARROW_WITH_GRPC ON) + set(ARROW_WITH_PROTOBUF ON) +endif() + if(ARROW_THRIFT) set(ARROW_WITH_ZLIB ON) endif() @@ -286,7 +294,9 @@ endif() if(ARROW_FLIGHT) set(ARROW_WITH_GRPC ON) - # gRPC requires zlib +endif() + +if(ARROW_WITH_GRPC) set(ARROW_WITH_ZLIB ON) endif() @@ -546,6 +556,25 @@ else() ) endif() +if(DEFINED ENV{ARROW_OPENTELEMETRY_URL}) + set(OPENTELEMETRY_SOURCE_URL "$ENV{ARROW_OPENTELEMETRY_URL}") +else() + # TODO: add mirror + set_urls(OPENTELEMETRY_SOURCE_URL + "https://github.com/open-telemetry/opentelemetry-cpp/archive/refs/tags/${ARROW_OPENTELEMETRY_BUILD_VERSION}.tar.gz" + ) +endif() + +if(DEFINED ENV{ARROW_OPENTELEMETRY_PROTO_URL}) + set(OPENTELEMETRY_PROTO_SOURCE_URL "$ENV{ARROW_OPENTELEMETRY_PROTO_URL}") +else() + # TODO: add mirror + # N.B. upstream pins to particular commits, not tags + set_urls(OPENTELEMETRY_PROTO_SOURCE_URL + "https://github.com/open-telemetry/opentelemetry-proto/archive/${ARROW_OPENTELEMETRY_PROTO_BUILD_VERSION}.tar.gz" + ) +endif() + if(DEFINED ENV{ARROW_PROTOBUF_URL}) set(PROTOBUF_SOURCE_URL "$ENV{ARROW_PROTOBUF_URL}") else() @@ -969,6 +998,23 @@ if(ARROW_BOOST_REQUIRED) include_directories(SYSTEM ${Boost_INCLUDE_DIR}) endif() +# ---------------------------------------------------------------------- +# cURL + +macro(find_curl) + if(NOT TARGET CURL::libcurl) + find_package(CURL REQUIRED) + if(NOT TARGET CURL::libcurl) + # For CMake 3.11 or older + add_library(CURL::libcurl UNKNOWN IMPORTED) + set_target_properties(CURL::libcurl + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${CURL_INCLUDE_DIRS}" IMPORTED_LOCATION + "${CURL_LIBRARIES}") + endif() + endif() +endmacro() + # ---------------------------------------------------------------------- # Snappy @@ -1931,8 +1977,7 @@ macro(build_benchmark) endmacro() if(ARROW_BUILD_BENCHMARKS) - # ArgsProduct() is available since 1.5.2 - set(BENCHMARK_REQUIRED_VERSION 1.5.2) + set(BENCHMARK_REQUIRED_VERSION 1.6.0) resolve_dependency(benchmark REQUIRED_VERSION ${BENCHMARK_REQUIRED_VERSION} @@ -3600,8 +3645,11 @@ macro(build_nlohmann_json_once) set(NLOHMANN_JSON_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/nlohmann_json_ep-install") set(NLOHMANN_JSON_INCLUDE_DIR "${NLOHMANN_JSON_PREFIX}/include") set(NLOHMANN_JSON_CMAKE_ARGS - ${EP_COMMON_CMAKE_ARGS} -DCMAKE_CXX_STANDARD=11 - "-DCMAKE_INSTALL_PREFIX=" -DBUILD_TESTING=OFF) + ${EP_COMMON_CMAKE_ARGS} + -DCMAKE_CXX_STANDARD=11 + "-DCMAKE_INSTALL_PREFIX=" + -DBUILD_TESTING=OFF + -DJSON_BuildTests=OFF) set(NLOHMANN_JSON_BUILD_BYPRODUCTS ${NLOHMANN_JSON_PREFIX}/include/nlohmann/json.hpp) @@ -3635,7 +3683,7 @@ macro(build_google_cloud_cpp_storage) # Curl is required on all platforms, but building it internally might also trip over S3's copy. # For now, force its inclusion from the underlying system or fail. - find_package(CURL 7.47.0 REQUIRED) + find_curl() find_package(OpenSSL ${ARROW_OPENSSL_REQUIRED_VERSION} REQUIRED) # Build google-cloud-cpp, with only storage_client @@ -3672,19 +3720,6 @@ macro(build_google_cloud_cpp_storage) add_dependencies(google_cloud_cpp_dependencies absl_ep) add_dependencies(google_cloud_cpp_dependencies crc32c_ep) add_dependencies(google_cloud_cpp_dependencies nlohmann_json_ep) - # Typically the steps to build the AWKSSDK provide `CURL::libcurl`, but if that is - # disabled we need to provide our own. - if(NOT TARGET CURL::libcurl) - find_package(CURL REQUIRED) - if(NOT TARGET CURL::libcurl) - # For CMake 3.11 or older - add_library(CURL::libcurl UNKNOWN IMPORTED) - set_target_properties(CURL::libcurl - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${CURL_INCLUDE_DIRS}" IMPORTED_LOCATION - "${CURL_LIBRARIES}") - endif() - endif() set(GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE "${GOOGLE_CLOUD_CPP_INSTALL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}google_cloud_cpp_storage${CMAKE_STATIC_LIBRARY_SUFFIX}" @@ -3865,6 +3900,204 @@ if(ARROW_ORC) message(STATUS "Found ORC headers: ${ORC_INCLUDE_DIR}") endif() +# ---------------------------------------------------------------------- +# OpenTelemetry C++ + +macro(build_opentelemetry) + message("Building OpenTelemetry from source") + + build_nlohmann_json_once() + find_curl() + + set(OPENTELEMETRY_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/opentelemetry_ep-install") + set(OPENTELEMETRY_INCLUDE_DIR "${OPENTELEMETRY_PREFIX}/include") + set(OPENTELEMETRY_STATIC_LIB + "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(_OPENTELEMETRY_APIS api ext sdk) + set(_OPENTELEMETRY_LIBS + common + http_client_curl + ostream_span_exporter + otlp_http_exporter + otlp_recordable + proto + resources + trace + version) + set(OPENTELEMETRY_BUILD_BYPRODUCTS) + set(OPENTELEMETRY_LIBRARIES) + + foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_APIS}) + add_library(opentelemetry-cpp::${_OPENTELEMETRY_LIB} INTERFACE IMPORTED) + set_target_properties(opentelemetry-cpp::${_OPENTELEMETRY_LIB} + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${OPENTELEMETRY_INCLUDE_DIR}") + endforeach() + foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) + if(_OPENTELEMETRY_LIB STREQUAL "http_client_curl") + set(_OPENTELEMETRY_STATIC_LIBRARY + "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${_OPENTELEMETRY_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + elseif(_OPENTELEMETRY_LIB STREQUAL "ostream_span_exporter") + set(_OPENTELEMETRY_STATIC_LIBRARY + "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_ostream_span${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + elseif(_OPENTELEMETRY_LIB STREQUAL "otlp_http_exporter") + set(_OPENTELEMETRY_STATIC_LIBRARY + "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_exporter_otlp_http${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + else() + set(_OPENTELEMETRY_STATIC_LIBRARY + "${OPENTELEMETRY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}opentelemetry_${_OPENTELEMETRY_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + endif() + add_library(opentelemetry-cpp::${_OPENTELEMETRY_LIB} STATIC IMPORTED) + set_target_properties(opentelemetry-cpp::${_OPENTELEMETRY_LIB} + PROPERTIES IMPORTED_LOCATION ${_OPENTELEMETRY_STATIC_LIBRARY}) + list(APPEND OPENTELEMETRY_BUILD_BYPRODUCTS ${_OPENTELEMETRY_STATIC_LIBRARY}) + list(APPEND OPENTELEMETRY_LIBRARIES opentelemetry-cpp::${_OPENTELEMETRY_LIB}) + endforeach() + + set(OPENTELEMETRY_CMAKE_ARGS + ${EP_COMMON_TOOLCHAIN} + "-DCMAKE_INSTALL_PREFIX=${OPENTELEMETRY_PREFIX}" + "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" + -DCMAKE_INSTALL_LIBDIR=lib + "-DCMAKE_CXX_FLAGS=${EP_CXX_FLAGS}" + -DBUILD_TESTING=OFF + -DWITH_EXAMPLES=OFF) + + set(OPENTELEMETRY_PREFIX_PATH_LIST) + # Don't specify the DEPENDS unless we actually have dependencies, else + # Ninja/other build systems may consider this target to always be dirty + set(_OPENTELEMETRY_DEPENDENCIES) + add_custom_target(opentelemetry_dependencies) + + set(_OPENTELEMETRY_DEPENDENCIES "opentelemetry_dependencies") + list(APPEND ARROW_BUNDLED_STATIC_LIBS ${OPENTELEMETRY_LIBRARIES}) + list(APPEND OPENTELEMETRY_PREFIX_PATH_LIST ${NLOHMANN_JSON_PREFIX}) + list(APPEND + OPENTELEMETRY_CMAKE_ARGS + -DWITH_OTLP=ON + -DWITH_OTLP_HTTP=ON + -DWITH_OTLP_GRPC=OFF) + + # OpenTelemetry with OTLP enabled requires Protobuf definitions from a + # submodule. This submodule path is hardcoded into their CMake definitions, + # and submodules are not included in their releases. Add a custom build step + # to download and extract the Protobufs. + + # Adding such a step is rather complicated, so instead: create a separate + # ExternalProject that just fetches the Protobufs, then add a custom step + # to the main build to copy the Protobufs. + externalproject_add(opentelemetry_proto_ep + ${EP_LOG_OPTIONS} + URL_HASH "SHA256=${ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM}" + URL ${OPENTELEMETRY_PROTO_SOURCE_URL} + BUILD_COMMAND "" + CONFIGURE_COMMAND "" + INSTALL_COMMAND "" + EXCLUDE_FROM_ALL OFF) + + add_dependencies(opentelemetry_dependencies nlohmann_json_ep opentelemetry_proto_ep) + if(gRPC_SOURCE STREQUAL "BUNDLED") + # TODO: opentelemetry-cpp::proto doesn't declare a dependency on gRPC, so + # even if we provide the location of gRPC, it'll fail to compile. + message(FATAL_ERROR "ARROW_WITH_OPENTELEMETRY cannot be configured with gRPC_SOURCE=BUNDLED. " + "See https://github.com/open-telemetry/opentelemetry-cpp/issues/1045" + ) + endif() + + set(OPENTELEMETRY_PREFIX_PATH_LIST_SEP_CHAR "|") + # JOIN is CMake >=3.12 only + string(REPLACE ";" "${OPENTELEMETRY_PREFIX_PATH_LIST_SEP_CHAR}" + OPENTELEMETRY_PREFIX_PATH "${OPENTELEMETRY_PREFIX_PATH_LIST}") + list(APPEND OPENTELEMETRY_CMAKE_ARGS "-DCMAKE_PREFIX_PATH=${OPENTELEMETRY_PREFIX_PATH}") + + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "s390x") + # OpenTelemetry tries to determine the processor arch for vcpkg, which fails + # on s390x, even though it doesn't use vcpkg there. Tell it ARCH manually + externalproject_add(opentelemetry_ep + ${EP_LOG_OPTIONS} + URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}" + LIST_SEPARATOR ${OPENTELEMETRY_PREFIX_PATH_LIST_SEP_CHAR} + CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ARCH=s390x + ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} + "" + ${OPENTELEMETRY_CMAKE_ARGS} + BUILD_COMMAND ${CMAKE_COMMAND} --build "" --target all + INSTALL_COMMAND ${CMAKE_COMMAND} --build "" --target + install + URL ${OPENTELEMETRY_SOURCE_URL} + BUILD_BYPRODUCTS ${OPENTELEMETRY_BUILD_BYPRODUCTS} + EXCLUDE_FROM_ALL NOT + ${ARROW_WITH_OPENTELEMETRY} + DEPENDS ${_OPENTELEMETRY_DEPENDENCIES}) + else() + externalproject_add(opentelemetry_ep + ${EP_LOG_OPTIONS} + URL_HASH "SHA256=${ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM}" + LIST_SEPARATOR ${OPENTELEMETRY_PREFIX_PATH_LIST_SEP_CHAR} + CMAKE_ARGS ${OPENTELEMETRY_CMAKE_ARGS} + URL ${OPENTELEMETRY_SOURCE_URL} + BUILD_BYPRODUCTS ${OPENTELEMETRY_BUILD_BYPRODUCTS} + EXCLUDE_FROM_ALL NOT + ${ARROW_WITH_OPENTELEMETRY} + DEPENDS ${_OPENTELEMETRY_DEPENDENCIES}) + endif() + + externalproject_add_step(opentelemetry_ep download_proto + COMMAND ${CMAKE_COMMAND} -E copy_directory + $/opentelemetry + $/third_party/opentelemetry-proto/opentelemetry + DEPENDEES download + DEPENDERS configure) + + add_dependencies(toolchain opentelemetry_ep) + add_dependencies(toolchain-tests opentelemetry_ep) + + set(OPENTELEMETRY_VENDORED 1) + + set_target_properties(opentelemetry-cpp::common + PROPERTIES INTERFACE_LINK_LIBRARIES + "opentelemetry-cpp::api;opentelemetry-cpp::sdk;Threads::Threads" + ) + set_target_properties(opentelemetry-cpp::resources + PROPERTIES INTERFACE_LINK_LIBRARIES "opentelemetry-cpp::common") + set_target_properties(opentelemetry-cpp::trace + PROPERTIES INTERFACE_LINK_LIBRARIES + "opentelemetry-cpp::common;opentelemetry-cpp::resources" + ) + set_target_properties(opentelemetry-cpp::http_client_curl + PROPERTIES INTERFACE_LINK_LIBRARIES + "opentelemetry-cpp::ext;CURL::libcurl") + set_target_properties(opentelemetry-cpp::proto + PROPERTIES INTERFACE_LINK_LIBRARIES + "${ARROW_PROTOBUF_LIBPROTOBUF}") + set_target_properties(opentelemetry-cpp::otlp_recordable + PROPERTIES INTERFACE_LINK_LIBRARIES + "opentelemetry-cpp::trace;opentelemetry-cpp::resources;opentelemetry-cpp::proto" + ) + set_target_properties(opentelemetry-cpp::otlp_http_exporter + PROPERTIES INTERFACE_LINK_LIBRARIES + "opentelemetry-cpp::otlp_recordable;opentelemetry-cpp::http_client_curl;nlohmann_json::nlohmann_json" + ) + + foreach(_OPENTELEMETRY_LIB ${_OPENTELEMETRY_LIBS}) + add_dependencies(opentelemetry-cpp::${_OPENTELEMETRY_LIB} opentelemetry_ep) + endforeach() +endmacro() + +if(ARROW_WITH_OPENTELEMETRY) + set(opentelemetry-cpp_SOURCE "AUTO") + resolve_dependency(opentelemetry-cpp) + get_target_property(OPENTELEMETRY_INCLUDE_DIR opentelemetry-cpp::api + INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${OPENTELEMETRY_INCLUDE_DIR}) + message(STATUS "Found OpenTelemetry headers: ${OPENTELEMETRY_INCLUDE_DIR}") +endif() + # ---------------------------------------------------------------------- # AWS SDK for C++ @@ -4000,15 +4233,7 @@ macro(build_awssdk) set(AWSSDK_LINK_LIBRARIES ${AWSSDK_LIBRARIES}) if(UNIX) # on Linux and macOS curl seems to be required - find_package(CURL REQUIRED) - if(NOT TARGET CURL::libcurl) - # For CMake 3.11 or older - add_library(CURL::libcurl UNKNOWN IMPORTED) - set_target_properties(CURL::libcurl - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${CURL_INCLUDE_DIRS}" IMPORTED_LOCATION - "${CURL_LIBRARIES}") - endif() + find_curl() set_property(TARGET aws-cpp-sdk-core APPEND PROPERTY INTERFACE_LINK_LIBRARIES CURL::libcurl) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 8d9cbb32300..5736c557bd0 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -287,6 +287,10 @@ if(ARROW_WITH_LZ4) list(APPEND ARROW_SRCS util/compression_lz4.cc) endif() +if(ARROW_WITH_OPENTELEMETRY) + list(APPEND ARROW_SRCS util/tracing_internal.cc) +endif() + if(ARROW_WITH_SNAPPY) add_definitions(-DARROW_WITH_SNAPPY) list(APPEND ARROW_SRCS util/compression_snappy.cc) diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 2add572e7a4..b6b769cf033 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -57,7 +57,7 @@ class ARROW_EXPORT Array { /// \brief Return true if value at index is null. Does not boundscheck bool IsNull(int64_t i) const { return null_bitmap_data_ != NULLPTR - ? !BitUtil::GetBit(null_bitmap_data_, i + data_->offset) + ? !bit_util::GetBit(null_bitmap_data_, i + data_->offset) : data_->null_count == data_->length; } @@ -65,7 +65,7 @@ class ARROW_EXPORT Array { /// boundscheck bool IsValid(int64_t i) const { return null_bitmap_data_ != NULLPTR - ? BitUtil::GetBit(null_bitmap_data_, i + data_->offset) + ? bit_util::GetBit(null_bitmap_data_, i + data_->offset) : data_->null_count != data_->length; } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 19d3d65a464..04ee804987f 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -75,6 +75,10 @@ class BaseBinaryArray : public FlatArray { raw_value_offsets_[i + 1] - pos); } + util::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + /// \brief Get binary value as a string_view /// Provided for consistency with other arrays. /// @@ -236,6 +240,10 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { return util::string_view(reinterpret_cast(GetValue(i)), byte_width()); } + util::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + std::string GetString(int64_t i) const { return std::string(GetView(i)); } int32_t byte_width() const { return byte_width_; } diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 7840c60f897..e9f86df3fc4 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -106,6 +106,18 @@ class TestStringArray : public ::testing::Test { AssertZeroPadded(*strings_); } + void TestArrayIndexOperator() { + const auto& arr = *strings_; + for (int64_t i = 0; i < arr.length(); ++i) { + if (valid_bytes_[i]) { + ASSERT_TRUE(arr[i].has_value()); + ASSERT_EQ(expected_[i], arr[i].value()); + } else { + ASSERT_FALSE(arr[i].has_value()); + } + } + } + void TestArrayCtors() { // ARROW-8863: ArrayData::null_count set to 0 when no validity bitmap // provided @@ -328,6 +340,8 @@ TYPED_TEST_SUITE(TestStringArray, BaseBinaryArrowTypes); TYPED_TEST(TestStringArray, TestArrayBasics) { this->TestArrayBasics(); } +TYPED_TEST(TestStringArray, TestArrayIndexOperator) { this->TestArrayIndexOperator(); } + TYPED_TEST(TestStringArray, TestArrayCtors) { this->TestArrayCtors(); } TYPED_TEST(TestStringArray, TestType) { this->TestType(); } @@ -595,7 +609,7 @@ class TestStringBuilder : public TestBuilder { int reps = 15; int64_t length = 0; int64_t capacity = 1000; - int64_t expected_capacity = BitUtil::RoundUpToMultipleOf64(capacity); + int64_t expected_capacity = bit_util::RoundUpToMultipleOf64(capacity); ASSERT_OK(builder_->ReserveData(capacity)); @@ -613,7 +627,7 @@ class TestStringBuilder : public TestBuilder { } int extra_capacity = 500; - expected_capacity = BitUtil::RoundUpToMultipleOf64(length + extra_capacity); + expected_capacity = bit_util::RoundUpToMultipleOf64(length + extra_capacity); ASSERT_OK(builder_->ReserveData(extra_capacity)); diff --git a/cpp/src/arrow/array/array_dict_test.cc b/cpp/src/arrow/array/array_dict_test.cc index d6f7f3c86f5..75f5205a10f 100644 --- a/cpp/src/arrow/array/array_dict_test.cc +++ b/cpp/src/arrow/array/array_dict_test.cc @@ -1141,7 +1141,7 @@ TEST(TestDictionary, FromArrays) { if (checked_cast(*index_ty).is_signed()) { // Invalid index is masked by null, so it's OK auto indices3 = ArrayFromJSON(index_ty, "[1, 2, -1, null, 2, 0]"); - BitUtil::ClearBit(indices3->data()->buffers[0]->mutable_data(), 2); + bit_util::ClearBit(indices3->data()->buffers[0]->mutable_data(), 2); ASSERT_OK_AND_ASSIGN(auto arr3, DictionaryArray::FromArrays(dict_type, indices3, dict)); } diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 34887ad26fc..e1aaa310814 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -245,7 +245,7 @@ class TestListArray : public TestBuilder { AssertArraysEqual(expected3, *list3); // Check that the last offset bit is zero - ASSERT_FALSE(BitUtil::GetBit(list3->null_bitmap()->data(), length + 1)); + ASSERT_FALSE(bit_util::GetBit(list3->null_bitmap()->data(), length + 1)); ArrayType expected4(list_type, length, offsets2->data()->buffers[1], values, offsets4->data()->buffers[0], 1); @@ -425,9 +425,9 @@ class TestListArray : public TestBuilder { ASSERT_EQ(2, array_data->buffers.size()); auto null_bitmap_buffer = array_data->buffers[0]; ASSERT_NE(nullptr, null_bitmap_buffer); - BitUtil::ClearBit(null_bitmap_buffer->mutable_data(), 1); - BitUtil::ClearBit(null_bitmap_buffer->mutable_data(), 3); - BitUtil::ClearBit(null_bitmap_buffer->mutable_data(), 4); + bit_util::ClearBit(null_bitmap_buffer->mutable_data(), 1); + bit_util::ClearBit(null_bitmap_buffer->mutable_data(), 3); + bit_util::ClearBit(null_bitmap_buffer->mutable_data(), 4); array_data->null_count += 3; auto list_array = std::dynamic_pointer_cast(MakeArray(array_data)); ASSERT_OK(list_array->ValidateFull()); @@ -811,7 +811,7 @@ TEST_F(TestMapArray, FromArrays) { AssertArraysEqual(expected3, *map3); // Check that the last offset bit is zero - ASSERT_FALSE(BitUtil::GetBit(map3->null_bitmap()->data(), length + 1)); + ASSERT_FALSE(bit_util::GetBit(map3->null_bitmap()->data(), length + 1)); MapArray expected4(map_type, length, offsets2->data()->buffers[1], keys, items, offsets4->data()->buffers[0], 1); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index a3c1fab054e..c80273e9adf 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -75,7 +75,7 @@ Status CleanListOffsets(const Array& offsets, MemoryPool* pool, // we have N + 1 offsets) ARROW_ASSIGN_OR_RAISE( auto clean_valid_bits, - offsets.null_bitmap()->CopySlice(0, BitUtil::BytesForBits(num_offsets - 1))); + offsets.null_bitmap()->CopySlice(0, bit_util::BytesForBits(num_offsets - 1))); *validity_buf_out = clean_valid_bits; const offset_type* raw_offsets = typed_offsets.raw_values(); diff --git a/cpp/src/arrow/array/array_primitive.h b/cpp/src/arrow/array/array_primitive.h index b5385f96514..740a4806a4d 100644 --- a/cpp/src/arrow/array/array_primitive.h +++ b/cpp/src/arrow/array/array_primitive.h @@ -48,12 +48,14 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { int64_t null_count = kUnknownNullCount, int64_t offset = 0); bool Value(int64_t i) const { - return BitUtil::GetBit(reinterpret_cast(raw_values_), - i + data_->offset); + return bit_util::GetBit(reinterpret_cast(raw_values_), + i + data_->offset); } bool GetView(int64_t i) const { return Value(i); } + util::optional operator[](int64_t i) const { return *IteratorType(*this, i); } + /// \brief Return the number of false (0) values among the valid /// values. Result is not cached. int64_t false_count() const; @@ -109,6 +111,10 @@ class NumericArray : public PrimitiveArray { // For API compatibility with BinaryArray etc. value_type GetView(int64_t i) const { return Value(i); } + util::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + IteratorType begin() const { return IteratorType(*this); } IteratorType end() const { return IteratorType(*this, length()); } @@ -123,6 +129,7 @@ class NumericArray : public PrimitiveArray { class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { public: using TypeClass = DayTimeIntervalType; + using IteratorType = stl::ArrayIterator; explicit DayTimeIntervalArray(const std::shared_ptr& data); @@ -141,6 +148,14 @@ class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { // For compatibility with Take kernel. TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); } + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + util::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); } const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } @@ -150,6 +165,7 @@ class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray { public: using TypeClass = MonthDayNanoIntervalType; + using IteratorType = stl::ArrayIterator; explicit MonthDayNanoIntervalArray(const std::shared_ptr& data); @@ -168,6 +184,14 @@ class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray { // For compatibility with Take kernel. TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); } + IteratorType begin() const { return IteratorType(*this); } + + IteratorType end() const { return IteratorType(*this, length()); } + + util::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); } const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index efe600f1223..059757d1049 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -1050,7 +1050,7 @@ void TestPrimitiveBuilder::Check(const std::unique_ptr ASSERT_FALSE(result->IsNull(i)); } if (!result->IsNull(i)) { - bool actual = BitUtil::GetBit(result->values()->data(), i); + bool actual = bit_util::GetBit(result->values()->data(), i); ASSERT_EQ(draws_[i] != 0, actual) << i; } } @@ -1058,11 +1058,11 @@ void TestPrimitiveBuilder::Check(const std::unique_ptr // buffers are correctly sized if (result->data()->buffers[0]) { - ASSERT_EQ(result->data()->buffers[0]->size(), BitUtil::BytesForBits(size)); + ASSERT_EQ(result->data()->buffers[0]->size(), bit_util::BytesForBits(size)); } else { ASSERT_EQ(result->data()->null_count, 0); } - ASSERT_EQ(result->data()->buffers[1]->size(), BitUtil::BytesForBits(size)); + ASSERT_EQ(result->data()->buffers[1]->size(), bit_util::BytesForBits(size)); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -2162,6 +2162,16 @@ TEST_F(TestFWBinaryArray, ArrayDataVisitorSliced) { ARROW_UNUSED(visitor); // Workaround weird MSVC warning } +TEST_F(TestFWBinaryArray, ArrayIndexOperator) { + auto type = fixed_size_binary(3); + auto arr = ArrayFromJSON(type, R"(["abc", null, "def"])"); + auto fsba = checked_pointer_cast(arr); + + ASSERT_EQ("abc", (*fsba)[0].value()); + ASSERT_EQ(util::nullopt, (*fsba)[1]); + ASSERT_EQ("def", (*fsba)[2].value()); +} + // ---------------------------------------------------------------------- // AdaptiveInt tests @@ -3348,4 +3358,61 @@ TEST(TestSwapEndianArrayData, InvalidLength) { } } +template +class TestPrimitiveArray : public ::testing::Test { + public: + using ElementType = typename PType::T; + + void SetUp() { + pool_ = default_memory_pool(); + GenerateInput(); + } + + void GenerateInput() { + validity_ = std::vector{true, false, true, true, false, true}; + values_ = std::vector{0, 1, 1, 0, 1, 1}; + } + + protected: + MemoryPool* pool_; + std::vector validity_; + std::vector values_; +}; + +template <> +void TestPrimitiveArray::GenerateInput() { + validity_ = std::vector{true, false}; + values_ = std::vector{{0, 10}, {1, 0}}; +} + +template <> +void TestPrimitiveArray::GenerateInput() { + validity_ = std::vector{false, true}; + values_ = + std::vector{{0, 10, 100}, {1, 0, 10}}; +} + +TYPED_TEST_SUITE(TestPrimitiveArray, Primitives); + +TYPED_TEST(TestPrimitiveArray, IndexOperator) { + typename TypeParam::BuilderType builder; + ASSERT_OK(builder.Reserve(this->values_.size())); + ASSERT_OK(builder.AppendValues(this->values_, this->validity_)); + ASSERT_OK_AND_ASSIGN(auto array, builder.Finish()); + + const auto& carray = checked_cast(*array); + + ASSERT_EQ(this->values_.size(), carray.length()); + for (int64_t i = 0; i < carray.length(); ++i) { + auto res = carray[i]; + if (this->validity_[i]) { + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(this->values_[i], res.value()); + } else { + ASSERT_FALSE(res.has_value()); + ASSERT_EQ(res, util::nullopt); + } + } +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 6ca65113f1c..54a30aae1ed 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -280,7 +280,7 @@ class BaseBinaryBuilder : public ArrayBuilder { auto offsets = array.GetValues(1); auto data = array.GetValues(2, 0); for (int64_t i = 0; i < length; i++) { - if (!bitmap || BitUtil::GetBit(bitmap, array.offset + offset + i)) { + if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) { const offset_type start = offsets[offset + i]; const offset_type end = offsets[offset + i + 1]; ARROW_RETURN_NOT_OK(Append(data + start, end - start)); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index e53b758efa3..9e3a4458d8d 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -127,7 +127,7 @@ class BaseListBuilder : public ArrayBuilder { const offset_type* offsets = array.GetValues(1); const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { - if (!validity || BitUtil::GetBit(validity, array.offset + row)) { + if (!validity || bit_util::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); int64_t slot_length = offsets[row + 1] - offsets[row]; ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0], @@ -297,7 +297,7 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { const int32_t* offsets = array.GetValues(1); const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { - if (!validity || BitUtil::GetBit(validity, array.offset + row)) { + if (!validity || bit_util::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); const int64_t slot_length = offsets[row + 1] - offsets[row]; ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice( @@ -413,7 +413,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final { const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { - if (!validity || BitUtil::GetBit(validity, array.offset + row)) { + if (!validity || bit_util::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice( *array.child_data[0], list_size_ * (array.offset + row), list_size_)); ARROW_RETURN_NOT_OK(Append()); diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 54a75f06c90..0631fe6915d 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -83,7 +83,7 @@ Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, int64_t bitmap_offset = 0; for (auto bitmap : bitmaps) { if (bitmap.AllSet()) { - BitUtil::SetBitsTo(dst, bitmap_offset, bitmap.range.length, true); + bit_util::SetBitsTo(dst, bitmap_offset, bitmap.range.length, true); } else { internal::CopyBitmap(bitmap.data, bitmap.range.offset, bitmap.range.length, dst, bitmap_offset); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 305910c247c..218a40ea066 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -77,7 +77,7 @@ class ConcatenateTest : public ::testing::Test { } void CheckTrailingBitsAreZeroed(const std::shared_ptr& bitmap, int64_t length) { - if (auto preceding_bits = BitUtil::kPrecedingBitmask[length % 8]) { + if (auto preceding_bits = bit_util::kPrecedingBitmask[length % 8]) { auto last_byte = bitmap->data()[length / 8]; ASSERT_EQ(static_cast(last_byte & preceding_bits), last_byte) << length << " " << int(preceding_bits); diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 0a50de0f1f1..32a95450374 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -290,7 +290,7 @@ class QuadraticSpaceMyersDiff { for (int64_t i = edit_count_; i > 0; --i) { bool insert = insert_[index]; - BitUtil::SetBitTo(insert_buf->mutable_data(), i, insert); + bit_util::SetBitTo(insert_buf->mutable_data(), i, insert); auto insertions_minus_deletions = (endpoint.base - base_begin_) - (endpoint.target - target_begin_); @@ -308,7 +308,7 @@ class QuadraticSpaceMyersDiff { endpoint = previous; } - BitUtil::SetBitTo(insert_buf->mutable_data(), 0, false); + bit_util::SetBitTo(insert_buf->mutable_data(), 0, false); run_length[0] = endpoint.base - base_begin_; return StructArray::Make( diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 2045b8f5c71..d5033e7ff0c 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -119,7 +119,7 @@ class ArrayDataEndianSwapper { // NOTE: data_->length not trusted (see warning above) int64_t length = in_buffer->size() / sizeof(T); for (int64_t i = 0; i < length; i++) { - out_data[i] = BitUtil::ByteSwap(in_data[i]); + out_data[i] = bit_util::ByteSwap(in_data[i]); } return std::move(out_buffer); } @@ -158,12 +158,12 @@ class ArrayDataEndianSwapper { uint64_t tmp; auto idx = i * 2; #if ARROW_LITTLE_ENDIAN - tmp = BitUtil::FromBigEndian(data[idx]); - new_data[idx] = BitUtil::FromBigEndian(data[idx + 1]); + tmp = bit_util::FromBigEndian(data[idx]); + new_data[idx] = bit_util::FromBigEndian(data[idx + 1]); new_data[idx + 1] = tmp; #else - tmp = BitUtil::FromLittleEndian(data[idx]); - new_data[idx] = BitUtil::FromLittleEndian(data[idx + 1]); + tmp = bit_util::FromLittleEndian(data[idx]); + new_data[idx] = bit_util::FromLittleEndian(data[idx + 1]); new_data[idx + 1] = tmp; #endif } @@ -181,18 +181,18 @@ class ArrayDataEndianSwapper { uint64_t tmp0, tmp1, tmp2; auto idx = i * 4; #if ARROW_LITTLE_ENDIAN - tmp0 = BitUtil::FromBigEndian(data[idx]); - tmp1 = BitUtil::FromBigEndian(data[idx + 1]); - tmp2 = BitUtil::FromBigEndian(data[idx + 2]); - new_data[idx] = BitUtil::FromBigEndian(data[idx + 3]); + tmp0 = bit_util::FromBigEndian(data[idx]); + tmp1 = bit_util::FromBigEndian(data[idx + 1]); + tmp2 = bit_util::FromBigEndian(data[idx + 2]); + new_data[idx] = bit_util::FromBigEndian(data[idx + 3]); new_data[idx + 1] = tmp2; new_data[idx + 2] = tmp1; new_data[idx + 3] = tmp0; #else - tmp0 = BitUtil::FromLittleEndian(data[idx]); - tmp1 = BitUtil::FromLittleEndian(data[idx + 1]); - tmp2 = BitUtil::FromLittleEndian(data[idx + 2]); - new_data[idx] = BitUtil::FromLittleEndian(data[idx + 3]); + tmp0 = bit_util::FromLittleEndian(data[idx]); + tmp1 = bit_util::FromLittleEndian(data[idx + 1]); + tmp2 = bit_util::FromLittleEndian(data[idx + 2]); + new_data[idx] = bit_util::FromLittleEndian(data[idx + 3]); new_data[idx + 1] = tmp2; new_data[idx + 2] = tmp1; new_data[idx + 3] = tmp0; @@ -217,13 +217,13 @@ class ArrayDataEndianSwapper { for (int64_t i = 0; i < length; i++) { MonthDayNanos tmp = data[i]; #if ARROW_LITTLE_ENDIAN - tmp.months = BitUtil::FromBigEndian(tmp.months); - tmp.days = BitUtil::FromBigEndian(tmp.days); - tmp.nanoseconds = BitUtil::FromBigEndian(tmp.nanoseconds); + tmp.months = bit_util::FromBigEndian(tmp.months); + tmp.days = bit_util::FromBigEndian(tmp.days); + tmp.nanoseconds = bit_util::FromBigEndian(tmp.nanoseconds); #else - tmp.months = BitUtil::FromLittleEndian(tmp.months); - tmp.days = BitUtil::FromLittleEndian(tmp.days); - tmp.nanoseconds = BitUtil::FromLittleEndian(tmp.nanoseconds); + tmp.months = bit_util::FromLittleEndian(tmp.months); + tmp.days = bit_util::FromLittleEndian(tmp.days); + tmp.nanoseconds = bit_util::FromLittleEndian(tmp.nanoseconds); #endif new_data[i] = tmp; } @@ -324,7 +324,7 @@ class NullArrayFactory { public: struct GetBufferLength { GetBufferLength(const std::shared_ptr& type, int64_t length) - : type_(*type), length_(length), buffer_length_(BitUtil::BytesForBits(length)) {} + : type_(*type), length_(length), buffer_length_(bit_util::BytesForBits(length)) {} Result Finish() && { RETURN_NOT_OK(VisitTypeInline(type_, this)); @@ -550,8 +550,8 @@ class RepeatedArrayFactory { Status Visit(const BooleanType&) { ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBitmap(length_, pool_)); - BitUtil::SetBitsTo(buffer->mutable_data(), 0, length_, - checked_cast(scalar_).value); + bit_util::SetBitsTo(buffer->mutable_data(), 0, length_, + checked_cast(scalar_).value); out_ = std::make_shared(length_, buffer); return Status::OK(); } diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 52fcad5e7eb..7c4ab6d5248 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -385,7 +385,7 @@ struct ValidateArrayImpl { int64_t min_buffer_size = -1; switch (spec.kind) { case DataTypeLayout::BITMAP: - min_buffer_size = BitUtil::BytesForBits(length_plus_offset); + min_buffer_size = bit_util::BytesForBits(length_plus_offset); break; case DataTypeLayout::FIXED_WIDTH: if (MultiplyWithOverflow(length_plus_offset, spec.byte_width, diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index b1b2945d0f5..e663ff9dca8 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -171,7 +171,7 @@ MutableBuffer::MutableBuffer(const std::shared_ptr& parent, const int64_ } Result> AllocateBitmap(int64_t length, MemoryPool* pool) { - ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(BitUtil::BytesForBits(length), pool)); + ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(bit_util::BytesForBits(length), pool)); // Zero out any trailing bits if (buf->size() > 0) { buf->mutable_data()[buf->size() - 1] = 0; @@ -180,7 +180,7 @@ Result> AllocateBitmap(int64_t length, MemoryPool* pool) } Result> AllocateEmptyBitmap(int64_t length, MemoryPool* pool) { - ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(BitUtil::BytesForBits(length), pool)); + ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(bit_util::BytesForBits(length), pool)); memset(buf->mutable_data(), 0, static_cast(buf->size())); return std::move(buf); } diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index 7b02ad09a82..d92a01a16eb 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -333,7 +333,7 @@ class TypedBufferBuilder { } void UnsafeAppend(bool value) { - BitUtil::SetBitTo(mutable_data(), bit_length_, value); + bit_util::SetBitTo(mutable_data(), bit_length_, value); if (!value) { ++false_count_; } @@ -361,7 +361,7 @@ class TypedBufferBuilder { } void UnsafeAppend(const int64_t num_copies, bool value) { - BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value); + bit_util::SetBitsTo(mutable_data(), bit_length_, num_copies, value); false_count_ += num_copies * !value; bit_length_ += num_copies; } @@ -386,7 +386,7 @@ class TypedBufferBuilder { Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { const int64_t old_byte_capacity = bytes_builder_.capacity(); ARROW_RETURN_NOT_OK( - bytes_builder_.Resize(BitUtil::BytesForBits(new_capacity), shrink_to_fit)); + bytes_builder_.Resize(bit_util::BytesForBits(new_capacity), shrink_to_fit)); // Resize() may have chosen a larger capacity (e.g. for padding), // so ask it again before calling memset(). const int64_t new_byte_capacity = bytes_builder_.capacity(); @@ -414,7 +414,7 @@ class TypedBufferBuilder { Status Finish(std::shared_ptr* out, bool shrink_to_fit = true) { // set bytes_builder_.size_ == byte size of data - bytes_builder_.UnsafeAdvance(BitUtil::BytesForBits(bit_length_) - + bytes_builder_.UnsafeAdvance(bit_util::BytesForBits(bit_length_) - bytes_builder_.length()); bit_length_ = false_count_ = 0; return bytes_builder_.Finish(out, shrink_to_fit); @@ -433,7 +433,7 @@ class TypedBufferBuilder { /// only for memory allocation). Result> FinishWithLength(int64_t final_length, bool shrink_to_fit = true) { - const auto final_byte_length = BitUtil::BytesForBits(final_length); + const auto final_byte_length = bit_util::BytesForBits(final_length); bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length()); bit_length_ = false_count_ = 0; return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit); diff --git a/cpp/src/arrow/buffer_test.cc b/cpp/src/arrow/buffer_test.cc index 4295d4ca692..a82c90be55c 100644 --- a/cpp/src/arrow/buffer_test.cc +++ b/cpp/src/arrow/buffer_test.cc @@ -797,12 +797,12 @@ TEST(TestBoolBufferBuilder, Basics) { ASSERT_OK(builder.Finish(&built)); AssertIsCPUBuffer(*built); - ASSERT_EQ(BitUtil::GetBit(built->data(), 0), false); + ASSERT_EQ(bit_util::GetBit(built->data(), 0), false); for (int i = 0; i != nvalues; ++i) { - ASSERT_EQ(BitUtil::GetBit(built->data(), i + 1), static_cast(values[i])); + ASSERT_EQ(bit_util::GetBit(built->data(), i + 1), static_cast(values[i])); } - ASSERT_EQ(built->size(), BitUtil::BytesForBits(nvalues + 1)); + ASSERT_EQ(built->size(), bit_util::BytesForBits(nvalues + 1)); } TEST(TestBoolBufferBuilder, AppendCopies) { @@ -819,10 +819,10 @@ TEST(TestBoolBufferBuilder, AppendCopies) { AssertIsCPUBuffer(*built); for (int i = 0; i != 13 + 17; ++i) { - EXPECT_EQ(BitUtil::GetBit(built->data(), i), i < 13) << "index = " << i; + EXPECT_EQ(bit_util::GetBit(built->data(), i), i < 13) << "index = " << i; } - ASSERT_EQ(built->size(), BitUtil::BytesForBits(13 + 17)); + ASSERT_EQ(built->size(), bit_util::BytesForBits(13 + 17)); } TEST(TestBoolBufferBuilder, Reserve) { @@ -837,7 +837,7 @@ TEST(TestBoolBufferBuilder, Reserve) { ASSERT_OK_AND_ASSIGN(auto built, builder.Finish()); AssertIsCPUBuffer(*built); - ASSERT_EQ(built->size(), BitUtil::BytesForBits(13 + 17)); + ASSERT_EQ(built->size(), bit_util::BytesForBits(13 + 17)); } template diff --git a/cpp/src/arrow/builder_benchmark.cc b/cpp/src/arrow/builder_benchmark.cc index d0edb4b2d08..c131f813927 100644 --- a/cpp/src/arrow/builder_benchmark.cc +++ b/cpp/src/arrow/builder_benchmark.cc @@ -227,7 +227,7 @@ static std::vector MakeSimilarIntDictFodder() { std::uniform_int_distribution values_dist(0, kDistinctElements - 1); auto max_int = std::numeric_limits::max(); auto multiplier = - static_cast(BitUtil::NextPower2(max_int / kDistinctElements / 2)); + static_cast(bit_util::NextPower2(max_int / kDistinctElements / 2)); std::generate(values.begin(), values.end(), [&]() { return multiplier * values_dist(gen); }); } diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index e5bfad81027..f4137867313 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -1455,7 +1455,7 @@ struct ArrayImporter { RETURN_NOT_OK(CheckNumBuffers(2)); RETURN_NOT_OK(AllocateArrayData()); RETURN_NOT_OK(ImportNullBitmap()); - if (BitUtil::IsMultipleOf8(type.bit_width())) { + if (bit_util::IsMultipleOf8(type.bit_width())) { RETURN_NOT_OK(ImportFixedSizeBuffer(1, type.bit_width() / 8)); } else { DCHECK_EQ(type.bit_width(), 1); @@ -1538,7 +1538,7 @@ struct ArrayImporter { Status ImportBitsBuffer(int32_t buffer_id) { // Compute visible size of buffer - int64_t buffer_size = BitUtil::BytesForBits(c_struct_->length + c_struct_->offset); + int64_t buffer_size = bit_util::BytesForBits(c_struct_->length + c_struct_->offset); return ImportBuffer(buffer_id, buffer_size); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 4ecb00a3f08..d4b94e53838 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -202,8 +202,8 @@ class RangeDataEqualsImpl { if (length <= 8) { // Avoid the BitmapUInt64Reader overhead for very small runs for (int64_t j = i; j < i + length; ++j) { - if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) != - BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) { + if (bit_util::GetBit(left_bits, left_start_idx_ + left_.offset + j) != + bit_util::GetBit(right_bits, right_start_idx_ + right_.offset + j)) { return false; } } diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index d80398a24cd..24a5a1ee77e 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -173,6 +173,30 @@ struct EnumTraits return ""; } }; + +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "Utf8NormalizeOptions::Form"; } + static std::string value_name(compute::Utf8NormalizeOptions::Form value) { + switch (value) { + case compute::Utf8NormalizeOptions::Form::NFC: + return "NFC"; + case compute::Utf8NormalizeOptions::Form::NFKC: + return "NFKC"; + case compute::Utf8NormalizeOptions::Form::NFD: + return "NFD"; + case compute::Utf8NormalizeOptions::Form::NFKD: + return "NFKD"; + } + return ""; + } +}; + } // namespace internal namespace compute { @@ -250,6 +274,8 @@ static auto kStructFieldOptionsType = GetFunctionOptionsType DataMember("indices", &StructFieldOptions::indices)); static auto kTrimOptionsType = GetFunctionOptionsType( DataMember("characters", &TrimOptions::characters)); +static auto kUtf8NormalizeOptionsType = GetFunctionOptionsType( + DataMember("form", &Utf8NormalizeOptions::form)); static auto kWeekOptionsType = GetFunctionOptionsType( DataMember("week_starts_monday", &WeekOptions::week_starts_monday), DataMember("count_from_zero", &WeekOptions::count_from_zero), @@ -429,6 +455,10 @@ TrimOptions::TrimOptions(std::string characters) TrimOptions::TrimOptions() : TrimOptions("") {} constexpr char TrimOptions::kTypeName[]; +Utf8NormalizeOptions::Utf8NormalizeOptions(Form form) + : FunctionOptions(internal::kUtf8NormalizeOptionsType), form(form) {} +constexpr char Utf8NormalizeOptions::kTypeName[]; + WeekOptions::WeekOptions(bool week_starts_monday, bool count_from_zero, bool first_week_is_fully_in_year) : FunctionOptions(internal::kWeekOptionsType), @@ -461,6 +491,7 @@ void RegisterScalarOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kStructFieldOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kUtf8NormalizeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kWeekOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index d2234a6182d..6e1c1ac74e8 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -61,15 +61,16 @@ enum class RoundMode : int8_t { UP, /// Get the integral part without fractional digits (aka "trunc") TOWARDS_ZERO, - /// Round negative values with DOWN rule and positive values with UP rule + /// Round negative values with DOWN rule + /// and positive values with UP rule (aka "away from zero") TOWARDS_INFINITY, - /// Round ties with DOWN rule + /// Round ties with DOWN rule (also called "round half towards negative infinity") HALF_DOWN, - /// Round ties with UP rule + /// Round ties with UP rule (also called "round half towards positive infinity") HALF_UP, - /// Round ties with TOWARDS_ZERO rule + /// Round ties with TOWARDS_ZERO rule (also called "round half away from infinity") HALF_TOWARDS_ZERO, - /// Round ties with TOWARDS_INFINITY rule + /// Round ties with TOWARDS_INFINITY rule (also called "round half away from zero") HALF_TOWARDS_INFINITY, /// Round ties to nearest even integer HALF_TO_EVEN, @@ -407,6 +408,18 @@ struct ARROW_EXPORT WeekOptions : public FunctionOptions { bool first_week_is_fully_in_year; }; +struct ARROW_EXPORT Utf8NormalizeOptions : public FunctionOptions { + public: + enum Form { NFC, NFKC, NFD, NFKD }; + + explicit Utf8NormalizeOptions(Form form = NFC); + static Utf8NormalizeOptions Defaults() { return Utf8NormalizeOptions(); } + constexpr static char const kTypeName[] = "Utf8NormalizeOptions"; + + /// The Unicode normalization form to apply + Form form; +}; + /// @} /// \brief Get the absolute value of a value. diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index eda355d6eda..7128d2201f4 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -178,7 +178,7 @@ Result> AllocateDataBuffer(KernelContext* ctx, int64_t l if (bit_width == 1) { return ctx->AllocateBitmap(length); } else { - int64_t buffer_size = BitUtil::BytesForBits(length * bit_width); + int64_t buffer_size = bit_util::BytesForBits(length * bit_width); return ctx->Allocate(buffer_size); } } @@ -405,7 +405,7 @@ class NullPropagator { output_->null_count = output_->length; if (bitmap_preallocated_) { - BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false); + bit_util::SetBitsTo(bitmap_, output_->offset, output_->length, false); return Status::OK(); } @@ -420,7 +420,7 @@ class NullPropagator { } RETURN_NOT_OK(EnsureAllocated()); - BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false); + bit_util::SetBitsTo(bitmap_, output_->offset, output_->length, false); return Status::OK(); } @@ -450,7 +450,7 @@ class NullPropagator { output_->buffers[0] = arr_bitmap; } else if (arr.offset % 8 == 0) { output_->buffers[0] = - SliceBuffer(arr_bitmap, arr.offset / 8, BitUtil::BytesForBits(arr.length)); + SliceBuffer(arr_bitmap, arr.offset / 8, bit_util::BytesForBits(arr.length)); } else { RETURN_NOT_OK(EnsureAllocated()); CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, @@ -512,7 +512,7 @@ class NullPropagator { // No arrays with nulls case output_->null_count = 0; if (bitmap_preallocated_) { - BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, true); + bit_util::SetBitsTo(bitmap_, output_->offset, output_->length, true); } return Status::OK(); } diff --git a/cpp/src/arrow/compute/exec/aggregate_node.cc b/cpp/src/arrow/compute/exec/aggregate_node.cc index ddf6f7934a7..59b2ff8b8af 100644 --- a/cpp/src/arrow/compute/exec/aggregate_node.cc +++ b/cpp/src/arrow/compute/exec/aggregate_node.cc @@ -441,7 +441,7 @@ class GroupByNode : public ExecNode { state->grouper.reset(); if (output_counter_.SetTotal( - static_cast(BitUtil::CeilDiv(out_data.length, output_batch_size())))) { + static_cast(bit_util::CeilDiv(out_data.length, output_batch_size())))) { // this will be hit if out_data.length == 0 finished_.MarkFinished(); } diff --git a/cpp/src/arrow/compute/exec/expression_test.cc b/cpp/src/arrow/compute/exec/expression_test.cc index 94ca2074835..fa05ddf6422 100644 --- a/cpp/src/arrow/compute/exec/expression_test.cc +++ b/cpp/src/arrow/compute/exec/expression_test.cc @@ -1222,8 +1222,9 @@ TEST(Expression, SingleComparisonGuarantees) { all = false; } } - Simplify{filter}.WithGuarantee(guarantee).Expect( - all ? literal(true) : none ? literal(false) : filter); + Simplify{filter}.WithGuarantee(guarantee).Expect(all ? literal(true) + : none ? literal(false) + : filter); } } } diff --git a/cpp/src/arrow/compute/exec/hash_join.cc b/cpp/src/arrow/compute/exec/hash_join.cc index 72226dda3a7..02f97dd6f86 100644 --- a/cpp/src/arrow/compute/exec/hash_join.cc +++ b/cpp/src/arrow/compute/exec/hash_join.cc @@ -191,8 +191,8 @@ class HashJoinBasicImpl : public HashJoinImpl { bool no_match = hash_table_empty_; for (int icol = 0; icol < num_cols; ++icol) { bool is_null = non_null_bit_vectors[icol] && - !BitUtil::GetBit(non_null_bit_vectors[icol], - non_null_bit_vector_offsets[icol] + irow); + !bit_util::GetBit(non_null_bit_vectors[icol], + non_null_bit_vector_offsets[icol] + irow); if (key_cmp_[icol] == JoinKeyCmp::EQ && is_null) { no_match = true; break; @@ -378,8 +378,8 @@ class HashJoinBasicImpl : public HashJoinImpl { bool passed = false; for (; static_cast(irow) < num_rows && match_left[irow] == curr_left; irow++) { - bool is_valid = !validity || BitUtil::GetBit(validity, irow); - bool is_cmp_true = BitUtil::GetBit(comparisons, irow); + bool is_valid = !validity || bit_util::GetBit(validity, irow); + bool is_cmp_true = bit_util::GetBit(comparisons, irow); // We treat a null comparison result as false, like in SQL if (is_valid && is_cmp_true) { match_left[match_idx] = match_left[irow]; @@ -516,9 +516,9 @@ class HashJoinBasicImpl : public HashJoinImpl { ARROW_DCHECK(batch[i].is_scalar()); if (!batch[i].scalar_as().is_valid) { if (nn_bit_vector_all_nulls->empty()) { - nn_bit_vector_all_nulls->resize(BitUtil::BytesForBits(batch.length)); + nn_bit_vector_all_nulls->resize(bit_util::BytesForBits(batch.length)); memset(nn_bit_vector_all_nulls->data(), 0, - BitUtil::BytesForBits(batch.length)); + bit_util::BytesForBits(batch.length)); } nn = nn_bit_vector_all_nulls->data(); } @@ -579,7 +579,7 @@ class HashJoinBasicImpl : public HashJoinImpl { for (auto i : local_state.match_right) { // Mark row in hash table as having a match - BitUtil::SetBit(local_state.has_match.data(), i); + bit_util::SetBit(local_state.has_match.data(), i); } RETURN_NOT_OK(ProbeBatch_OutputAll(thread_index, local_state.exec_batch_keys, @@ -723,7 +723,7 @@ class HashJoinBasicImpl : public HashJoinImpl { join_type_ != JoinType::RIGHT_OUTER && join_type_ != JoinType::FULL_OUTER) { return 0; } - return BitUtil::CeilDiv(hash_table_keys_.num_rows(), hash_table_scan_unit_); + return bit_util::CeilDiv(hash_table_keys_.num_rows(), hash_table_scan_unit_); } Status ScanHashTable_exec_task(size_t thread_index, int64_t task_id) { @@ -747,7 +747,7 @@ class HashJoinBasicImpl : public HashJoinImpl { bool match_search_value = (join_type_ == JoinType::RIGHT_SEMI); for (int32_t row_id = start_row_id; row_id < end_row_id; ++row_id) { - if (BitUtil::GetBit(has_match_.data(), row_id) == match_search_value) { + if (bit_util::GetBit(has_match_.data(), row_id) == match_search_value) { id_right.push_back(row_id); } } @@ -821,8 +821,8 @@ class HashJoinBasicImpl : public HashJoinImpl { } if (!hash_table_empty_) { int32_t num_rows = hash_table_keys_.num_rows(); - local_state->has_match.resize(BitUtil::BytesForBits(num_rows)); - memset(local_state->has_match.data(), 0, BitUtil::BytesForBits(num_rows)); + local_state->has_match.resize(bit_util::BytesForBits(num_rows)); + memset(local_state->has_match.data(), 0, bit_util::BytesForBits(num_rows)); } local_state->is_has_match_initialized = true; } @@ -833,8 +833,8 @@ class HashJoinBasicImpl : public HashJoinImpl { } int32_t num_rows = hash_table_keys_.num_rows(); - has_match_.resize(BitUtil::BytesForBits(num_rows)); - memset(has_match_.data(), 0, BitUtil::BytesForBits(num_rows)); + has_match_.resize(bit_util::BytesForBits(num_rows)); + memset(has_match_.data(), 0, bit_util::BytesForBits(num_rows)); for (size_t tid = 0; tid < local_states_.size(); ++tid) { if (!local_states_[tid].is_initialized) { diff --git a/cpp/src/arrow/compute/exec/hash_join_dict.cc b/cpp/src/arrow/compute/exec/hash_join_dict.cc index 195331a5976..b923433b493 100644 --- a/cpp/src/arrow/compute/exec/hash_join_dict.cc +++ b/cpp/src/arrow/compute/exec/hash_join_dict.cc @@ -71,13 +71,13 @@ Result> HashJoinDictUtil::IndexRemapUsingLUT( uint8_t* nns = result->buffers[0]->mutable_data(); int32_t* ids = reinterpret_cast(result->buffers[1]->mutable_data()); for (int64_t i = 0; i < batch_length; ++i) { - bool is_null = !BitUtil::GetBit(nns, i); + bool is_null = !bit_util::GetBit(nns, i); if (is_null) { ids[i] = kNullId; } else { ARROW_DCHECK(ids[i] >= 0 && ids[i] < map_array->length); - if (!BitUtil::GetBit(map_non_nulls, ids[i])) { - BitUtil::ClearBit(nns, i); + if (!bit_util::GetBit(map_non_nulls, ids[i])) { + bit_util::ClearBit(nns, i); ids[i] = kNullId; } else { ids[i] = map[ids[i]]; @@ -102,7 +102,7 @@ static Result> ConvertImp( ARROW_ASSIGN_OR_RAISE(std::shared_ptr to_nn_buf, AllocateBitmap(batch_length, ctx->memory_pool())); uint8_t* to_nn = to_nn_buf->mutable_data(); - memset(to_nn, 0xff, BitUtil::BytesForBits(batch_length)); + memset(to_nn, 0xff, bit_util::BytesForBits(batch_length)); if (!is_scalar) { const ArrayData& arr = *input.array(); @@ -115,9 +115,9 @@ static Result> ConvertImp( ARROW_DCHECK(static_cast(to[i]) == from[i]); bool is_null = (arr.buffers[0] != NULLPTR) && - !BitUtil::GetBit(arr.buffers[0]->data(), arr.offset + i); + !bit_util::GetBit(arr.buffers[0]->data(), arr.offset + i); if (is_null) { - BitUtil::ClearBit(to_nn, i); + bit_util::ClearBit(to_nn, i); } } @@ -138,11 +138,11 @@ static Result> ConvertImp( to[i] = to_value; } - memset(to_nn, 0xff, BitUtil::BytesForBits(batch_length)); + memset(to_nn, 0xff, bit_util::BytesForBits(batch_length)); return ArrayData::Make(to_type, batch_length, {std::move(to_nn_buf), std::move(to_buf)}); } else { - memset(to_nn, 0, BitUtil::BytesForBits(batch_length)); + memset(to_nn, 0, bit_util::BytesForBits(batch_length)); return ArrayData::Make(to_type, batch_length, {std::move(to_nn_buf), std::move(to_buf)}); } @@ -245,7 +245,7 @@ Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr dictiona AllocateBuffer(length * sizeof(int32_t), ctx->memory_pool())); uint8_t* non_nulls = non_nulls_buf->mutable_data(); int32_t* ids = reinterpret_cast(ids_buf->mutable_data()); - memset(non_nulls, 0xff, BitUtil::BytesForBits(length)); + memset(non_nulls, 0xff, bit_util::BytesForBits(length)); int32_t num_entries = 0; for (int64_t i = 0; i < length; ++i) { @@ -257,7 +257,7 @@ Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr dictiona // if (internal::KeyEncoder::IsNull(reinterpret_cast(str.data()))) { ids[i] = HashJoinDictUtil::kNullId; - BitUtil::ClearBit(non_nulls, i); + bit_util::ClearBit(non_nulls, i); continue; } @@ -307,7 +307,7 @@ Result> HashJoinDictBuild::RemapInputValues( AllocateBuffer(batch_length * sizeof(int32_t), ctx->memory_pool())); uint8_t* non_nulls = non_nulls_buf->mutable_data(); int32_t* ids = reinterpret_cast(ids_buf->mutable_data()); - memset(non_nulls, 0xff, BitUtil::BytesForBits(batch_length)); + memset(non_nulls, 0xff, bit_util::BytesForBits(batch_length)); // Populate output buffers (for scalar only the first entry is populated) // @@ -315,7 +315,7 @@ Result> HashJoinDictBuild::RemapInputValues( std::string str = encoder.encoded_row(static_cast(i)); if (internal::KeyEncoder::IsNull(reinterpret_cast(str.data()))) { // Map nulls to nulls - BitUtil::ClearBit(non_nulls, i); + bit_util::ClearBit(non_nulls, i); ids[i] = HashJoinDictUtil::kNullId; } else { auto iter = hash_table_.find(str); @@ -330,8 +330,8 @@ Result> HashJoinDictBuild::RemapInputValues( // Generate array of repeated values for scalar input // if (is_scalar) { - if (!BitUtil::GetBit(non_nulls, 0)) { - memset(non_nulls, 0, BitUtil::BytesForBits(batch_length)); + if (!bit_util::GetBit(non_nulls, 0)) { + memset(non_nulls, 0, bit_util::BytesForBits(batch_length)); } for (int64_t i = 1; i < batch_length; ++i) { ids[i] = ids[0]; @@ -447,7 +447,7 @@ Result> HashJoinDictProbe::RemapInput( reinterpret_cast(row_ids_arr->buffers[1]->mutable_data()); const uint8_t* non_nulls = row_ids_arr->buffers[0]->data(); for (int64_t i = 0; i < batch_length; ++i) { - if (!BitUtil::GetBit(non_nulls, i)) { + if (!bit_util::GetBit(non_nulls, i)) { row_ids[i] = internal::RowEncoder::kRowIdForNulls(); } } diff --git a/cpp/src/arrow/compute/exec/hash_join_node_test.cc b/cpp/src/arrow/compute/exec/hash_join_node_test.cc index 481cd94d5c9..b8a18893da8 100644 --- a/cpp/src/arrow/compute/exec/hash_join_node_test.cc +++ b/cpp/src/arrow/compute/exec/hash_join_node_test.cc @@ -281,8 +281,10 @@ struct RandomDataTypeConstraints { void OnlyInt(int int_size, bool allow_nulls) { Default(); - data_type_enabled_mask = - int_size == 8 ? kInt8 : int_size == 4 ? kInt4 : int_size == 2 ? kInt2 : kInt1; + data_type_enabled_mask = int_size == 8 ? kInt8 + : int_size == 4 ? kInt4 + : int_size == 2 ? kInt2 + : kInt1; if (!allow_nulls) { max_null_probability = 0.0; } @@ -459,7 +461,7 @@ void TakeUsingVector(ExecContext* ctx, const std::vector> ASSERT_OK_AND_ASSIGN(std::shared_ptr null_buf, AllocateBitmap(indices.size(), ctx->memory_pool())); uint8_t* non_nulls = null_buf->mutable_data(); - memset(non_nulls, 0xFF, BitUtil::BytesForBits(indices.size())); + memset(non_nulls, 0xFF, bit_util::BytesForBits(indices.size())); if ((*result)[i]->data()->buffers.size() == 2) { (*result)[i] = MakeArray( ArrayData::Make((*result)[i]->type(), indices.size(), @@ -477,7 +479,7 @@ void TakeUsingVector(ExecContext* ctx, const std::vector> if (indices[i] < 0) { for (size_t col = 0; col < result->size(); ++col) { uint8_t* non_nulls = (*result)[col]->data()->buffers[0]->mutable_data(); - BitUtil::ClearBit(non_nulls, i); + bit_util::ClearBit(non_nulls, i); } } } @@ -546,7 +548,7 @@ std::vector NullInKey(const std::vector& cmp, continue; } for (size_t j = 0; j < result.size(); ++j) { - if (!BitUtil::GetBit(nulls, j)) { + if (!bit_util::GetBit(nulls, j)) { result[j] = true; } } @@ -1105,8 +1107,8 @@ TEST(HashJoin, Random) { std::shared_ptr output_rows_test; HashJoinWithExecPlan(rng, parallel, join_options, output_schema, shuffled_input_arrays[0], shuffled_input_arrays[1], - static_cast(BitUtil::CeilDiv(num_rows_l, batch_size)), - static_cast(BitUtil::CeilDiv(num_rows_r, batch_size)), + static_cast(bit_util::CeilDiv(num_rows_l, batch_size)), + static_cast(bit_util::CeilDiv(num_rows_r, batch_size)), &output_rows_test); // Compare results @@ -1166,12 +1168,12 @@ void TestHashJoinDictionaryHelper( int expected_num_r_no_match, // Whether to swap two inputs to the hash join bool swap_sides) { - int64_t l_length = l_key.is_array() - ? l_key.array()->length - : l_payload.is_array() ? l_payload.array()->length : -1; - int64_t r_length = r_key.is_array() - ? r_key.array()->length - : r_payload.is_array() ? r_payload.array()->length : -1; + int64_t l_length = l_key.is_array() ? l_key.array()->length + : l_payload.is_array() ? l_payload.array()->length + : -1; + int64_t r_length = r_key.is_array() ? r_key.array()->length + : r_payload.is_array() ? r_payload.array()->length + : -1; ARROW_DCHECK(l_length >= 0 && r_length >= 0); constexpr int batch_multiplicity_for_parallel = 2; diff --git a/cpp/src/arrow/compute/exec/key_compare.cc b/cpp/src/arrow/compute/exec/key_compare.cc index 55b0e5e998b..368c63b8b92 100644 --- a/cpp/src/arrow/compute/exec/key_compare.cc +++ b/cpp/src/arrow/compute/exec/key_compare.cc @@ -57,7 +57,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; int64_t bitid = irow_right * null_mask_num_bytes * 8 + id_col; - match_bytevector[i] &= (BitUtil::GetBit(null_masks, bitid) ? 0 : 0xff); + match_bytevector[i] &= (bit_util::GetBit(null_masks, bitid) ? 0 : 0xff); } } else if (!rows.has_any_nulls(ctx)) { // Remove rows from the result for which the column value on left side is null @@ -66,7 +66,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com for (uint32_t i = num_processed; i < num_rows_to_compare; ++i) { uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; match_bytevector[i] &= - BitUtil::GetBit(non_nulls, irow_left + col.bit_offset(0)) ? 0xff : 0; + bit_util::GetBit(non_nulls, irow_left + col.bit_offset(0)) ? 0xff : 0; } } else { const uint8_t* null_masks = rows.null_masks(); @@ -77,9 +77,9 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; uint32_t irow_right = left_to_right_map[irow_left]; int64_t bitid_right = irow_right * null_mask_num_bytes * 8 + id_col; - int right_null = BitUtil::GetBit(null_masks, bitid_right) ? 0xff : 0; + int right_null = bit_util::GetBit(null_masks, bitid_right) ? 0xff : 0; int left_null = - BitUtil::GetBit(non_nulls, irow_left + col.bit_offset(0)) ? 0 : 0xff; + bit_util::GetBit(non_nulls, irow_left + col.bit_offset(0)) ? 0 : 0xff; match_bytevector[i] |= left_null & right_null; match_bytevector[i] &= ~(left_null ^ right_null); } @@ -140,7 +140,8 @@ void KeyCompare::CompareBinaryColumnToRow( left_to_right_map, ctx, col, rows, match_bytevector, [bit_offset](const uint8_t* left_base, const uint8_t* right_base, uint32_t irow_left, uint32_t offset_right) { - uint8_t left = BitUtil::GetBit(left_base, irow_left + bit_offset) ? 0xff : 0x00; + uint8_t left = + bit_util::GetBit(left_base, irow_left + bit_offset) ? 0xff : 0x00; uint8_t right = right_base[offset_right]; return left == right ? 0xff : 0; }); @@ -200,7 +201,7 @@ void KeyCompare::CompareBinaryColumnToRow( // Non-zero length guarantees no underflow int32_t num_loops_less_one = - static_cast(BitUtil::CeilDiv(length, 8)) - 1; + static_cast(bit_util::CeilDiv(length, 8)) - 1; uint64_t tail_mask = ~0ULL >> (64 - 8 * (length - num_loops_less_one * 8)); @@ -271,7 +272,7 @@ void KeyCompare::CompareVarBinaryColumnToRow( if (length > 0) { int32_t j; // length can be zero - for (j = 0; j < static_cast(BitUtil::CeilDiv(length, 8)) - 1; ++j) { + for (j = 0; j < static_cast(bit_util::CeilDiv(length, 8)) - 1; ++j) { uint64_t key_left = util::SafeLoad(key_left_ptr + j); uint64_t key_right = key_right_ptr[j]; result_or |= key_left ^ key_right; @@ -296,7 +297,7 @@ void KeyCompare::AndByteVectors(KeyEncoder::KeyEncoderContext* ctx, uint32_t num } #endif - for (uint32_t i = num_processed / 8; i < BitUtil::CeilDiv(num_elements, 8); ++i) { + for (uint32_t i = num_processed / 8; i < bit_util::CeilDiv(num_elements, 8); ++i) { uint64_t* a = reinterpret_cast(bytevector_A); const uint64_t* b = reinterpret_cast(bytevector_B); a[i] &= b[i]; @@ -403,19 +404,19 @@ void KeyCompare::CompareColumnsToRows(uint32_t num_rows_to_compare, } } - util::BitUtil::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, - match_bytevector_A, match_bitvector); + util::bit_util::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, + match_bytevector_A, match_bitvector); if (sel_left_maybe_null) { int out_num_rows_int; - util::BitUtil::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare, - match_bitvector, sel_left_maybe_null, - &out_num_rows_int, out_sel_left_maybe_same); + util::bit_util::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare, + match_bitvector, sel_left_maybe_null, + &out_num_rows_int, out_sel_left_maybe_same); *out_num_rows = out_num_rows_int; } else { int out_num_rows_int; - util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare, - match_bitvector, &out_num_rows_int, - out_sel_left_maybe_same); + util::bit_util::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare, + match_bitvector, &out_num_rows_int, + out_sel_left_maybe_same); *out_num_rows = out_num_rows_int; } } diff --git a/cpp/src/arrow/compute/exec/key_encode.cc b/cpp/src/arrow/compute/exec/key_encode.cc index 8ab76cd27b3..d5e208bfb74 100644 --- a/cpp/src/arrow/compute/exec/key_encode.cc +++ b/cpp/src/arrow/compute/exec/key_encode.cc @@ -204,7 +204,7 @@ Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from, uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id]; auto src64 = reinterpret_cast(src + from_offsets[row_id]); auto dst64 = reinterpret_cast(dst); - for (uint32_t j = 0; j < BitUtil::CeilDiv(length, 8); ++j) { + for (uint32_t j = 0; j < bit_util::CeilDiv(length, 8); ++j) { dst64[j] = src64[j]; } dst += length; @@ -218,7 +218,7 @@ Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from, uint32_t length = metadata_.fixed_length; auto src64 = reinterpret_cast(src + length * row_id); auto dst64 = reinterpret_cast(dst); - for (uint32_t j = 0; j < BitUtil::CeilDiv(length, 8); ++j) { + for (uint32_t j = 0; j < bit_util::CeilDiv(length, 8); ++j) { dst64[j] = src64[j]; } dst += length; @@ -263,7 +263,7 @@ bool KeyEncoder::KeyRowArray::has_any_nulls(const KeyEncoderContext* ctx) const } if (num_rows_for_has_any_nulls_ < num_rows_) { auto size_per_row = metadata().null_masks_bytes_per_row; - has_any_nulls_ = !util::BitUtil::are_all_bytes_zero( + has_any_nulls_ = !util::bit_util::are_all_bytes_zero( ctx->hardware_flags, null_masks() + size_per_row * num_rows_for_has_any_nulls_, static_cast(size_per_row * (num_rows_ - num_rows_for_has_any_nulls_))); num_rows_for_has_any_nulls_ = num_rows_; @@ -374,7 +374,7 @@ void KeyEncoder::TransformBoolean::PostDecode(const KeyColumnArray& input, DCHECK(input.data(buffer_index) != nullptr); DCHECK(output->mutable_data(buffer_index) != nullptr); - util::BitUtil::bytes_to_bits( + util::bit_util::bytes_to_bits( ctx->hardware_flags, static_cast(input.length()), input.data(buffer_index), output->mutable_data(buffer_index), output->bit_offset(buffer_index)); } @@ -543,7 +543,7 @@ void KeyEncoder::EncoderBinary::DecodeImp(uint32_t start_row, uint32_t num_rows, DecodeHelper( start_row, num_rows, offset_within_row, &rows, nullptr, col, col, [](uint8_t* dst, const uint8_t* src, int64_t length) { - for (uint32_t istripe = 0; istripe < BitUtil::CeilDiv(length, 8); ++istripe) { + for (uint32_t istripe = 0; istripe < bit_util::CeilDiv(length, 8); ++istripe) { auto dst64 = reinterpret_cast(dst); auto src64 = reinterpret_cast(src); util::SafeStore(dst64 + istripe, src64[istripe]); @@ -572,10 +572,14 @@ void KeyEncoder::EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows uint32_t col_width1 = col_prep[0].metadata().fixed_length; uint32_t col_width2 = col_prep[1].metadata().fixed_length; - int log_col_width1 = - col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0; - int log_col_width2 = - col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0; + int log_col_width1 = col_width1 == 8 ? 3 + : col_width1 == 4 ? 2 + : col_width1 == 2 ? 1 + : 0; + int log_col_width2 = col_width2 == 8 ? 3 + : col_width2 == 4 ? 2 + : col_width2 == 2 ? 1 + : 0; bool is_row_fixed_length = rows.metadata().is_fixed_length; @@ -746,7 +750,7 @@ void KeyEncoder::EncoderVarBinary::DecodeImp(uint32_t start_row, uint32_t num_ro DecodeHelper( start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col, [](uint8_t* dst, const uint8_t* src, int64_t length) { - for (uint32_t istripe = 0; istripe < BitUtil::CeilDiv(length, 8); ++istripe) { + for (uint32_t istripe = 0; istripe < bit_util::CeilDiv(length, 8); ++istripe) { auto dst64 = reinterpret_cast(dst); auto src64 = reinterpret_cast(src); util::SafeStore(dst64 + istripe, src64[istripe]); @@ -774,14 +778,14 @@ void KeyEncoder::EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows, non_nulls[0] |= 0xff << (bit_offset); if (bit_offset + num_rows > 8) { int bits_in_first_byte = 8 - bit_offset; - memset(non_nulls + 1, 0xff, BitUtil::BytesForBits(num_rows - bits_in_first_byte)); + memset(non_nulls + 1, 0xff, bit_util::BytesForBits(num_rows - bits_in_first_byte)); } for (uint32_t row = 0; row < num_rows; ++row) { uint32_t null_masks_bit_id = (start_row + row) * null_masks_bytes_per_row * 8 + static_cast(col); - bool is_set = BitUtil::GetBit(null_masks, null_masks_bit_id); + bool is_set = bit_util::GetBit(null_masks, null_masks_bit_id); if (is_set) { - BitUtil::ClearBit(non_nulls, bit_offset + row); + bit_util::ClearBit(non_nulls, bit_offset + row); } } } @@ -1055,7 +1059,7 @@ void KeyEncoder::EncoderBinary::EncodeSelectedImp( const uint8_t* non_null_bits = col.data(0); uint8_t* dst = rows->mutable_data(1) + offset_within_row; for (uint32_t i = 0; i < num_selected; ++i) { - bool is_null = !BitUtil::GetBit(non_null_bits, selection[i] + col.bit_offset(0)); + bool is_null = !bit_util::GetBit(non_null_bits, selection[i] + col.bit_offset(0)); if (is_null) { set_null_fn(dst); } @@ -1074,7 +1078,7 @@ void KeyEncoder::EncoderBinary::EncodeSelectedImp( uint8_t* dst = rows->mutable_data(2) + offset_within_row; const uint32_t* offsets = rows->offsets(); for (uint32_t i = 0; i < num_selected; ++i) { - bool is_null = !BitUtil::GetBit(non_null_bits, selection[i] + col.bit_offset(0)); + bool is_null = !bit_util::GetBit(non_null_bits, selection[i] + col.bit_offset(0)); if (is_null) { set_null_fn(dst + offsets[i]); } @@ -1094,7 +1098,7 @@ void KeyEncoder::EncoderBinary::EncodeSelected(uint32_t offset_within_row, EncodeSelectedImp( offset_within_row, rows, col, num_selected, selection, [bit_offset](uint8_t* dst, const uint8_t* src_base, uint16_t irow) { - *dst = BitUtil::GetBit(src_base, irow + bit_offset) ? 0xff : 0x00; + *dst = bit_util::GetBit(src_base, irow + bit_offset) ? 0xff : 0x00; }, [](uint8_t* dst) { *dst = 0xae; }); } else if (col_width == 1) { @@ -1168,7 +1172,8 @@ void KeyEncoder::EncoderOffsets::GetRowOffsetsSelected( const uint32_t* col_offsets = cols[icol].offsets(); for (uint32_t i = 0; i < num_selected; ++i) { uint32_t irow = selection[i]; - bool is_null = !BitUtil::GetBit(non_null_bits, irow + cols[icol].bit_offset(0)); + bool is_null = + !bit_util::GetBit(non_null_bits, irow + cols[icol].bit_offset(0)); if (is_null) { uint32_t length = col_offsets[irow + 1] - col_offsets[irow]; row_offsets[i] -= length; @@ -1205,8 +1210,8 @@ void KeyEncoder::EncoderOffsets::EncodeSelectedImp( uint32_t length = col_offsets[irow + 1] - col_offsets[irow]; if (has_nulls) { uint32_t null_multiplier = - BitUtil::GetBit(col_non_null_bits, irow + cols[ivarbinary].bit_offset(0)) ? 1 - : 0; + bit_util::GetBit(col_non_null_bits, irow + cols[ivarbinary].bit_offset(0)) ? 1 + : 0; length *= null_multiplier; } uint32_t* row = reinterpret_cast(row_base + row_offsets[i]); @@ -1289,9 +1294,9 @@ void KeyEncoder::EncoderNulls::EncodeSelected(KeyRowArray* rows, if (non_null_bits) { for (uint32_t i = 0; i < num_selected; ++i) { uint32_t irow = selection[i]; - bool is_null = !BitUtil::GetBit(non_null_bits, irow + cols[icol].bit_offset(0)); + bool is_null = !bit_util::GetBit(non_null_bits, irow + cols[icol].bit_offset(0)); if (is_null) { - BitUtil::SetBit(null_masks, i * null_mask_num_bytes * 8 + icol); + bit_util::SetBit(null_masks, i * null_mask_num_bytes * 8 + icol); } } } diff --git a/cpp/src/arrow/compute/exec/key_hash.cc b/cpp/src/arrow/compute/exec/key_hash.cc index 76c8ed1ef30..12065cdd3cb 100644 --- a/cpp/src/arrow/compute/exec/key_hash.cc +++ b/cpp/src/arrow/compute/exec/key_hash.cc @@ -280,9 +280,9 @@ void Hashing::HashMultiColumn(const std::vector& col if (cols[icol].metadata().is_fixed_length) { uint32_t col_width = cols[icol].metadata().fixed_length; if (col_width == 0) { - util::BitUtil::bits_to_bytes(ctx->hardware_flags, num_rows, cols[icol].data(1), - byte_temp_buf.mutable_data(), - cols[icol].bit_offset(1)); + util::bit_util::bits_to_bytes(ctx->hardware_flags, num_rows, cols[icol].data(1), + byte_temp_buf.mutable_data(), + cols[icol].bit_offset(1)); } Hashing::hash_fixed( ctx->hardware_flags, num_rows, col_width == 0 ? 1 : col_width, @@ -299,9 +299,9 @@ void Hashing::HashMultiColumn(const std::vector& col if (cols[icol].data(0)) { uint32_t* dst_hash = is_first ? out_hash : hash_temp_buf.mutable_data(); int num_nulls; - util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows, cols[icol].data(0), - &num_nulls, hash_null_index_buf.mutable_data(), - cols[icol].bit_offset(0)); + util::bit_util::bits_to_indexes( + 0, ctx->hardware_flags, num_rows, cols[icol].data(0), &num_nulls, + hash_null_index_buf.mutable_data(), cols[icol].bit_offset(0)); for (int i = 0; i < num_nulls; ++i) { uint16_t row_id = hash_null_index_buf.mutable_data()[i]; dst_hash[row_id] = 0; diff --git a/cpp/src/arrow/compute/exec/key_map.cc b/cpp/src/arrow/compute/exec/key_map.cc index bff352e0155..fe5ed98bb3e 100644 --- a/cpp/src/arrow/compute/exec/key_map.cc +++ b/cpp/src/arrow/compute/exec/key_map.cc @@ -28,7 +28,7 @@ namespace arrow { -using BitUtil::CountLeadingZeros; +using bit_util::CountLeadingZeros; namespace compute { @@ -198,7 +198,7 @@ void SwissTable::init_slot_ids(const int num_keys, const uint16_t* selection, if (log_blocks_ == 0) { for (int i = 0; i < num_keys; ++i) { uint16_t id = selection[i]; - uint32_t match = ::arrow::BitUtil::GetBit(match_bitvector, id) ? 1 : 0; + uint32_t match = ::arrow::bit_util::GetBit(match_bitvector, id) ? 1 : 0; uint32_t slot_id = local_slots[id] + match; out_slot_ids[id] = slot_id; } @@ -207,7 +207,7 @@ void SwissTable::init_slot_ids(const int num_keys, const uint16_t* selection, uint16_t id = selection[i]; uint32_t hash = hashes[id]; uint32_t iblock = (hash >> (bits_hash_ - log_blocks_)); - uint32_t match = ::arrow::BitUtil::GetBit(match_bitvector, id) ? 1 : 0; + uint32_t match = ::arrow::bit_util::GetBit(match_bitvector, id) ? 1 : 0; uint32_t slot_id = iblock * 8 + local_slots[id] + match; out_slot_ids[id] = slot_id; } @@ -371,9 +371,9 @@ void SwissTable::run_comparisons(const int num_keys, equal_impl_(num_keys, nullptr, groupids, &out_num, out_not_equal_selection); *out_num_not_equal = static_cast(out_num); } else { - util::BitUtil::bits_to_indexes(1, hardware_flags_, num_keys, - optional_selection_bitvector, out_num_not_equal, - out_not_equal_selection); + util::bit_util::bits_to_indexes(1, hardware_flags_, num_keys, + optional_selection_bitvector, out_num_not_equal, + out_not_equal_selection); uint32_t out_num; equal_impl_(*out_num_not_equal, out_not_equal_selection, groupids, &out_num, out_not_equal_selection); @@ -500,8 +500,8 @@ void SwissTable::find(const int num_keys, const uint32_t* hashes, run_comparisons(num_keys, nullptr, inout_match_bitvector, out_group_ids, &num_ids, ids); } else { - util::BitUtil::bits_to_indexes(1, hardware_flags_, num_keys, inout_match_bitvector, - &num_ids, ids); + util::bit_util::bits_to_indexes(1, hardware_flags_, num_keys, inout_match_bitvector, + &num_ids, ids); extract_group_ids(num_ids, ids, hashes, local_slots, out_group_ids); run_comparisons(num_ids, ids, nullptr, out_group_ids, &num_ids, ids); } @@ -525,7 +525,7 @@ void SwissTable::find(const int num_keys, const uint32_t* hashes, slot_ids[id] = next_slot_id; // If next match was not found then clear match bit in a bit vector if (!match_found) { - ::arrow::BitUtil::ClearBit(inout_match_bitvector, id); + ::arrow::bit_util::ClearBit(inout_match_bitvector, id); } else { ids[num_ids++] = id; } @@ -580,7 +580,7 @@ Status SwissTable::map_new_keys_helper(const uint32_t* hashes, // out_group_ids[id] = num_inserted_ + num_inserted_new; insert_into_empty_slot(inout_next_slot_ids[id], hashes[id], out_group_ids[id]); - ::arrow::BitUtil::ClearBit(match_bitvector, num_processed); + ::arrow::bit_util::ClearBit(match_bitvector, num_processed); ++num_inserted_new; // We need to break processing and have the caller of this function @@ -600,15 +600,15 @@ Status SwissTable::map_new_keys_helper(const uint32_t* hashes, // Copy keys for newly inserted rows using callback // - util::BitUtil::bits_filter_indexes(0, hardware_flags_, num_processed, match_bitvector, - inout_selection, &num_temp_ids, temp_ids); + util::bit_util::bits_filter_indexes(0, hardware_flags_, num_processed, match_bitvector, + inout_selection, &num_temp_ids, temp_ids); ARROW_DCHECK(static_cast(num_inserted_new) == num_temp_ids); RETURN_NOT_OK(append_impl_(num_inserted_new, temp_ids)); num_inserted_ += num_inserted_new; // Evaluate comparisons and append ids of rows that failed it to the non-match set. - util::BitUtil::bits_filter_indexes(1, hardware_flags_, num_processed, match_bitvector, - inout_selection, &num_temp_ids, temp_ids); + util::bit_util::bits_filter_indexes(1, hardware_flags_, num_processed, match_bitvector, + inout_selection, &num_temp_ids, temp_ids); run_comparisons(num_temp_ids, temp_ids, nullptr, out_group_ids, &num_temp_ids, temp_ids); diff --git a/cpp/src/arrow/compute/exec/key_map.h b/cpp/src/arrow/compute/exec/key_map.h index cf539f4a99b..12c1e393c4a 100644 --- a/cpp/src/arrow/compute/exec/key_map.h +++ b/cpp/src/arrow/compute/exec/key_map.h @@ -153,8 +153,10 @@ class SwissTable { static int num_groupid_bits_from_log_blocks(int log_blocks) { int required_bits = log_blocks + 3; - return required_bits <= 8 ? 8 - : required_bits <= 16 ? 16 : required_bits <= 32 ? 32 : 64; + return required_bits <= 8 ? 8 + : required_bits <= 16 ? 16 + : required_bits <= 32 ? 32 + : 64; } // Use 32-bit hash for now diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc index 64060d44564..6e26927e40c 100644 --- a/cpp/src/arrow/compute/exec/util.cc +++ b/cpp/src/arrow/compute/exec/util.cc @@ -25,12 +25,12 @@ namespace arrow { -using BitUtil::CountTrailingZeros; +using bit_util::CountTrailingZeros; namespace util { -inline void BitUtil::bits_to_indexes_helper(uint64_t word, uint16_t base_index, - int* num_indexes, uint16_t* indexes) { +inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index, + int* num_indexes, uint16_t* indexes) { int n = *num_indexes; while (word) { indexes[n++] = base_index + static_cast(CountTrailingZeros(word)); @@ -39,9 +39,9 @@ inline void BitUtil::bits_to_indexes_helper(uint64_t word, uint16_t base_index, *num_indexes = n; } -inline void BitUtil::bits_filter_indexes_helper(uint64_t word, - const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes) { +inline void bit_util::bits_filter_indexes_helper(uint64_t word, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes) { int n = *num_indexes; while (word) { indexes[n++] = input_indexes[CountTrailingZeros(word)]; @@ -51,10 +51,10 @@ inline void BitUtil::bits_filter_indexes_helper(uint64_t word, } template -void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes, - uint16_t base_index) { +void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes, uint16_t base_index) { // 64 bits at a time constexpr int unroll = 64; int tail = num_bits % unroll; @@ -101,9 +101,9 @@ void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bit } } -void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits, - const uint8_t* bits, int* num_indexes, uint16_t* indexes, - int bit_offset) { +void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits, + const uint8_t* bits, int* num_indexes, uint16_t* indexes, + int bit_offset) { bits += bit_offset / 8; bit_offset %= 8; *num_indexes = 0; @@ -135,10 +135,10 @@ void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num *num_indexes += num_indexes_new; } -void BitUtil::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, - const int num_bits, const uint8_t* bits, - const uint16_t* input_indexes, int* num_indexes, - uint16_t* indexes, int bit_offset) { +void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes, int bit_offset) { bits += bit_offset / 8; bit_offset %= 8; if (bit_offset != 0) { @@ -169,10 +169,10 @@ void BitUtil::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, } } -void BitUtil::bits_split_indexes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, int* num_indexes_bit0, - uint16_t* indexes_bit0, uint16_t* indexes_bit1, - int bit_offset) { +void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, int* num_indexes_bit0, + uint16_t* indexes_bit0, uint16_t* indexes_bit1, + int bit_offset) { bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0, bit_offset); int num_indexes_bit1; @@ -180,8 +180,8 @@ void BitUtil::bits_split_indexes(int64_t hardware_flags, const int num_bits, bit_offset); } -void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, uint8_t* bytes, int bit_offset) { +void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, uint8_t* bytes, int bit_offset) { bits += bit_offset / 8; bit_offset %= 8; if (bit_offset != 0) { @@ -221,8 +221,8 @@ void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits, } } -void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits, - const uint8_t* bytes, uint8_t* bits, int bit_offset) { +void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits, + const uint8_t* bytes, uint8_t* bits, int bit_offset) { bits += bit_offset / 8; bit_offset %= 8; if (bit_offset != 0) { @@ -260,8 +260,8 @@ void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits, } } -bool BitUtil::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, - uint32_t num_bytes) { +bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, + uint32_t num_bytes) { #if defined(ARROW_HAVE_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { return are_all_bytes_zero_avx2(bytes, num_bytes); diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h index 800c6f0e91a..3efb3115830 100644 --- a/cpp/src/arrow/compute/exec/util.h +++ b/cpp/src/arrow/compute/exec/util.h @@ -89,7 +89,7 @@ class TempVectorStack { // using SIMD when number of vector elements is not divisible // by the number of SIMD lanes. // - return ::arrow::BitUtil::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; + return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; } void alloc(uint32_t num_bytes, uint8_t** data, int* id) { int64_t old_top = top_; @@ -144,7 +144,7 @@ class TempVectorHolder { uint32_t num_elements_; }; -class BitUtil { +class bit_util { public: static void bits_to_indexes(int bit_to_search, int64_t hardware_flags, const int num_bits, const uint8_t* bits, int* num_indexes, diff --git a/cpp/src/arrow/compute/exec/util_avx2.cc b/cpp/src/arrow/compute/exec/util_avx2.cc index bdc0e41f576..2357bc936a2 100644 --- a/cpp/src/arrow/compute/exec/util_avx2.cc +++ b/cpp/src/arrow/compute/exec/util_avx2.cc @@ -25,9 +25,9 @@ namespace util { #if defined(ARROW_HAVE_AVX2) -void BitUtil::bits_to_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, int* num_indexes, - uint16_t* indexes, uint16_t base_index) { +void bit_util::bits_to_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, int* num_indexes, + uint16_t* indexes, uint16_t base_index) { if (bit_to_search == 0) { bits_to_indexes_imp_avx2<0>(num_bits, bits, num_indexes, indexes, base_index); } else { @@ -37,9 +37,9 @@ void BitUtil::bits_to_indexes_avx2(int bit_to_search, const int num_bits, } template -void BitUtil::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, - int* num_indexes, uint16_t* indexes, - uint16_t base_index) { +void bit_util::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, + int* num_indexes, uint16_t* indexes, + uint16_t base_index) { // 64 bits at a time constexpr int unroll = 64; @@ -67,7 +67,7 @@ void BitUtil::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, _pext_u64(mask, _pdep_u64(word, kEachByteIs1) * 0xff) + base; *reinterpret_cast(byte_indexes + num_indexes_loop) = byte_indexes_next; base += incr; - num_indexes_loop += static_cast(arrow::BitUtil::PopCount(word & 0xff)); + num_indexes_loop += static_cast(arrow::bit_util::PopCount(word & 0xff)); word >>= 8; } // Unpack indexes to 16-bits and either add the base of i * 64 or shuffle input @@ -82,9 +82,10 @@ void BitUtil::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, } } -void BitUtil::bits_filter_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes) { +void bit_util::bits_filter_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes) { if (bit_to_search == 0) { bits_filter_indexes_imp_avx2<0>(num_bits, bits, input_indexes, num_indexes, indexes); } else { @@ -93,9 +94,9 @@ void BitUtil::bits_filter_indexes_avx2(int bit_to_search, const int num_bits, } template -void BitUtil::bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, - const uint16_t* input_indexes, - int* out_num_indexes, uint16_t* indexes) { +void bit_util::bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, + int* out_num_indexes, uint16_t* indexes) { // 64 bits at a time constexpr int unroll = 64; @@ -157,7 +158,7 @@ void BitUtil::bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bi kByteSequence_0_8_1_9_2_10_3_11, kByteSequence_4_12_5_13_6_14_7_15)); _mm256_storeu_si256((__m256i*)(indexes + num_indexes), output); - num_indexes += static_cast(arrow::BitUtil::PopCount(word & 0xffff)); + num_indexes += static_cast(arrow::bit_util::PopCount(word & 0xffff)); word >>= 16; ++loop_id; } @@ -166,8 +167,8 @@ void BitUtil::bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bi *out_num_indexes = num_indexes; } -void BitUtil::bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, - uint8_t* bytes) { +void bit_util::bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, + uint8_t* bytes) { constexpr int unroll = 32; constexpr uint64_t kEachByteIs1 = 0x0101010101010101ULL; @@ -187,8 +188,8 @@ void BitUtil::bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, } } -void BitUtil::bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, - uint8_t* bits) { +void bit_util::bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, + uint8_t* bits) { constexpr int unroll = 32; // Processing 32 bits at a time for (int i = 0; i < num_bits / unroll; ++i) { @@ -197,7 +198,7 @@ void BitUtil::bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, } } -bool BitUtil::are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) { +bool bit_util::are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) { __m256i result_or = _mm256_setzero_si256(); uint32_t i; for (i = 0; i < num_bytes / 32; ++i) { diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index 3769517a9e4..198cb84ff5e 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -95,7 +95,7 @@ void AssertValidityZeroExtraBits(const ArrayData& arr) { const int64_t bit_extent = ((arr.offset + arr.length + 7) / 8) * 8; for (int64_t i = arr.offset + arr.length; i < bit_extent; ++i) { - EXPECT_FALSE(BitUtil::GetBit(buf.data(), i)) << i; + EXPECT_FALSE(bit_util::GetBit(buf.data(), i)) << i; } } @@ -253,7 +253,7 @@ TEST_F(TestPropagateNulls, SingleValueWithNulls) { if (preallocate) { ASSERT_OK_AND_ASSIGN( preallocated_bitmap, - AllocateBuffer(BitUtil::BytesForBits(sliced->length() + out_offset))); + AllocateBuffer(bit_util::BytesForBits(sliced->length() + out_offset))); std::memset(preallocated_bitmap->mutable_data(), 0, preallocated_bitmap->size()); output.buffers[0] = preallocated_bitmap; } else { diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 626824d73ec..b503a1732d9 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -125,6 +125,8 @@ TEST(FunctionOptions, Equality) { options.emplace_back(new PartitionNthOptions(/*pivot=*/42)); options.emplace_back(new SelectKOptions(0, {})); options.emplace_back(new SelectKOptions(5, {{SortKey("key", SortOrder::Ascending)}})); + options.emplace_back(new Utf8NormalizeOptions()); + options.emplace_back(new Utf8NormalizeOptions(Utf8NormalizeOptions::NFD)); for (size_t i = 0; i < options.size(); i++) { const size_t prev_i = i == 0 ? options.size() - 1 : i - 1; diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index 666b73e415c..909c2399c8e 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -49,7 +49,7 @@ Result> KernelContext::Allocate(int64_t nbytes) } Result> KernelContext::AllocateBitmap(int64_t num_bits) { - const int64_t nbytes = BitUtil::BytesForBits(num_bits); + const int64_t nbytes = bit_util::BytesForBits(num_bits); ARROW_ASSIGN_OR_RAISE(std::shared_ptr result, AllocateResizableBuffer(nbytes, exec_ctx_->memory_pool())); // Since bitmaps are typically written bit by bit, we could leak uninitialized bits. diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 38575553b3e..c5aaffda8fc 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -259,7 +259,7 @@ Result> SumInit(KernelContext* ctx, Result> MeanInit(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor( + MeanKernelInit visitor( ctx, args.inputs[0].type, static_cast(*args.options)); return visitor.Create(); @@ -929,6 +929,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { AddArrayScalarAggKernels(SumInit, SignedIntTypes(), int64(), func.get()); AddArrayScalarAggKernels(SumInit, UnsignedIntTypes(), uint64(), func.get()); AddArrayScalarAggKernels(SumInit, FloatingPointTypes(), float64(), func.get()); + AddArrayScalarAggKernels(SumInit, {null()}, int64(), func.get()); // Add the SIMD variants for sum #if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512) auto cpu_info = arrow::internal::CpuInfo::GetInstance(); @@ -955,6 +956,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { AddAggKernel( KernelSignature::Make({InputType(Type::DECIMAL256)}, OutputType(ScalarFirstType)), MeanInit, func.get(), SimdLevel::NONE); + AddArrayScalarAggKernels(MeanInit, {null()}, float64(), func.get()); // Add the SIMD variants for mean #if defined(ARROW_HAVE_RUNTIME_AVX2) if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index e56fab21c16..8f020160255 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -121,6 +121,40 @@ struct SumImpl : public ScalarAggregator { ScalarAggregateOptions options; }; +template +struct NullSumImpl : public ScalarAggregator { + using ScalarType = typename TypeTraits::ScalarType; + + explicit NullSumImpl(const ScalarAggregateOptions& options_) : options(options_) {} + + Status Consume(KernelContext*, const ExecBatch& batch) override { + if (batch[0].is_scalar() || batch[0].array()->GetNullCount() > 0) { + // If the batch is a scalar or an array with elements, set is_empty to false + is_empty = false; + } + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->is_empty &= other.is_empty; + return Status::OK(); + } + + Status Finalize(KernelContext*, Datum* out) override { + if ((options.skip_nulls || this->is_empty) && options.min_count == 0) { + // Return 0 if the remaining data is empty + out->value = std::make_shared(0); + } else { + out->value = MakeNullScalar(TypeTraits::type_singleton()); + } + return Status::OK(); + } + + bool is_empty = true; + ScalarAggregateOptions options; +}; + template struct MeanImpl : public SumImpl { using SumImpl::SumImpl; @@ -200,12 +234,29 @@ struct SumLikeInit { return Status::OK(); } + virtual Status Visit(const NullType&) { + state.reset(new NullSumImpl(options)); + return Status::OK(); + } + Result> Create() { RETURN_NOT_OK(VisitTypeInline(*type, this)); return std::move(state); } }; +template