diff --git a/.asf.yaml b/.asf.yaml index 4bd5191a7a6..2c66ce5be63 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -15,6 +15,10 @@ # specific language governing permissions and limitations # under the License. +github: + description: "Apache Arrow is a multi-language toolbox for accelerated data interchange and in-memory processing" + homepage: https://arrow.apache.org/ + notifications: commits: commits@arrow.apache.org issues: github@arrow.apache.org diff --git a/.dockerignore b/.dockerignore index eb71138c679..a369d7d59a6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -27,7 +27,7 @@ # include explicitly !ci/** !c_glib/Gemfile -!dev/archery/requirements*.txt +!dev/archery/setup.py !python/requirements*.txt !python/manylinux1/** !python/manylinux2010/** diff --git a/.env b/.env index cd6b57e004a..0af36084bd7 100644 --- a/.env +++ b/.env @@ -42,12 +42,11 @@ ULIMIT_CORE=-1 REPO=apache/arrow-dev CUDA=9.1 DEBIAN=10 -UBUNTU=18.04 +UBUNTU=20.04 FEDORA=33 PYTHON=3.6 -LLVM=11 +LLVM=12 CLANG_TOOLS=8 -RUST=nightly-2020-11-24 GO=1.15 NODE=14 MAVEN=3.5.4 @@ -60,15 +59,17 @@ KARTOTHEK=latest HDFS=3.2.1 SPARK=master DOTNET=3.1 -R=4.0 +R=4.1 ARROW_R_DEV=TRUE +GCC_VERSION="" # These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-gcc-release:latest R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest +TZ=UTC # -1 does not attempt to install a devtoolset version, any positive integer will install devtoolset-n DEVTOOLSET_VERSION=-1 # Used for the manylinux and windows wheels, please update the crossbow configuration on update: # https://github.com/ursacomputing/crossbow/blob/master/.github/workflows/cache_vcpkg.yml -VCPKG=fced4bef1606260f110d74de1ae1975c2b9ac549 +VCPKG="2021.04.30" diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index 761e0459543..66cd04a37c9 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -31,21 +31,25 @@ on: - 'dev/tasks/**' - 'docker-compose.yml' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + jobs: test: if: ${{ !contains(github.event.pull_request.title, 'WIP') }} name: Archery Unittests and Crossbow Check Config runs-on: ubuntu-latest + timeout-minutes: 15 steps: - name: Checkout Arrow uses: actions/checkout@v2 with: fetch-depth: 0 - name: Git Fixup - if: ${{ github.event_name == 'pull_request' }} shell: bash - run: git branch master origin/master + run: git branch master origin/master || true - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Setup Python @@ -58,7 +62,7 @@ jobs: working-directory: dev/archery run: pytest -v archery - name: Archery Docker Validation - run: archery docker + run: archery docker check-config - name: Crossbow Check Config working-directory: dev/tasks run: archery crossbow check-config diff --git a/.github/workflows/cancel.yml b/.github/workflows/cancel.yml deleted file mode 100644 index de980eb6d05..00000000000 --- a/.github/workflows/cancel.yml +++ /dev/null @@ -1,123 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Cancel stale runs - -on: - workflow_run: - # The name of another workflow (whichever one) that always runs on PRs - workflows: ['Dev'] - types: ['requested'] - -jobs: - cancel-stale-workflow-runs: - name: "Cancel stale workflow runs" - runs-on: ubuntu-latest - steps: - # Unfortunately, we need to define a separate cancellation step for - # each workflow where we want to cancel stale runs. - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale C++ runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: cpp.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale C# runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: csharp.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Dev runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: dev.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Go runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: go.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Integration runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: integration.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Java JNI runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: java_jni.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Java runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: java.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale JS runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: js.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Julia runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: julia.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Python runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: python.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale R runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: r.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Ruby runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: ruby.yml - skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Rust runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: rust.yml - skipEventTypes: '["push", "schedule"]' diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 9e103003eee..35d889152fb 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -68,6 +68,7 @@ jobs: } if changed '^r/.*\.R$'; then echo "R_DOCS=true" >> $GITHUB_ENV + echo "R_CODE=true" >> $GITHUB_ENV fi if changed 'cmake' || changed 'CMake'; then echo "CMAKE_FORMAT=true" >> $GITHUB_ENV @@ -78,6 +79,16 @@ jobs: if changed '^r/src'; then echo "CLANG_FORMAT_R=true" >> $GITHUB_ENV fi + - name: Ensure clang-format has the appropriate versoin + if: env.CMAKE_FORMAT == 'true' || + env.CLANG_FORMAT_CPP == 'true' || + env.CLANG_FORMAT_R == 'true' || + endsWith(github.event.comment.body, 'everything') + run: | + set -e + . .env # To get the clang version we use + sudo apt update + sudo apt install -y clang-format-${CLANG_TOOLS} - name: Run cmake_format if: env.CMAKE_FORMAT == 'true' || endsWith(github.event.comment.body, 'everything') run: | @@ -103,15 +114,30 @@ jobs: --exclude_glob=cpp/build-support/lint_exclusions.txt \ --source_dir=r/src --quiet --fix - uses: r-lib/actions/setup-r@v1 - if: env.R_DOCS == 'true' || endsWith(github.event.comment.body, 'everything') + if: env.R_DOCS == 'true' || env.R_CODE == 'true' || endsWith(github.event.comment.body, 'everything') - name: Update R docs if: env.R_DOCS == 'true' || endsWith(github.event.comment.body, 'everything') shell: Rscript {0} run: | source("ci/etc/rprofile") install.packages(c("remotes", "roxygen2")) + # We currently need dev roxygen2 (> 7.1.1) until they release + remotes::install_github("r-lib/roxygen2") remotes::install_deps("r") roxygen2::roxygenize("r") + - name: Style R code + if: env.R_CODE == 'true' || endsWith(github.event.comment.body, 'everything') + shell: Rscript {0} + run: | + changed_files <- system("git diff --name-only HEAD..upstream/master 2>&1", intern = TRUE) + # only grab the .R files under r/ + changed_files <- grep('^r/.*\\.R$', changed_files, value = TRUE) + # remove latin1 which is unstylable due to encoding and codegen.R which is unique + changed_files <- changed_files[!changed_files %in% file.path("r", source("r/.styler_excludes.R")$value)] + source("ci/etc/rprofile") + install.packages(c("remotes", "styler")) + remotes::install_deps("r") + styler::style_file(changed_files) - name: Commit results run: | git config user.name "$(git log -1 --pretty=format:%an)" diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 5f25deb4512..086f45d6fee 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -37,6 +37,10 @@ on: - 'cpp/**' - 'format/Flight.proto' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + env: ARROW_ENABLE_TIMING_TESTS: OFF DOCKER_VOLUME_PREFIX: ".docker/" @@ -49,6 +53,7 @@ jobs: name: ${{ matrix.title }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 45 strategy: fail-fast: false matrix: @@ -59,7 +64,7 @@ jobs: - image: conda-cpp title: AMD64 Conda C++ - image: ubuntu-cpp-sanitizer - title: AMD64 Ubuntu 18.04 C++ ASAN UBSAN + title: AMD64 Ubuntu 20.04 C++ ASAN UBSAN steps: - name: Checkout Arrow uses: actions/checkout@v2 @@ -91,84 +96,11 @@ jobs: continue-on-error: true run: archery docker push ${{ matrix.image }} - docker-arm: - # NOTE: this job is specific for self-hosted runners - # CACHING: don't use the cache plugin because of various permission - # issues and keep the cached docker volumes permanently on the - # host - # PYTHON: no distributions are built for arm machines by the github - # actions team, so python>3.6 must be preinstalled on the self - # hosted machines - name: ${{ matrix.title }} - runs-on: ${{ matrix.runner }} - # TODO(kszucs): re-enable once the self-hosted workers are properly - # registered to github - if: false && github.event_name == 'push' - defaults: - # To use certain environment variables set by .bashrc, an interactive - # bash shell must be used - run: - shell: bash -i {0} - strategy: - fail-fast: false - matrix: - name: - - arm32v7-debian-10-cpp - - arm64v8-ubuntu-20.04-cpp - include: - - name: arm32v7-debian-10-cpp - debian: 10 - title: ARM32v7 Debian 10 C++ - image: | - -e CPP_MAKE_PARALLELISM=2 \ - -e CXXFLAGS=-Wno-psabi \ - -e ARROW_PARQUET=OFF \ - -e ARROW_FLIGHT=OFF \ - -e ARROW_GANDIVA=OFF \ - -e ARROW_ORC=OFF \ - -e CMAKE_ARGS=-DARROW_CPU_FLAG=armv7 \ - debian-cpp - arch: 'arm32v7' - runner: [self-hosted, linux, ARM] - - name: arm64v8-ubuntu-20.04-cpp - ubuntu: 20.04 - title: ARM64v8 Ubuntu 20.04 C++ - image: | - -e CPP_MAKE_PARALLELISM=1 \ - -e ARROW_PARQUET=OFF \ - ubuntu-cpp - arch: 'arm64v8' - runner: [self-hosted, linux, ARM64] - env: - # the defaults here should correspond to the values in .env - ARCH: ${{ matrix.arch || 'arm64v8' }} - DEBIAN: ${{ matrix.debian || 10 }} - FEDORA: ${{ matrix.fedora || 32 }} - UBUNTU: ${{ matrix.ubuntu || 18.04 }} - LLVM: 8 - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Setup Archery - run: pip install -U -e dev/archery[docker] - - name: Execute Docker Build - # parallelism is reduced because the ARM builders are low on memory - run: | - ulimit -c unlimited - archery docker run ${{ matrix.image }} - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ${{ matrix.image }} - build-example: name: C++ Minimal Build Example runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 45 strategy: fail-fast: false steps: @@ -185,6 +117,7 @@ jobs: name: AMD64 MacOS 10.15 C++ runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 45 strategy: fail-fast: false env: @@ -219,7 +152,6 @@ jobs: run: | rm -f /usr/local/bin/2to3 brew update --preinstall - brew unlink gcc@8 gcc@9 brew bundle --file=cpp/Brewfile - name: Build shell: bash @@ -236,6 +168,7 @@ jobs: name: AMD64 ${{ matrix.name }} C++ runs-on: ${{ matrix.os }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 45 strategy: fail-fast: false matrix: @@ -305,6 +238,7 @@ jobs: name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} C++ runs-on: windows-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 45 strategy: fail-fast: false matrix: diff --git a/.github/workflows/cpp_cron.yml b/.github/workflows/cpp_cron.yml index c229ad93be3..c031e5961cb 100644 --- a/.github/workflows/cpp_cron.yml +++ b/.github/workflows/cpp_cron.yml @@ -36,76 +36,12 @@ env: ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: - docker: - name: ${{ matrix.title }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.repository == 'apache/arrow' }} - strategy: - fail-fast: false - matrix: - name: - - amd64-debian-10-cpp - - amd64-fedora-33-cpp - - amd64-ubuntu-16.04-cpp - - amd64-ubuntu-18.04-cpp - include: - - name: amd64-debian-10-cpp - image: debian-cpp - title: AMD64 Debian 10 C++ - debian: 10 - - name: amd64-fedora-33-cpp - image: fedora-cpp - title: AMD64 Fedora 33 C++ - fedora: 33 - - name: amd64-ubuntu-16.04-cpp - image: ubuntu-cpp - title: AMD64 Ubuntu 16.04 C++ - ubuntu: 16.04 - - name: amd64-ubuntu-18.04-cpp - image: ubuntu-cpp - title: AMD64 Ubuntu 18.04 C++ - ubuntu: 18.04 - env: - # the defaults here should correspond to the values in .env - ARCH: 'amd64' - DEBIAN: ${{ matrix.debian || 10 }} - FEDORA: ${{ matrix.fedora || 33 }} - UBUNTU: ${{ matrix.ubuntu || 18.04 }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: ${{ matrix.name }}-${{ hashFiles('cpp/**') }} - restore-keys: ${{ matrix.name }}- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ${{ matrix.image }} - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ${{ matrix.image }} oss-fuzz: name: OSS-Fuzz build check runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.repository == 'apache/arrow' }} + timeout-minutes: 60 strategy: fail-fast: false matrix: diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 03a297bb914..b339b8f4655 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -29,12 +29,17 @@ on: - 'ci/scripts/csharp_*' - 'csharp/**' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + jobs: ubuntu: name: AMD64 Ubuntu 18.04 C# ${{ matrix.dotnet }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 15 strategy: fail-fast: false matrix: @@ -65,6 +70,7 @@ jobs: name: AMD64 Windows 2019 18.04 C# ${{ matrix.dotnet }} runs-on: windows-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 15 strategy: fail-fast: false matrix: @@ -94,6 +100,7 @@ jobs: name: AMD64 MacOS 10.15 C# ${{ matrix.dotnet }} runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 15 strategy: fail-fast: false matrix: diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 37016efcbfe..9ef46c31fa3 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -22,6 +22,10 @@ on: push: pull_request: +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -29,7 +33,7 @@ env: jobs: lint: - name: Lint C++, Python, R, Rust, Docker, RAT + name: Lint C++, Python, R, Docker, RAT runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: @@ -51,7 +55,7 @@ jobs: run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited - archery docker run ubuntu-lint + archery docker run -e GITHUB_ACTIONS=true ubuntu-lint - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' continue-on-error: true @@ -79,13 +83,13 @@ jobs: with: python-version: '3.6' - name: Install Ruby - uses: actions/setup-ruby@v1 + uses: ruby/setup-ruby@v1 with: ruby-version: '2.6' - name: Install Dependencies shell: bash run: | - pip install cython setuptools pytest jira + pip install cython setuptools six pytest jira - name: Run Release Test shell: bash run: | diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 7b92b897051..5f3acd7bebf 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -28,6 +28,9 @@ on: - edited - synchronize +# NOTE: not using the "cancel-in-progress" feature here as the group key +# does not have enough information for linking it to a particular PR + jobs: process: name: Process diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 098e1bad7f4..8860d91f813 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -36,6 +36,9 @@ lang-js: lang-julia: - julia/**/* +lang-matlab: + - matlab/**/* + lang-python: - python/**/* @@ -45,19 +48,9 @@ lang-R: lang-ruby: - ruby/**/* -lang-rust: - - rust/**/* - -datafusion: - - rust/datafusion/**/* - -ballista: - - rust/ballista/**/* - flight: - cpp/src/arrow/flight/**/* - r/R/flight.* - - rust/arrow-flight/**/* - python/pyarrow/*flight.* gandiva: @@ -71,4 +64,3 @@ parquet: - cpp/src/parquet/**/* - r/R/parquet.* - ruby/red-parquet/**/* - - rust/parquet*/**/* diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 574795f5e9b..3c9100c20b7 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -32,6 +32,10 @@ on: - 'ci/scripts/go_*' - 'go/**' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -42,6 +46,7 @@ jobs: name: AMD64 Debian 10 Go ${{ matrix.go }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 15 strategy: fail-fast: false matrix: @@ -74,6 +79,7 @@ jobs: name: AMD64 Windows 2019 Go ${{ matrix.go }} runs-on: windows-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 15 strategy: fail-fast: false matrix: @@ -101,6 +107,7 @@ jobs: name: AMD64 MacOS 10.15 Go ${{ matrix.go }} runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 15 strategy: fail-fast: false matrix: diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 20112553ea2..7a4deb8e3ea 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -29,7 +29,6 @@ on: - 'cpp/**' - 'java/**' - 'format/**' - - 'rust/**' pull_request: paths: - '.github/workflows/integration.yml' @@ -41,7 +40,10 @@ on: - 'cpp/**' - 'java/**' - 'format/**' - - 'rust/**' + +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true env: DOCKER_VOLUME_PREFIX: ".docker/" @@ -54,6 +56,7 @@ jobs: name: AMD64 Conda Integration Test runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 60 steps: - name: Checkout Arrow uses: actions/checkout@v2 @@ -61,6 +64,11 @@ jobs: fetch-depth: 0 - name: Fetch Submodules and Tags run: ci/scripts/util_checkout.sh + - name: Checkout Arrow Rust + uses: actions/checkout@v2 + with: + repository: apache/arrow-rs + path: rust - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes @@ -76,7 +84,7 @@ jobs: - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build - run: archery docker run conda-integration + run: archery docker run -e ARCHERY_INTEGRATION_WITH_RUST=1 conda-integration - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' continue-on-error: true diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 7f6f29f0f44..72f4df7e36e 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -35,6 +35,10 @@ on: - 'format/Flight.proto' - 'java/**' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + env: DOCKER_VOLUME_PREFIX: ".docker/" ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} @@ -46,6 +50,7 @@ jobs: name: AMD64 Debian 9 Java JDK ${{ matrix.jdk }} Maven ${{ matrix.maven }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 30 strategy: fail-fast: false matrix: @@ -88,6 +93,7 @@ jobs: name: AMD64 MacOS 10.15 Java JDK ${{ matrix.jdk }} runs-on: macos-latest if: github.event_name == 'push' + timeout-minutes: 30 strategy: fail-fast: false matrix: diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 5f25e8c053d..48351f3c22a 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -35,6 +35,10 @@ on: - 'cpp/**' - 'java/**' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + env: DOCKER_VOLUME_PREFIX: ".docker/" ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} @@ -46,14 +50,7 @@ jobs: name: AMD64 Debian 9 Java JNI (Gandiva, Plasma, ORC, Dataset) runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - strategy: - fail-fast: false - matrix: - jdk: [8] - maven: [3.5.2] - env: - JDK: ${{ matrix.jdk }} - MAVEN: ${{ matrix.maven }} + timeout-minutes: 90 steps: - name: Checkout Arrow uses: actions/checkout@v2 diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 354c45c60d3..95414909d39 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -31,6 +31,10 @@ on: - 'ci/scripts/js_*' - 'js/**' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} @@ -41,6 +45,7 @@ jobs: name: AMD64 Debian 10 NodeJS 14 runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 60 steps: - name: Checkout Arrow uses: actions/checkout@v2 @@ -70,6 +75,7 @@ jobs: name: AMD64 MacOS 10.15 NodeJS ${{ matrix.node }} runs-on: macos-latest if: github.event_name == 'push' + timeout-minutes: 60 strategy: fail-fast: false matrix: @@ -93,30 +99,26 @@ jobs: shell: bash run: ci/scripts/js_test.sh $(pwd) - # TODO(kszucs): the windows build fails with platform specific npm error - # windows: - # name: AMD64 Windows 2019 NodeJS ${{ matrix.node }} - # runs-on: windows-latest - # if: github.event_name == 'push' - # strategy: - # fail-fast: false - # matrix: - # node: [14] - # steps: - # - name: Checkout Arrow - # uses: actions/checkout@v1 - # with: - # submodules: true - # - name: Install NodeJS - # uses: actions/setup-node@v1 - # with: - # node-version: ${{ matrix.node }} - # - name: Install Platform Dependencies - # shell: bash - # run: yarn add -g cross-env - # - name: Build - # shell: bash - # run: ci/scripts/js_build.sh $(pwd) - # - name: Test - # shell: bash - # run: ci/scripts/js_test.sh $(pwd) + windows: + name: AMD64 Windows 2019 NodeJS ${{ matrix.node }} + runs-on: windows-latest + if: github.event_name == 'push' + strategy: + fail-fast: false + matrix: + node: [14] + steps: + - name: Checkout Arrow + uses: actions/checkout@v1 + with: + submodules: true + - name: Install NodeJS + uses: actions/setup-node@v1 + with: + node-version: ${{ matrix.node }} + - name: Build + shell: bash + run: ci/scripts/js_build.sh $(pwd) + - name: Test + shell: bash + run: ci/scripts/js_test.sh $(pwd) diff --git a/.github/workflows/julia.yml b/.github/workflows/julia.yml index 64ea6c947a1..226ec3e6ad0 100644 --- a/.github/workflows/julia.yml +++ b/.github/workflows/julia.yml @@ -26,12 +26,17 @@ on: - '.github/workflows/julia.yml' - 'julia/**' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + jobs: test: name: AMD64 ${{ matrix.os }} Julia ${{ matrix.version }} env: JULIA_NUM_THREADS: 2 runs-on: ${{ matrix.os }} + timeout-minutes: 30 strategy: fail-fast: false matrix: diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 9062e93e665..59b14dc3287 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -31,6 +31,10 @@ on: - 'cpp/**' - 'python/**' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + env: DOCKER_VOLUME_PREFIX: ".docker/" ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} @@ -42,6 +46,7 @@ jobs: name: ${{ matrix.title }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 60 strategy: fail-fast: false matrix: @@ -61,6 +66,7 @@ jobs: title: AMD64 Conda Python 3.6 Pandas 0.23 python: 3.6 pandas: 0.23 + numpy: 1.16 - name: conda-python-3.7-pandas-latest cache: conda-python-3.7 image: conda-python-pandas @@ -107,6 +113,7 @@ jobs: name: AMD64 MacOS 10.15 Python 3 runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 60 env: ARROW_HOME: /usr/local ARROW_DATASET: ON @@ -124,6 +131,7 @@ jobs: ARROW_WITH_BROTLI: ON ARROW_BUILD_TESTS: OFF CMAKE_ARGS: "-DPython3_EXECUTABLE=/usr/local/bin/python3" + PYARROW_TEST_LARGE_MEMORY: ON steps: - name: Checkout Arrow uses: actions/checkout@v2 @@ -137,7 +145,6 @@ jobs: run: | rm -f /usr/local/bin/2to3 brew update --preinstall - brew unlink gcc@8 gcc@9 brew bundle --file=cpp/Brewfile brew install coreutils python3 -mpip install \ diff --git a/.github/workflows/python_cron.yml b/.github/workflows/python_cron.yml deleted file mode 100644 index 7a4401af1c3..00000000000 --- a/.github/workflows/python_cron.yml +++ /dev/null @@ -1,141 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Python Cron - -on: - push: - paths: - - '.github/workflows/python_cron.yml' - pull_request: - paths: - - '.github/workflows/python_cron.yml' - schedule: - - cron: | - 0 */12 * * * - -env: - DOCKER_VOLUME_PREFIX: ".docker/" - ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} - ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - -jobs: - - docker: - name: ${{ matrix.title }} - runs-on: ubuntu-latest - if: ${{ !contains(github.event.pull_request.title, 'WIP') && github.repository == 'apache/arrow' }} - strategy: - fail-fast: false - matrix: - name: - - debian-10-python-3 - - fedora-33-python-3 - - ubuntu-18.04-python-3 - - conda-python-3.7-dask-latest - - conda-python-3.7-turbodbc-latest - - conda-python-3.7-kartothek-latest - - conda-python-3.7-pandas-0.24 - - conda-python-3.7-pandas-master - - conda-python-3.7-hdfs-2.9.2 - include: - - name: debian-10-python-3 - cache: debian-10-python-3 - image: debian-python - title: AMD64 Debian 10 Python 3 - debian: 10 - - name: fedora-33-python-3 - cache: fedora-33-python-3 - image: fedora-python - title: AMD64 Fedora 33 Python 3 - fedora: 33 - - name: ubuntu-18.04-python-3 - cache: ubuntu-18.04-python-3 - image: ubuntu-python - title: AMD64 Ubuntu 18.04 Python 3 - ubuntu: 18.04 - - name: conda-python-3.7-dask-latest - cache: conda-python-3.7 - image: conda-python-dask - title: AMD64 Conda Python 3.7 Dask latest - dask: latest - - name: conda-python-3.7-turbodbc-latest - cache: conda-python-3.7 - image: conda-python-turbodbc - title: AMD64 Conda Python 3.7 Turbodbc latest - turbodbc: latest - - name: conda-python-3.7-kartothek-latest - cache: conda-python-3.7 - image: conda-python-kartothek - title: AMD64 Conda Python 3.7 Kartothek latest - kartothek: latest - - name: conda-python-3.7-pandas-0.24 - cache: conda-python-3.7 - image: conda-python-pandas - title: AMD64 Conda Python 3.7 Pandas 0.24 - pandas: 0.24 - - name: conda-python-3.7-pandas-master - cache: conda-python-3.7 - image: --no-leaf-cache conda-python-pandas - title: AMD64 Conda Python 3.7 Pandas master - pandas: master - - name: conda-python-3.7-hdfs-2.9.2 - cache: conda-python-3.7 - image: conda-python-hdfs - title: AMD64 Conda Python 3.7 HDFS 2.9.2 - hdfs: 2.9.2 - env: - # the defaults here should correspond to the values in .env - DEBIAN: ${{ matrix.debian || 10 }} - FEDORA: ${{ matrix.fedora || 33 }} - UBUNTU: ${{ matrix.ubuntu || 18.04 }} - PYTHON: ${{ matrix.python || 3.7 }} - HDFS: ${{ matrix.hdfs || '2.9.2' }} - DASK: ${{ matrix.dask || 'latest' }} - TURBODBC: ${{ matrix.turbodbc || 'latest' }} - PANDAS: ${{ matrix.pandas || 'latest' }} - KARTOTHEK: ${{ matrix.kartothek || 'latest' }} - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Fetch Submodules and Tags - run: ci/scripts/util_checkout.sh - - name: Free Up Disk Space - run: ci/scripts/util_cleanup.sh - - name: Cache Docker Volumes - uses: actions/cache@v2 - with: - path: .docker - key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} - restore-keys: ${{ matrix.cache }}- - - name: Setup Python - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Setup Archery - run: pip install -e dev/archery[docker] - - name: Execute Docker Build - run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited - archery docker run ${{ matrix.image }} - - name: Docker Push - if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' - continue-on-error: true - run: archery docker push ${{ matrix.image }} diff --git a/.github/workflows/r-without-arrow.yml b/.github/workflows/r-without-arrow.yml new file mode 100644 index 00000000000..309c6ece5d0 --- /dev/null +++ b/.github/workflows/r-without-arrow.yml @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: R without Arrow + +on: + push: + paths: + - ".github/workflows/r-without-arrow.yml" + - "r/src/**" + pull_request: + paths: + - ".github/workflows/r-without-arrow.yml" + - "r/src/**" + +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + +env: + DOCKER_VOLUME_PREFIX: ".docker/" + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + +jobs: + bundled: + name: "R package without arrow" + runs-on: ubuntu-latest + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 60 + strategy: + fail-fast: false + env: + R_ORG: rhub + R_IMAGE: ubuntu-gcc-release + R_TAG: latest + steps: + - name: Checkout Arrow + uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Fetch Submodules and Tags + run: ci/scripts/util_checkout.sh + - name: Free Up Disk Space + run: ci/scripts/util_cleanup.sh + - name: Cache Docker Volumes + uses: actions/cache@v2 + with: + path: .docker + key: ubuntu-gcc-release-r-${{ hashFiles('cpp/**') }} + restore-keys: ubuntu-gcc-release-r- + - name: Setup Python + uses: actions/setup-python@v1 + with: + python-version: 3.8 + - name: Setup Archery + run: pip install -e dev/archery[docker] + - name: Execute Docker Build + run: | + sudo sysctl -w kernel.core_pattern="core.%e.%p" + ulimit -c unlimited + archery docker run -e LIBARROW_DOWNLOAD=FALSE -e LIBARROW_BUILD=FALSE -e TEST_R_WITH_ARROW=FALSE -e NOT_CRAN=FALSE r + - name: Dump install logs + run: cat r/check/arrow.Rcheck/00install.out + if: always() + - name: Dump test logs + run: cat r/check/arrow.Rcheck/tests/testthat.Rout* + if: always() + - name: Save the test output + if: always() + uses: actions/upload-artifact@v2 + with: + name: test-output + path: r/check/arrow.Rcheck/tests/testthat.Rout* + - name: Docker Push + if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' + continue-on-error: true + run: archery docker push r diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 7851b6b1915..9a2fcf5daec 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -39,6 +39,10 @@ on: - "cpp/**" - "r/**" +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + env: DOCKER_VOLUME_PREFIX: ".docker/" ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} @@ -49,11 +53,12 @@ jobs: name: AMD64 Ubuntu ${{ matrix.ubuntu }} R ${{ matrix.r }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 60 strategy: fail-fast: false matrix: - r: ["3.6"] - ubuntu: [18.04] + r: ["4.1"] + ubuntu: [20.04] env: R: ${{ matrix.r }} UBUNTU: ${{ matrix.ubuntu }} @@ -84,7 +89,9 @@ jobs: run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited - archery docker run ubuntu-r + # Setting a non-default and non-probable Marquesas French Polynesia time + # it has both with a .45 offset and very very few people who live there. + archery docker run -e TZ=MART ubuntu-r - name: Dump install logs run: cat r/check/arrow.Rcheck/00install.out if: always() @@ -106,6 +113,7 @@ jobs: name: "${{ matrix.config.org }}/${{ matrix.config.image }}:${{ matrix.config.tag }}" runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 60 strategy: fail-fast: false matrix: @@ -141,7 +149,10 @@ jobs: run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited - archery docker run r + # Don't set a TZ here to test that case. These builds will have the following warning in them: + # System has not been booted with systemd as init system (PID 1). Can't operate. + # Failed to connect to bus: Host is down + archery docker run -e TZ="" r - name: Dump install logs run: cat r/check/arrow.Rcheck/00install.out if: always() @@ -163,6 +174,7 @@ jobs: name: AMD64 Windows RTools ${{ matrix.rtools }} runs-on: windows-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 60 strategy: fail-fast: false matrix: @@ -207,7 +219,7 @@ jobs: - uses: r-lib/actions/setup-r@master with: rtools-version: 40 - r-version: "4.0" + r-version: "4.1" Ncpus: 2 - uses: r-lib/actions/setup-r@master if: ${{ matrix.rtools == 35 }} @@ -235,7 +247,8 @@ jobs: run: | Sys.setenv( RWINLIB_LOCAL = file.path(Sys.getenv("GITHUB_WORKSPACE"), "libarrow.zip"), - MAKEFLAGS = paste0("-j", parallel::detectCores()) + MAKEFLAGS = paste0("-j", parallel::detectCores()), + "_R_CHECK_FORCE_SUGGESTS_" = FALSE ) rcmdcheck::rcmdcheck("r", build_args = '--no-build-vignettes', diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index d9430f536b2..067b40aefe9 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -43,6 +43,10 @@ on: - 'cpp/**' - 'ruby/**' +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + env: DOCKER_VOLUME_PREFIX: ".docker/" ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} @@ -54,11 +58,11 @@ jobs: name: AMD64 Ubuntu ${{ matrix.ubuntu }} GLib & Ruby runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 40 strategy: fail-fast: false matrix: ubuntu: - - 18.04 - 20.04 env: UBUNTU: ${{ matrix.ubuntu }} @@ -89,7 +93,11 @@ jobs: run: | sudo sysctl -w kernel.core_pattern="core.%e.%p" ulimit -c unlimited - archery docker run ubuntu-ruby + archery docker run \ + -e ARROW_FLIGHT=ON \ + -e Protobuf_SOURCE=BUNDLED \ + -e gRPC_SOURCE=BUNDLED \ + ubuntu-ruby - name: Docker Push if: success() && github.event_name == 'push' && github.repository == 'apache/arrow' continue-on-error: true @@ -100,10 +108,12 @@ jobs: name: AMD64 MacOS 10.15 GLib & Ruby runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 40 strategy: fail-fast: false env: ARROW_BUILD_TESTS: OFF + ARROW_FLIGHT: ON ARROW_GANDIVA: ON ARROW_GLIB_DEVELOPMENT_MODE: true ARROW_GLIB_GTK_DOC: true @@ -130,7 +140,6 @@ jobs: run: | rm -f /usr/local/bin/2to3 brew update --preinstall - brew unlink gcc@8 gcc@9 brew bundle --file=cpp/Brewfile brew bundle --file=c_glib/Brewfile - name: Install Ruby Dependencies @@ -171,19 +180,19 @@ jobs: name: AMD64 Windows MinGW ${{ matrix.mingw-n-bits }} GLib & Ruby runs-on: windows-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + timeout-minutes: 40 strategy: fail-fast: false matrix: mingw-n-bits: - 64 ruby-version: - - 2.6 + - "3.0" env: ARROW_BUILD_SHARED: ON ARROW_BUILD_STATIC: OFF ARROW_BUILD_TESTS: OFF ARROW_BUILD_TYPE: release - ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON ARROW_HDFS: OFF @@ -223,7 +232,7 @@ jobs: shell: bash run: ci/scripts/util_checkout.sh - name: Setup Ruby - uses: actions/setup-ruby@v1 + uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} - name: Upgrade MSYS2 diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml deleted file mode 100644 index 6d87e6b6260..00000000000 --- a/.github/workflows/rust.yml +++ /dev/null @@ -1,470 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Rust - -on: - push: - paths: - - '.github/workflows/rust.yml' - - 'rust/**' - - 'format/Flight.proto' - pull_request: - paths: - - '.github/workflows/rust.yml' - - 'rust/**' - - 'format/Flight.proto' - -jobs: - - # build the library, a compilation step used by multiple steps below - linux-build-lib: - name: Build Libraries on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - steps: - - uses: actions/checkout@v2 - - name: Cache Cargo - uses: actions/cache@v2 - with: - # these represent dependencies downloaded by cargo - # and thus do not depend on the OS, arch nor rust version. - path: /github/home/.cargo - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - # these represent compiled steps of both dependencies and arrow - # and thus are specific for a particular OS, arch and rust version. - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }}- - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Build Workspace - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust - cargo build - # Ballista is currently not part of the main workspace so requires a separate build step - - name: Build Ballista - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/ballista/rust - # snmalloc requires cmake so build without default features - cargo build --no-default-features - - # test the crate - linux-test: - name: Test Workspace on AMD64 Rust ${{ matrix.rust }} - needs: [linux-build-lib] - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - PARQUET_TEST_DATA: /__w/arrow/arrow/cpp/submodules/parquet-testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Run tests - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust - # run tests on all workspace members with default feature list - cargo test - # test datafusion examples - cd datafusion-examples - cargo test --no-default-features - cargo run --example csv_sql - cargo run --example parquet_sql - cd .. - cd arrow - # re-run tests on arrow workspace with additional features - cargo test --features=prettyprint - cargo run --example builders - cargo run --example dynamic_types - cargo run --example read_csv - cargo run --example read_csv_infer_schema - # Ballista is currently not part of the main workspace so requires a separate test step - - name: Run Ballista tests - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/ballista/rust - # snmalloc requires cmake so build without default features - cargo test --no-default-features - - # test the --features "simd" of the arrow crate. This requires nightly. - linux-test-simd: - name: Test SIMD on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [nightly-2020-11-24] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Run tests - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/arrow - cargo test --features "simd" - - windows-and-macos: - name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [windows-latest, macos-latest] - rust: [stable] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - # TODO: this won't cache anything, which is expensive. Setup this action - # with a OS-dependent path. - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Run tests - shell: bash - run: | - export ARROW_TEST_DATA=$(pwd)/testing/data - export PARQUET_TEST_DATA=$(pwd)/cpp/submodules/parquet-testing/data - # do not produce debug symbols to keep memory usage down - export RUSTFLAGS="-C debuginfo=0" - cd rust - cargo test - - clippy: - name: Clippy - needs: [linux-build-lib] - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt clippy - - name: Run clippy - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust - cargo clippy --all-targets --workspace -- -D warnings -A clippy::redundant_field_names - - miri-checks: - name: MIRI - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [nightly-2021-01-19] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - uses: actions/cache@v2 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target - key: ${{ runner.os }}-cargo-miri-${{ hashFiles('**/Cargo.lock') }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt clippy miri - - name: Run Miri Checks - env: - RUST_BACKTRACE: full - RUST_LOG: 'trace' - run: | - export MIRIFLAGS="-Zmiri-disable-isolation" - cd rust - cargo miri setup - cargo clean - # Ignore MIRI errors until we can get a clean run - cargo miri test || true - - coverage: - name: Coverage - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [stable] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /home/runner/.cargo - # this key is not equal because the user is different than on a container (runner vs github) - key: cargo-coverage-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /home/runner/target - # this key is not equal because coverage uses different compilation flags. - key: ${{ runner.os }}-${{ matrix.arch }}-target-coverage-cache-${{ matrix.rust }}- - - name: Run coverage - run: | - export CARGO_HOME="/home/runner/.cargo" - export CARGO_TARGET_DIR="/home/runner/target" - - export ARROW_TEST_DATA=$(pwd)/testing/data - export PARQUET_TEST_DATA=$(pwd)/cpp/submodules/parquet-testing/data - - # 2020-11-15: There is a cargo-tarpaulin regression in 0.17.0 - # see https://github.com/xd009642/tarpaulin/issues/618 - cargo install --version 0.16.0 cargo-tarpaulin - cd rust - cargo tarpaulin --out Xml - - name: Report coverage - continue-on-error: true - run: bash <(curl -s https://codecov.io/bash) - - # test FFI against the C-Data interface exposed by pyarrow - pyarrow-integration-test: - name: Test Pyarrow C Data Interface - runs-on: ubuntu-latest - strategy: - matrix: - rust: [stable] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt clippy - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /home/runner/.cargo - key: cargo-maturin-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /home/runner/target - # this key is not equal because maturin uses different compilation flags. - key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}- - - uses: actions/setup-python@v2 - with: - python-version: '3.7' - - name: Install Python dependencies - run: python -m pip install --upgrade pip setuptools wheel - - name: Run tests - run: | - export CARGO_HOME="/home/runner/.cargo" - export CARGO_TARGET_DIR="/home/runner/target" - - cd rust/arrow-pyarrow-integration-testing - - python -m venv venv - source venv/bin/activate - - pip install maturin==0.8.2 toml==0.10.1 pyarrow==1.0.0 - maturin develop - python -m unittest discover tests - - # test the arrow crate builds against wasm32 in stable rust - wasm32-build: - name: Build wasm32 on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [nightly-2020-11-24] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - PARQUET_TEST_DATA: /__w/arrow/arrow/cpp/submodules/parquet-testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-wasm32-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup override set ${{ matrix.rust }} - rustup component add rustfmt - rustup target add wasm32-unknown-unknown - - name: Build arrow crate - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/arrow - cargo build --target wasm32-unknown-unknown - - # test the projects can build without default features - default-build: - name: Check No Defaults on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - PARQUET_TEST_DATA: /__w/arrow/arrow/cpp/submodules/parquet-testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-wasm32-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup override set ${{ matrix.rust }} - rustup component add rustfmt - - name: Build arrow crate - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/arrow - cargo check --all-targets --no-default-features diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d2d2d81d68..0718072308a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,21 +29,6 @@ repos: entry: bash -c "git archive HEAD --prefix=apache-arrow/ --output=arrow-src.tar && ./dev/release/run-rat.sh arrow-src.tar" always_run: true pass_filenames: false - - id: rustfmt - name: Rust Format - language: system - entry: bash -c "cd rust && cargo +stable fmt --all -- --check" - files: ^rust/.*\.rs$ - types: - - file - - rust - - id: cmake-format - name: CMake Format - language: python - entry: python run-cmake-format.py - types: [cmake] - additional_dependencies: - - cmake_format==0.5.2 - id: hadolint name: Docker Format language: docker_image diff --git a/.travis.yml b/.travis.yml index 2cf70cca982..6a279a2f87b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -dist: bionic +dist: focal language: minimal @@ -43,7 +43,13 @@ jobs: include: - name: "C++ on ARM" os: linux - arch: arm64 + arch: arm64-graviton2 + # This is required for arm64-graviton2. + # https://docs.travis-ci.com/user/multi-cpu-architectures/#example-multi-architecture-build-matrix + group: edge + # This is required for arm64-graviton2. + # https://docs.travis-ci.com/user/multi-cpu-architectures/#testing-on-multiple-cpu-architectures + virt: vm env: <<: *global_env ARCH: arm64v8 @@ -51,28 +57,29 @@ jobs: DOCKER_IMAGE_ID: ubuntu-cpp # ARROW_USE_GLOG=OFF is needed to avoid build error caused by # glog and CMAKE_UNITY_BUILD=ON. - # - # Disable ARROW_S3 because it often causes "No output has - # been received in the last 10m0s, this potentially indicates - # a stalled build or something wrong with the build itself." - # on Travis CI. - # - # Limiting CPP_MAKE_PARALLELISM is required to avoid random compiler - # crashes. DOCKER_RUN_ARGS: >- " -e ARROW_BUILD_STATIC=OFF -e ARROW_ORC=OFF - -e ARROW_S3=OFF -e ARROW_USE_GLOG=OFF -e CMAKE_UNITY_BUILD=ON - -e CPP_MAKE_PARALLELISM=4 " - # The LLVM's APT repository provides only arm64 binaries. + # The LLVM's APT repository doesn't provide arm64 binaries. # We should use LLVM provided by Ubuntu. LLVM: "10" UBUNTU: "20.04" + - name: "Go on ARM" + os: linux + arch: arm64-graviton2 + group: edge + virt: vm + env: + <<: *global_env + ARCH: arm64v8 + ARROW_CI_MODULES: "GO" + DOCKER_IMAGE_ID: debian-go + - name: "C++ on s390x" os: linux arch: s390x @@ -97,11 +104,10 @@ jobs: -e PARQUET_BUILD_EXAMPLES=OFF -e PARQUET_BUILD_EXECUTABLES=OFF -e Protobuf_SOURCE=BUNDLED - -e cares_SOURCE=BUNDLED -e gRPC_SOURCE=BUNDLED " - # The LLVM's APT repository provides only arm64 binaries. - # We should use LLVM provided by Ubuntu. + # The LLVM's APT repository causes download error for s390x binary + # We should use the LLVM provided by the default APT repository LLVM: "10" UBUNTU: "20.04" @@ -125,6 +131,7 @@ jobs: JDK: 11 allow_failures: + - name: "Go on ARM" - name: "Go on s390x" - name: "Java on s390x" @@ -143,14 +150,11 @@ before_install: fi install: - - pip3 install -e dev/archery[docker] + - sudo -H pip3 install --upgrade pip + - sudo -H pip3 install 'docker-compose>=1.27.0' + - sudo -H pip3 install -e dev/archery[docker] script: - - sudo sysctl -w kernel.core_pattern="core.%e.%p" - # This isn't allowed on Travis CI: - # /home/travis/.travis/functions: line 109: ulimit: core file size: cannot modify limit: Operation not permitted - - | - ulimit -c unlimited || : - | archery docker run \ ${DOCKER_RUN_ARGS} \ diff --git a/LICENSE.txt b/LICENSE.txt index 4cec07fd0c9..5d4de206545 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -2218,3 +2218,25 @@ https://github.com/pypa/packaging/ which is made available under both the Apache license v2.0 and the BSD 2-clause license. + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/pcg contain code from + +https://github.com/imneme/pcg-cpp + +and have the following copyright notice: + +Copyright 2014-2019 Melissa O'Neill , + and the PCG Project contributors. + +SPDX-License-Identifier: (Apache-2.0 OR MIT) + +Licensed under the Apache License, Version 2.0 (provided in +LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0) +or under the MIT license (provided in LICENSE-MIT.txt and at +http://opensource.org/licenses/MIT), at your option. This file may not +be copied, modified, or distributed except according to those terms. + +Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either +express or implied. See your chosen license for details. diff --git a/README.md b/README.md index 133018c72df..7d10b81c6e4 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,6 @@ # Apache Arrow -[![Build Status](https://ci.appveyor.com/api/projects/status/github/apache/arrow/branch/master?svg=true)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/arrow/branch/master) -[![Coverage Status](https://codecov.io/gh/apache/arrow/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/arrow?branch=master) [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/arrow.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:arrow) [![License](http://img.shields.io/:license-Apache%202-blue.svg)](https://github.com/apache/arrow/blob/master/LICENSE.txt) [![Twitter Follow](https://img.shields.io/twitter/follow/apachearrow.svg?style=social&label=Follow)](https://twitter.com/apachearrow) @@ -53,7 +51,7 @@ Major components of the project include: - [Python libraries](https://github.com/apache/arrow/tree/master/python) - [R libraries](https://github.com/apache/arrow/tree/master/r) - [Ruby libraries](https://github.com/apache/arrow/tree/master/ruby) - - [Rust libraries](https://github.com/apache/arrow/tree/master/rust) + - [Rust libraries](https://github.com/apache/arrow-rs) Arrow is an [Apache Software Foundation](https://www.apache.org) project. Learn more at [arrow.apache.org](https://arrow.apache.org). diff --git a/c_glib/Gemfile b/c_glib/Gemfile index 4b570902bcd..6864cfd3244 100644 --- a/c_glib/Gemfile +++ b/c_glib/Gemfile @@ -20,4 +20,4 @@ source "https://rubygems.org/" gem "test-unit" -gem "gobject-introspection" +gem "gobject-introspection", ">= 3.4.7" diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h index ff160452845..03e56516112 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.h +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.h @@ -21,6 +21,8 @@ #include +#include +#include #include #include #include diff --git a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp index c221825bc2a..65341b9b77e 100644 --- a/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp +++ b/c_glib/arrow-dataset-glib/arrow-dataset-glib.hpp @@ -21,6 +21,8 @@ #include +#include +#include #include #include #include diff --git a/c_glib/arrow-dataset-glib/dataset-factory.cpp b/c_glib/arrow-dataset-glib/dataset-factory.cpp new file mode 100644 index 00000000000..146db69adfc --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset-factory.cpp @@ -0,0 +1,468 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: dataset-factory + * @section_id: dataset-factory + * @title: Dataset factory related classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetDatasetFactory is a base class for dataset factories. + * + * #GADatasetFileSystemDatasetFactory is a class for + * #GADatasetFileSystemDataset factory. + * + * Since: 5.0.0 + */ + +typedef struct GADatasetDatasetFactoryPrivate_ { + std::shared_ptr factory; +} GADatasetDatasetFactoryPrivate; + +enum { + PROP_DATASET_FACTORY = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDatasetFactory, + gadataset_dataset_factory, + G_TYPE_OBJECT) + +#define GADATASET_DATASET_FACTORY_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_dataset_factory_get_instance_private( \ + GADATASET_DATASET_FACTORY(obj))) + +static void +gadataset_dataset_factory_finalize(GObject *object) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + priv->factory.~shared_ptr(); + G_OBJECT_CLASS(gadataset_dataset_factory_parent_class)->finalize(object); +} + +static void +gadataset_dataset_factory_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATASET_FACTORY: + { + auto arrow_factory_pointer = + static_cast *>( + g_value_get_pointer(value)); + if (arrow_factory_pointer) { + priv->factory = *arrow_factory_pointer; + } + } + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_dataset_factory_init(GADatasetDatasetFactory *object) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(object); + new(&priv->factory) std::shared_ptr; +} + +static void +gadataset_dataset_factory_class_init(GADatasetDatasetFactoryClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_dataset_factory_finalize; + gobject_class->set_property = gadataset_dataset_factory_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("dataset-factory", + "Dataset factory", + "The raw " + "std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATASET_FACTORY, spec); +} + +/** + * gadataset_dataset_factory_finish: + * @factory: A #GADatasetDatasetFactory. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetDataset on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetDataset * +gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GError **error) +{ + auto arrow_factory = gadataset_dataset_factory_get_raw(factory); + auto arrow_dataset_result = arrow_factory->Finish(); + if (garrow::check(error, arrow_dataset_result, "[dataset-factory][finish]")) { + auto arrow_dataset = *arrow_dataset_result; + return gadataset_dataset_new_raw(&arrow_dataset); + } else { + return NULL; + } +} + + +typedef struct GADatasetFileSystemDatasetFactoryPrivate_ { + GADatasetFileFormat *format; + GArrowFileSystem *file_system; + GList *files; + arrow::dataset::FileSystemFactoryOptions options; +} GADatasetFileSystemDatasetFactoryPrivate; + +enum { + PROP_FORMAT = 1, + PROP_FILE_SYSTEM, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDatasetFactory, + gadataset_file_system_dataset_factory, + GADATASET_TYPE_DATASET_FACTORY) + +#define GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_system_dataset_factory_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET_FACTORY(obj))) + +static void +gadataset_file_system_dataset_factory_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + if (priv->format) { + g_object_unref(priv->format); + priv->format = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + if (priv->files) { + g_list_free_full(priv->files, g_object_unref); + priv->files = NULL; + } + + G_OBJECT_CLASS( + gadataset_file_system_dataset_factory_parent_class)->dispose(object); +} + +static void +gadataset_file_system_dataset_factory_finalize(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + priv->options.~FileSystemFactoryOptions(); + G_OBJECT_CLASS( + gadataset_file_system_dataset_factory_parent_class)->finalize(object); +} + +static void +gadataset_file_system_dataset_factory_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_factory_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + g_value_set_object(value, priv->format); + break; + case PROP_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_factory_init( + GADatasetFileSystemDatasetFactory *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(object); + new(&priv->options) arrow::dataset::FileSystemFactoryOptions; +} + +static void +gadataset_file_system_dataset_factory_class_init( + GADatasetFileSystemDatasetFactoryClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = gadataset_file_system_dataset_factory_dispose; + gobject_class->finalize = gadataset_file_system_dataset_factory_finalize; + gobject_class->set_property = gadataset_file_system_dataset_factory_set_property; + gobject_class->get_property = gadataset_file_system_dataset_factory_get_property; + + GParamSpec *spec; + /** + * GADatasetFileSystemDatasetFactory:format: + * + * Format passed to #GADatasetFileSystemDataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("format", + "Format", + "Format passed to GADatasetFileSystemDataset", + GADATASET_TYPE_FILE_FORMAT, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); + + /** + * GADatasetFileSystemDatasetFactory:file-system: + * + * File system passed to #GADatasetFileSystemDataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "File system passed to GADatasetFileSystemDataset", + GARROW_TYPE_FILE_SYSTEM, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); +} + +/** + * gadataset_file_system_factory_new: + * @format: A #GADatasetFileFormat. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GADatasetDatasetFileSystemFactory on success, + * %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetFileSystemDatasetFactory * +gadataset_file_system_dataset_factory_new(GADatasetFileFormat *format) +{ + return GADATASET_FILE_SYSTEM_DATASET_FACTORY( + g_object_new(GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY, + "format", format, + NULL)); +} + +/** + * gadataset_file_system_dataset_factory_set_file_system: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @file_system: A #GArrowFileSystem. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_set_file_system( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileSystem *file_system, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][set-file-system]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system is already set"), + context); + return FALSE; + } + priv->file_system = file_system; + g_object_ref(priv->file_system); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_set_file_system_uri: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @uri: An URI for file system. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_set_file_system_uri( + GADatasetFileSystemDatasetFactory *factory, + const gchar *uri, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][set-file-system-uri]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system is already set"), + context); + return FALSE; + } + std::string internal_path; + auto arrow_file_system_result = + arrow::fs::FileSystemFromUri(uri, &internal_path); + if (!garrow::check(error, arrow_file_system_result, context)) { + return FALSE; + } + auto arrow_file_system = *arrow_file_system_result; + auto arrow_file_info_result = arrow_file_system->GetFileInfo(internal_path); + if (!garrow::check(error, arrow_file_info_result, context)) { + return FALSE; + } + priv->file_system = garrow_file_system_new_raw(&arrow_file_system); + auto file_info = garrow_file_info_new_raw(*arrow_file_info_result); + priv->files = g_list_prepend(priv->files, file_info); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_add_path: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @path: A path to be added. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gadataset_file_system_dataset_factory_add_path( + GADatasetFileSystemDatasetFactory *factory, + const gchar *path, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][add-path]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (!priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system isn't set"), + context); + return FALSE; + } + auto arrow_file_system = garrow_file_system_get_raw(priv->file_system); + auto arrow_file_info_result = arrow_file_system->GetFileInfo(path); + if (!garrow::check(error, arrow_file_info_result, context)) { + return FALSE; + } + auto file_info = garrow_file_info_new_raw(*arrow_file_info_result); + priv->files = g_list_prepend(priv->files, file_info); + return TRUE; +} + +/** + * gadataset_file_system_dataset_factory_finish: + * @factory: A #GADatasetFileSystemDatasetFactory. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetFileSystemDataset on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetFileSystemDataset * +gadataset_file_system_dataset_factory_finish( + GADatasetFileSystemDatasetFactory *factory, + GError **error) +{ + const gchar *context = "[file-system-dataset-factory][finish]"; + auto priv = GADATASET_FILE_SYSTEM_DATASET_FACTORY_GET_PRIVATE(factory); + if (!priv->file_system) { + garrow::check(error, + arrow::Status::Invalid("file system isn't set"), + context); + return NULL; + } + auto arrow_file_system = garrow_file_system_get_raw(priv->file_system); + auto arrow_format = gadataset_file_format_get_raw(priv->format); + std::vector arrow_files; + priv->files = g_list_reverse(priv->files); + for (auto node = priv->files; node; node = node->next) { + auto file = GARROW_FILE_INFO(node->data); + arrow_files.push_back(*garrow_file_info_get_raw(file)); + } + priv->files = g_list_reverse(priv->files); + auto arrow_factory_result = + arrow::dataset::FileSystemDatasetFactory::Make(arrow_file_system, + arrow_files, + arrow_format, + priv->options); + if (!garrow::check(error, arrow_factory_result, context)) { + return NULL; + } + auto arrow_dataset_result = (*arrow_factory_result)->Finish(); + if (!garrow::check(error, arrow_dataset_result, context)) { + return NULL; + } + auto arrow_dataset = *arrow_dataset_result; + return GADATASET_FILE_SYSTEM_DATASET( + gadataset_dataset_new_raw(&arrow_dataset, + "dataset", &arrow_dataset, + "file-system", priv->file_system, + "format", priv->format, + NULL)); +} + + +G_END_DECLS + +std::shared_ptr +gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory) +{ + auto priv = GADATASET_DATASET_FACTORY_GET_PRIVATE(factory); + return priv->factory; +} diff --git a/c_glib/arrow-dataset-glib/dataset-factory.h b/c_glib/arrow-dataset-glib/dataset-factory.h new file mode 100644 index 00000000000..e2ee3ed9806 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset-factory.h @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +#define GADATASET_TYPE_DATASET_FACTORY (gadataset_dataset_factory_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDatasetFactory, + gadataset_dataset_factory, + GADATASET, + DATASET_FACTORY, + GObject) +struct _GADatasetDatasetFactoryClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetDataset * +gadataset_dataset_factory_finish(GADatasetDatasetFactory *factory, + GError **error); + + +#define GADATASET_TYPE_FILE_SYSTEM_DATASET_FACTORY \ + (gadataset_file_system_dataset_factory_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDatasetFactory, + gadataset_file_system_dataset_factory, + GADATASET, + FILE_SYSTEM_DATASET_FACTORY, + GADatasetDatasetFactory) +struct _GADatasetFileSystemDatasetFactoryClass +{ + GADatasetDatasetFactoryClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetFileSystemDatasetFactory * +gadataset_file_system_dataset_factory_new(GADatasetFileFormat *file_format); +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_set_file_system( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileSystem *file_system, + GError **error); +gboolean +gadataset_file_system_dataset_factory_set_file_system_uri( + GADatasetFileSystemDatasetFactory *factory, + const gchar *uri, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_path( + GADatasetFileSystemDatasetFactory *factory, + const gchar *path, + GError **error); +/* +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_file( + GADatasetFileSystemDatasetFactory *factory, + GArrowFileInfo *file, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gboolean +gadataset_file_system_dataset_factory_add_selector( + GADatasetFileSystemDatasetFactory *factory, + GArrorFileSelector *selector, + GError **error); +*/ + +GARROW_AVAILABLE_IN_5_0 +GADatasetFileSystemDataset * +gadataset_file_system_dataset_factory_finish( + GADatasetFileSystemDatasetFactory *factory, + GError **error); + + +G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/dataset-factory.hpp b/c_glib/arrow-dataset-glib/dataset-factory.hpp new file mode 100644 index 00000000000..114db35bc59 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset-factory.hpp @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +std::shared_ptr +gadataset_dataset_factory_get_raw(GADatasetDatasetFactory *factory); diff --git a/c_glib/arrow-dataset-glib/dataset.cpp b/c_glib/arrow-dataset-glib/dataset.cpp new file mode 100644 index 00000000000..3bd62f99ef3 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset.cpp @@ -0,0 +1,365 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: dataset + * @section_id: dataset + * @title: Dataset related classes + * @include: arrow-dataset-glib/arrow-dataset-glib.h + * + * #GADatasetDataset is a base class for datasets. + * + * #GADatasetFileSystemDataset is a class for file system dataset. + * + * #GADatasetFileFormat is a base class for file formats. + * + * #GADatasetCSVFileFormat is a class for CSV file format. + * + * #GADatasetIPCFileFormat is a class for IPC file format. + * + * #GADatasetParquetFileFormat is a class for Apache Parquet file format. + * + * Since: 5.0.0 + */ + +typedef struct GADatasetDatasetPrivate_ { + std::shared_ptr dataset; +} GADatasetDatasetPrivate; + +enum { + PROP_DATASET = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetDataset, + gadataset_dataset, + G_TYPE_OBJECT) + +#define GADATASET_DATASET_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_dataset_get_instance_private( \ + GADATASET_DATASET(obj))) + +static void +gadataset_dataset_finalize(GObject *object) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + priv->dataset.~shared_ptr(); + G_OBJECT_CLASS(gadataset_dataset_parent_class)->finalize(object); +} + +static void +gadataset_dataset_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATASET: + priv->dataset = + *static_cast *>( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_dataset_init(GADatasetDataset *object) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(object); + new(&priv->dataset) std::shared_ptr; +} + +static void +gadataset_dataset_class_init(GADatasetDatasetClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_dataset_finalize; + gobject_class->set_property = gadataset_dataset_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("dataset", + "Dataset", + "The raw " + "std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATASET, spec); +} + +/** + * gadataset_dataset_begin_scan: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A newly created #GADatasetScannerBuilder on success, %NULL on error. + * + * Since: 5.0.0 + */ +GADatasetScannerBuilder * +gadataset_dataset_begin_scan(GADatasetDataset *dataset, + GError **error) +{ + return gadataset_scanner_builder_new(dataset, error); +} + +/** + * gadataset_dataset_to_table: + * @dataset: A #GADatasetDataset. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full) (nullable): + * A loaded #GArrowTable on success, %NULL on error. + * + * Since: 5.0.0 + */ +GArrowTable * +gadataset_dataset_to_table(GADatasetDataset *dataset, + GError **error) +{ + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (!garrow::check(error, + arrow_scanner_builder_result, + "[dataset][to-table]")) { + return NULL; + } + auto arrow_scanner_builder = *arrow_scanner_builder_result; + auto arrow_scanner_result = arrow_scanner_builder->Finish(); + if (!garrow::check(error, + arrow_scanner_result, + "[dataset][to-table]")) { + return NULL; + } + auto arrow_scanner = *arrow_scanner_result; + auto arrow_table_result = arrow_scanner->ToTable(); + if (!garrow::check(error, + arrow_scanner_result, + "[dataset][to-table]")) { + return NULL; + } + return garrow_table_new_raw(&(*arrow_table_result)); +} + +/** + * gadataset_dataset_get_type_name: + * @dataset: A #GADatasetDataset. + * + * Returns: The type name of @dataset. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 5.0.0 + */ +gchar * +gadataset_dataset_get_type_name(GADatasetDataset *dataset) +{ + const auto arrow_dataset = gadataset_dataset_get_raw(dataset); + const auto &type_name = arrow_dataset->type_name(); + return g_strndup(type_name.data(), type_name.size()); +} + + +typedef struct GADatasetFileSystemDatasetPrivate_ { + GADatasetFileFormat *format; + GArrowFileSystem *file_system; +} GADatasetFileSystemDatasetPrivate; + +enum { + PROP_FORMAT = 1, + PROP_FILE_SYSTEM, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileSystemDataset, + gadataset_file_system_dataset, + GADATASET_TYPE_DATASET) + +#define GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_system_dataset_get_instance_private( \ + GADATASET_FILE_SYSTEM_DATASET(obj))) + +static void +gadataset_file_system_dataset_dispose(GObject *object) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + if (priv->format) { + g_object_unref(priv->format); + priv->format = NULL; + } + + if (priv->file_system) { + g_object_unref(priv->file_system); + priv->file_system = NULL; + } + + G_OBJECT_CLASS(gadataset_file_system_dataset_parent_class)->dispose(object); +} + +static void +gadataset_file_system_dataset_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + priv->format = GADATASET_FILE_FORMAT(g_value_dup_object(value)); + break; + case PROP_FILE_SYSTEM: + priv->file_system = GARROW_FILE_SYSTEM(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GADATASET_FILE_SYSTEM_DATASET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_FORMAT: + g_value_set_object(value, priv->format); + break; + case PROP_FILE_SYSTEM: + g_value_set_object(value, priv->file_system); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gadataset_file_system_dataset_init(GADatasetFileSystemDataset *object) +{ +} + +static void +gadataset_file_system_dataset_class_init(GADatasetFileSystemDatasetClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = gadataset_file_system_dataset_dispose; + gobject_class->set_property = gadataset_file_system_dataset_set_property; + gobject_class->get_property = gadataset_file_system_dataset_get_property; + + GParamSpec *spec; + /** + * GADatasetFileSystemDataset:format: + * + * Format of the dataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("format", + "Format", + "Format of the dataset", + GADATASET_TYPE_FILE_FORMAT, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FORMAT, spec); + + /** + * GADatasetFileSystemDataset:file-system: + * + * File system of the dataset. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("file-system", + "File system", + "File system of the dataset", + GARROW_TYPE_FILE_SYSTEM, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_FILE_SYSTEM, spec); +} + + +G_END_DECLS + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset) +{ + return gadataset_dataset_new_raw(arrow_dataset, + "dataset", arrow_dataset, + NULL); +} + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + ...) +{ + va_list args; + va_start(args, first_property_name); + auto array = gadataset_dataset_new_raw_valist(arrow_dataset, + first_property_name, + args); + va_end(args); + return array; +} + +GADatasetDataset * +gadataset_dataset_new_raw_valist( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + va_list args) +{ + GType type = GADATASET_TYPE_DATASET; + const auto type_name = (*arrow_dataset)->type_name(); + if (type_name == "filesystem") { + type = GADATASET_TYPE_FILE_SYSTEM_DATASET; + } + return GADATASET_DATASET(g_object_new_valist(type, + first_property_name, + args)); +} + +std::shared_ptr +gadataset_dataset_get_raw(GADatasetDataset *dataset) +{ + auto priv = GADATASET_DATASET_GET_PRIVATE(dataset); + return priv->dataset; +} diff --git a/c_glib/arrow-dataset-glib/dataset.h b/c_glib/arrow-dataset-glib/dataset.h new file mode 100644 index 00000000000..97cf35d74d7 --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset.h @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +typedef struct _GADatasetScannerBuilder GADatasetScannerBuilder; + +#define GADATASET_TYPE_DATASET (gadataset_dataset_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetDataset, + gadataset_dataset, + GADATASET, + DATASET, + GObject) +struct _GADatasetDatasetClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GADatasetScannerBuilder * +gadataset_dataset_begin_scan(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +GArrowTable * +gadataset_dataset_to_table(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gchar * +gadataset_dataset_get_type_name(GADatasetDataset *dataset); + + +#define GADATASET_TYPE_FILE_SYSTEM_DATASET \ + (gadataset_file_system_dataset_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileSystemDataset, + gadataset_file_system_dataset, + GADATASET, + FILE_SYSTEM_DATASET, + GADatasetDataset) +struct _GADatasetFileSystemDatasetClass +{ + GADatasetDatasetClass parent_class; +}; + + +G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/dataset.hpp b/c_glib/arrow-dataset-glib/dataset.hpp new file mode 100644 index 00000000000..94dddd2eb7a --- /dev/null +++ b/c_glib/arrow-dataset-glib/dataset.hpp @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset); +GADatasetDataset * +gadataset_dataset_new_raw( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + ...); +GADatasetDataset * +gadataset_dataset_new_raw_valist( + std::shared_ptr *arrow_dataset, + const gchar *first_property_name, + va_list arg); +std::shared_ptr +gadataset_dataset_get_raw(GADatasetDataset *dataset); + +GADatasetFileFormat * +gadataset_file_format_new_raw( + std::shared_ptr *arrow_format); +std::shared_ptr +gadataset_dataset_get_raw(GADatasetDataset *dataset); + + diff --git a/c_glib/arrow-dataset-glib/file-format.cpp b/c_glib/arrow-dataset-glib/file-format.cpp index 7f10c9debbe..43f6a198f23 100644 --- a/c_glib/arrow-dataset-glib/file-format.cpp +++ b/c_glib/arrow-dataset-glib/file-format.cpp @@ -29,56 +29,57 @@ G_BEGIN_DECLS * @title: File format classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * - * #GADFileFormat is a base class for file format classes. + * #GADatasetFileFormat is a base class for file format classes. * - * #GADCSVFileFormat is a class for CSV file format. + * #GADatasetCSVFileFormat is a class for CSV file format. * - * #GADIPCFileFormat is a class for IPC file format. + * #GADatasetIPCFileFormat is a class for IPC file format. * - * #GADParquetFileFormat is a class for Parquet file format. + * #GADatasetParquetFileFormat is a class for Parquet file format. * - * * Since: 3.0.0 + * Since: 3.0.0 */ -typedef struct GADFileFormatPrivate_ { +typedef struct GADatasetFileFormatPrivate_ { std::shared_ptr file_format; -} GADFileFormatPrivate; +} GADatasetFileFormatPrivate; enum { PROP_FILE_FORMAT = 1, }; -G_DEFINE_TYPE_WITH_PRIVATE(GADFileFormat, - gad_file_format, +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetFileFormat, + gadataset_file_format, G_TYPE_OBJECT) -#define GAD_FILE_FORMAT_GET_PRIVATE(obj) \ - static_cast( \ - gad_file_format_get_instance_private( \ - GAD_FILE_FORMAT(obj))) +#define GADATASET_FILE_FORMAT_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_file_format_get_instance_private( \ + GADATASET_FILE_FORMAT(obj))) static void -gad_file_format_finalize(GObject *object) +gadataset_file_format_finalize(GObject *object) { - auto priv = GAD_FILE_FORMAT_GET_PRIVATE(object); + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); priv->file_format.~shared_ptr(); - G_OBJECT_CLASS(gad_file_format_parent_class)->finalize(object); + G_OBJECT_CLASS(gadataset_file_format_parent_class)->finalize(object); } static void -gad_file_format_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +gadataset_file_format_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GAD_FILE_FORMAT_GET_PRIVATE(object); + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); switch (prop_id) { case PROP_FILE_FORMAT: priv->file_format = - *static_cast *>(g_value_get_pointer(value)); + *static_cast *>( + g_value_get_pointer(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -87,19 +88,19 @@ gad_file_format_set_property(GObject *object, } static void -gad_file_format_init(GADFileFormat *object) +gadataset_file_format_init(GADatasetFileFormat *object) { - auto priv = GAD_FILE_FORMAT_GET_PRIVATE(object); + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(object); new(&priv->file_format) std::shared_ptr; } static void -gad_file_format_class_init(GADFileFormatClass *klass) +gadataset_file_format_class_init(GADatasetFileFormatClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); - gobject_class->finalize = gad_file_format_finalize; - gobject_class->set_property = gad_file_format_set_property; + gobject_class->finalize = gadataset_file_format_finalize; + gobject_class->set_property = gadataset_file_format_set_property; GParamSpec *spec; spec = g_param_spec_pointer("file-format", @@ -111,8 +112,8 @@ gad_file_format_class_init(GADFileFormatClass *klass) } /** - * gad_file_format_get_type_name: - * @file_format: A #GADFileFormat. + * gadataset_file_format_get_type_name: + * @file_format: A #GADatasetFileFormat. * * Returns: The type name of @file_format. * @@ -121,145 +122,149 @@ gad_file_format_class_init(GADFileFormatClass *klass) * Since: 3.0.0 */ gchar * -gad_file_format_get_type_name(GADFileFormat *file_format) +gadataset_file_format_get_type_name(GADatasetFileFormat *file_format) { - const auto arrow_file_format = gad_file_format_get_raw(file_format); + const auto arrow_file_format = gadataset_file_format_get_raw(file_format); const auto &type_name = arrow_file_format->type_name(); return g_strndup(type_name.data(), type_name.size()); } /** - * gad_file_format_equal: - * @file_format: A #GADFileFormat. - * @other_file_format: A #GADFileFormat to be compared. + * gadataset_file_format_equal: + * @file_format: A #GADatasetFileFormat. + * @other_file_format: A #GADatasetFileFormat to be compared. * * Returns: %TRUE if they are the same content file format, %FALSE otherwise. * * Since: 3.0.0 */ gboolean -gad_file_format_equal(GADFileFormat *file_format, - GADFileFormat *other_file_format) +gadataset_file_format_equal(GADatasetFileFormat *file_format, + GADatasetFileFormat *other_file_format) { - const auto arrow_file_format = gad_file_format_get_raw(file_format); - const auto arrow_other_file_format = gad_file_format_get_raw(other_file_format); + const auto arrow_file_format = gadataset_file_format_get_raw(file_format); + const auto arrow_other_file_format = + gadataset_file_format_get_raw(other_file_format); return arrow_file_format->Equals(*arrow_other_file_format); } -G_DEFINE_TYPE(GADCSVFileFormat, - gad_csv_file_format, - GAD_TYPE_FILE_FORMAT) +G_DEFINE_TYPE(GADatasetCSVFileFormat, + gadataset_csv_file_format, + GADATASET_TYPE_FILE_FORMAT) static void -gad_csv_file_format_init(GADCSVFileFormat *object) +gadataset_csv_file_format_init(GADatasetCSVFileFormat *object) { } static void -gad_csv_file_format_class_init(GADCSVFileFormatClass *klass) +gadataset_csv_file_format_class_init(GADatasetCSVFileFormatClass *klass) { } /** - * gad_csv_file_format_new: + * gadataset_csv_file_format_new: * * Returns: The newly created CSV file format. * * Since: 3.0.0 */ -GADCSVFileFormat * -gad_csv_file_format_new(void) +GADatasetCSVFileFormat * +gadataset_csv_file_format_new(void) { std::shared_ptr arrow_file_format = std::make_shared(); - return GAD_CSV_FILE_FORMAT(gad_file_format_new_raw(&arrow_file_format)); + return GADATASET_CSV_FILE_FORMAT( + gadataset_file_format_new_raw(&arrow_file_format)); } -G_DEFINE_TYPE(GADIPCFileFormat, - gad_ipc_file_format, - GAD_TYPE_FILE_FORMAT) +G_DEFINE_TYPE(GADatasetIPCFileFormat, + gadataset_ipc_file_format, + GADATASET_TYPE_FILE_FORMAT) static void -gad_ipc_file_format_init(GADIPCFileFormat *object) +gadataset_ipc_file_format_init(GADatasetIPCFileFormat *object) { } static void -gad_ipc_file_format_class_init(GADIPCFileFormatClass *klass) +gadataset_ipc_file_format_class_init(GADatasetIPCFileFormatClass *klass) { } /** - * gad_ipc_file_format_new: + * gadataset_ipc_file_format_new: * * Returns: The newly created IPC file format. * * Since: 3.0.0 */ -GADIPCFileFormat * -gad_ipc_file_format_new(void) +GADatasetIPCFileFormat * +gadataset_ipc_file_format_new(void) { std::shared_ptr arrow_file_format = std::make_shared(); - return GAD_IPC_FILE_FORMAT(gad_file_format_new_raw(&arrow_file_format)); + return GADATASET_IPC_FILE_FORMAT( + gadataset_file_format_new_raw(&arrow_file_format)); } -G_DEFINE_TYPE(GADParquetFileFormat, - gad_parquet_file_format, - GAD_TYPE_FILE_FORMAT) +G_DEFINE_TYPE(GADatasetParquetFileFormat, + gadataset_parquet_file_format, + GADATASET_TYPE_FILE_FORMAT) static void -gad_parquet_file_format_init(GADParquetFileFormat *object) +gadataset_parquet_file_format_init(GADatasetParquetFileFormat *object) { } static void -gad_parquet_file_format_class_init(GADParquetFileFormatClass *klass) +gadataset_parquet_file_format_class_init(GADatasetParquetFileFormatClass *klass) { } /** - * gad_parquet_file_format_new: + * gadataset_parquet_file_format_new: * * Returns: The newly created Parquet file format. * * Since: 3.0.0 */ -GADParquetFileFormat * -gad_parquet_file_format_new(void) +GADatasetParquetFileFormat * +gadataset_parquet_file_format_new(void) { std::shared_ptr arrow_file_format = std::make_shared(); - return GAD_PARQUET_FILE_FORMAT(gad_file_format_new_raw(&arrow_file_format)); + return GADATASET_PARQUET_FILE_FORMAT( + gadataset_file_format_new_raw(&arrow_file_format)); } G_END_DECLS -GADFileFormat * -gad_file_format_new_raw( +GADatasetFileFormat * +gadataset_file_format_new_raw( std::shared_ptr *arrow_file_format) { - GType type = GAD_TYPE_FILE_FORMAT; + GType type = GADATASET_TYPE_FILE_FORMAT; const auto &type_name = (*arrow_file_format)->type_name(); if (type_name == "csv") { - type = GAD_TYPE_CSV_FILE_FORMAT; + type = GADATASET_TYPE_CSV_FILE_FORMAT; } else if (type_name == "ipc") { - type = GAD_TYPE_IPC_FILE_FORMAT; + type = GADATASET_TYPE_IPC_FILE_FORMAT; } else if (type_name == "parquet") { - type = GAD_TYPE_PARQUET_FILE_FORMAT; + type = GADATASET_TYPE_PARQUET_FILE_FORMAT; } - return GAD_FILE_FORMAT(g_object_new(type, - "file-format", arrow_file_format, - NULL)); + return GADATASET_FILE_FORMAT(g_object_new(type, + "file-format", arrow_file_format, + NULL)); } std::shared_ptr -gad_file_format_get_raw(GADFileFormat *file_format) +gadataset_file_format_get_raw(GADatasetFileFormat *file_format) { - auto priv = GAD_FILE_FORMAT_GET_PRIVATE(file_format); + auto priv = GADATASET_FILE_FORMAT_GET_PRIVATE(file_format); return priv->file_format; } diff --git a/c_glib/arrow-dataset-glib/file-format.h b/c_glib/arrow-dataset-glib/file-format.h index f77addc8da6..7a6f46f56e9 100644 --- a/c_glib/arrow-dataset-glib/file-format.h +++ b/c_glib/arrow-dataset-glib/file-format.h @@ -23,70 +23,71 @@ G_BEGIN_DECLS -#define GAD_TYPE_FILE_FORMAT (gad_file_format_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADFileFormat, - gad_file_format, - GAD, +#define GADATASET_TYPE_FILE_FORMAT (gadataset_file_format_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFileFormat, + gadataset_file_format, + GADATASET, FILE_FORMAT, GObject) -struct _GADFileFormatClass +struct _GADatasetFileFormatClass { GObjectClass parent_class; }; GARROW_AVAILABLE_IN_3_0 gchar * -gad_file_format_get_type_name(GADFileFormat *file_format); +gadataset_file_format_get_type_name(GADatasetFileFormat *file_format); GARROW_AVAILABLE_IN_3_0 gboolean -gad_file_format_equal(GADFileFormat *file_format, - GADFileFormat *other_file_format); +gadataset_file_format_equal(GADatasetFileFormat *file_format, + GADatasetFileFormat *other_file_format); -#define GAD_TYPE_CSV_FILE_FORMAT (gad_csv_file_format_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADCSVFileFormat, - gad_csv_file_format, - GAD, +#define GADATASET_TYPE_CSV_FILE_FORMAT (gadataset_csv_file_format_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetCSVFileFormat, + gadataset_csv_file_format, + GADATASET, CSV_FILE_FORMAT, - GADFileFormat) -struct _GADCSVFileFormatClass + GADatasetFileFormat) +struct _GADatasetCSVFileFormatClass { - GADFileFormatClass parent_class; + GADatasetFileFormatClass parent_class; }; GARROW_AVAILABLE_IN_3_0 -GADCSVFileFormat *gad_csv_file_format_new(void); +GADatasetCSVFileFormat *gadataset_csv_file_format_new(void); -#define GAD_TYPE_IPC_FILE_FORMAT (gad_ipc_file_format_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADIPCFileFormat, - gad_ipc_file_format, - GAD, +#define GADATASET_TYPE_IPC_FILE_FORMAT (gadataset_ipc_file_format_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetIPCFileFormat, + gadataset_ipc_file_format, + GADATASET, IPC_FILE_FORMAT, - GADFileFormat) -struct _GADIPCFileFormatClass + GADatasetFileFormat) +struct _GADatasetIPCFileFormatClass { - GADFileFormatClass parent_class; + GADatasetFileFormatClass parent_class; }; GARROW_AVAILABLE_IN_3_0 -GADIPCFileFormat *gad_ipc_file_format_new(void); +GADatasetIPCFileFormat *gadataset_ipc_file_format_new(void); -#define GAD_TYPE_PARQUET_FILE_FORMAT (gad_parquet_file_format_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADParquetFileFormat, - gad_parquet_file_format, - GAD, +#define GADATASET_TYPE_PARQUET_FILE_FORMAT \ + (gadataset_parquet_file_format_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetParquetFileFormat, + gadataset_parquet_file_format, + GADATASET, PARQUET_FILE_FORMAT, - GADFileFormat) -struct _GADParquetFileFormatClass + GADatasetFileFormat) +struct _GADatasetParquetFileFormatClass { - GADFileFormatClass parent_class; + GADatasetFileFormatClass parent_class; }; GARROW_AVAILABLE_IN_3_0 -GADParquetFileFormat *gad_parquet_file_format_new(void); +GADatasetParquetFileFormat *gadataset_parquet_file_format_new(void); G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/file-format.hpp b/c_glib/arrow-dataset-glib/file-format.hpp index e7e73f4ed98..5dfb20b3caa 100644 --- a/c_glib/arrow-dataset-glib/file-format.hpp +++ b/c_glib/arrow-dataset-glib/file-format.hpp @@ -23,8 +23,8 @@ #include -GADFileFormat * -gad_file_format_new_raw( +GADatasetFileFormat * +gadataset_file_format_new_raw( std::shared_ptr *arrow_file_format); std::shared_ptr -gad_file_format_get_raw(GADFileFormat *file_format); +gadataset_file_format_get_raw(GADatasetFileFormat *file_format); diff --git a/c_glib/arrow-dataset-glib/fragment.cpp b/c_glib/arrow-dataset-glib/fragment.cpp index 515a370d8e6..f2f0cd1c3e9 100644 --- a/c_glib/arrow-dataset-glib/fragment.cpp +++ b/c_glib/arrow-dataset-glib/fragment.cpp @@ -30,54 +30,55 @@ G_BEGIN_DECLS * @title: Fragment classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * - * #GADFragment is a base class for all fragment classes. + * #GADatasetFragment is a base class for all fragment classes. * - * #GADInMemoryFragment is a class for in-memory fragment. + * #GADatasetInMemoryFragment is a class for in-memory fragment. * * Since: 4.0.0 */ /* arrow::dataset::Fragment */ -typedef struct GADFragmentPrivate_ { +typedef struct GADatasetFragmentPrivate_ { std::shared_ptr fragment; -} GADFragmentPrivate; +} GADatasetFragmentPrivate; enum { PROP_FRAGMENT = 1, }; -G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADFragment, - gad_fragment, +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADatasetFragment, + gadataset_fragment, G_TYPE_OBJECT) -#define GAD_FRAGMENT_GET_PRIVATE(obj) \ - static_cast( \ - gad_fragment_get_instance_private( \ - GAD_FRAGMENT(obj))) +#define GADATASET_FRAGMENT_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_fragment_get_instance_private( \ + GADATASET_FRAGMENT(obj))) static void -gad_fragment_finalize(GObject *object) +gadataset_fragment_finalize(GObject *object) { - auto priv = GAD_FRAGMENT_GET_PRIVATE(object); + auto priv = GADATASET_FRAGMENT_GET_PRIVATE(object); priv->fragment.~shared_ptr(); - G_OBJECT_CLASS(gad_fragment_parent_class)->finalize(object); + G_OBJECT_CLASS(gadataset_fragment_parent_class)->finalize(object); } static void -gad_fragment_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +gadataset_fragment_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GAD_FRAGMENT_GET_PRIVATE(object); + auto priv = GADATASET_FRAGMENT_GET_PRIVATE(object); switch (prop_id) { case PROP_FRAGMENT: priv->fragment = - *static_cast *>(g_value_get_pointer(value)); + *static_cast *>( + g_value_get_pointer(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -86,19 +87,19 @@ gad_fragment_set_property(GObject *object, } static void -gad_fragment_init(GADFragment *object) +gadataset_fragment_init(GADatasetFragment *object) { - auto priv = GAD_FRAGMENT_GET_PRIVATE(object); + auto priv = GADATASET_FRAGMENT_GET_PRIVATE(object); new(&priv->fragment) std::shared_ptr; } static void -gad_fragment_class_init(GADFragmentClass *klass) +gadataset_fragment_class_init(GADatasetFragmentClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); - gobject_class->finalize = gad_fragment_finalize; - gobject_class->set_property = gad_fragment_set_property; + gobject_class->finalize = gadataset_fragment_finalize; + gobject_class->set_property = gadataset_fragment_set_property; GParamSpec *spec; spec = g_param_spec_pointer("fragment", @@ -111,35 +112,35 @@ gad_fragment_class_init(GADFragmentClass *klass) /* arrow::dataset::InMemoryFragment */ -G_DEFINE_TYPE(GADInMemoryFragment, - gad_in_memory_fragment, - GAD_TYPE_FRAGMENT) +G_DEFINE_TYPE(GADatasetInMemoryFragment, + gadataset_in_memory_fragment, + GADATASET_TYPE_FRAGMENT) static void -gad_in_memory_fragment_init(GADInMemoryFragment *object) +gadataset_in_memory_fragment_init(GADatasetInMemoryFragment *object) { } static void -gad_in_memory_fragment_class_init(GADInMemoryFragmentClass *klass) +gadataset_in_memory_fragment_class_init(GADatasetInMemoryFragmentClass *klass) { } /** - * gad_in_memory_fragment_new: + * gadataset_in_memory_fragment_new: * @schema: A #GArrowSchema. * @record_batches: (array length=n_record_batches): * (element-type GArrowRecordBatch): The record batches of the table. * @n_record_batches: The number of record batches. * - * Returns: A newly created #GADInMemoryFragment. + * Returns: A newly created #GADatasetInMemoryFragment. * * Since: 4.0.0 */ -GADInMemoryFragment * -gad_in_memory_fragment_new(GArrowSchema *schema, - GArrowRecordBatch **record_batches, - gsize n_record_batches) +GADatasetInMemoryFragment * +gadataset_in_memory_fragment_new(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches) { auto arrow_schema = garrow_schema_get_raw(schema); std::vector> arrow_record_batches; @@ -151,34 +152,36 @@ gad_in_memory_fragment_new(GArrowSchema *schema, auto arrow_in_memory_fragment = std::make_shared(arrow_schema, arrow_record_batches); - return gad_in_memory_fragment_new_raw(&arrow_in_memory_fragment); + return gadataset_in_memory_fragment_new_raw(&arrow_in_memory_fragment); } G_END_DECLS -GADFragment * -gad_fragment_new_raw(std::shared_ptr *arrow_fragment) +GADatasetFragment * +gadataset_fragment_new_raw( + std::shared_ptr *arrow_fragment) { auto fragment = - GAD_FRAGMENT(g_object_new(GAD_TYPE_FRAGMENT, - "fragment", arrow_fragment, - NULL)); + GADATASET_FRAGMENT(g_object_new(GADATASET_TYPE_FRAGMENT, + "fragment", arrow_fragment, + NULL)); return fragment; } std::shared_ptr -gad_fragment_get_raw(GADFragment *fragment) +gadataset_fragment_get_raw(GADatasetFragment *fragment) { - auto priv = GAD_FRAGMENT_GET_PRIVATE(fragment); + auto priv = GADATASET_FRAGMENT_GET_PRIVATE(fragment); return priv->fragment; } -GADInMemoryFragment * -gad_in_memory_fragment_new_raw(std::shared_ptr *arrow_fragment) +GADatasetInMemoryFragment * +gadataset_in_memory_fragment_new_raw( + std::shared_ptr *arrow_fragment) { auto fragment = - GAD_IN_MEMORY_FRAGMENT(g_object_new(GAD_TYPE_IN_MEMORY_FRAGMENT, - "fragment", arrow_fragment, - NULL)); + GADATASET_IN_MEMORY_FRAGMENT(g_object_new(GADATASET_TYPE_IN_MEMORY_FRAGMENT, + "fragment", arrow_fragment, + NULL)); return fragment; } diff --git a/c_glib/arrow-dataset-glib/fragment.h b/c_glib/arrow-dataset-glib/fragment.h index c0ee8769db1..9376b6cf3ee 100644 --- a/c_glib/arrow-dataset-glib/fragment.h +++ b/c_glib/arrow-dataset-glib/fragment.h @@ -25,34 +25,35 @@ G_BEGIN_DECLS /* arrow::dataset::Fragment */ -#define GAD_TYPE_FRAGMENT (gad_fragment_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADFragment, - gad_fragment, - GAD, +#define GADATASET_TYPE_FRAGMENT (gadataset_fragment_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetFragment, + gadataset_fragment, + GADATASET, FRAGMENT, GObject) -struct _GADFragmentClass +struct _GADatasetFragmentClass { GObjectClass parent_class; }; /* arrow::dataset::InMemoryFragment */ -#define GAD_TYPE_IN_MEMORY_FRAGMENT (gad_in_memory_fragment_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADInMemoryFragment, - gad_in_memory_fragment, - GAD, +#define GADATASET_TYPE_IN_MEMORY_FRAGMENT \ + (gadataset_in_memory_fragment_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetInMemoryFragment, + gadataset_in_memory_fragment, + GADATASET, IN_MEMORY_FRAGMENT, - GADFragment) -struct _GADInMemoryFragmentClass + GADatasetFragment) +struct _GADatasetInMemoryFragmentClass { - GADFragmentClass parent_class; + GADatasetFragmentClass parent_class; }; GARROW_AVAILABLE_IN_4_0 -GADInMemoryFragment * -gad_in_memory_fragment_new(GArrowSchema *schema, - GArrowRecordBatch **record_batches, - gsize n_record_batches); +GADatasetInMemoryFragment * +gadataset_in_memory_fragment_new(GArrowSchema *schema, + GArrowRecordBatch **record_batches, + gsize n_record_batches); G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/fragment.hpp b/c_glib/arrow-dataset-glib/fragment.hpp index 441b7c99cb8..904f8365396 100644 --- a/c_glib/arrow-dataset-glib/fragment.hpp +++ b/c_glib/arrow-dataset-glib/fragment.hpp @@ -24,10 +24,12 @@ #include std::shared_ptr -gad_fragment_get_raw(GADFragment *fragment); +gadataset_fragment_get_raw(GADatasetFragment *fragment); -GADFragment* -gad_fragment_new_raw(std::shared_ptr *arrow_fragment); +GADatasetFragment* +gadataset_fragment_new_raw( + std::shared_ptr *arrow_fragment); -GADInMemoryFragment* -gad_in_memory_fragment_new_raw(std::shared_ptr *arrow_fragment); +GADatasetInMemoryFragment* +gadataset_in_memory_fragment_new_raw( + std::shared_ptr *arrow_fragment); diff --git a/c_glib/arrow-dataset-glib/meson.build b/c_glib/arrow-dataset-glib/meson.build index 83b57504f81..b3f617330cf 100644 --- a/c_glib/arrow-dataset-glib/meson.build +++ b/c_glib/arrow-dataset-glib/meson.build @@ -18,6 +18,8 @@ # under the License. sources = files( + 'dataset-factory.cpp', + 'dataset.cpp', 'file-format.cpp', 'fragment.cpp', 'scanner.cpp', @@ -25,6 +27,8 @@ sources = files( c_headers = files( 'arrow-dataset-glib.h', + 'dataset-factory.h', + 'dataset.h', 'file-format.h', 'fragment.h', 'scanner.h', @@ -32,6 +36,8 @@ c_headers = files( cpp_headers = files( 'arrow-dataset-glib.hpp', + 'dataset-factory.hpp', + 'dataset.hpp', 'file-format.hpp', 'fragment.hpp', 'scanner.hpp', @@ -68,8 +74,8 @@ if have_gi sources: sources + c_headers, namespace: 'ArrowDataset', nsversion: api_version, - identifier_prefix: 'GAD', - symbol_prefix: 'gad', + identifier_prefix: 'GADataset', + symbol_prefix: 'gadataset', export_packages: 'arrow-dataset-glib', includes: [ 'Arrow-1.0', diff --git a/c_glib/arrow-dataset-glib/scanner.cpp b/c_glib/arrow-dataset-glib/scanner.cpp index 36701ca373a..efa2a5c3287 100644 --- a/c_glib/arrow-dataset-glib/scanner.cpp +++ b/c_glib/arrow-dataset-glib/scanner.cpp @@ -17,13 +17,10 @@ * under the License. */ -#include - #include -#include -#include +#include -#include +#include #include G_BEGIN_DECLS @@ -31,70 +28,54 @@ G_BEGIN_DECLS /** * SECTION: scanner * @section_id: scanner - * @title: Scanner classes + * @title: Scanner related classes * @include: arrow-dataset-glib/arrow-dataset-glib.h * - * #GADScanOptions is a class for a set of scan options. - * - * #GADScanTask is an abstract class for a scan task. + * #GADatasetScanner is a class for scanning dataset. * - * #GADInMemoryScanTask is a class for a scan task of record batches. + * #GADatasetScannerBuilder is a class for building a scanner. * - * Since: 1.0.0 + * Since: 5.0.0 */ -/* arrow::dataset::ScanOptions */ - -typedef struct GADScanOptionsPrivate_ { - std::shared_ptr scan_options; -} GADScanOptionsPrivate; +typedef struct GADatasetScannerPrivate_ { + std::shared_ptr scanner; +} GADatasetScannerPrivate; enum { - PROP_SCAN_OPTIONS = 1, - PROP_FILTER, - PROP_EVALUATOR, - PROP_PROJECTOR, - PROP_BATCH_SIZE, - PROP_USE_THREADS, + PROP_SCANNER = 1, }; -G_DEFINE_TYPE_WITH_PRIVATE(GADScanOptions, - gad_scan_options, +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScanner, + gadataset_scanner, G_TYPE_OBJECT) -#define GAD_SCAN_OPTIONS_GET_PRIVATE(obj) \ - static_cast( \ - gad_scan_options_get_instance_private( \ - GAD_SCAN_OPTIONS(obj))) +#define GADATASET_SCANNER_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_scanner_get_instance_private( \ + GADATASET_SCANNER(obj))) static void -gad_scan_options_finalize(GObject *object) +gadataset_scanner_finalize(GObject *object) { - auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(object); - - priv->scan_options.~shared_ptr(); - - G_OBJECT_CLASS(gad_scan_options_parent_class)->finalize(object); + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); + priv->scanner.~shared_ptr(); + G_OBJECT_CLASS(gadataset_scanner_parent_class)->finalize(object); } static void -gad_scan_options_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +gadataset_scanner_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(object); + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); switch (prop_id) { - case PROP_SCAN_OPTIONS: - priv->scan_options = - *static_cast *>(g_value_get_pointer(value)); - break; - case PROP_BATCH_SIZE: - priv->scan_options->batch_size = g_value_get_int64(value); - break; - case PROP_USE_THREADS: - priv->scan_options->use_threads = g_value_get_boolean(value); + case PROP_SCANNER: + priv->scanner = + *static_cast *>( + g_value_get_pointer(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -103,212 +84,91 @@ gad_scan_options_set_property(GObject *object, } static void -gad_scan_options_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) +gadataset_scanner_init(GADatasetScanner *object) { - auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_BATCH_SIZE: - g_value_set_int64(value, priv->scan_options->batch_size); - break; - case PROP_USE_THREADS: - g_value_set_boolean(value, priv->scan_options->use_threads); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } + auto priv = GADATASET_SCANNER_GET_PRIVATE(object); + new(&priv->scanner) std::shared_ptr; } static void -gad_scan_options_init(GADScanOptions *object) +gadataset_scanner_class_init(GADatasetScannerClass *klass) { - auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(object); - new(&priv->scan_options) std::shared_ptr; -} + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = gadataset_scanner_finalize; + gobject_class->set_property = gadataset_scanner_set_property; -static void -gad_scan_options_class_init(GADScanOptionsClass *klass) -{ - GObjectClass *gobject_class; GParamSpec *spec; - - gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = gad_scan_options_finalize; - gobject_class->set_property = gad_scan_options_set_property; - gobject_class->get_property = gad_scan_options_get_property; - - auto scan_options = std::make_shared(); - - spec = g_param_spec_pointer("scan-options", - "ScanOptions", - "The raw std::shared *", + spec = g_param_spec_pointer("scanner", + "Scanner", + "The raw std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_SCAN_OPTIONS, spec); - - // TODO: PROP_FILTER - // TODO: PROP_EVALUATOR - // TODO: PROP_PROJECTOR - - /** - * GADScanOptions:batch-size: - * - * Maximum row count for scanned batches. - * - * Since: 1.0.0 - */ - spec = g_param_spec_int64("batch-size", - "Batch size", - "Maximum row count for scanned batches", - 0, - G_MAXINT64, - scan_options->batch_size, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_BATCH_SIZE, spec); - - /** - * GADScanOptions:use-threads: - * - * Indicate if the Scanner should make use of a ThreadPool. - * - * Since: 4.0.0 - */ - spec = g_param_spec_boolean("use-threads", - "Use threads", - "Indicate if the Scanner should make use of a ThreadPool", - scan_options->use_threads, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_USE_THREADS, spec); + g_object_class_install_property(gobject_class, PROP_SCANNER, spec); } /** - * gad_scan_options_new: - * @schema: A #GArrowSchema. - * - * Returns: A newly created #GADScanOptions. - * - * Since: 1.0.0 - */ -GADScanOptions * -gad_scan_options_new(GArrowSchema *schema) -{ - auto arrow_schema = garrow_schema_get_raw(schema); - auto arrow_scan_options = std::make_shared(); - arrow_scan_options->dataset_schema = arrow_schema; - return gad_scan_options_new_raw(&arrow_scan_options); -} - -/** - * gad_scan_options_get_schema: - * @scan_options: A #GADScanOptions. + * gadataset_scanner_to_table: + * @scanner: A #GADatasetScanner. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (transfer full): A #GArrowSchema. + * Returns: (transfer full) (nullable): + * A newly created #GArrowTable on success, %NULL on error. * - * Since: 1.0.0 + * Since: 5.0.0 */ -GArrowSchema * -gad_scan_options_get_schema(GADScanOptions *scan_options) +GArrowTable * +gadataset_scanner_to_table(GADatasetScanner *scanner, + GError **error) { - auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(scan_options); - auto arrow_schema = priv->scan_options->dataset_schema; - return garrow_schema_new_raw(&arrow_schema); + auto arrow_scanner = gadataset_scanner_get_raw(scanner); + auto arrow_table_result = arrow_scanner->ToTable(); + if (garrow::check(error, arrow_table_result, "[scanner][to-table]")) { + auto arrow_table = *arrow_table_result; + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } } -/* arrow::dataset::ScanTask */ -typedef struct GADScanTaskPrivate_ { - std::shared_ptr scan_task; - GADScanOptions *options; - GADFragment *fragment; -} GADScanTaskPrivate; +typedef struct GADatasetScannerBuilderPrivate_ { + std::shared_ptr scanner_builder; +} GADatasetScannerBuilderPrivate; enum { - PROP_SCAN_TASK = 1, - PROP_OPTIONS, - PROP_FRAGMENT, + PROP_SCANNER_BUILDER = 1, }; -G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GADScanTask, - gad_scan_task, - G_TYPE_OBJECT) - -#define GAD_SCAN_TASK_GET_PRIVATE(obj) \ - static_cast( \ - gad_scan_task_get_instance_private( \ - GAD_SCAN_TASK(obj))) - -static void -gad_scan_task_dispose(GObject *object) -{ - auto priv = GAD_SCAN_TASK_GET_PRIVATE(object); - - if (priv->options) { - g_object_unref(priv->options); - priv->options = NULL; - } - - if (priv->fragment) { - g_object_unref(priv->fragment); - priv->fragment = NULL; - } - - G_OBJECT_CLASS(gad_scan_task_parent_class)->dispose(object); -} - -static void -gad_scan_task_finalize(GObject *object) -{ - auto priv = GAD_SCAN_TASK_GET_PRIVATE(object); - - priv->scan_task.~shared_ptr(); +G_DEFINE_TYPE_WITH_PRIVATE(GADatasetScannerBuilder, + gadataset_scanner_builder, + G_TYPE_OBJECT) - G_OBJECT_CLASS(gad_scan_task_parent_class)->finalize(object); -} +#define GADATASET_SCANNER_BUILDER_GET_PRIVATE(obj) \ + static_cast( \ + gadataset_scanner_builder_get_instance_private( \ + GADATASET_SCANNER_BUILDER(obj))) static void -gad_scan_task_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +gadataset_scanner_builder_finalize(GObject *object) { - auto priv = GAD_SCAN_TASK_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_SCAN_TASK: - priv->scan_task = - *static_cast *>(g_value_get_pointer(value)); - break; - case PROP_OPTIONS: - priv->options = GAD_SCAN_OPTIONS(g_value_dup_object(value)); - break; - case PROP_FRAGMENT: - priv->fragment = GAD_FRAGMENT(g_value_dup_object(value)); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); + priv->scanner_builder.~shared_ptr(); + G_OBJECT_CLASS(gadataset_scanner_builder_parent_class)->finalize(object); } static void -gad_scan_task_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) +gadataset_scanner_builder_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GAD_SCAN_TASK_GET_PRIVATE(object); + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); switch (prop_id) { - case PROP_OPTIONS: - g_value_set_object(value, priv->options); - break; - case PROP_FRAGMENT: - g_value_set_object(value, priv->fragment); + case PROP_SCANNER_BUILDER: + priv->scanner_builder = + *static_cast *>( + g_value_get_pointer(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -317,206 +177,112 @@ gad_scan_task_get_property(GObject *object, } static void -gad_scan_task_init(GADScanTask *object) +gadataset_scanner_builder_init(GADatasetScannerBuilder *object) { - auto priv = GAD_SCAN_TASK_GET_PRIVATE(object); - new(&priv->scan_task) std::shared_ptr; + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(object); + new(&priv->scanner_builder) std::shared_ptr; } static void -gad_scan_task_class_init(GADScanTaskClass *klass) +gadataset_scanner_builder_class_init(GADatasetScannerBuilderClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->dispose = gad_scan_task_dispose; - gobject_class->finalize = gad_scan_task_finalize; - gobject_class->set_property = gad_scan_task_set_property; - gobject_class->get_property = gad_scan_task_get_property; + gobject_class->finalize = gadataset_scanner_builder_finalize; + gobject_class->set_property = gadataset_scanner_builder_set_property; GParamSpec *spec; - spec = g_param_spec_pointer("scan-task", - "ScanTask", - "The raw std::shared *", + spec = g_param_spec_pointer("scanner-builder", + "Scanner builder", + "The raw " + "std::shared *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_SCAN_TASK, spec); - - /** - * GADScanTask:options: - * - * The options of the scan task. - * - * Since: 1.0.0 - */ - spec = g_param_spec_object("options", - "Options", - "The options of the scan task", - GAD_TYPE_SCAN_OPTIONS, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_OPTIONS, spec); - - /** - * GADScanTask:fragment: - * - * The fragment of the scan task. - * - * Since: 4.0.0 - */ - spec = g_param_spec_object("fragment", - "Fragment", - "The fragment of the scan task", - GAD_TYPE_FRAGMENT, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FRAGMENT, spec); + g_object_class_install_property(gobject_class, PROP_SCANNER_BUILDER, spec); } /** - * gad_scan_task_get_options: - * @scan_task: A #GADScanTask. - * - * Returns: (transfer full): A #GADScanOptions. - * - * Since: 1.0.0 - */ -GADScanOptions * -gad_scan_task_get_options(GADScanTask *scan_task) -{ - auto priv = GAD_SCAN_TASK_GET_PRIVATE(scan_task); - if (priv->options) { - g_object_ref(priv->options); - return priv->options; - } - - auto arrow_options = priv->scan_task->options(); - return gad_scan_options_new_raw(&arrow_options); -} - -/** - * gad_scan_task_get_fragment: - * @scan_task: A #GADFragment. + * gadataset_scanner_builder_new: + * @dataset: A #GADatasetDataset to be scanned. + * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (transfer full): A #GADFragment. + * Returns: (nullable): A newly created #GADatasetScannerBuilder on success, + * %NULL on error. * - * Since: 4.0.0 + * Since: 5.0.0 */ -GADFragment * -gad_scan_task_get_fragment(GADScanTask *scan_task) +GADatasetScannerBuilder * +gadataset_scanner_builder_new(GADatasetDataset *dataset, GError **error) { - auto priv = GAD_SCAN_TASK_GET_PRIVATE(scan_task); - if (priv->fragment) { - g_object_ref(priv->fragment); - return priv->fragment; + auto arrow_dataset = gadataset_dataset_get_raw(dataset); + auto arrow_scanner_builder_result = arrow_dataset->NewScan(); + if (garrow::check(error, + arrow_scanner_builder_result, + "[scanner-builder][new]")) { + auto arrow_scanner_builder = *arrow_scanner_builder_result; + return gadataset_scanner_builder_new_raw(&arrow_scanner_builder); + } else { + return NULL; } - - auto arrow_fragment = priv->scan_task->fragment(); - return gad_fragment_new_raw(&arrow_fragment); } /** - * gad_scan_task_execute: - * @scan_task: A #GADScanTask. + * gadataset_scanner_builder_finish: + * @builder: A #GADatasetScannerBuilder. * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (nullable) (transfer full): A newly created #GArrowRecordBatchIterator, - * or %NULL on error. + * Returns: (transfer full) (nullable): + * A newly created #GADatasetScanner on success, %NULL on error. * - * Since: 1.0.0 + * Since: 5.0.0 */ -GArrowRecordBatchIterator *gad_scan_task_execute(GADScanTask *scan_task, - GError **error) +GADatasetScanner * +gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, + GError **error) { - auto priv = GAD_SCAN_TASK_GET_PRIVATE(scan_task); - auto arrow_result = priv->scan_task->Execute(); - if (garrow::check(error, arrow_result, "[datasets][scan-task][execute]")) { - auto arrow_record_batch_iteraor = std::move(*arrow_result); - return garrow_record_batch_iterator_new_raw(&arrow_record_batch_iteraor); + auto arrow_builder = gadataset_scanner_builder_get_raw(builder); + auto arrow_scanner_result = arrow_builder->Finish(); + if (garrow::check(error, arrow_scanner_result, "[scanner-builder][finish]")) { + auto arrow_scanner = *arrow_scanner_result; + return gadataset_scanner_new_raw(&arrow_scanner); } else { return NULL; } } -/* arrow::dataset::InMemoryScanTask */ - -G_DEFINE_TYPE(GADInMemoryScanTask, - gad_in_memory_scan_task, - GAD_TYPE_SCAN_TASK) - -static void -gad_in_memory_scan_task_init(GADInMemoryScanTask *object) -{ -} -static void -gad_in_memory_scan_task_class_init(GADInMemoryScanTaskClass *klass) -{ -} +G_END_DECLS -/** - * gad_in_memory_scan_task_new: - * @record_batches: (array length=n_record_batches): - * (element-type GArrowRecordBatch): The record batches of the table. - * @n_record_batches: The number of record batches. - * @options: A #GADScanOptions. - * @fragment: A #GADInMemoryFragment. - * - * Returns: A newly created #GADInMemoryScanTask. - * - * Since: 1.0.0 - */ -GADInMemoryScanTask * -gad_in_memory_scan_task_new(GArrowRecordBatch **record_batches, - gsize n_record_batches, - GADScanOptions *options, - GADInMemoryFragment *fragment) +GADatasetScanner * +gadataset_scanner_new_raw( + std::shared_ptr *arrow_scanner) { - std::vector> arrow_record_batches; - arrow_record_batches.reserve(n_record_batches); - for (gsize i = 0; i < n_record_batches; ++i) { - auto arrow_record_batch = garrow_record_batch_get_raw(record_batches[i]); - arrow_record_batches.push_back(arrow_record_batch); - } - auto arrow_options = gad_scan_options_get_raw(options); - auto arrow_fragment = gad_fragment_get_raw(GAD_FRAGMENT(fragment)); - auto arrow_in_memory_scan_task = - std::make_shared(arrow_record_batches, - arrow_options, - arrow_fragment); - return gad_in_memory_scan_task_new_raw(&arrow_in_memory_scan_task, - options, - fragment); + auto scanner = + GADATASET_SCANNER(g_object_new(GADATASET_TYPE_SCANNER, + "scanner", arrow_scanner, + NULL)); + return scanner; } -G_END_DECLS - -GADScanOptions * -gad_scan_options_new_raw(std::shared_ptr *arrow_scan_options) +std::shared_ptr +gadataset_scanner_get_raw(GADatasetScanner *scanner) { - auto scan_options = - GAD_SCAN_OPTIONS(g_object_new(GAD_TYPE_SCAN_OPTIONS, - "scan-options", arrow_scan_options, - NULL)); - return scan_options; + auto priv = GADATASET_SCANNER_GET_PRIVATE(scanner); + return priv->scanner; } -std::shared_ptr -gad_scan_options_get_raw(GADScanOptions *scan_options) +GADatasetScannerBuilder * +gadataset_scanner_builder_new_raw( + std::shared_ptr *arrow_scanner_builder) { - auto priv = GAD_SCAN_OPTIONS_GET_PRIVATE(scan_options); - return priv->scan_options; + return GADATASET_SCANNER_BUILDER( + g_object_new(GADATASET_TYPE_SCANNER_BUILDER, + "scanner-builder", arrow_scanner_builder, + NULL)); } -GADInMemoryScanTask * -gad_in_memory_scan_task_new_raw(std::shared_ptr *arrow_in_memory_scan_task, - GADScanOptions *options, - GADInMemoryFragment *fragment) +std::shared_ptr +gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder) { - auto in_memory_scan_task = - GAD_IN_MEMORY_SCAN_TASK(g_object_new(GAD_TYPE_IN_MEMORY_SCAN_TASK, - "scan-task", arrow_in_memory_scan_task, - "options", options, - "fragment", fragment, - NULL)); - return in_memory_scan_task; + auto priv = GADATASET_SCANNER_BUILDER_GET_PRIVATE(scanner_builder); + return priv->scanner_builder; } diff --git a/c_glib/arrow-dataset-glib/scanner.h b/c_glib/arrow-dataset-glib/scanner.h index f387e8948f2..446815d6db1 100644 --- a/c_glib/arrow-dataset-glib/scanner.h +++ b/c_glib/arrow-dataset-glib/scanner.h @@ -19,70 +19,45 @@ #pragma once -#include - +#include #include G_BEGIN_DECLS -/* arrow::dataset::ScanOptions */ - -#define GAD_TYPE_SCAN_OPTIONS (gad_scan_options_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADScanOptions, - gad_scan_options, - GAD, - SCAN_OPTIONS, +#define GADATASET_TYPE_SCANNER (gadataset_scanner_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetScanner, + gadataset_scanner, + GADATASET, + SCANNER, GObject) -struct _GADScanOptionsClass +struct _GADatasetScannerClass { GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_5_0 +GArrowTable * +gadataset_scanner_to_table(GADatasetScanner *scanner, + GError **error); -GARROW_AVAILABLE_IN_1_0 -GADScanOptions *gad_scan_options_new(GArrowSchema *schema); -GARROW_AVAILABLE_IN_1_0 -GArrowSchema *gad_scan_options_get_schema(GADScanOptions *scan_options); - -/* arrow::dataset::ScanTask */ - -#define GAD_TYPE_SCAN_TASK (gad_scan_task_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADScanTask, - gad_scan_task, - GAD, - SCAN_TASK, +#define GADATASET_TYPE_SCANNER_BUILDER (gadataset_scanner_builder_get_type()) +G_DECLARE_DERIVABLE_TYPE(GADatasetScannerBuilder, + gadataset_scanner_builder, + GADATASET, + SCANNER_BUILDER, GObject) -struct _GADScanTaskClass +struct _GADatasetScannerBuilderClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_1_0 -GADScanOptions *gad_scan_task_get_options(GADScanTask *scan_task); -GARROW_AVAILABLE_IN_4_0 -GADFragment *gad_scan_task_get_fragment(GADScanTask *scan_task); -GARROW_AVAILABLE_IN_1_0 -GArrowRecordBatchIterator *gad_scan_task_execute(GADScanTask *scan_task, - GError **error); - -/* arrow::dataset::InMemoryScanTask */ - -#define GAD_TYPE_IN_MEMORY_SCAN_TASK (gad_in_memory_scan_task_get_type()) -G_DECLARE_DERIVABLE_TYPE(GADInMemoryScanTask, - gad_in_memory_scan_task, - GAD, - IN_MEMORY_SCAN_TASK, - GADScanTask) -struct _GADInMemoryScanTaskClass -{ - GADScanTaskClass parent_class; -}; - -GARROW_AVAILABLE_IN_1_0 -GADInMemoryScanTask * -gad_in_memory_scan_task_new(GArrowRecordBatch **record_batches, - gsize n_record_batches, - GADScanOptions *options, - GADInMemoryFragment *fragment); +GARROW_AVAILABLE_IN_5_0 +GADatasetScannerBuilder * +gadataset_scanner_builder_new(GADatasetDataset *dataset, + GError **error); +GARROW_AVAILABLE_IN_5_0 +GADatasetScanner * +gadataset_scanner_builder_finish(GADatasetScannerBuilder *builder, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-dataset-glib/scanner.hpp b/c_glib/arrow-dataset-glib/scanner.hpp index f10351ee99b..663ab6fc44b 100644 --- a/c_glib/arrow-dataset-glib/scanner.hpp +++ b/c_glib/arrow-dataset-glib/scanner.hpp @@ -24,12 +24,14 @@ #include #include -GADScanOptions * -gad_scan_options_new_raw(std::shared_ptr *arrow_scan_options); -std::shared_ptr -gad_scan_options_get_raw(GADScanOptions *scan_options); +GADatasetScanner * +gadataset_scanner_new_raw( + std::shared_ptr *arrow_scanner); +std::shared_ptr +gadataset_scanner_get_raw(GADatasetScanner *scanner); -GADInMemoryScanTask * -gad_in_memory_scan_task_new_raw(std::shared_ptr *arrow_in_memory_scan_task, - GADScanOptions *scan_options, - GADInMemoryFragment *fragment); +GADatasetScannerBuilder * +gadataset_scanner_builder_new_raw( + std::shared_ptr *arrow_scanner_builder); +std::shared_ptr +gadataset_scanner_builder_get_raw(GADatasetScannerBuilder *scanner_builder); diff --git a/c_glib/arrow-flight-glib/arrow-flight-glib.h b/c_glib/arrow-flight-glib/arrow-flight-glib.h new file mode 100644 index 00000000000..6fc8f43d840 --- /dev/null +++ b/c_glib/arrow-flight-glib/arrow-flight-glib.h @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include diff --git a/c_glib/arrow-flight-glib/arrow-flight-glib.hpp b/c_glib/arrow-flight-glib/arrow-flight-glib.hpp new file mode 100644 index 00000000000..11e1fe94d52 --- /dev/null +++ b/c_glib/arrow-flight-glib/arrow-flight-glib.hpp @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include diff --git a/c_glib/arrow-flight-glib/client.cpp b/c_glib/arrow-flight-glib/client.cpp new file mode 100644 index 00000000000..7610fc98570 --- /dev/null +++ b/c_glib/arrow-flight-glib/client.cpp @@ -0,0 +1,405 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: client + * @section_id: client + * @title: Client related classes + * @include: arrow-flight-glib/arrow-flight-glib.h + * + * #GAFlightStreamReader is a class for reading record batches from a + * server. + * + * #GAFlightCallOptions is a class for options of each call. + * + * #GAFlightClientOptions is a class for options of each client. + * + * #GAFlightClient is a class for Apache Arrow Flight client. + * + * Since: 5.0.0 + */ + +G_DEFINE_TYPE(GAFlightStreamReader, + gaflight_stream_reader, + GAFLIGHT_TYPE_RECORD_BATCH_READER) + +static void +gaflight_stream_reader_init(GAFlightStreamReader *object) +{ +} + +static void +gaflight_stream_reader_class_init(GAFlightStreamReaderClass *klass) +{ +} + +typedef struct GAFlightCallOptionsPrivate_ { + arrow::flight::FlightCallOptions options; +} GAFlightCallOptionsPrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightCallOptions, + gaflight_call_options, + G_TYPE_OBJECT) + +#define GAFLIGHT_CALL_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_call_options_get_instance_private( \ + GAFLIGHT_CALL_OPTIONS(obj))) + +static void +gaflight_call_options_finalize(GObject *object) +{ + auto priv = GAFLIGHT_CALL_OPTIONS_GET_PRIVATE(object); + + priv->options.~FlightCallOptions(); + + G_OBJECT_CLASS(gaflight_call_options_parent_class)->finalize(object); +} + +static void +gaflight_call_options_init(GAFlightCallOptions *object) +{ + auto priv = GAFLIGHT_CALL_OPTIONS_GET_PRIVATE(object); + new(&priv->options) arrow::flight::FlightCallOptions; +} + +static void +gaflight_call_options_class_init(GAFlightCallOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_call_options_finalize; +} + +/** + * gaflight_call_options_new: + * + * Returns: The newly created options for a call. + * + * Since: 5.0.0 + */ +GAFlightCallOptions * +gaflight_call_options_new(void) +{ + return static_cast( + g_object_new(GAFLIGHT_TYPE_CALL_OPTIONS, NULL)); +} + + +typedef struct GAFlightClientOptionsPrivate_ { + arrow::flight::FlightClientOptions options; +} GAFlightClientOptionsPrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightClientOptions, + gaflight_client_options, + G_TYPE_OBJECT) + +#define GAFLIGHT_CLIENT_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_client_options_get_instance_private( \ + GAFLIGHT_CLIENT_OPTIONS(obj))) + +static void +gaflight_client_options_finalize(GObject *object) +{ + auto priv = GAFLIGHT_CLIENT_OPTIONS_GET_PRIVATE(object); + + priv->options.~FlightClientOptions(); + + G_OBJECT_CLASS(gaflight_client_options_parent_class)->finalize(object); +} + +static void +gaflight_client_options_init(GAFlightClientOptions *object) +{ + auto priv = GAFLIGHT_CLIENT_OPTIONS_GET_PRIVATE(object); + new(&(priv->options)) arrow::flight::FlightClientOptions; + priv->options = arrow::flight::FlightClientOptions::Defaults(); +} + +static void +gaflight_client_options_class_init(GAFlightClientOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_client_options_finalize; +} + +/** + * gaflight_client_options_new: + * + * Returns: The newly created options for a client. + * + * Since: 5.0.0 + */ +GAFlightClientOptions * +gaflight_client_options_new(void) +{ + return static_cast( + g_object_new(GAFLIGHT_TYPE_CLIENT_OPTIONS, NULL)); +} + + +typedef struct GAFlightClientPrivate_ { + arrow::flight::FlightClient *client; +} GAFlightClientPrivate; + +enum { + PROP_CLIENT = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightClient, + gaflight_client, + G_TYPE_OBJECT) + +#define GAFLIGHT_CLIENT_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_client_get_instance_private( \ + GAFLIGHT_CLIENT(obj))) + +static void +gaflight_client_finalize(GObject *object) +{ + auto priv = GAFLIGHT_CLIENT_GET_PRIVATE(object); + + delete priv->client; + + G_OBJECT_CLASS(gaflight_client_parent_class)->finalize(object); +} + +static void +gaflight_client_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_CLIENT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CLIENT: + priv->client = + static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_client_init(GAFlightClient *object) +{ +} + +static void +gaflight_client_class_init(GAFlightClientClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_client_finalize; + gobject_class->set_property = gaflight_client_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("client", + "Client", + "The raw arrow::flight::FlightClient *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CLIENT, spec); +} + +/** + * gaflight_client_new: + * @location: A #GAFlightLocation to be connected. + * @options: (nullable): A #GAFlightClientOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): The newly created client, %NULL on error. + * + * Since: 5.0.0 + */ +GAFlightClient * +gaflight_client_new(GAFlightLocation *location, + GAFlightClientOptions *options, + GError **error) +{ + const auto flight_location = gaflight_location_get_raw(location); + std::unique_ptr flight_client; + arrow::Status status; + if (options) { + const auto flight_options = gaflight_client_options_get_raw(options); + status = arrow::flight::FlightClient::Connect(*flight_location, + *flight_options, + &flight_client); + } else { + status = arrow::flight::FlightClient::Connect(*flight_location, + &flight_client); + } + if (garrow::check(error, status, "[flight-client][new]")) { + return gaflight_client_new_raw(flight_client.release()); + } else { + return NULL; + } +} + +/** + * gaflight_client_list_flights: + * @client: A #GAFlightClient. + * @criteria: (nullable): A #GAFlightCriteria. + * @options: (nullable): A #GAFlightCallOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (element-type GAFlightInfo) (transfer full): + * The returned list of #GAFlightInfo on success, %NULL on error. + * + * Since: 5.0.0 + */ +GList * +gaflight_client_list_flights(GAFlightClient *client, + GAFlightCriteria *criteria, + GAFlightCallOptions *options, + GError **error) +{ + auto flight_client = gaflight_client_get_raw(client); + arrow::flight::Criteria flight_default_criteria; + auto flight_criteria = &flight_default_criteria; + if (criteria) { + flight_criteria = gaflight_criteria_get_raw(criteria); + } + arrow::flight::FlightCallOptions flight_default_options; + auto flight_options = &flight_default_options; + if (options) { + flight_options = gaflight_call_options_get_raw(options); + } + std::unique_ptr flight_listing; + auto status = flight_client->ListFlights(*flight_options, + *flight_criteria, + &flight_listing); + if (!garrow::check(error, + status, + "[flight-client][list-flights]")) { + return NULL; + } + GList *listing = NULL; + std::unique_ptr flight_info; + while (true) { + status = flight_listing->Next(&flight_info); + if (!garrow::check(error, + status, + "[flight-client][list-flights]")) { + g_list_free_full(listing, g_object_unref); + return NULL; + } + if (!flight_info) { + break; + } + auto info = gaflight_info_new_raw(flight_info.release()); + listing = g_list_prepend(listing, info); + } + return g_list_reverse(listing); +} + +/** + * gaflight_client_do_get: + * @client: A #GAFlightClient. + * @ticket: A #GAFlightTicket. + * @options: (nullable): A #GAFlightCallOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): + * The #GAFlightStreamReader to read record batched from the server + * on success, %NULL on error. + * + * Since: 6.0.0 + */ +GAFlightStreamReader * +gaflight_client_do_get(GAFlightClient *client, + GAFlightTicket *ticket, + GAFlightCallOptions *options, + GError **error) +{ + auto flight_client = gaflight_client_get_raw(client); + const auto flight_ticket = gaflight_ticket_get_raw(ticket); + arrow::flight::FlightCallOptions flight_default_options; + auto flight_options = &flight_default_options; + if (options) { + flight_options = gaflight_call_options_get_raw(options); + } + std::unique_ptr flight_reader; + auto status = flight_client->DoGet(*flight_options, + *flight_ticket, + &flight_reader); + if (garrow::check(error, + status, + "[flight-client][do-get]")) { + return gaflight_stream_reader_new_raw(flight_reader.release()); + } else { + return NULL; + } +} + + +G_END_DECLS + + +GAFlightStreamReader * +gaflight_stream_reader_new_raw( + arrow::flight::FlightStreamReader *flight_reader) +{ + return GAFLIGHT_STREAM_READER( + g_object_new(GAFLIGHT_TYPE_STREAM_READER, + "reader", flight_reader, + NULL)); +} + +arrow::flight::FlightCallOptions * +gaflight_call_options_get_raw(GAFlightCallOptions *options) +{ + auto priv = GAFLIGHT_CALL_OPTIONS_GET_PRIVATE(options); + return &(priv->options); +} + +arrow::flight::FlightClientOptions * +gaflight_client_options_get_raw(GAFlightClientOptions *options) +{ + auto priv = GAFLIGHT_CLIENT_OPTIONS_GET_PRIVATE(options); + return &(priv->options); +} + +arrow::flight::FlightClient * +gaflight_client_get_raw(GAFlightClient *client) +{ + auto priv = GAFLIGHT_CLIENT_GET_PRIVATE(client); + return priv->client; +} + +GAFlightClient * +gaflight_client_new_raw(arrow::flight::FlightClient *flight_client) +{ + return GAFLIGHT_CLIENT(g_object_new(GAFLIGHT_TYPE_CLIENT, + "client", flight_client, + NULL)); +} diff --git a/c_glib/arrow-flight-glib/client.h b/c_glib/arrow-flight-glib/client.h new file mode 100644 index 00000000000..bc297116135 --- /dev/null +++ b/c_glib/arrow-flight-glib/client.h @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + + +#define GAFLIGHT_TYPE_STREAM_READER \ + (gaflight_stream_reader_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightStreamReader, + gaflight_stream_reader, + GAFLIGHT, + STREAM_READER, + GAFlightRecordBatchReader) +struct _GAFlightStreamReaderClass +{ + GAFlightRecordBatchReaderClass parent_class; +}; + + +#define GAFLIGHT_TYPE_CALL_OPTIONS (gaflight_call_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightCallOptions, + gaflight_call_options, + GAFLIGHT, + CALL_OPTIONS, + GObject) +struct _GAFlightCallOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightCallOptions * +gaflight_call_options_new(void); + + +#define GAFLIGHT_TYPE_CLIENT_OPTIONS (gaflight_client_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightClientOptions, + gaflight_client_options, + GAFLIGHT, + CLIENT_OPTIONS, + GObject) +struct _GAFlightClientOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightClientOptions * +gaflight_client_options_new(void); + + +#define GAFLIGHT_TYPE_CLIENT (gaflight_client_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightClient, + gaflight_client, + GAFLIGHT, + CLIENT, + GObject) +struct _GAFlightClientClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightClient * +gaflight_client_new(GAFlightLocation *location, + GAFlightClientOptions *options, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +GList * +gaflight_client_list_flights(GAFlightClient *client, + GAFlightCriteria *criteria, + GAFlightCallOptions *options, + GError **error); + +GARROW_AVAILABLE_IN_6_0 +GAFlightStreamReader * +gaflight_client_do_get(GAFlightClient *client, + GAFlightTicket *ticket, + GAFlightCallOptions *options, + GError **error); + + +G_END_DECLS diff --git a/c_glib/arrow-flight-glib/client.hpp b/c_glib/arrow-flight-glib/client.hpp new file mode 100644 index 00000000000..1e68761b7ee --- /dev/null +++ b/c_glib/arrow-flight-glib/client.hpp @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + + +GAFlightStreamReader * +gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader); + +arrow::flight::FlightCallOptions * +gaflight_call_options_get_raw(GAFlightCallOptions *options); + +arrow::flight::FlightClientOptions * +gaflight_client_options_get_raw(GAFlightClientOptions *options); + +arrow::flight::FlightClient * +gaflight_client_get_raw(GAFlightClient *client); +GAFlightClient * +gaflight_client_new_raw(arrow::flight::FlightClient *flight_client); diff --git a/c_glib/arrow-flight-glib/common.cpp b/c_glib/arrow-flight-glib/common.cpp new file mode 100644 index 00000000000..81b00f7a369 --- /dev/null +++ b/c_glib/arrow-flight-glib/common.cpp @@ -0,0 +1,1467 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include + +G_BEGIN_DECLS + +/** + * SECTION: common + * @section_id: common + * @title: Classes both for client and server + * @include: arrow-flight-glib/arrow-flight-glib.h + * + * #GAFlightCriteria is a class for criteria. + * + * #GAFlightLocation is a class for location. + * + * #GAFlightDescriptor is a base class for all descriptor classes such + * as #GAFlightPathDescriptor. + * + * #GAFlightPathDescriptor is a class for path descriptor. + * + * #GAFlightCommandDescriptor is a class for command descriptor. + * + * #GAFlightTicket is a class for ticket. + * + * #GAFlightEndpoint is a class for endpoint. + * + * #GAFlightInfo is a class for flight information. + * + * #GAFlightStreamChunk is a class for a chunk in stream. + * + * #GAFlightRecordBatchReader is a class for reading record batches. + * + * Since: 5.0.0 + */ + +typedef struct GAFlightCriteriaPrivate_ { + arrow::flight::Criteria criteria; + GBytes *expression; +} GAFlightCriteriaPrivate; + +enum { + PROP_EXPRESSION = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightCriteria, + gaflight_criteria, + G_TYPE_OBJECT) + +#define GAFLIGHT_CRITERIA_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_criteria_get_instance_private( \ + GAFLIGHT_CRITERIA(obj))) + +static void +gaflight_criteria_dispose(GObject *object) +{ + auto priv = GAFLIGHT_CRITERIA_GET_PRIVATE(object); + + if (priv->expression) { + g_bytes_unref(priv->expression); + priv->expression = NULL; + } + + G_OBJECT_CLASS(gaflight_criteria_parent_class)->dispose(object); +} + +static void +gaflight_criteria_finalize(GObject *object) +{ + auto priv = GAFLIGHT_CRITERIA_GET_PRIVATE(object); + + priv->criteria.~Criteria(); + + G_OBJECT_CLASS(gaflight_criteria_parent_class)->finalize(object); +} + +static void +gaflight_criteria_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_CRITERIA_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_EXPRESSION: + if (priv->expression) { + g_bytes_unref(priv->expression); + } + priv->expression = static_cast(g_value_dup_boxed(value)); + { + gsize size; + auto data = g_bytes_get_data(priv->expression, &size); + priv->criteria.expression.assign(static_cast(data), + size); + } + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_criteria_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_CRITERIA_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_EXPRESSION: + g_value_set_boxed(value, priv->expression); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_criteria_init(GAFlightCriteria *object) +{ + auto priv = GAFLIGHT_CRITERIA_GET_PRIVATE(object); + new(&priv->criteria) arrow::flight::Criteria; +} + +static void +gaflight_criteria_class_init(GAFlightCriteriaClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gaflight_criteria_dispose; + gobject_class->finalize = gaflight_criteria_finalize; + gobject_class->set_property = gaflight_criteria_set_property; + gobject_class->get_property = gaflight_criteria_get_property; + + GParamSpec *spec; + /** + * GAFlightCriteria:expression: + * + * Opaque criteria expression, dependent on server implementation. + * + * Since: 5.0.0 + */ + spec = g_param_spec_boxed("expression", + "Expression", + "Opaque criteria expression, " + "dependent on server implementation", + G_TYPE_BYTES, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_EXPRESSION, spec); +} + +/** + * gaflight_criteria_new: + * @expression: A #GBytes. + * + * Returns: The newly created #GAFlightCriteria, %NULL on error. + * + * Since: 5.0.0 + */ +GAFlightCriteria * +gaflight_criteria_new(GBytes *expression) +{ + return GAFLIGHT_CRITERIA( + g_object_new(GAFLIGHT_TYPE_CRITERIA, + "expression", expression, + NULL)); +} + + +typedef struct GAFlightLocationPrivate_ { + arrow::flight::Location location; +} GAFlightLocationPrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightLocation, + gaflight_location, + G_TYPE_OBJECT) + +#define GAFLIGHT_LOCATION_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_location_get_instance_private( \ + GAFLIGHT_LOCATION(obj))) + +static void +gaflight_location_finalize(GObject *object) +{ + auto priv = GAFLIGHT_LOCATION_GET_PRIVATE(object); + + priv->location.~Location(); + + G_OBJECT_CLASS(gaflight_location_parent_class)->finalize(object); +} + +static void +gaflight_location_init(GAFlightLocation *object) +{ + auto priv = GAFLIGHT_LOCATION_GET_PRIVATE(object); + new(&priv->location) arrow::flight::Location; +} + +static void +gaflight_location_class_init(GAFlightLocationClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_location_finalize; +} + +/** + * gaflight_location_new: + * @uri: An URI to specify location. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): The newly created location, %NULL on error. + * + * Since: 5.0.0 + */ +GAFlightLocation * +gaflight_location_new(const gchar *uri, + GError **error) +{ + auto location = GAFLIGHT_LOCATION(g_object_new(GAFLIGHT_TYPE_LOCATION, NULL)); + auto flight_location = gaflight_location_get_raw(location); + if (garrow::check(error, + arrow::flight::Location::Parse(uri, flight_location), + "[flight-location][new]")) { + return location; + } else { + g_object_unref(location); + return NULL; + } +} + +/** + * gaflight_location_to_string: + * @location: A #GAFlightLocation. + * + * Returns: A representation of this URI as a string. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 5.0.0 + */ +gchar * +gaflight_location_to_string(GAFlightLocation *location) +{ + const auto flight_location = gaflight_location_get_raw(location); + return g_strdup(flight_location->ToString().c_str()); +} + +/** + * gaflight_location_get_scheme: + * @location: A #GAFlightLocation. + * + * Returns: The scheme of this URI. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 5.0.0 + */ +gchar * +gaflight_location_get_scheme(GAFlightLocation *location) +{ + const auto flight_location = gaflight_location_get_raw(location); + return g_strdup(flight_location->scheme().c_str()); +} + +/** + * gaflight_location_equal: + * @location: A #GAFlightLocation. + * @other_location: A #GAFlightLocation to be compared. + * + * Returns: %TRUE if both of them represents the same URI, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gaflight_location_equal(GAFlightLocation *location, + GAFlightLocation *other_location) +{ + const auto flight_location = gaflight_location_get_raw(location); + const auto flight_other_location = gaflight_location_get_raw(other_location); + return flight_location->Equals(*flight_other_location); +} + + +typedef struct GAFlightDescriptorPrivate_ { + arrow::flight::FlightDescriptor descriptor; +} GAFlightDescriptorPrivate; + +enum { + PROP_DESCRIPTOR = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightDescriptor, + gaflight_descriptor, + G_TYPE_OBJECT) + +#define GAFLIGHT_DESCRIPTOR_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_descriptor_get_instance_private( \ + GAFLIGHT_DESCRIPTOR(obj))) + +static void +gaflight_descriptor_finalize(GObject *object) +{ + auto priv = GAFLIGHT_DESCRIPTOR_GET_PRIVATE(object); + + priv->descriptor.~FlightDescriptor(); + + G_OBJECT_CLASS(gaflight_descriptor_parent_class)->finalize(object); +} + +static void +gaflight_descriptor_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_DESCRIPTOR_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DESCRIPTOR: + priv->descriptor = *static_cast( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_descriptor_init(GAFlightDescriptor *object) +{ + auto priv = GAFLIGHT_DESCRIPTOR_GET_PRIVATE(object); + new(&priv->descriptor) arrow::flight::FlightDescriptor; +} + +static void +gaflight_descriptor_class_init(GAFlightDescriptorClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_descriptor_finalize; + gobject_class->set_property = gaflight_descriptor_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("descriptor", + "Descriptor", + "The raw arrow::flight::FlightDescriptor", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_EXPRESSION, spec); +} + +/** + * gaflight_descriptor_to_string: + * @descriptor: A #GAFlightDescriptor. + * + * Returns: A descriptor as a string. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 5.0.0 + */ +gchar * +gaflight_descriptor_to_string(GAFlightDescriptor *descriptor) +{ + auto flight_descriptor = gaflight_descriptor_get_raw(descriptor); + return g_strdup(flight_descriptor->ToString().c_str()); +} + +/** + * gaflight_descriptor_equal: + * @descriptor: A #GAFlightDescriptor. + * @other_descriptor: A #GAFlightDescriptor to be compared. + * + * Returns: %TRUE if both of them represents the same descriptor, + * %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gaflight_descriptor_equal(GAFlightDescriptor *descriptor, + GAFlightDescriptor *other_descriptor) +{ + const auto flight_descriptor = + gaflight_descriptor_get_raw(descriptor); + const auto flight_other_descriptor = + gaflight_descriptor_get_raw(other_descriptor); + return flight_descriptor->Equals(*flight_other_descriptor); +} + + +G_DEFINE_TYPE(GAFlightPathDescriptor, + gaflight_path_descriptor, + GAFLIGHT_TYPE_DESCRIPTOR) + +static void +gaflight_path_descriptor_init(GAFlightPathDescriptor *object) +{ +} + +static void +gaflight_path_descriptor_class_init(GAFlightPathDescriptorClass *klass) +{ +} + +/** + * gaflight_path_descriptor_new: + * @paths: (array length=n_paths): List of paths identifying a + * particular dataset. + * @n_paths: The number of @paths. + * + * Returns: The newly created #GAFlightPathDescriptor. + * + * Since: 5.0.0 + */ +GAFlightPathDescriptor * +gaflight_path_descriptor_new(const gchar **paths, + gsize n_paths) +{ + std::vector flight_paths; + for (gsize i = 0; i < n_paths; i++) { + flight_paths.push_back(paths[i]); + } + auto flight_descriptor = arrow::flight::FlightDescriptor::Path(flight_paths); + return GAFLIGHT_PATH_DESCRIPTOR( + gaflight_descriptor_new_raw(&flight_descriptor)); +} + +/** + * gaflight_path_descriptor_get_paths: + * @descriptor: A #GAFlightPathDescriptor. + * + * Returns: (nullable) (array zero-terminated=1) (transfer full): + * The paths in this descriptor. + * + * It must be freed with g_strfreev() when no longer needed. + * + * Since: 5.0.0 + */ +gchar ** +gaflight_path_descriptor_get_paths(GAFlightPathDescriptor *descriptor) +{ + const auto flight_descriptor = + gaflight_descriptor_get_raw(GAFLIGHT_DESCRIPTOR(descriptor)); + const auto &flight_paths = flight_descriptor->path; + if (flight_paths.empty()) { + return NULL; + } else { + auto paths = g_new(gchar *, flight_paths.size() + 1); + gsize i = 0; + for (const auto &flight_path : flight_paths) { + paths[i++] = g_strdup(flight_path.c_str()); + } + paths[i] = NULL; + return paths; + } +} + + +G_DEFINE_TYPE(GAFlightCommandDescriptor, + gaflight_command_descriptor, + GAFLIGHT_TYPE_DESCRIPTOR) + +static void +gaflight_command_descriptor_init(GAFlightCommandDescriptor *object) +{ +} + +static void +gaflight_command_descriptor_class_init(GAFlightCommandDescriptorClass *klass) +{ +} + +/** + * gaflight_command_descriptor_new: + * @command: Opaque value used to express a command. + * + * Returns: The newly created #GAFlightCommandDescriptor. + * + * Since: 5.0.0 + */ +GAFlightCommandDescriptor * +gaflight_command_descriptor_new(const gchar *command) +{ + auto flight_descriptor = arrow::flight::FlightDescriptor::Command(command); + return GAFLIGHT_COMMAND_DESCRIPTOR( + gaflight_descriptor_new_raw(&flight_descriptor)); +} + +/** + * gaflight_command_descriptor_get_command: + * @descriptor: A #GAFlightCommandDescriptor. + * + * Returns: The opaque value used to express a command. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 5.0.0 + */ +gchar * +gaflight_command_descriptor_get_command(GAFlightCommandDescriptor *descriptor) +{ + const auto flight_descriptor = + gaflight_descriptor_get_raw(GAFLIGHT_DESCRIPTOR(descriptor)); + const auto &flight_command = flight_descriptor->cmd; + return g_strdup(flight_command.c_str()); +} + + +typedef struct GAFlightTicketPrivate_ { + arrow::flight::Ticket ticket; + GBytes *data; +} GAFlightTicketPrivate; + +enum { + PROP_DATA = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightTicket, + gaflight_ticket, + G_TYPE_OBJECT) + +#define GAFLIGHT_TICKET_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_ticket_get_instance_private( \ + GAFLIGHT_TICKET(obj))) + +static void +gaflight_ticket_dispose(GObject *object) +{ + auto priv = GAFLIGHT_TICKET_GET_PRIVATE(object); + + if (priv->data) { + g_bytes_unref(priv->data); + priv->data = NULL; + } + + G_OBJECT_CLASS(gaflight_ticket_parent_class)->dispose(object); +} + +static void +gaflight_ticket_finalize(GObject *object) +{ + auto priv = GAFLIGHT_TICKET_GET_PRIVATE(object); + + priv->ticket.~Ticket(); + + G_OBJECT_CLASS(gaflight_ticket_parent_class)->finalize(object); +} + +static void +gaflight_ticket_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_TICKET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATA: + if (priv->data) { + g_bytes_unref(priv->data); + } + priv->data = static_cast(g_value_dup_boxed(value)); + { + gsize size; + auto data = g_bytes_get_data(priv->data, &size); + priv->ticket.ticket.assign(static_cast(data), + size); + } + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_ticket_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_TICKET_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DATA: + g_value_set_boxed(value, priv->data); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_ticket_init(GAFlightTicket *object) +{ + auto priv = GAFLIGHT_TICKET_GET_PRIVATE(object); + new(&priv->ticket) arrow::flight::Ticket; +} + +static void +gaflight_ticket_class_init(GAFlightTicketClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gaflight_ticket_dispose; + gobject_class->finalize = gaflight_ticket_finalize; + gobject_class->set_property = gaflight_ticket_set_property; + gobject_class->get_property = gaflight_ticket_get_property; + + GParamSpec *spec; + /** + * GAFlightTicket:data: + * + * Opaque identifier or credential to use when requesting a data + * stream with the DoGet RPC. + * + * Since: 5.0.0 + */ + spec = g_param_spec_boxed("data", + "Data", + "Opaque identifier or credential to use " + "when requesting a data stream with the DoGet RPC", + G_TYPE_BYTES, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_DATA, spec); +} + +/** + * gaflight_ticket_new: + * @data: A #GBytes. + * + * Returns: The newly created #GAFlightTicket, %NULL on error. + * + * Since: 5.0.0 + */ +GAFlightTicket * +gaflight_ticket_new(GBytes *data) +{ + return GAFLIGHT_TICKET( + g_object_new(GAFLIGHT_TYPE_TICKET, + "data", data, + NULL)); +} + +/** + * gaflight_ticket_equal: + * @ticket: A #GAFlightTicket. + * @other_ticket: A #GAFlightTicket to be compared. + * + * Returns: %TRUE if both of them represents the same ticket, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gaflight_ticket_equal(GAFlightTicket *ticket, + GAFlightTicket *other_ticket) +{ + const auto flight_ticket = gaflight_ticket_get_raw(ticket); + const auto flight_other_ticket = gaflight_ticket_get_raw(other_ticket); + return flight_ticket->Equals(*flight_other_ticket); +} + + +typedef struct GAFlightEndpointPrivate_ { + arrow::flight::FlightEndpoint endpoint; + GAFlightTicket *ticket; + GList *locations; +} GAFlightEndpointPrivate; + +enum { + PROP_TICKET = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightEndpoint, + gaflight_endpoint, + G_TYPE_OBJECT) + +#define GAFLIGHT_ENDPOINT_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_endpoint_get_instance_private( \ + GAFLIGHT_ENDPOINT(obj))) + +static void +gaflight_endpoint_dispose(GObject *object) +{ + auto priv = GAFLIGHT_ENDPOINT_GET_PRIVATE(object); + + if (priv->ticket) { + g_object_unref(priv->ticket); + priv->ticket = NULL; + } + + if (priv->locations) { + g_list_free_full(priv->locations, g_object_unref); + priv->locations = NULL; + } + + G_OBJECT_CLASS(gaflight_endpoint_parent_class)->dispose(object); +} + +static void +gaflight_endpoint_finalize(GObject *object) +{ + auto priv = GAFLIGHT_ENDPOINT_GET_PRIVATE(object); + + priv->endpoint.~FlightEndpoint(); + + G_OBJECT_CLASS(gaflight_endpoint_parent_class)->finalize(object); +} + +static void +gaflight_endpoint_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_ENDPOINT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_TICKET: + g_value_set_object(value, priv->ticket); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_endpoint_init(GAFlightEndpoint *object) +{ + auto priv = GAFLIGHT_ENDPOINT_GET_PRIVATE(object); + new(&priv->endpoint) arrow::flight::FlightEndpoint; +} + +static void +gaflight_endpoint_class_init(GAFlightEndpointClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gaflight_endpoint_dispose; + gobject_class->finalize = gaflight_endpoint_finalize; + gobject_class->get_property = gaflight_endpoint_get_property; + + GParamSpec *spec; + /** + * GAFlightEndpoint:ticket: + * + * Opaque ticket identify; use with DoGet RPC. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("ticket", + "Ticket", + "Opaque ticket identify; use with DoGet RPC", + GAFLIGHT_TYPE_TICKET, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_TICKET, spec); +} + +/** + * gaflight_endpoint_new: + * @ticket: A #GAFlightTicket. + * @locations: (element-type GAFlightLocation): A list of #GAFlightLocation. + * + * Returns: The newly created #GAFlightEndpoint, %NULL on error. + * + * Since: 5.0.0 + */ +GAFlightEndpoint * +gaflight_endpoint_new(GAFlightTicket *ticket, + GList *locations) +{ + auto endpoint = gaflight_endpoint_new_raw(nullptr, ticket); + auto priv = GAFLIGHT_ENDPOINT_GET_PRIVATE(endpoint); + for (auto node = locations; node; node = node->next) { + auto location = GAFLIGHT_LOCATION(node->data); + priv->endpoint.locations.push_back(*gaflight_location_get_raw(location)); + } + return endpoint; +} + +/** + * gaflight_endpoint_equal: + * @endpoint: A #GAFlightEndpoint. + * @other_endpoint: A #GAFlightEndpoint to be compared. + * + * Returns: %TRUE if both of them represents the same endpoint, + * %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gaflight_endpoint_equal(GAFlightEndpoint *endpoint, + GAFlightEndpoint *other_endpoint) +{ + const auto flight_endpoint = gaflight_endpoint_get_raw(endpoint); + const auto flight_other_endpoint = gaflight_endpoint_get_raw(other_endpoint); + return flight_endpoint->Equals(*flight_other_endpoint); +} + +/** + * gaflight_endpoint_get_locations: + * @endpoint: A #GAFlightEndpoint. + * + * Returns: (nullable) (element-type GAFlightLocation) (transfer full): + * The locations in this endpoint. + * + * It must be freed with g_list_free() and g_object_unref() when no + * longer needed. You can use `g_list_free_full(locations, + * g_object_unref)`. + * + * Since: 5.0.0 + */ +GList * +gaflight_endpoint_get_locations(GAFlightEndpoint *endpoint) +{ + const auto flight_endpoint = gaflight_endpoint_get_raw(endpoint); + GList *locations = NULL; + for (const auto &flight_location : flight_endpoint->locations) { + auto location = gaflight_location_new(flight_location.ToString().c_str(), + nullptr); + locations = g_list_prepend(locations, location); + } + return g_list_reverse(locations); +} + + +typedef struct GAFlightInfoPrivate_ { + arrow::flight::FlightInfo info; +} GAFlightInfoPrivate; + +enum { + PROP_INFO = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightInfo, + gaflight_info, + G_TYPE_OBJECT) + +#define GAFLIGHT_INFO_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_info_get_instance_private( \ + GAFLIGHT_INFO(obj))) + +static void +gaflight_info_finalize(GObject *object) +{ + auto priv = GAFLIGHT_INFO_GET_PRIVATE(object); + + priv->info.~FlightInfo(); + + G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object); +} + +static void +gaflight_info_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_INFO_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_INFO: + { + auto info = + static_cast(g_value_get_pointer(value)); + new(&(priv->info)) arrow::flight::FlightInfo(*info); + } + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_info_init(GAFlightInfo *object) +{ +} + +static void +gaflight_info_class_init(GAFlightInfoClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_info_finalize; + gobject_class->set_property = gaflight_info_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("info", + "Info", + "The raw arrow::flight::FlightInfo *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_INFO, spec); +} + +/** + * gaflight_info_new: + * @schema: A #GArrowSchema. + * @descriptor: A #GAFlightDescriptor. + * @endpoints: (element-type GAFlightEndpoint): A list of #GAFlightEndpoint. + * @total_records: The number of total records. + * @total_bytes: The number of total bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): The newly created #GAFlightInfo, %NULL on error. + * + * Since: 5.0.0 + */ +GAFlightInfo * +gaflight_info_new(GArrowSchema *schema, + GAFlightDescriptor *descriptor, + GList *endpoints, + gint64 total_records, + gint64 total_bytes, + GError **error) +{ + auto arrow_schema = garrow_schema_get_raw(schema); + auto flight_descriptor = gaflight_descriptor_get_raw(descriptor); + std::vector flight_endpoints; + for (auto node = endpoints; node; node = node->next) { + auto endpoint = GAFLIGHT_ENDPOINT(node->data); + flight_endpoints.push_back(*gaflight_endpoint_get_raw(endpoint)); + } + auto flight_info_result = + arrow::flight::FlightInfo::Make(*arrow_schema, + *flight_descriptor, + flight_endpoints, + total_records, + total_bytes); + if (!garrow::check(error, + flight_info_result, + "[flight-info][new]")) { + return NULL; + } + return gaflight_info_new_raw(&(*flight_info_result)); +} + +/** + * gaflight_info_equal: + * @info: A #GAFlightInfo. + * @other_info: A #GAFlightInfo to be compared. + * + * Returns: %TRUE if both of them represents the same information, + * %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +gaflight_info_equal(GAFlightInfo *info, + GAFlightInfo *other_info) +{ + const auto flight_info = gaflight_info_get_raw(info); + const auto flight_other_info = gaflight_info_get_raw(other_info); + return + (flight_info->serialized_schema() == + flight_other_info->serialized_schema()) && + (flight_info->descriptor() == + flight_other_info->descriptor()) && + (flight_info->endpoints() == + flight_other_info->endpoints()) && + (flight_info->total_records() == + flight_other_info->total_records()) && + (flight_info->total_bytes() == + flight_other_info->total_bytes()); +} + +/** + * gaflight_info_get_schema: + * @info: A #GAFlightInfo. + * @options: (nullable): A #GArrowReadOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): Deserialized #GArrowSchema, %NULL on error. + * + * Since: 5.0.0 + */ +GArrowSchema * +gaflight_info_get_schema(GAFlightInfo *info, + GArrowReadOptions *options, + GError **error) +{ + const auto flight_info = gaflight_info_get_raw(info); + arrow::Status status; + std::shared_ptr arrow_schema; + if (options) { + auto arrow_memo = garrow_read_options_get_dictionary_memo_raw(options); + status = flight_info->GetSchema(arrow_memo, &arrow_schema); + } else { + arrow::ipc::DictionaryMemo arrow_memo; + status = flight_info->GetSchema(&arrow_memo, &arrow_schema); + } + if (garrow::check(error, status, "[flight-info][get-schema]")) { + return garrow_schema_new_raw(&arrow_schema); + } else { + return NULL; + } +} + +/** + * gaflight_info_get_descriptor: + * @info: A #GAFlightInfo. + * + * Returns: (transfer full): The #GAFlightDescriptor of the information. + * + * Since: 5.0.0 + */ +GAFlightDescriptor * +gaflight_info_get_descriptor(GAFlightInfo *info) +{ + const auto flight_info = gaflight_info_get_raw(info); + return gaflight_descriptor_new_raw(&(flight_info->descriptor())); +} + +/** + * gaflight_info_get_endpoints: + * @info: A #GAFlightInfo. + * + * Returns: (element-type GAFlightEndpoint) (transfer full): + * The list of #GAFlightEndpoint of the information. + * + * Since: 5.0.0 + */ +GList * +gaflight_info_get_endpoints(GAFlightInfo *info) +{ + const auto flight_info = gaflight_info_get_raw(info); + GList *endpoints = NULL; + for (const auto &flight_endpoint : flight_info->endpoints()) { + auto endpoint = gaflight_endpoint_new_raw(&flight_endpoint, nullptr); + endpoints = g_list_prepend(endpoints, endpoint); + } + return g_list_reverse(endpoints); +} + +/** + * gaflight_info_get_total_records: + * @info: A #GAFlightInfo. + * + * Returns: The number of total records of the information. + * + * Since: 5.0.0 + */ +gint64 +gaflight_info_get_total_records(GAFlightInfo *info) +{ + const auto flight_info = gaflight_info_get_raw(info); + return flight_info->total_records(); +} + +/** + * gaflight_info_get_total_bytes: + * @info: A #GAFlightInfo. + * + * Returns: The number of total bytes of the information. + * + * Since: 5.0.0 + */ +gint64 +gaflight_info_get_total_bytes(GAFlightInfo *info) +{ + const auto flight_info = gaflight_info_get_raw(info); + return flight_info->total_bytes(); +} + +typedef struct GAFlightStreamChunkPrivate_ { + arrow::flight::FlightStreamChunk chunk; +} GAFlightStreamChunkPrivate; + +enum { + PROP_CHUNK = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightStreamChunk, + gaflight_stream_chunk, + G_TYPE_OBJECT) + +#define GAFLIGHT_STREAM_CHUNK_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_stream_chunk_get_instance_private( \ + GAFLIGHT_STREAM_CHUNK(obj))) + +static void +gaflight_stream_chunk_finalize(GObject *object) +{ + auto priv = GAFLIGHT_STREAM_CHUNK_GET_PRIVATE(object); + + priv->chunk.~FlightStreamChunk(); + + G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object); +} + +static void +gaflight_stream_chunk_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_STREAM_CHUNK_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CHUNK: + priv->chunk = + *static_cast( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_stream_chunk_init(GAFlightStreamChunk *object) +{ +} + +static void +gaflight_stream_chunk_class_init(GAFlightStreamChunkClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_stream_chunk_finalize; + gobject_class->set_property = gaflight_stream_chunk_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("chunk", + "Stream chunk", + "The raw arrow::flight::FlightStreamChunk *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CHUNK, spec); +} + +/** + * gaflight_stream_chunk_get_data: + * @chunk: A #GAFlightStreamChunk. + * + * Returns: (transfer full): The data of the chunk. + * + * Since: 6.0.0 + */ +GArrowRecordBatch * +gaflight_stream_chunk_get_data(GAFlightStreamChunk *chunk) +{ + auto flight_chunk = gaflight_stream_chunk_get_raw(chunk); + return garrow_record_batch_new_raw(&(flight_chunk->data)); +} + +/** + * gaflight_stream_chunk_get_metadata: + * @chunk: A #GAFlightStreamChunk. + * + * Returns: (nullable) (transfer full): The metadata of the chunk. + * + * The metadata may be NULL. + * + * Since: 6.0.0 + */ +GArrowBuffer * +gaflight_stream_chunk_get_metadata(GAFlightStreamChunk *chunk) +{ + auto flight_chunk = gaflight_stream_chunk_get_raw(chunk); + if (flight_chunk->app_metadata) { + return garrow_buffer_new_raw(&(flight_chunk->app_metadata)); + } else { + return NULL; + } +} + + +typedef struct GAFlightRecordBatchReaderPrivate_ { + arrow::flight::MetadataRecordBatchReader *reader; +} GAFlightRecordBatchReaderPrivate; + +enum { + PROP_READER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightRecordBatchReader, + gaflight_record_batch_reader, + G_TYPE_OBJECT) + +#define GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_record_batch_reader_get_instance_private( \ + GAFLIGHT_RECORD_BATCH_READER(obj))) + +static void +gaflight_record_batch_reader_finalize(GObject *object) +{ + auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(object); + + delete priv->reader; + + G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object); +} + +static void +gaflight_record_batch_reader_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_READER: + priv->reader = + static_cast( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_record_batch_reader_init(GAFlightRecordBatchReader *object) +{ +} + +static void +gaflight_record_batch_reader_class_init(GAFlightRecordBatchReaderClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_record_batch_reader_finalize; + gobject_class->set_property = gaflight_record_batch_reader_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("reader", + "Reader", + "The raw arrow::flight::MetadataRecordBatchReader *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_READER, spec); +} + +/** + * gaflight_record_batch_reader_read_next: + * @reader: A #GAFlightRecordBatchReader. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): The next chunk on success, %NULL on end + * of stream, %NULL on error. + * + * Since: 6.0.0 + */ +GAFlightStreamChunk * +gaflight_record_batch_reader_read_next(GAFlightRecordBatchReader *reader, + GError **error) +{ + auto flight_reader = gaflight_record_batch_reader_get_raw(reader); + arrow::flight::FlightStreamChunk flight_chunk; + auto status = flight_reader->Next(&flight_chunk); + if (garrow::check(error, status, "[flight-record-batch-reader][read-next]")) { + if (flight_chunk.data) { + return gaflight_stream_chunk_new_raw(&flight_chunk); + } else { + return NULL; + } + } else { + return NULL; + } +} + +/** + * gaflight_record_batch_reader_read_all: + * @reader: A #GAFlightRecordBatchReader. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): The all data on success, %NULL on error. + * + * Since: 6.0.0 + */ +GArrowTable * +gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, + GError **error) +{ + auto flight_reader = gaflight_record_batch_reader_get_raw(reader); + std::shared_ptr arrow_table; + auto status = flight_reader->ReadAll(&arrow_table); + if (garrow::check(error, status, "[flight-record-batch-reader][read-all]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + + +G_END_DECLS + + +GAFlightCriteria * +gaflight_criteria_new_raw(const arrow::flight::Criteria *flight_criteria) +{ + auto criteria = g_object_new(GAFLIGHT_TYPE_CRITERIA, NULL); + auto priv = GAFLIGHT_CRITERIA_GET_PRIVATE(criteria); + priv->criteria = *flight_criteria; + priv->expression = g_bytes_new(priv->criteria.expression.data(), + priv->criteria.expression.size()); + return GAFLIGHT_CRITERIA(criteria); +} + +arrow::flight::Criteria * +gaflight_criteria_get_raw(GAFlightCriteria *criteria) +{ + auto priv = GAFLIGHT_CRITERIA_GET_PRIVATE(criteria); + return &(priv->criteria); +} + +arrow::flight::Location * +gaflight_location_get_raw(GAFlightLocation *location) +{ + auto priv = GAFLIGHT_LOCATION_GET_PRIVATE(location); + return &(priv->location); +} + +GAFlightDescriptor * +gaflight_descriptor_new_raw( + const arrow::flight::FlightDescriptor *flight_descriptor) +{ + GType gtype = GAFLIGHT_TYPE_DESCRIPTOR; + switch (flight_descriptor->type) { + case arrow::flight::FlightDescriptor::DescriptorType::PATH: + gtype = GAFLIGHT_TYPE_PATH_DESCRIPTOR; + break; + case arrow::flight::FlightDescriptor::DescriptorType::CMD: + gtype = GAFLIGHT_TYPE_COMMAND_DESCRIPTOR; + break; + default: + break; + } + return GAFLIGHT_DESCRIPTOR(g_object_new(gtype, + "descriptor", flight_descriptor, + NULL)); +} + +arrow::flight::FlightDescriptor * +gaflight_descriptor_get_raw(GAFlightDescriptor *descriptor) +{ + auto priv = GAFLIGHT_DESCRIPTOR_GET_PRIVATE(descriptor); + return &(priv->descriptor); +} + +GAFlightTicket * +gaflight_ticket_new_raw(const arrow::flight::Ticket *flight_ticket) +{ + auto ticket = g_object_new(GAFLIGHT_TYPE_TICKET, NULL); + auto priv = GAFLIGHT_TICKET_GET_PRIVATE(ticket); + priv->ticket = *flight_ticket; + priv->data = g_bytes_new(priv->ticket.ticket.data(), + priv->ticket.ticket.size()); + return GAFLIGHT_TICKET(ticket); +} + +arrow::flight::Ticket * +gaflight_ticket_get_raw(GAFlightTicket *ticket) +{ + auto priv = GAFLIGHT_TICKET_GET_PRIVATE(ticket); + return &(priv->ticket); +} + +GAFlightEndpoint * +gaflight_endpoint_new_raw(const arrow::flight::FlightEndpoint *flight_endpoint, + GAFlightTicket *ticket) +{ + auto endpoint = GAFLIGHT_ENDPOINT(g_object_new(GAFLIGHT_TYPE_ENDPOINT, + NULL)); + auto priv = GAFLIGHT_ENDPOINT_GET_PRIVATE(endpoint); + if (ticket) { + priv->ticket = ticket; + g_object_ref(priv->ticket); + priv->endpoint.ticket = *gaflight_ticket_get_raw(priv->ticket); + } else { + auto data = g_bytes_new(flight_endpoint->ticket.ticket.data(), + flight_endpoint->ticket.ticket.length()); + auto ticket = gaflight_ticket_new(data); + g_bytes_unref(data); + priv->ticket = ticket; + priv->endpoint.ticket.ticket = flight_endpoint->ticket.ticket; + } + if (flight_endpoint) { + priv->endpoint.locations = flight_endpoint->locations; + } + return endpoint; +} + +arrow::flight::FlightEndpoint * +gaflight_endpoint_get_raw(GAFlightEndpoint *endpoint) +{ + auto priv = GAFLIGHT_ENDPOINT_GET_PRIVATE(endpoint); + return &(priv->endpoint); +} + +GAFlightInfo * +gaflight_info_new_raw(arrow::flight::FlightInfo *flight_info) +{ + return GAFLIGHT_INFO(g_object_new(GAFLIGHT_TYPE_INFO, + "info", flight_info, + NULL)); +} + +arrow::flight::FlightInfo * +gaflight_info_get_raw(GAFlightInfo *info) +{ + auto priv = GAFLIGHT_INFO_GET_PRIVATE(info); + return &(priv->info); +} + +GAFlightStreamChunk * +gaflight_stream_chunk_new_raw(arrow::flight::FlightStreamChunk *flight_chunk) +{ + return GAFLIGHT_STREAM_CHUNK( + g_object_new(GAFLIGHT_TYPE_STREAM_CHUNK, + "chunk", flight_chunk, + NULL)); +} + +arrow::flight::FlightStreamChunk * +gaflight_stream_chunk_get_raw(GAFlightStreamChunk *chunk) +{ + auto priv = GAFLIGHT_STREAM_CHUNK_GET_PRIVATE(chunk); + return &(priv->chunk); +} + +arrow::flight::MetadataRecordBatchReader * +gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader) +{ + auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(reader); + return priv->reader; +} diff --git a/c_glib/arrow-flight-glib/common.h b/c_glib/arrow-flight-glib/common.h new file mode 100644 index 00000000000..368fb665b47 --- /dev/null +++ b/c_glib/arrow-flight-glib/common.h @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + + +#define GAFLIGHT_TYPE_CRITERIA (gaflight_criteria_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightCriteria, + gaflight_criteria, + GAFLIGHT, + CRITERIA, + GObject) +struct _GAFlightCriteriaClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightCriteria * +gaflight_criteria_new(GBytes *expression); + + +#define GAFLIGHT_TYPE_LOCATION (gaflight_location_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightLocation, + gaflight_location, + GAFLIGHT, + LOCATION, + GObject) +struct _GAFlightLocationClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightLocation * +gaflight_location_new(const gchar *uri, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +gchar * +gaflight_location_to_string(GAFlightLocation *location); + +GARROW_AVAILABLE_IN_5_0 +gchar * +gaflight_location_get_scheme(GAFlightLocation *location); + +GARROW_AVAILABLE_IN_5_0 +gboolean +gaflight_location_equal(GAFlightLocation *location, + GAFlightLocation *other_location); + + +#define GAFLIGHT_TYPE_DESCRIPTOR (gaflight_descriptor_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightDescriptor, + gaflight_descriptor, + GAFLIGHT, + DESCRIPTOR, + GObject) +struct _GAFlightDescriptorClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +gchar * +gaflight_descriptor_to_string(GAFlightDescriptor *descriptor); + +GARROW_AVAILABLE_IN_5_0 +gboolean +gaflight_descriptor_equal(GAFlightDescriptor *descriptor, + GAFlightDescriptor *other_descriptor); + + +#define GAFLIGHT_TYPE_PATH_DESCRIPTOR (gaflight_path_descriptor_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightPathDescriptor, + gaflight_path_descriptor, + GAFLIGHT, + PATH_DESCRIPTOR, + GAFlightDescriptor) +struct _GAFlightPathDescriptorClass +{ + GAFlightDescriptorClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightPathDescriptor * +gaflight_path_descriptor_new(const gchar **paths, + gsize n_paths); + +GARROW_AVAILABLE_IN_5_0 +gchar ** +gaflight_path_descriptor_get_paths(GAFlightPathDescriptor *descriptor); + + +#define GAFLIGHT_TYPE_COMMAND_DESCRIPTOR (gaflight_command_descriptor_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightCommandDescriptor, + gaflight_command_descriptor, + GAFLIGHT, + COMMAND_DESCRIPTOR, + GAFlightDescriptor) +struct _GAFlightCommandDescriptorClass +{ + GAFlightDescriptorClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightCommandDescriptor * +gaflight_command_descriptor_new(const gchar *command); + +GARROW_AVAILABLE_IN_5_0 +gchar * +gaflight_command_descriptor_get_command(GAFlightCommandDescriptor *descriptor); + + +#define GAFLIGHT_TYPE_TICKET (gaflight_ticket_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightTicket, + gaflight_ticket, + GAFLIGHT, + TICKET, + GObject) +struct _GAFlightTicketClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightTicket * +gaflight_ticket_new(GBytes *data); + +GARROW_AVAILABLE_IN_5_0 +gboolean +gaflight_ticket_equal(GAFlightTicket *ticket, + GAFlightTicket *other_ticket); + + +#define GAFLIGHT_TYPE_ENDPOINT (gaflight_endpoint_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightEndpoint, + gaflight_endpoint, + GAFLIGHT, + ENDPOINT, + GObject) +struct _GAFlightEndpointClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightEndpoint * +gaflight_endpoint_new(GAFlightTicket *ticket, + GList *locations); + +GARROW_AVAILABLE_IN_5_0 +gboolean +gaflight_endpoint_equal(GAFlightEndpoint *endpoint, + GAFlightEndpoint *other_endpoint); + +GARROW_AVAILABLE_IN_5_0 +GList * +gaflight_endpoint_get_locations(GAFlightEndpoint *endpoint); + + +#define GAFLIGHT_TYPE_INFO (gaflight_info_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightInfo, + gaflight_info, + GAFLIGHT, + INFO, + GObject) +struct _GAFlightInfoClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightInfo * +gaflight_info_new(GArrowSchema *schema, + GAFlightDescriptor *descriptor, + GList *endpoints, + gint64 total_records, + gint64 total_bytes, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +gboolean +gaflight_info_equal(GAFlightInfo *info, + GAFlightInfo *other_info); + +GARROW_AVAILABLE_IN_5_0 +GArrowSchema * +gaflight_info_get_schema(GAFlightInfo *info, + GArrowReadOptions *options, + GError **error); +GARROW_AVAILABLE_IN_5_0 +GAFlightDescriptor * +gaflight_info_get_descriptor(GAFlightInfo *info); +GARROW_AVAILABLE_IN_5_0 +GList * +gaflight_info_get_endpoints(GAFlightInfo *info); +GARROW_AVAILABLE_IN_5_0 +gint64 +gaflight_info_get_total_records(GAFlightInfo *info); +GARROW_AVAILABLE_IN_5_0 +gint64 +gaflight_info_get_total_bytes(GAFlightInfo *info); + + +#define GAFLIGHT_TYPE_STREAM_CHUNK (gaflight_stream_chunk_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightStreamChunk, + gaflight_stream_chunk, + GAFLIGHT, + STREAM_CHUNK, + GObject) +struct _GAFlightStreamChunkClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GArrowRecordBatch * +gaflight_stream_chunk_get_data(GAFlightStreamChunk *chunk); +GARROW_AVAILABLE_IN_6_0 +GArrowBuffer * +gaflight_stream_chunk_get_metadata(GAFlightStreamChunk *chunk); + + +#define GAFLIGHT_TYPE_RECORD_BATCH_READER \ + (gaflight_record_batch_reader_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightRecordBatchReader, + gaflight_record_batch_reader, + GAFLIGHT, + RECORD_BATCH_READER, + GObject) +struct _GAFlightRecordBatchReaderClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GAFlightStreamChunk * +gaflight_record_batch_reader_read_next(GAFlightRecordBatchReader *reader, + GError **error); + +GARROW_AVAILABLE_IN_6_0 +GArrowTable * +gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, + GError **error); + + +G_END_DECLS diff --git a/c_glib/arrow-flight-glib/common.hpp b/c_glib/arrow-flight-glib/common.hpp new file mode 100644 index 00000000000..d23f7c8867f --- /dev/null +++ b/c_glib/arrow-flight-glib/common.hpp @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + + +GAFlightCriteria * +gaflight_criteria_new_raw(const arrow::flight::Criteria *flight_criteria); +arrow::flight::Criteria * +gaflight_criteria_get_raw(GAFlightCriteria *criteria); + +arrow::flight::Location * +gaflight_location_get_raw(GAFlightLocation *location); + +GAFlightDescriptor * +gaflight_descriptor_new_raw( + const arrow::flight::FlightDescriptor *flight_descriptor); +arrow::flight::FlightDescriptor * +gaflight_descriptor_get_raw(GAFlightDescriptor *descriptor); + +GAFlightTicket * +gaflight_ticket_new_raw(const arrow::flight::Ticket *flight_ticket); +arrow::flight::Ticket * +gaflight_ticket_get_raw(GAFlightTicket *ticket); + +GAFlightEndpoint * +gaflight_endpoint_new_raw(const arrow::flight::FlightEndpoint *flight_endpoint, + GAFlightTicket *ticket); +arrow::flight::FlightEndpoint * +gaflight_endpoint_get_raw(GAFlightEndpoint *endpoint); + +GAFlightInfo * +gaflight_info_new_raw(arrow::flight::FlightInfo *flight_info); +arrow::flight::FlightInfo * +gaflight_info_get_raw(GAFlightInfo *info); + +GAFlightStreamChunk * +gaflight_stream_chunk_new_raw(arrow::flight::FlightStreamChunk *flight_chunk); +arrow::flight::FlightStreamChunk * +gaflight_stream_chunk_get_raw(GAFlightStreamChunk *chunk); + +arrow::flight::MetadataRecordBatchReader * +gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader); diff --git a/c_glib/arrow-flight-glib/meson.build b/c_glib/arrow-flight-glib/meson.build new file mode 100644 index 00000000000..c17415fee3d --- /dev/null +++ b/c_glib/arrow-flight-glib/meson.build @@ -0,0 +1,82 @@ +# -*- indent-tabs-mode: nil -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sources = files( + 'client.cpp', + 'common.cpp', + 'server.cpp', +) + +c_headers = files( + 'arrow-flight-glib.h', + 'client.h', + 'common.h', + 'server.h', +) + +cpp_headers = files( + 'arrow-flight-glib.hpp', + 'client.hpp', + 'common.hpp', + 'server.hpp', +) + +headers = c_headers + cpp_headers +install_headers(headers, subdir: 'arrow-flight-glib') + +dependencies = [ + arrow_flight, + arrow_glib, +] +libarrow_flight_glib = library('arrow-flight-glib', + sources: sources, + install: true, + dependencies: dependencies, + include_directories: base_include_directories, + soversion: so_version, + version: library_version) +arrow_flight_glib = declare_dependency(link_with: libarrow_flight_glib, + include_directories: base_include_directories, + dependencies: dependencies) + +pkgconfig.generate(libarrow_flight_glib, + filebase: 'arrow-flight-glib', + name: 'Apache Arrow Flight GLib', + description: 'C API for Apache Arrow Flight based on GLib', + version: version, + requires: ['arrow-glib', 'arrow-flight']) + +if have_gi + gnome.generate_gir(libarrow_flight_glib, + dependencies: declare_dependency(sources: arrow_glib_gir), + sources: sources + c_headers, + namespace: 'ArrowFlight', + nsversion: api_version, + identifier_prefix: 'GAFlight', + symbol_prefix: 'gaflight', + export_packages: 'arrow-flight-glib', + includes: [ + 'Arrow-1.0', + ], + install: true, + extra_args: [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', + ]) +endif diff --git a/c_glib/arrow-flight-glib/server.cpp b/c_glib/arrow-flight-glib/server.cpp new file mode 100644 index 00000000000..e283b6d2688 --- /dev/null +++ b/c_glib/arrow-flight-glib/server.cpp @@ -0,0 +1,724 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include + +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: server + * @section_id: server + * @title: Server related classes + * @include: arrow-flight-glib/arrow-flight-glib.h + * + * #GAFlightDataStream is a class for producing a sequence of IPC + * payloads to be sent in `FlightData` protobuf messages. Generally, + * this is not used directly. Generally, #GAFlightRecordBatchStream is + * used instead. + * + * #GAFlightRecordBatchStream is a class for producing a sequence of + * IPC payloads to be sent in `FlightData` protobuf messages by + * #GArrowREcordBatchReader`. + * + * #GAFlightServerOptions is a class for options of each server. + * + * #GAFlightServerCallContext is a class for context of each server call. + * + * #GAFlightServer is a class to develop an Apache Arrow Flight server. + * + * Since: 5.0.0 + */ + + +typedef struct GAFlightDataStreamPrivate_ { + arrow::flight::FlightDataStream *stream; +} GAFlightDataStreamPrivate; + +enum { + PROP_STREAM = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightDataStream, + gaflight_data_stream, + G_TYPE_OBJECT) + +#define GAFLIGHT_DATA_STREAM_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_data_stream_get_instance_private( \ + GAFLIGHT_DATA_STREAM(obj))) + +static void +gaflight_data_stream_finalize(GObject *object) +{ + auto priv = GAFLIGHT_DATA_STREAM_GET_PRIVATE(object); + + delete priv->stream; + + G_OBJECT_CLASS(gaflight_data_stream_parent_class)->finalize(object); +} + +static void +gaflight_data_stream_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_DATA_STREAM_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_STREAM: + priv->stream = static_cast( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_data_stream_init(GAFlightDataStream *object) +{ +} + +static void +gaflight_data_stream_class_init(GAFlightDataStreamClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_data_stream_finalize; + gobject_class->set_property = gaflight_data_stream_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("stream", + "Stream", + "The raw arrow::flight::FlightDataStream *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_STREAM, spec); +} + + +typedef struct GAFlightRecordBatchStreamPrivate_ { + GArrowRecordBatchReader *reader; +} GAFlightRecordBatchStreamPrivate; + +enum { + PROP_READER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightRecordBatchStream, + gaflight_record_batch_stream, + GAFLIGHT_TYPE_DATA_STREAM) + +#define GAFLIGHT_RECORD_BATCH_STREAM_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_record_batch_stream_get_instance_private( \ + GAFLIGHT_RECORD_BATCH_STREAM(obj))) + +static void +gaflight_record_batch_stream_dispose(GObject *object) +{ + auto priv = GAFLIGHT_RECORD_BATCH_STREAM_GET_PRIVATE(object); + + if (priv->reader) { + g_object_unref(priv->reader); + priv->reader = NULL; + } + + G_OBJECT_CLASS(gaflight_record_batch_stream_parent_class)->dispose(object); +} + +static void +gaflight_record_batch_stream_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_RECORD_BATCH_STREAM_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_READER: + priv->reader = GARROW_RECORD_BATCH_READER(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_record_batch_stream_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_RECORD_BATCH_STREAM_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_READER: + g_value_set_object(value, priv->reader); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_record_batch_stream_init(GAFlightRecordBatchStream *object) +{ +} + +static void +gaflight_record_batch_stream_class_init(GAFlightRecordBatchStreamClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gaflight_record_batch_stream_dispose; + gobject_class->set_property = gaflight_record_batch_stream_set_property; + gobject_class->get_property = gaflight_record_batch_stream_get_property; + + GParamSpec *spec; + /** + * GAFlightRecordBatchStream:reader: + * + * The reader that produces record batches. + * + * Since: 6.0.0 + */ + spec = g_param_spec_object("reader", + "Reader", + "The reader that produces record batches", + GARROW_TYPE_RECORD_BATCH_READER, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_READER, spec); +} + +/** + * gaflight_record_batch_stream_new: + * @reader: A #GArrowRecordBatchReader to be read. + * @options: (nullable): A #GArrowWriteOptions for writing record batches to + * a client. + * + * Returns: The newly created #GAFlightRecordBatchStream. + * + * Since: 6.0.0 + */ +GAFlightRecordBatchStream * +gaflight_record_batch_stream_new(GArrowRecordBatchReader *reader, + GArrowWriteOptions *options) +{ + auto arrow_reader = garrow_record_batch_reader_get_raw(reader); + auto arrow_options_default = arrow::ipc::IpcWriteOptions::Defaults(); + arrow::ipc::IpcWriteOptions *arrow_options = NULL; + if (options) { + arrow_options = garrow_write_options_get_raw(options); + } else { + arrow_options = &arrow_options_default; + } + auto stream = arrow::internal::make_unique< + arrow::flight::RecordBatchStream>(arrow_reader, *arrow_options); + return static_cast( + g_object_new(GAFLIGHT_TYPE_RECORD_BATCH_STREAM, + "stream", stream.release(), + "reader", reader, + NULL)); +} + + +typedef struct GAFlightServerOptionsPrivate_ { + arrow::flight::FlightServerOptions options; + GAFlightLocation *location; +} GAFlightServerOptionsPrivate; + +enum { + PROP_LOCATION = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightServerOptions, + gaflight_server_options, + G_TYPE_OBJECT) + +#define GAFLIGHT_SERVER_OPTIONS_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_server_options_get_instance_private( \ + GAFLIGHT_SERVER_OPTIONS(obj))) + +static void +gaflight_server_options_dispose(GObject *object) +{ + auto priv = GAFLIGHT_SERVER_OPTIONS_GET_PRIVATE(object); + + if (priv->location) { + g_object_unref(priv->location); + priv->location = NULL; + } + + G_OBJECT_CLASS(gaflight_server_options_parent_class)->dispose(object); +} + +static void +gaflight_server_options_finalize(GObject *object) +{ + auto priv = GAFLIGHT_SERVER_OPTIONS_GET_PRIVATE(object); + + priv->options.~FlightServerOptions(); + + G_OBJECT_CLASS(gaflight_server_options_parent_class)->finalize(object); +} + +static void +gaflight_server_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_SERVER_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_LOCATION: + { + priv->location = GAFLIGHT_LOCATION(g_value_dup_object(value)); + auto flight_location = gaflight_location_get_raw(priv->location); + new(&(priv->options)) arrow::flight::FlightServerOptions(*flight_location); + } + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_server_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_SERVER_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_LOCATION: + g_value_set_object(value, priv->location); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_server_options_init(GAFlightServerOptions *object) +{ +} + +static void +gaflight_server_options_class_init(GAFlightServerOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gaflight_server_options_dispose; + gobject_class->finalize = gaflight_server_options_finalize; + gobject_class->set_property = gaflight_server_options_set_property; + gobject_class->get_property = gaflight_server_options_get_property; + + GParamSpec *spec; + spec = g_param_spec_object("location", + "Location", + "The location to be listened", + GAFLIGHT_TYPE_LOCATION, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_LOCATION, spec); +} + +/** + * gaflight_server_options_new: + * @location: A #GAFlightLocation to be listened. + * + * Returns: The newly created options for a server. + * + * Since: 5.0.0 + */ +GAFlightServerOptions * +gaflight_server_options_new(GAFlightLocation *location) +{ + return static_cast( + g_object_new(GAFLIGHT_TYPE_SERVER_OPTIONS, + "location", location, + NULL)); +} + + +typedef struct GAFlightServerCallContextPrivate_ { + arrow::flight::ServerCallContext *call_context; +} GAFlightServerCallContextPrivate; + +enum { + PROP_CALL_CONTEXT = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightServerCallContext, + gaflight_server_call_context, + G_TYPE_OBJECT) + +#define GAFLIGHT_SERVER_CALL_CONTEXT_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_server_call_context_get_instance_private( \ + GAFLIGHT_SERVER_CALL_CONTEXT(obj))) + +static void +gaflight_server_call_context_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_SERVER_CALL_CONTEXT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CALL_CONTEXT: + priv->call_context = + static_cast( + g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_server_call_context_init(GAFlightServerCallContext *object) +{ +} + +static void +gaflight_server_call_context_class_init(GAFlightServerCallContextClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->set_property = gaflight_server_call_context_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("call-context", + "Call context", + "The raw arrow::flight::ServerCallContext", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CALL_CONTEXT, spec); +} + + +G_END_DECLS +namespace gaflight { + class DataStream : public arrow::flight::FlightDataStream { + public: + DataStream(GAFlightDataStream *gastream) : + arrow::flight::FlightDataStream(), + gastream_(gastream) { + } + + ~DataStream() override { + g_object_unref(gastream_); + } + + std::shared_ptr schema() override { + auto stream = gaflight_data_stream_get_raw(gastream_); + return stream->schema(); + } + + arrow::Status GetSchemaPayload( + arrow::flight::FlightPayload *payload) override { + auto stream = gaflight_data_stream_get_raw(gastream_); + return stream->GetSchemaPayload(payload); + } + + arrow::Status Next(arrow::flight::FlightPayload *payload) override { + auto stream = gaflight_data_stream_get_raw(gastream_); + return stream->Next(payload); + } + + private: + GAFlightDataStream *gastream_; + }; + + class Server : public arrow::flight::FlightServerBase { + public: + Server(GAFlightServer *gaserver) : gaserver_(gaserver) { + } + + arrow::Status + ListFlights( + const arrow::flight::ServerCallContext &context, + const arrow::flight::Criteria *criteria, + std::unique_ptr *listing) override { + auto gacontext = gaflight_server_call_context_new_raw(&context); + GAFlightCriteria *gacriteria = NULL; + if (criteria) { + gacriteria = gaflight_criteria_new_raw(criteria); + } + GError *gerror = NULL; + auto gaflights = gaflight_server_list_flights(gaserver_, + gacontext, + gacriteria, + &gerror); + if (gacriteria) { + g_object_unref(gacriteria); + } + g_object_unref(gacontext); + if (gerror) { + return garrow_error_to_status(gerror, + arrow::StatusCode::UnknownError, + "[flight-server][list-flights]"); + } + std::vector flights; + for (auto node = gaflights; node; node = node->next) { + auto gaflight = GAFLIGHT_INFO(node->data); + flights.push_back(*gaflight_info_get_raw(gaflight)); + g_object_unref(gaflight); + } + g_list_free(gaflights); + *listing = arrow::internal::make_unique< + arrow::flight::SimpleFlightListing>(flights); + return arrow::Status::OK(); + } + + arrow::Status DoGet( + const arrow::flight::ServerCallContext &context, + const arrow::flight::Ticket &ticket, + std::unique_ptr *stream) override { + auto gacontext = gaflight_server_call_context_new_raw(&context); + auto gaticket = gaflight_ticket_new_raw(&ticket); + GError *gerror = NULL; + auto gastream = gaflight_server_do_get(gaserver_, + gacontext, + gaticket, + &gerror); + g_object_unref(gaticket); + g_object_unref(gacontext); + if (gerror) { + return garrow_error_to_status(gerror, + arrow::StatusCode::UnknownError, + "[flight-server][do-get]"); + } + *stream = arrow::internal::make_unique(gastream); + return arrow::Status::OK(); + } + + private: + GAFlightServer *gaserver_; + }; +}; +G_BEGIN_DECLS + +typedef struct GAFlightServerPrivate_ { + gaflight::Server server; +} GAFlightServerPrivate; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightServer, + gaflight_server, + G_TYPE_OBJECT) + +#define GAFLIGHT_SERVER_GET_PRIVATE(obj) \ + static_cast( \ + gaflight_server_get_instance_private( \ + GAFLIGHT_SERVER(obj))) + +static void +gaflight_server_finalize(GObject *object) +{ + auto priv = GAFLIGHT_SERVER_GET_PRIVATE(object); + + priv->server.~Server(); + + G_OBJECT_CLASS(gaflight_server_parent_class)->finalize(object); +} + +static void +gaflight_server_init(GAFlightServer *object) +{ + auto priv = GAFLIGHT_SERVER_GET_PRIVATE(object); + new(&(priv->server)) gaflight::Server(object); +} + +static void +gaflight_server_class_init(GAFlightServerClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_server_finalize; +} + +/** + * gaflight_server_listen: + * @server: A #GAFlightServer. + * @options: A #GAFlightServerOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 5.0.0 + */ +gboolean +gaflight_server_listen(GAFlightServer *server, + GAFlightServerOptions *options, + GError **error) +{ + auto flight_server = gaflight_server_get_raw(server); + const auto flight_options = gaflight_server_options_get_raw(options); + return garrow::check(error, + flight_server->Init(*flight_options), + "[flight-server][listen]"); +} + +/** + * gaflight_server_new: + * @server: A #GAFlightServer. + * + * Returns: The port number listening. + * + * Since: 5.0.0 + */ +gint +gaflight_server_get_port(GAFlightServer *server) +{ + const auto flight_server = gaflight_server_get_raw(server); + return flight_server->port(); +} + +/** + * gaflight_server_shutdown: + * @server: A #GAFlightServer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Shuts down the serve. This function can be called from signal + * handler or another thread. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 5.0.0 + */ +gboolean +gaflight_server_shutdown(GAFlightServer *server, + GError **error) +{ + auto flight_server = gaflight_server_get_raw(server); + return garrow::check(error, + flight_server->Shutdown(), + "[flight-server][shutdown]"); +} + +/** + * gaflight_server_list_flights: + * @server: A #GAFlightServer. + * @context: A #GAFlightServerCallContext. + * @criteria: (nullable): A #GAFlightCriteria. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (element-type GAFlightInfo) (transfer full): + * #GList of #GAFlightInfo on success, %NULL on error. + * + * Since: 5.0.0 + */ +GList * +gaflight_server_list_flights(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightCriteria *criteria, + GError **error) +{ + auto klass = GAFLIGHT_SERVER_GET_CLASS(server); + if (!(klass && klass->list_flights)) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_NOT_IMPLEMENTED, + "not implemented"); + return NULL; + } + return (*(klass->list_flights))(server, context, criteria, error); +} + +/** + * gaflight_server_do_get: + * @server: A #GAFlightServer. + * @context: A #GAFlightServerCallContext. + * @ticket: A #GAFlightTicket. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): #GAFlightDataStream on success, %NULL on error. + * + * Since: 6.0.0 + */ +GAFlightDataStream * +gaflight_server_do_get(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightTicket *ticket, + GError **error) +{ + auto klass = GAFLIGHT_SERVER_GET_CLASS(server); + if (!(klass && klass->do_get)) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_NOT_IMPLEMENTED, + "not implemented"); + return NULL; + } + return (*(klass->do_get))(server, context, ticket, error); +} + + +G_END_DECLS + + +arrow::flight::FlightDataStream * +gaflight_data_stream_get_raw(GAFlightDataStream *stream) +{ + auto priv = GAFLIGHT_DATA_STREAM_GET_PRIVATE(stream); + return priv->stream; +} + +arrow::flight::FlightServerOptions * +gaflight_server_options_get_raw(GAFlightServerOptions *options) +{ + auto priv = GAFLIGHT_SERVER_OPTIONS_GET_PRIVATE(options); + return &(priv->options); +} + +GAFlightServerCallContext * +gaflight_server_call_context_new_raw( + const arrow::flight::ServerCallContext *call_context) +{ + return GAFLIGHT_SERVER_CALL_CONTEXT( + g_object_new(GAFLIGHT_TYPE_SERVER_CALL_CONTEXT, + "call-context", call_context, + NULL)); +} + +arrow::flight::FlightServerBase * +gaflight_server_get_raw(GAFlightServer *server) +{ + auto priv = GAFLIGHT_SERVER_GET_PRIVATE(server); + return &(priv->server); +} diff --git a/c_glib/arrow-flight-glib/server.h b/c_glib/arrow-flight-glib/server.h new file mode 100644 index 00000000000..107fe44bf77 --- /dev/null +++ b/c_glib/arrow-flight-glib/server.h @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + + +#define GAFLIGHT_TYPE_DATA_STREAM \ + (gaflight_data_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightDataStream, + gaflight_data_stream, + GAFLIGHT, + DATA_STREAM, + GObject) +struct _GAFlightDataStreamClass +{ + GObjectClass parent_class; +}; + + +#define GAFLIGHT_TYPE_RECORD_BATCH_STREAM \ + (gaflight_record_batch_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightRecordBatchStream, + gaflight_record_batch_stream, + GAFLIGHT, + RECORD_BATCH_STREAM, + GAFlightDataStream) +struct _GAFlightRecordBatchStreamClass +{ + GAFlightDataStreamClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +GAFlightRecordBatchStream * +gaflight_record_batch_stream_new(GArrowRecordBatchReader *reader, + GArrowWriteOptions *options); + + +#define GAFLIGHT_TYPE_SERVER_OPTIONS (gaflight_server_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightServerOptions, + gaflight_server_options, + GAFLIGHT, + SERVER_OPTIONS, + GObject) +struct _GAFlightServerOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GAFlightServerOptions * +gaflight_server_options_new(GAFlightLocation *location); + + +#define GAFLIGHT_TYPE_SERVER_CALL_CONTEXT \ + (gaflight_server_call_context_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightServerCallContext, + gaflight_server_call_context, + GAFLIGHT, + SERVER_CALL_CONTEXT, + GObject) +struct _GAFlightServerCallContextClass +{ + GObjectClass parent_class; +}; + + +#define GAFLIGHT_TYPE_SERVER (gaflight_server_get_type()) +G_DECLARE_DERIVABLE_TYPE(GAFlightServer, + gaflight_server, + GAFLIGHT, + SERVER, + GObject) +/** + * GAFlightServerClass: + * @list_flights: A virtual function to implement `ListFlights` API. + * @do_get: A virtual function to implement `DoGet` API. + * + * Since: 5.0.0 + */ +struct _GAFlightServerClass +{ + GObjectClass parent_class; + + GList *(*list_flights)(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightCriteria *criteria, + GError **error); + GAFlightDataStream *(*do_get)(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightTicket *ticket, + GError **error); +}; + +GARROW_AVAILABLE_IN_5_0 +gboolean +gaflight_server_listen(GAFlightServer *server, + GAFlightServerOptions *options, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gint +gaflight_server_get_port(GAFlightServer *server); +GARROW_AVAILABLE_IN_5_0 +gboolean +gaflight_server_shutdown(GAFlightServer *server, + GError **error); +GARROW_AVAILABLE_IN_5_0 +gboolean +gaflight_server_wait(GAFlightServer *server, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +GList * +gaflight_server_list_flights(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightCriteria *criteria, + GError **error); +GARROW_AVAILABLE_IN_6_0 +GAFlightDataStream * +gaflight_server_do_get(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightTicket *ticket, + GError **error); + +G_END_DECLS diff --git a/c_glib/arrow-flight-glib/server.hpp b/c_glib/arrow-flight-glib/server.hpp new file mode 100644 index 00000000000..f7f2a7aba1b --- /dev/null +++ b/c_glib/arrow-flight-glib/server.hpp @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + + +arrow::flight::FlightDataStream * +gaflight_data_stream_get_raw(GAFlightDataStream *stream); + +arrow::flight::FlightServerOptions * +gaflight_server_options_get_raw(GAFlightServerOptions *options); + +GAFlightServerCallContext * +gaflight_server_call_context_new_raw( + const arrow::flight::ServerCallContext *flight_context); + +arrow::flight::FlightServerBase * +gaflight_server_get_raw(GAFlightServer *server); diff --git a/c_glib/arrow-glib/array-builder.cpp b/c_glib/arrow-glib/array-builder.cpp index c9ac8f5755c..c5ae035a7bb 100644 --- a/c_glib/arrow-glib/array-builder.cpp +++ b/c_glib/arrow-glib/array-builder.cpp @@ -6142,9 +6142,9 @@ garrow_array_builder_new_raw(arrow::ArrayBuilder *arrow_builder, break; case arrow::Type::type::DICTIONARY: { - const auto& dict_type = - arrow::internal::checked_cast(*arrow_builder->type()); - switch (dict_type.value_type()->id()) { + auto dict_type = + std::static_pointer_cast(arrow_builder->type()); + switch (dict_type->value_type()->id()) { case arrow::Type::type::BINARY: type = GARROW_TYPE_BINARY_DICTIONARY_ARRAY_BUILDER; break; diff --git a/c_glib/arrow-glib/arrow-glib.h b/c_glib/arrow-glib/arrow-glib.h index 74d9f9209ed..e25044ec9f0 100644 --- a/c_glib/arrow-glib/arrow-glib.h +++ b/c_glib/arrow-glib/arrow-glib.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/c_glib/arrow-glib/arrow-glib.hpp b/c_glib/arrow-glib/arrow-glib.hpp index 4382328f1bd..6dc6d43f2f9 100644 --- a/c_glib/arrow-glib/arrow-glib.hpp +++ b/c_glib/arrow-glib/arrow-glib.hpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index f2a924ee45c..1eb65b88964 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -167,6 +167,178 @@ G_BEGIN_DECLS * extension types. */ +typedef struct GArrowEqualOptionsPrivate_ { + gboolean approx; + arrow::EqualOptions options; +} GArrowEqualOptionsPrivate; + +enum { + PROP_APPROX = 1, + PROP_NANS_EQUAL, + PROP_ABSOLUTE_TOLERANCE, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowEqualOptions, + garrow_equal_options, + G_TYPE_OBJECT) + +#define GARROW_EQUAL_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_equal_options_get_instance_private( \ + GARROW_EQUAL_OPTIONS(object))) + +static void +garrow_equal_options_finalize(GObject *object) +{ + auto priv = GARROW_EQUAL_OPTIONS_GET_PRIVATE(object); + priv->options.~EqualOptions(); + G_OBJECT_CLASS(garrow_equal_options_parent_class)->finalize(object); +} + +static void +garrow_equal_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_EQUAL_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_APPROX: + priv->approx = g_value_get_boolean(value); + break; + case PROP_NANS_EQUAL: + priv->options = priv->options.nans_equal(g_value_get_boolean(value)); + break; + case PROP_ABSOLUTE_TOLERANCE: + priv->options = priv->options.atol(g_value_get_double(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_equal_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_EQUAL_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_APPROX: + g_value_set_boolean(value, priv->approx); + break; + case PROP_NANS_EQUAL: + g_value_set_boolean(value, priv->options.nans_equal()); + break; + case PROP_ABSOLUTE_TOLERANCE: + g_value_set_double(value, priv->options.atol()); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_equal_options_init(GArrowEqualOptions *object) +{ + auto priv = GARROW_EQUAL_OPTIONS_GET_PRIVATE(object); + priv->approx = FALSE; + new(&priv->options) arrow::EqualOptions; + priv->options = arrow::EqualOptions::Defaults(); +} + +static void +garrow_equal_options_class_init(GArrowEqualOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->finalize = garrow_equal_options_finalize; + gobject_class->set_property = garrow_equal_options_set_property; + gobject_class->get_property = garrow_equal_options_get_property; + + auto options = arrow::EqualOptions::Defaults(); + GParamSpec *spec; + /** + * GArrowEqualOptions:approx: + * + * Whether or not approximate comparison is used. + * + * Since: 5.0.0 + */ + spec = g_param_spec_boolean("approx", + "Approx", + "Whether or not approximate comparison is used", + FALSE, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_APPROX, spec); + + /** + * GArrowEqualOptions:nans-equal: + * + * Whether or not NaNs are considered equal. + * + * Since: 5.0.0 + */ + spec = g_param_spec_boolean("nans-equal", + "NaNs equal", + "Whether or not NaNs are considered equal", + options.nans_equal(), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_NANS_EQUAL, spec); + + /** + * GArrowEqualOptions:absolute-tolerance: + * + * The absolute tolerance for approximate comparison of + * floating-point values. + * + * Since: 5.0.0 + */ + spec = g_param_spec_double("absolute-tolerance", + "Absolute tolerance", + "The absolute tolerance for approximate comparison " + "of floating-point values", + -G_MAXDOUBLE, + G_MAXDOUBLE, + options.atol(), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_ABSOLUTE_TOLERANCE, spec); +} + +/** + * garrow_equal_options_new: + * + * Returns: A newly created #GArrowEqualOptions. + * + * Since: 5.0.0 + */ +GArrowEqualOptions * +garrow_equal_options_new(void) +{ + auto equal_options = g_object_new(GARROW_TYPE_EQUAL_OPTIONS, NULL); + return GARROW_EQUAL_OPTIONS(equal_options); +} + +/** + * garrow_equal_options_is_approx: + * @options: A #GArrowEqualOptions. + * + * Returns: %TRUE if approximate comparison is used, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +garrow_equal_options_is_approx(GArrowEqualOptions *options) +{ + auto priv = GARROW_EQUAL_OPTIONS_GET_PRIVATE(options); + return priv->approx; +} + + typedef struct GArrowArrayPrivate_ { std::shared_ptr array; GArrowDataType *value_data_type; @@ -396,10 +568,39 @@ garrow_array_class_init(GArrowArrayClass *klass) */ gboolean garrow_array_equal(GArrowArray *array, GArrowArray *other_array) +{ + return garrow_array_equal_options(array, other_array, NULL); +} + +/** + * garrow_array_equal_options: + * @array: A #GArrowArray. + * @other_array: A #GArrowArray to be compared. + * @options: (nullable): A #GArrowEqualOptions to custom how to compare. + * + * Returns: %TRUE if both of them have the same data, %FALSE + * otherwise. + * + * Since: 5.0.0 + */ +gboolean +garrow_array_equal_options(GArrowArray *array, + GArrowArray *other_array, + GArrowEqualOptions *options) { const auto arrow_array = garrow_array_get_raw(array); const auto arrow_other_array = garrow_array_get_raw(other_array); - return arrow_array->Equals(arrow_other_array); + if (options) { + auto is_approx = garrow_equal_options_is_approx(options); + const auto arrow_options = garrow_equal_options_get_raw(options); + if (is_approx) { + return arrow_array->ApproxEquals(arrow_other_array, *arrow_options); + } else { + return arrow_array->Equals(arrow_other_array, *arrow_options); + } + } else { + return arrow_array->Equals(arrow_other_array); + } } /** @@ -429,6 +630,7 @@ garrow_array_equal_approx(GArrowArray *array, GArrowArray *other_array) * @end_index: The end index of @array to be used. The end index of * @other_array is "@other_start_index + (@end_index - * @start_index)". + * @options: (nullable): A #GArrowEqualOptions to custom how to compare. * * Returns: %TRUE if both of them have the same data in the range, * %FALSE otherwise. @@ -440,14 +642,24 @@ garrow_array_equal_range(GArrowArray *array, gint64 start_index, GArrowArray *other_array, gint64 other_start_index, - gint64 end_index) + gint64 end_index, + GArrowEqualOptions *options) { const auto arrow_array = garrow_array_get_raw(array); const auto arrow_other_array = garrow_array_get_raw(other_array); - return arrow_array->RangeEquals(*arrow_other_array, - start_index, - end_index, - other_start_index); + if (options) { + const auto arrow_options = garrow_equal_options_get_raw(options); + return arrow_array->RangeEquals(arrow_other_array, + start_index, + end_index, + other_start_index, + *arrow_options); + } else { + return arrow_array->RangeEquals(arrow_other_array, + start_index, + end_index, + other_start_index); + } } /** @@ -2848,6 +3060,13 @@ garrow_extension_array_get_storage(GArrowExtensionArray *array) G_END_DECLS +arrow::EqualOptions * +garrow_equal_options_get_raw(GArrowEqualOptions *equal_options) +{ + auto priv = GARROW_EQUAL_OPTIONS_GET_PRIVATE(equal_options); + return &(priv->options); +} + GArrowArray * garrow_array_new_raw(std::shared_ptr *arrow_array) { diff --git a/c_glib/arrow-glib/basic-array.h b/c_glib/arrow-glib/basic-array.h index 9835db5e67a..b4b3de15217 100644 --- a/c_glib/arrow-glib/basic-array.h +++ b/c_glib/arrow-glib/basic-array.h @@ -24,6 +24,25 @@ G_BEGIN_DECLS +#define GARROW_TYPE_EQUAL_OPTIONS (garrow_equal_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowEqualOptions, + garrow_equal_options, + GARROW, + EQUAL_OPTIONS, + GObject) +struct _GArrowEqualOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowEqualOptions * +garrow_equal_options_new(void); +GARROW_AVAILABLE_IN_5_0 +gboolean +garrow_equal_options_is_approx(GArrowEqualOptions *options); + + #define GARROW_TYPE_ARRAY (garrow_array_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowArray, garrow_array, @@ -37,13 +56,18 @@ struct _GArrowArrayClass gboolean garrow_array_equal (GArrowArray *array, GArrowArray *other_array); +GARROW_AVAILABLE_IN_5_0 +gboolean garrow_array_equal_options(GArrowArray *array, + GArrowArray *other_array, + GArrowEqualOptions *options); gboolean garrow_array_equal_approx(GArrowArray *array, GArrowArray *other_array); gboolean garrow_array_equal_range (GArrowArray *array, gint64 start_index, GArrowArray *other_array, gint64 other_start_index, - gint64 end_index); + gint64 end_index, + GArrowEqualOptions *options); gboolean garrow_array_is_null (GArrowArray *array, gint64 i); diff --git a/c_glib/arrow-glib/basic-array.hpp b/c_glib/arrow-glib/basic-array.hpp index effebb01a6f..3ef1c196976 100644 --- a/c_glib/arrow-glib/basic-array.hpp +++ b/c_glib/arrow-glib/basic-array.hpp @@ -23,6 +23,9 @@ #include +arrow::EqualOptions * +garrow_equal_options_get_raw(GArrowEqualOptions *equal_options); + GArrowArray * garrow_array_new_raw(std::shared_ptr *arrow_array); GArrowArray * diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 1f2082712da..d7e3ca85f38 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -1925,6 +1925,9 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) case arrow::Type::type::DICTIONARY: type = GARROW_TYPE_DICTIONARY_DATA_TYPE; break; + case arrow::Type::type::MAP: + type = GARROW_TYPE_MAP_DATA_TYPE; + break; case arrow::Type::type::DECIMAL128: type = GARROW_TYPE_DECIMAL128_DATA_TYPE; break; diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 275e406be79..e845b1d80cc 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -52,27 +52,6 @@ garrow_numeric_array_sum(GArrowArrayType array, } } -template -GArrowBooleanArray * -garrow_numeric_array_compare(GArrowArrayType array, - VALUE value, - GArrowCompareOptions *options, - GError **error, - const gchar *tag) -{ - auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); - auto arrow_options = garrow_compare_options_get_raw(options); - auto arrow_compared_datum = arrow::compute::Compare(arrow_array, - arrow::Datum(value), - *arrow_options); - if (garrow::check(error, arrow_compared_datum, tag)) { - auto arrow_compared_array = (*arrow_compared_datum).make_array(); - return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_compared_array)); - } else { - return NULL; - } -} - template auto garrow_take(arrow::Datum arrow_values, @@ -130,8 +109,9 @@ G_BEGIN_DECLS * #GArrowCastOptions is a class to customize the `cast` function and * garrow_array_cast(). * - * #GArrowCountOptions is a class to customize the `count` function and - * garrow_array_count(). + * #GArrowScalarAggregateOptions is a class to customize the scalar + * aggregate functions such as `count` function and convenient + * functions of them such as garrow_array_count(). * * #GArrowFilterOptions is a class to customize the `filter` function and * garrow_array_filter() family. @@ -139,9 +119,6 @@ G_BEGIN_DECLS * #GArrowTakeOptions is a class to customize the `take` function and * garrow_array_take() family. * - * #GArrowCompareOptions is a class to customize the `equal` function - * family and garrow_int8_array_compare() family. - * * #GArrowArraySortOptions is a class to customize the * `array_sort_indices` function. * @@ -525,7 +502,7 @@ garrow_cast_options_class_init(GArrowCastOptionsClass *klass) /** * GArrowCastOptions:to-data-type: * - * The GArrowDataType being casted to. + * The #GArrowDataType being casted to. * * Since: 1.0.0 */ @@ -636,60 +613,65 @@ garrow_cast_options_new(void) } -typedef struct GArrowCountOptionsPrivate_ { - arrow::compute::CountOptions options; -} GArrowCountOptionsPrivate; +typedef struct GArrowScalarAggregateOptionsPrivate_ { + arrow::compute::ScalarAggregateOptions options; +} GArrowScalarAggregateOptionsPrivate; enum { - PROP_MODE = 1, + PROP_SKIP_NULLS = 1, + PROP_MIN_COUNT, }; static arrow::compute::FunctionOptions * -garrow_count_options_get_raw_function_options(GArrowFunctionOptions *options) +garrow_scalar_aggregate_options_get_raw_function_options( + GArrowFunctionOptions *options) { - return garrow_count_options_get_raw(GARROW_COUNT_OPTIONS(options)); + return garrow_scalar_aggregate_options_get_raw( + GARROW_SCALAR_AGGREGATE_OPTIONS(options)); } static void -garrow_count_options_function_options_interface_init( +garrow_scalar_aggregate_options_function_options_interface_init( GArrowFunctionOptionsInterface *iface) { - iface->get_raw = garrow_count_options_get_raw_function_options; + iface->get_raw = garrow_scalar_aggregate_options_get_raw_function_options; } -G_DEFINE_TYPE_WITH_CODE(GArrowCountOptions, - garrow_count_options, +G_DEFINE_TYPE_WITH_CODE(GArrowScalarAggregateOptions, + garrow_scalar_aggregate_options, G_TYPE_OBJECT, - G_ADD_PRIVATE(GArrowCountOptions) + G_ADD_PRIVATE(GArrowScalarAggregateOptions) G_IMPLEMENT_INTERFACE( GARROW_TYPE_FUNCTION_OPTIONS, - garrow_count_options_function_options_interface_init)) + garrow_scalar_aggregate_options_function_options_interface_init)) -#define GARROW_COUNT_OPTIONS_GET_PRIVATE(object) \ - static_cast( \ - garrow_count_options_get_instance_private( \ - GARROW_COUNT_OPTIONS(object))) +#define GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_scalar_aggregate_options_get_instance_private( \ + GARROW_SCALAR_AGGREGATE_OPTIONS(object))) static void -garrow_count_options_finalize(GObject *object) +garrow_scalar_aggregate_options_finalize(GObject *object) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object); - priv->options.~CountOptions(); - G_OBJECT_CLASS(garrow_count_options_parent_class)->finalize(object); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object); + priv->options.~ScalarAggregateOptions(); + G_OBJECT_CLASS(garrow_scalar_aggregate_options_parent_class)->finalize(object); } static void -garrow_count_options_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) +garrow_scalar_aggregate_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_MODE: - priv->options.count_mode = - static_cast(g_value_get_enum(value)); + case PROP_SKIP_NULLS: + priv->options.skip_nulls = g_value_get_boolean(value); + break; + case PROP_MIN_COUNT: + priv->options.min_count = g_value_get_uint(value); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -698,16 +680,19 @@ garrow_count_options_set_property(GObject *object, } static void -garrow_count_options_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) +garrow_scalar_aggregate_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object); switch (prop_id) { - case PROP_MODE: - g_value_set_enum(value, priv->options.count_mode); + case PROP_SKIP_NULLS: + g_value_set_boolean(value, priv->options.skip_nulls); + break; + case PROP_MIN_COUNT: + g_value_set_uint(value, priv->options.min_count); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -716,51 +701,69 @@ garrow_count_options_get_property(GObject *object, } static void -garrow_count_options_init(GArrowCountOptions *object) +garrow_scalar_aggregate_options_init(GArrowScalarAggregateOptions *object) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(object); - new(&priv->options) arrow::compute::CountOptions( - arrow::compute::CountOptions::COUNT_NON_NULL); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(object); + new(&priv->options) arrow::compute::ScalarAggregateOptions(); } static void -garrow_count_options_class_init(GArrowCountOptionsClass *klass) +garrow_scalar_aggregate_options_class_init( + GArrowScalarAggregateOptionsClass *klass) { auto gobject_class = G_OBJECT_CLASS(klass); - gobject_class->finalize = garrow_count_options_finalize; - gobject_class->set_property = garrow_count_options_set_property; - gobject_class->get_property = garrow_count_options_get_property; + gobject_class->finalize = garrow_scalar_aggregate_options_finalize; + gobject_class->set_property = garrow_scalar_aggregate_options_set_property; + gobject_class->get_property = garrow_scalar_aggregate_options_get_property; + + auto options = arrow::compute::ScalarAggregateOptions::Defaults(); GParamSpec *spec; /** - * GArrowCountOptions:mode: + * GArrowScalarAggregateOptions:skip-nulls: * - * How to count values. + * Whether NULLs are skipped or not. * - * Since: 0.13.0 + * Since: 5.0.0 + */ + spec = g_param_spec_boolean("skip-nulls", + "Skip NULLs", + "Whether NULLs are skipped or not", + options.skip_nulls, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SKIP_NULLS, spec); + + /** + * GArrowScalarAggregateOptions:min-count: + * + * The minimum required number of values. + * + * Since: 5.0.0 */ - spec = g_param_spec_enum("mode", - "Mode", - "How to count values", - GARROW_TYPE_COUNT_MODE, - GARROW_COUNT_ALL, + spec = g_param_spec_uint("min-count", + "Min count", + "The minimum required number of values", + 0, + G_MAXUINT, + options.min_count, static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_MODE, spec); + g_object_class_install_property(gobject_class, PROP_MIN_COUNT, spec); } /** - * garrow_count_options_new: + * garrow_scalar_aggregate_options_new: * - * Returns: A newly created #GArrowCountOptions. + * Returns: A newly created #GArrowScalarAggregateOptions. * - * Since: 0.13.0 + * Since: 5.0.0 */ -GArrowCountOptions * -garrow_count_options_new(void) +GArrowScalarAggregateOptions * +garrow_scalar_aggregate_options_new(void) { - auto count_options = g_object_new(GARROW_TYPE_COUNT_OPTIONS, NULL); - return GARROW_COUNT_OPTIONS(count_options); + auto scalar_aggregate_options = + g_object_new(GARROW_TYPE_SCALAR_AGGREGATE_OPTIONS, NULL); + return GARROW_SCALAR_AGGREGATE_OPTIONS(scalar_aggregate_options); } @@ -863,14 +866,14 @@ garrow_filter_options_class_init(GArrowFilterOptionsClass *klass) GParamSpec *spec; /** - * GArrowFilterOptions:null_selection_behavior: + * GArrowFilterOptions:null-selection-behavior: * * How to handle filtered values. * * Since: 0.17.0 */ - spec = g_param_spec_enum("null_selection_behavior", - "Null selection behavior", + spec = g_param_spec_enum("null-selection-behavior", + "NULL selection behavior", "How to handle filtered values", GARROW_TYPE_FILTER_NULL_SELECTION_BEHAVIOR, static_cast( @@ -962,133 +965,6 @@ garrow_take_options_new(void) } -typedef struct GArrowCompareOptionsPrivate_ { - arrow::compute::CompareOptions options; -} GArrowCompareOptionsPrivate; - -enum { - PROP_OPERATOR = 1, -}; - -static arrow::compute::FunctionOptions * -garrow_compare_options_get_raw_function_options(GArrowFunctionOptions *options) -{ - return garrow_compare_options_get_raw(GARROW_COMPARE_OPTIONS(options)); -} - -static void -garrow_compare_options_function_options_interface_init( - GArrowFunctionOptionsInterface *iface) -{ - iface->get_raw = garrow_compare_options_get_raw_function_options; -} - -G_DEFINE_TYPE_WITH_CODE(GArrowCompareOptions, - garrow_compare_options, - G_TYPE_OBJECT, - G_ADD_PRIVATE(GArrowCompareOptions) - G_IMPLEMENT_INTERFACE( - GARROW_TYPE_FUNCTION_OPTIONS, - garrow_compare_options_function_options_interface_init)) - -#define GARROW_COMPARE_OPTIONS_GET_PRIVATE(object) \ - static_cast( \ - garrow_compare_options_get_instance_private( \ - GARROW_COMPARE_OPTIONS(object))) - -static void -garrow_compare_options_finalize(GObject *object) -{ - auto priv = GARROW_COMPARE_OPTIONS_GET_PRIVATE(object); - priv->options.~CompareOptions(); - G_OBJECT_CLASS(garrow_compare_options_parent_class)->finalize(object); -} - -static void -garrow_compare_options_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) -{ - auto priv = GARROW_COMPARE_OPTIONS_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_OPERATOR: - priv->options.op = - static_cast(g_value_get_enum(value)); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_compare_options_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - auto priv = GARROW_COMPARE_OPTIONS_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_OPERATOR: - g_value_set_enum(value, priv->options.op); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_compare_options_init(GArrowCompareOptions *object) -{ - auto priv = GARROW_COMPARE_OPTIONS_GET_PRIVATE(object); - new(&priv->options) arrow::compute::CompareOptions(arrow::compute::EQUAL); -} - -static void -garrow_compare_options_class_init(GArrowCompareOptionsClass *klass) -{ - auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = garrow_compare_options_finalize; - gobject_class->set_property = garrow_compare_options_set_property; - gobject_class->get_property = garrow_compare_options_get_property; - - GParamSpec *spec; - /** - * GArrowCompareOptions:operator: - * - * How to compare the value. - * - * Since: 0.14.0 - */ - spec = g_param_spec_enum("operator", - "Operator", - "How to compare the value", - GARROW_TYPE_COMPARE_OPERATOR, - 0, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, PROP_OPERATOR, spec); -} - -/** - * garrow_compare_options_new: - * - * Returns: A newly created #GArrowCompareOptions. - * - * Since: 0.14.0 - */ -GArrowCompareOptions * -garrow_compare_options_new(void) -{ - auto compare_options = g_object_new(GARROW_TYPE_COMPARE_OPTIONS, NULL); - return GARROW_COMPARE_OPTIONS(compare_options); -} - - typedef struct GArrowArraySortOptionsPrivate_ { arrow::compute::ArraySortOptions options; } GArrowArraySortOptionsPrivate; @@ -1682,7 +1558,7 @@ garrow_array_dictionary_encode(GArrowArray *array, /** * garrow_array_count: * @array: A #GArrowArray. - * @options: (nullable): A #GArrowCountOptions. + * @options: (nullable): A #GArrowScalarAggregateOptions. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: The number of target values on success. If an error is occurred, @@ -1692,14 +1568,14 @@ garrow_array_dictionary_encode(GArrowArray *array, */ gint64 garrow_array_count(GArrowArray *array, - GArrowCountOptions *options, + GArrowScalarAggregateOptions *options, GError **error) { auto arrow_array = garrow_array_get_raw(array); auto arrow_array_raw = arrow_array.get(); arrow::Result arrow_counted_datum; if (options) { - auto arrow_options = garrow_count_options_get_raw(options); + auto arrow_options = garrow_scalar_aggregate_options_get_raw(options); arrow_counted_datum = arrow::compute::Count(*arrow_array_raw, *arrow_options); } else { @@ -2305,267 +2181,6 @@ garrow_record_batch_take(GArrowRecordBatch *record_batch, "[record-batch][take]"); } - -/** - * garrow_int8_array_compare: - * @array: A #GArrowInt8Array. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_int8_array_compare(GArrowInt8Array *array, - gint8 value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[int8-array][compare]"); -} - -/** - * garrow_uint8_array_compare: - * @array: A #GArrowUInt8Array. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_uint8_array_compare(GArrowUInt8Array *array, - guint8 value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[uint8-array][compare]"); -} - -/** - * garrow_int16_array_compare: - * @array: A #GArrowInt16Array. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_int16_array_compare(GArrowInt16Array *array, - gint16 value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[int16-array][compare]"); -} - -/** - * garrow_uint16_array_compare: - * @array: A #GArrowUInt16Array. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_uint16_array_compare(GArrowUInt16Array *array, - guint16 value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[uint16-array][compare]"); -} - -/** - * garrow_int32_array_compare: - * @array: A #GArrowUInt32Array. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_int32_array_compare(GArrowInt32Array *array, - gint32 value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[int32-array][compare]"); -} - -/** - * garrow_uint32_array_compare: - * @array: A #GArrowUInt32Array. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_uint32_array_compare(GArrowUInt32Array *array, - guint32 value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[uint32-array][compare]"); -} - -/** - * garrow_int64_array_compare: - * @array: A #GArrowInt64Array. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_int64_array_compare(GArrowInt64Array *array, - gint64 value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[int64-array][compare]"); -} - -/** - * garrow_uint64_array_compare: - * @array: A #GArrowUInt64Array. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_uint64_array_compare(GArrowUInt64Array *array, - guint64 value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[uint64-array][compare]"); -} - -/** - * garrow_float_array_compare: - * @array: A #GArrowFloatArray. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_float_array_compare(GArrowFloatArray *array, - gfloat value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[float-array][compare]"); -} - -/** - * garrow_double_array_compare: - * @array: A #GArrowDoubleArray. - * @value: The value to compare. - * @options: A #GArrowCompareOptions. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): The #GArrowBooleanArray as - * the result compared a numeric array with a scalar on success, - * %NULL on error. - * - * Since: 0.14.0 - */ -GArrowBooleanArray * -garrow_double_array_compare(GArrowDoubleArray *array, - gdouble value, - GArrowCompareOptions *options, - GError **error) -{ - return garrow_numeric_array_compare(array, - value, - options, - error, - "[double-array][compare]"); -} - /** * garrow_array_filter: * @array: A #GArrowArray. @@ -3059,20 +2674,23 @@ garrow_cast_options_get_raw(GArrowCastOptions *cast_options) return &(priv->options); } -GArrowCountOptions * -garrow_count_options_new_raw(arrow::compute::CountOptions *arrow_count_options) +GArrowScalarAggregateOptions * +garrow_scalar_aggregate_options_new_raw( + arrow::compute::ScalarAggregateOptions *arrow_scalar_aggregate_options) { - auto count_options = - g_object_new(GARROW_TYPE_COUNT_OPTIONS, - "mode", arrow_count_options->count_mode, + auto scalar_aggregate_options = + g_object_new(GARROW_TYPE_SCALAR_AGGREGATE_OPTIONS, + "skip-nulls", arrow_scalar_aggregate_options->skip_nulls, + "min-count", arrow_scalar_aggregate_options->min_count, NULL); - return GARROW_COUNT_OPTIONS(count_options); + return GARROW_SCALAR_AGGREGATE_OPTIONS(scalar_aggregate_options); } -arrow::compute::CountOptions * -garrow_count_options_get_raw(GArrowCountOptions *count_options) +arrow::compute::ScalarAggregateOptions * +garrow_scalar_aggregate_options_get_raw( + GArrowScalarAggregateOptions *scalar_aggregate_options) { - auto priv = GARROW_COUNT_OPTIONS_GET_PRIVATE(count_options); + auto priv = GARROW_SCALAR_AGGREGATE_OPTIONS_GET_PRIVATE(scalar_aggregate_options); return &(priv->options); } @@ -3090,13 +2708,6 @@ garrow_take_options_get_raw(GArrowTakeOptions *take_options) return &(priv->options); } -arrow::compute::CompareOptions * -garrow_compare_options_get_raw(GArrowCompareOptions *compare_options) -{ - auto priv = GARROW_COMPARE_OPTIONS_GET_PRIVATE(compare_options); - return &(priv->options); -} - arrow::compute::ArraySortOptions * garrow_array_sort_options_get_raw(GArrowArraySortOptions *array_sort_options) { diff --git a/c_glib/arrow-glib/compute.h b/c_glib/arrow-glib/compute.h index 63ba6e0eae5..1163983644c 100644 --- a/c_glib/arrow-glib/compute.h +++ b/c_glib/arrow-glib/compute.h @@ -83,32 +83,20 @@ struct _GArrowCastOptionsClass GArrowCastOptions *garrow_cast_options_new(void); -/** - * GArrowCountMode: - * @GARROW_COUNT_ALL: Count all non-null values. - * @GARROW_COUNT_NULL: Count all null values. - * - * They are corresponding to `arrow::compute::CountOptions::Mode` values. - */ -typedef enum { - GARROW_COUNT_ALL, - GARROW_COUNT_NULL, -} GArrowCountMode; - -#define GARROW_TYPE_COUNT_OPTIONS (garrow_count_options_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowCountOptions, - garrow_count_options, +#define GARROW_TYPE_SCALAR_AGGREGATE_OPTIONS (garrow_scalar_aggregate_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowScalarAggregateOptions, + garrow_scalar_aggregate_options, GARROW, - COUNT_OPTIONS, + SCALAR_AGGREGATE_OPTIONS, GObject) -struct _GArrowCountOptionsClass +struct _GArrowScalarAggregateOptionsClass { GObjectClass parent_class; }; -GARROW_AVAILABLE_IN_0_13 -GArrowCountOptions * -garrow_count_options_new(void); +GARROW_AVAILABLE_IN_5_0 +GArrowScalarAggregateOptions * +garrow_scalar_aggregate_options_new(void); /** @@ -158,42 +146,6 @@ GArrowTakeOptions * garrow_take_options_new(void); -/** - * GArrowCompareOperator: - * @GARROW_COMPARE_EQUAL: Equal operator. - * @GARROW_COMPARE_NOT_EQUAL: Not equal operator. - * @GARROW_COMPARE_GREATER: Greater operator. - * @GARROW_COMPARE_GREATER_EQUAL: Greater equal operator. - * @GARROW_COMPARE_LESS: Less operator. - * @GARROW_COMPARE_LESS_EQUAL: Less equal operator. - * - * They are corresponding to `arrow::compute::CompareOperator` values. - */ -typedef enum { - GARROW_COMPARE_EQUAL, - GARROW_COMPARE_NOT_EQUAL, - GARROW_COMPARE_GREATER, - GARROW_COMPARE_GREATER_EQUAL, - GARROW_COMPARE_LESS, - GARROW_COMPARE_LESS_EQUAL -} GArrowCompareOperator; - -#define GARROW_TYPE_COMPARE_OPTIONS (garrow_compare_options_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowCompareOptions, - garrow_compare_options, - GARROW, - COMPARE_OPTIONS, - GObject) -struct _GArrowCompareOptionsClass -{ - GObjectClass parent_class; -}; - -GARROW_AVAILABLE_IN_0_14 -GArrowCompareOptions * -garrow_compare_options_new(void); - - /** * GArrowSortOrder: * @GARROW_SORT_ORDER_ASCENDING: Sort in ascending order. @@ -290,7 +242,7 @@ GArrowDictionaryArray *garrow_array_dictionary_encode(GArrowArray *array, GError **error); GARROW_AVAILABLE_IN_0_13 gint64 garrow_array_count(GArrowArray *array, - GArrowCountOptions *options, + GArrowScalarAggregateOptions *options, GError **error); GARROW_AVAILABLE_IN_0_13 GArrowStructArray *garrow_array_count_values(GArrowArray *array, @@ -387,66 +339,6 @@ garrow_record_batch_take(GArrowRecordBatch *record_batch, GArrowArray *indices, GArrowTakeOptions *options, GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_int8_array_compare(GArrowInt8Array *array, - gint8 value, - GArrowCompareOptions *options, - GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_uint8_array_compare(GArrowUInt8Array *array, - guint8 value, - GArrowCompareOptions *options, - GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_int16_array_compare(GArrowInt16Array *array, - gint16 value, - GArrowCompareOptions *options, - GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_uint16_array_compare(GArrowUInt16Array *array, - guint16 value, - GArrowCompareOptions *options, - GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_int32_array_compare(GArrowInt32Array *array, - gint32 value, - GArrowCompareOptions *options, - GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_uint32_array_compare(GArrowUInt32Array *array, - guint32 value, - GArrowCompareOptions *options, - GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_int64_array_compare(GArrowInt64Array *array, - gint64 value, - GArrowCompareOptions *options, - GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_uint64_array_compare(GArrowUInt64Array *array, - guint64 value, - GArrowCompareOptions *options, - GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_float_array_compare(GArrowFloatArray *array, - gfloat value, - GArrowCompareOptions *options, - GError **error); -GARROW_AVAILABLE_IN_0_14 -GArrowBooleanArray * -garrow_double_array_compare(GArrowDoubleArray *array, - gdouble value, - GArrowCompareOptions *options, - GError **error); GARROW_AVAILABLE_IN_0_15 GArrowArray * garrow_array_filter(GArrowArray *array, diff --git a/c_glib/arrow-glib/compute.hpp b/c_glib/arrow-glib/compute.hpp index 1bc6fefdd40..8089a1d3364 100644 --- a/c_glib/arrow-glib/compute.hpp +++ b/c_glib/arrow-glib/compute.hpp @@ -46,10 +46,12 @@ garrow_function_get_raw(GArrowFunction *function); GArrowCastOptions *garrow_cast_options_new_raw(arrow::compute::CastOptions *arrow_cast_options); arrow::compute::CastOptions *garrow_cast_options_get_raw(GArrowCastOptions *cast_options); -GArrowCountOptions * -garrow_count_options_new_raw(arrow::compute::CountOptions *arrow_count_options); -arrow::compute::CountOptions * -garrow_count_options_get_raw(GArrowCountOptions *count_options); +GArrowScalarAggregateOptions * +garrow_scalar_aggregate_options_new_raw( + arrow::compute::ScalarAggregateOptions *arrow_scalar_aggregate_options); +arrow::compute::ScalarAggregateOptions * +garrow_scalar_aggregate_options_get_raw( + GArrowScalarAggregateOptions *scalar_aggregate_options); arrow::compute::FilterOptions * garrow_filter_options_get_raw(GArrowFilterOptions *filter_options); @@ -57,9 +59,6 @@ garrow_filter_options_get_raw(GArrowFilterOptions *filter_options); arrow::compute::TakeOptions * garrow_take_options_get_raw(GArrowTakeOptions *take_options); -arrow::compute::CompareOptions * -garrow_compare_options_get_raw(GArrowCompareOptions *compare_options); - arrow::compute::ArraySortOptions * garrow_array_sort_options_get_raw(GArrowArraySortOptions *array_sort_options); diff --git a/c_glib/arrow-glib/datum.cpp b/c_glib/arrow-glib/datum.cpp index 781dc086e46..66993d6c229 100644 --- a/c_glib/arrow-glib/datum.cpp +++ b/c_glib/arrow-glib/datum.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include G_BEGIN_DECLS @@ -143,6 +144,37 @@ garrow_datum_is_array_like(GArrowDatum *datum) return arrow_datum.is_arraylike(); } +/** + * garrow_datum_is_scalar: + * @datum: A #GArrowDatum. + * + * Returns: %TRUE if the datum holds a #GArrowScalar, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +garrow_datum_is_scalar(GArrowDatum *datum) +{ + const auto &arrow_datum = garrow_datum_get_raw(datum); + return arrow_datum.is_scalar(); +} + +/** + * garrow_datum_is_value: + * @datum: A #GArrowDatum. + * + * Returns: %TRUE if the datum holds a #GArrowArray, #GArrowChunkedArray or + * #GArrowScalar, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +garrow_datum_is_value(GArrowDatum *datum) +{ + const auto &arrow_datum = garrow_datum_get_raw(datum); + return arrow_datum.is_value(); +} + /** * garrow_datum_equal: * @datum: A #GArrowDatum. @@ -286,6 +318,109 @@ garrow_array_datum_new(GArrowArray *value) } +typedef struct GArrowScalarDatumPrivate_ { + GArrowScalar *value; +} GArrowScalarDatumPrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowScalarDatum, + garrow_scalar_datum, + GARROW_TYPE_DATUM) + +#define GARROW_SCALAR_DATUM_GET_PRIVATE(obj) \ + static_cast( \ + garrow_scalar_datum_get_instance_private( \ + GARROW_SCALAR_DATUM(obj))) + +static void +garrow_scalar_datum_dispose(GObject *object) +{ + auto priv = GARROW_SCALAR_DATUM_GET_PRIVATE(object); + + if (priv->value) { + g_object_unref(priv->value); + priv->value = NULL; + } + + G_OBJECT_CLASS(garrow_scalar_datum_parent_class)->dispose(object); +} + +static void +garrow_scalar_datum_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_SCALAR_DATUM_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_VALUE: + priv->value = GARROW_SCALAR(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_scalar_datum_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_SCALAR_DATUM_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_VALUE: + g_value_set_object(value, priv->value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_scalar_datum_init(GArrowScalarDatum *object) +{ +} + +static void +garrow_scalar_datum_class_init(GArrowScalarDatumClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_scalar_datum_dispose; + gobject_class->set_property = garrow_scalar_datum_set_property; + gobject_class->get_property = garrow_scalar_datum_get_property; + + GParamSpec *spec; + spec = g_param_spec_object("value", + "Value", + "The scalar held by this datum", + GARROW_TYPE_SCALAR, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_VALUE, spec); +} + +/** + * garrow_scalar_datum_new: + * @value: A #GArrowScalar. + * + * Returns: A newly created #GArrowScalarDatum. + * + * Since: 5.0.0 + */ +GArrowScalarDatum * +garrow_scalar_datum_new(GArrowScalar *value) +{ + auto arrow_value = garrow_scalar_get_raw(value); + arrow::Datum arrow_datum(arrow_value); + return garrow_scalar_datum_new_raw(&arrow_datum, value); +} + + typedef struct GArrowChunkedArrayDatumPrivate_ { GArrowChunkedArray *value; } GArrowChunkedArrayDatumPrivate; @@ -608,6 +743,12 @@ GArrowDatum * garrow_datum_new_raw(arrow::Datum *arrow_datum) { switch (arrow_datum->kind()) { + case arrow::Datum::SCALAR: + { + auto arrow_scalar = arrow_datum->scalar(); + auto scalar = garrow_scalar_new_raw(&arrow_scalar); + return GARROW_DATUM(garrow_scalar_datum_new_raw(arrow_datum, scalar)); + } case arrow::Datum::ARRAY: { auto arrow_array = arrow_datum->make_array(); @@ -642,6 +783,16 @@ garrow_datum_new_raw(arrow::Datum *arrow_datum) } } +GArrowScalarDatum * +garrow_scalar_datum_new_raw(arrow::Datum *arrow_datum, + GArrowScalar *value) +{ + return GARROW_SCALAR_DATUM(g_object_new(GARROW_TYPE_SCALAR_DATUM, + "datum", arrow_datum, + "value", value, + NULL)); +} + GArrowArrayDatum * garrow_array_datum_new_raw(arrow::Datum *arrow_datum, GArrowArray *value) diff --git a/c_glib/arrow-glib/datum.h b/c_glib/arrow-glib/datum.h index 9b1544f3271..bc7dda36911 100644 --- a/c_glib/arrow-glib/datum.h +++ b/c_glib/arrow-glib/datum.h @@ -22,6 +22,7 @@ #include #include #include +#include #include G_BEGIN_DECLS @@ -41,10 +42,12 @@ GARROW_AVAILABLE_IN_1_0 gboolean garrow_datum_is_array(GArrowDatum *datum); GARROW_AVAILABLE_IN_1_0 gboolean garrow_datum_is_array_like(GArrowDatum *datum); -/* -GARROW_AVAILABLE_IN_1_0 +GARROW_AVAILABLE_IN_5_0 gboolean garrow_datum_is_scalar(GArrowDatum *datum); -GARROW_AVAILABLE_IN_1_0 +GARROW_AVAILABLE_IN_5_0 +gboolean garrow_datum_is_value(GArrowDatum *datum); +/* +GARROW_AVAILABLE_IN_5_0 gboolean garrow_datum_is_collection(GArrowDatum *datum); */ GARROW_AVAILABLE_IN_1_0 @@ -54,9 +57,20 @@ GARROW_AVAILABLE_IN_1_0 gchar *garrow_datum_to_string(GArrowDatum *datum); /* GARROW_TYPE_NONE_DATUM */ -/* GARROW_TYPE_SCALAR_DATUM */ -/* GARROW_TYPE_INT8_SCALAR_DATUM */ -/* ... */ + +#define GARROW_TYPE_SCALAR_DATUM (garrow_scalar_datum_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowScalarDatum, + garrow_scalar_datum, + GARROW, + SCALAR_DATUM, + GArrowDatum) +struct _GArrowScalarDatumClass +{ + GArrowDatumClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowScalarDatum *garrow_scalar_datum_new(GArrowScalar *value); #define GARROW_TYPE_ARRAY_DATUM (garrow_array_datum_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowArrayDatum, diff --git a/c_glib/arrow-glib/datum.hpp b/c_glib/arrow-glib/datum.hpp index 673501f89ed..d1acfc58c93 100644 --- a/c_glib/arrow-glib/datum.hpp +++ b/c_glib/arrow-glib/datum.hpp @@ -28,6 +28,9 @@ garrow_datum_get_raw(GArrowDatum *datum); GArrowDatum * garrow_datum_new_raw(arrow::Datum *arrow_datum); +GArrowScalarDatum * +garrow_scalar_datum_new_raw(arrow::Datum *arrow_datum, + GArrowScalar *value); GArrowArrayDatum * garrow_array_datum_new_raw(arrow::Datum *arrow_datum, GArrowArray *value); diff --git a/c_glib/arrow-glib/decimal.cpp b/c_glib/arrow-glib/decimal.cpp index cf0a08a3d7c..497d76fcfaa 100644 --- a/c_glib/arrow-glib/decimal.cpp +++ b/c_glib/arrow-glib/decimal.cpp @@ -177,7 +177,7 @@ garrow_decimal_to_bytes(typename DecimalConverter::GArrowType *decimal) { DecimalConverter converter; const auto arrow_decimal = converter.get_raw(decimal); - uint8_t data[DecimalConverter::ArrowType::bit_width / 8]; + uint8_t data[DecimalConverter::ArrowType::kBitWidth / 8]; arrow_decimal->ToBytes(data); return g_bytes_new(data, sizeof(data)); } diff --git a/c_glib/arrow-glib/error.cpp b/c_glib/arrow-glib/error.cpp index 9502d114e88..ac61ddc499a 100644 --- a/c_glib/arrow-glib/error.cpp +++ b/c_glib/arrow-glib/error.cpp @@ -135,8 +135,8 @@ garrow_error_to_status(GError *error, message << context << ": " << g_quark_to_string(error->domain); message << "(" << error->code << "): "; message << error->message; - g_error_free(error); auto code = garrow_error_to_status_code(error, default_code); + g_error_free(error); return arrow::Status(code, message.str()); } diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index dbfea52a847..d0479634d6d 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -31,6 +31,7 @@ sources = files( 'error.cpp', 'field.cpp', 'record-batch.cpp', + 'scalar.cpp', 'schema.cpp', 'table.cpp', 'table-builder.cpp', @@ -88,6 +89,7 @@ c_headers = files( 'field.h', 'gobject-type.h', 'record-batch.h', + 'scalar.h', 'schema.h', 'table.h', 'table-builder.h', @@ -144,6 +146,7 @@ cpp_headers = files( 'error.hpp', 'field.hpp', 'record-batch.hpp', + 'scalar.hpp', 'schema.hpp', 'table.hpp', 'table-builder.hpp', diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 762d0c30faf..ca580e8dcf3 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -144,6 +144,42 @@ garrow_record_batch_reader_class_init(GArrowRecordBatchReaderClass *klass) g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_READER, spec); } +/** + * garrow_record_batch_reader_new: + * @record_batches: (element-type GArrowRecordBatch): + * A list of #GArrowRecordBatch. + * @schema: (nullable): A #GArrowSchema to confirm to. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: The schema in the stream on success, %NULL on error. + * + * Since: 6.0.0 + */ +GArrowRecordBatchReader * +garrow_record_batch_reader_new(GList *record_batches, + GArrowSchema *schema, + GError **error) +{ + std::vector> arrow_record_batches; + for (auto node = record_batches; node; node = node->next) { + auto record_batch = GARROW_RECORD_BATCH(node->data); + arrow_record_batches.push_back(garrow_record_batch_get_raw(record_batch)); + } + std::shared_ptr arrow_schema; + if (schema) { + arrow_schema = garrow_schema_get_raw(schema); + } + auto arrow_reader_result = + arrow::RecordBatchReader::Make(arrow_record_batches, arrow_schema); + if (garrow::check(error, + arrow_reader_result, + "[record-batch-stream-reader][new]")) { + return garrow_record_batch_reader_new_raw(&*arrow_reader_result); + } else { + return NULL; + } +} + /** * garrow_record_batch_reader_get_schema: * @reader: A #GArrowRecordBatchReader. @@ -231,6 +267,33 @@ garrow_record_batch_reader_read_next(GArrowRecordBatchReader *reader, } } +/** + * garrow_record_batch_reader_read_all: + * @reader: A #GArrowRecordBatchReader. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): + * The all record batches in the stream as #GArrowTable. + * + * Since: 6.0.0 + */ +GArrowTable * +garrow_record_batch_reader_read_all(GArrowRecordBatchReader *reader, + GError **error) +{ + auto arrow_reader = garrow_record_batch_reader_get_raw(reader); + std::shared_ptr arrow_table; + auto status = arrow_reader->ReadAll(&arrow_table); + + if (garrow::check(error, + status, + "[record-batch-reader][read-all]")) { + return garrow_table_new_raw(&arrow_table); + } else { + return NULL; + } +} + G_DEFINE_TYPE(GArrowTableBatchReader, garrow_table_batch_reader, @@ -2077,13 +2140,13 @@ garrow_json_reader_read(GArrowJSONReader *reader, G_END_DECLS GArrowRecordBatchReader * -garrow_record_batch_reader_new_raw(std::shared_ptr *arrow_reader) +garrow_record_batch_reader_new_raw( + std::shared_ptr *arrow_reader) { - auto reader = - GARROW_RECORD_BATCH_READER(g_object_new(GARROW_TYPE_RECORD_BATCH_READER, - "record-batch-reader", arrow_reader, - NULL)); - return reader; + return GARROW_RECORD_BATCH_READER( + g_object_new(GARROW_TYPE_RECORD_BATCH_READER, + "record-batch-reader", arrow_reader, + NULL)); } std::shared_ptr diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h index 2628a7292ee..563b0cf227a 100644 --- a/c_glib/arrow-glib/reader.h +++ b/c_glib/arrow-glib/reader.h @@ -41,6 +41,12 @@ struct _GArrowRecordBatchReaderClass GObjectClass parent_class; }; +GARROW_AVAILABLE_IN_6_0 +GArrowRecordBatchReader * +garrow_record_batch_reader_new(GList *record_batches, + GArrowSchema *schema, + GError **error); + GArrowSchema *garrow_record_batch_reader_get_schema( GArrowRecordBatchReader *reader); #ifndef GARROW_DISABLE_DEPRECATED @@ -58,7 +64,10 @@ GArrowRecordBatch *garrow_record_batch_reader_read_next_record_batch( GArrowRecordBatch *garrow_record_batch_reader_read_next( GArrowRecordBatchReader *reader, GError **error); - +GARROW_AVAILABLE_IN_6_0 +GArrowTable * +garrow_record_batch_reader_read_all(GArrowRecordBatchReader *reader, + GError **error); #define GARROW_TYPE_TABLE_BATCH_READER (garrow_table_batch_reader_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowTableBatchReader, diff --git a/c_glib/arrow-glib/scalar.cpp b/c_glib/arrow-glib/scalar.cpp new file mode 100644 index 00000000000..847b48620bd --- /dev/null +++ b/c_glib/arrow-glib/scalar.cpp @@ -0,0 +1,2405 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: scalar + * @section_id: scalar-classes + * @title: Scalar classes + * @include: arrow-glib/arrow-glib.h + * + * #GArrowScalar is a base class for all scalar classes such as + * #GArrowBooleanScalar. + * + * #GArrowNullScalar is a class for a null scalar. + * + * #GArrowBooleanScalar is a class for a boolean scalar. + * + * #GArrowInt8Scalar is a class for a 8-bit integer scalar. + * + * #GArrowInt16Scalar is a class for a 16-bit integer scalar. + * + * #GArrowInt32Scalar is a class for a 32-bit integer scalar. + * + * #GArrowInt64Scalar is a class for a 64-bit integer scalar. + * + * #GArrowUInt8Scalar is a class for a 8-bit unsigned integer scalar. + * + * #GArrowUInt16Scalar is a class for a 16-bit unsigned integer scalar. + * + * #GArrowUInt32Scalar is a class for a 32-bit unsigned integer scalar. + * + * #GArrowUInt64Scalar is a class for a 64-bit unsigned integer scalar. + * + * #GArrowFloatScalar is a class for a 32-bit floating point scalar. + * + * #GArrowDoubleScalar is a class for a 64-bit floating point scalar. + * + * #GArrowBaseBinaryScalar is a base class for all binary and string + * scalar classes such as #GArrowBinaryScalar. + * + * #GArrowBinaryScalar is a class for a binary scalar. + * + * #GArrowStringScalar is a class for an UTF-8 encoded string scalar. + * + * #GArrowLargeBinaryScalar is a class for a 64-bit offsets binary + * scalar. + * + * #GArrowLargeStringScalar is a class for a 64-bit offsets UTF-8 + * encoded string scalar. + * + * #GArrowFixedSizeBinaryScalar is a class for a fixed-size binary + * scalar. + * + * #GArrowDate32Scalar is a class for the number of days since UNIX + * epoch in a 32-bit signed integer scalar. + * + * #GArrowDate64Scalar is a class for the number of milliseconds + * since UNIX epoch in a 64-bit signed integer scalar. + * + * #GArrowTime32Scalar is a class for the number of seconds or + * milliseconds since midnight in a 32-bit signed integer scalar. + * + * #GArrowTime64Scalar is a class for the number of microseconds or + * nanoseconds since midnight in a 64-bit signed integer scalar. + * + * #GArrowTimestampScalar is a class for the number of + * seconds/milliseconds/microseconds/nanoseconds since UNIX epoch in + * a 64-bit signed integer scalar. + * + * #GArrowDecimal128Scalar is a class for a 128-bit decimal scalar. + * + * #GArrowDecimal256Scalar is a class for a 256-bit decimal scalar. + * + * #GArrowBaseListScalar is a base class for all list scalar classes + * such as #GArrowListScalar. + * + * #GArrowListScalar is a class for a list scalar. + * + * #GArrowLargeListScalar is a class for a large list scalar. + * + * #GArrowMapScalar is a class for a map list scalar. + * + * #GArrowStructScalar is a class for a struct list scalar. + * + * #GArrowUnionScalar is a base class for all union scalar classes + * such as #GArrowSparseUnionScalar. + * + * #GArrowSparseUnionScalar is a class for a sparse union scalar. + * + * #GArrowDenseUnionScalar is a class for a dense union scalar. + * + * #GArrowExtensionScalar is a base class for user-defined extension + * scalar. + */ + +typedef struct GArrowScalarPrivate_ { + std::shared_ptr scalar; + GArrowDataType *data_type; +} GArrowScalarPrivate; + +enum { + PROP_SCALAR = 1, + PROP_DATA_TYPE, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowScalar, + garrow_scalar, + G_TYPE_OBJECT) + +#define GARROW_SCALAR_GET_PRIVATE(obj) \ + static_cast( \ + garrow_scalar_get_instance_private( \ + GARROW_SCALAR(obj))) + +static void +garrow_scalar_dispose(GObject *object) +{ + auto priv = GARROW_SCALAR_GET_PRIVATE(object); + + if (priv->data_type) { + g_object_unref(priv->data_type); + priv->data_type = NULL; + } + + G_OBJECT_CLASS(garrow_scalar_parent_class)->dispose(object); +} + +static void +garrow_scalar_finalize(GObject *object) +{ + auto priv = GARROW_SCALAR_GET_PRIVATE(object); + + priv->scalar.~shared_ptr(); + + G_OBJECT_CLASS(garrow_scalar_parent_class)->finalize(object); +} + +static void +garrow_scalar_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_SCALAR_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_SCALAR: + priv->scalar = + *static_cast *>(g_value_get_pointer(value)); + break; + case PROP_DATA_TYPE: + priv->data_type = GARROW_DATA_TYPE(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_scalar_init(GArrowScalar *object) +{ + auto priv = GARROW_SCALAR_GET_PRIVATE(object); + new(&priv->scalar) std::shared_ptr; +} + +static void +garrow_scalar_class_init(GArrowScalarClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_scalar_dispose; + gobject_class->finalize = garrow_scalar_finalize; + gobject_class->set_property = garrow_scalar_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer("scalar", + "Scalar", + "The raw std::shared *", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_SCALAR, spec); + + /** + * GArrowScalar:data-type: + * + * The data type of the scalar. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("data-type", + "Data type", + "The data type of the scalar", + GARROW_TYPE_DATA_TYPE, + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATA_TYPE, spec); +} + +/** + * garrow_scalar_parse: + * @data_type: A #GArrowDataType for the parsed scalar. + * @data: (array length=size): Data to be parsed. + * @size: The number of bytes of the data. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): + * A newly created #GArrowScalar if the data is parsed successfully, + * %NULL otherwise. + * + * Since: 5.0.0 + */ +GArrowScalar * +garrow_scalar_parse(GArrowDataType *data_type, + const guint8 *data, + gsize size, + GError **error) +{ + const auto arrow_data_type = garrow_data_type_get_raw(data_type); + auto arrow_data = + arrow::util::string_view(reinterpret_cast(data), + size); + auto arrow_scalar_result = arrow::Scalar::Parse(arrow_data_type, arrow_data); + if (garrow::check(error, arrow_scalar_result, "[scalar][parse]")) { + auto arrow_scalar = *arrow_scalar_result; + return garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + NULL); + } else { + return NULL; + } +} + +/** + * garrow_scalar_get_data_type: + * @scalar: A #GArrowScalar. + * + * Returns: (transfer none): The #GArrowDataType for the scalar. + * + * Since: 5.0.0 + */ +GArrowDataType * +garrow_scalar_get_data_type(GArrowScalar *scalar) +{ + auto priv = GARROW_SCALAR_GET_PRIVATE(scalar); + if (!priv->data_type) { + priv->data_type = garrow_data_type_new_raw(&(priv->scalar->type)); + } + return priv->data_type; +} + +/** + * garrow_scalar_is_valid: + * @scalar: A #GArrowScalar. + * + * Returns: %TRUE if the scalar is valid, %FALSE otherwise. + * + * Since: 5.0.0 + */ +gboolean +garrow_scalar_is_valid(GArrowScalar *scalar) +{ + const auto arrow_scalar = garrow_scalar_get_raw(scalar); + return arrow_scalar->is_valid; +} + +/** + * garrow_scalar_equal: + * @scalar: A #GArrowScalar. + * @other_scalar: A #GArrowScalar to be compared. + * + * Returns: %TRUE if both of them have the same data, %FALSE + * otherwise. + * + * Since: 5.0.0 + */ +gboolean +garrow_scalar_equal(GArrowScalar *scalar, + GArrowScalar *other_scalar) +{ + return garrow_scalar_equal_options(scalar, other_scalar, NULL); +} + +/** + * garrow_scalar_equal_options: + * @scalar: A #GArrowScalar. + * @other_scalar: A #GArrowScalar to be compared. + * @options: (nullable): A #GArrowEqualOptions. + * + * Returns: %TRUE if both of them have the same data, %FALSE + * otherwise. + * + * Since: 5.0.0 + */ +gboolean +garrow_scalar_equal_options(GArrowScalar *scalar, + GArrowScalar *other_scalar, + GArrowEqualOptions *options) +{ + const auto arrow_scalar = garrow_scalar_get_raw(scalar); + const auto arrow_other_scalar = garrow_scalar_get_raw(other_scalar); + if (options) { + auto is_approx = garrow_equal_options_is_approx(options); + const auto arrow_options = garrow_equal_options_get_raw(options); + if (is_approx) { + return arrow_scalar->ApproxEquals(*arrow_other_scalar, *arrow_options); + } else { + return arrow_scalar->Equals(arrow_other_scalar, *arrow_options); + } + } else { + return arrow_scalar->Equals(arrow_other_scalar); + } +} + +/** + * garrow_scalar_to_string: + * @scalar: A #GArrowScalar. + * + * Returns: The string representation of the scalar. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 5.0.0 + */ +gchar * +garrow_scalar_to_string(GArrowScalar *scalar) +{ + const auto arrow_scalar = garrow_scalar_get_raw(scalar); + return g_strdup(arrow_scalar->ToString().c_str()); +} + +/** + * garrow_scalar_cast: + * @scalar: A #GArrowScalar. + * @data_type: A #GArrowDataType of the casted scalar. + * @options: (nullable): A #GArrowCastOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): + * A newly created casted scalar on success, %NULL on error. + * + * Since: 5.0.0 + */ +GArrowScalar * +garrow_scalar_cast(GArrowScalar *scalar, + GArrowDataType *data_type, + GArrowCastOptions *options, + GError **error) +{ + const auto arrow_scalar = garrow_scalar_get_raw(scalar); + const auto arrow_data_type = garrow_data_type_get_raw(data_type); + auto arrow_casted_scalar_result = arrow_scalar->CastTo(arrow_data_type); + if (garrow::check(error, arrow_casted_scalar_result, "[scalar][cast]")) { + auto arrow_casted_scalar = *arrow_casted_scalar_result; + return garrow_scalar_new_raw(&arrow_casted_scalar, + "scalar", &arrow_casted_scalar, + "data-type", data_type, + NULL); + } else { + return NULL; + } +} + + +G_DEFINE_TYPE(GArrowNullScalar, + garrow_null_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_null_scalar_init(GArrowNullScalar *object) +{ +} + +static void +garrow_null_scalar_class_init(GArrowNullScalarClass *klass) +{ +} + +/** + * garrow_null_scalar_new: + * + * Returns: A newly created #GArrowNullScalar. + * + * Since: 5.0.0 + */ +GArrowNullScalar * +garrow_null_scalar_new(void) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared()); + return GARROW_NULL_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + + +G_DEFINE_TYPE(GArrowBooleanScalar, + garrow_boolean_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_boolean_scalar_init(GArrowBooleanScalar *object) +{ +} + +static void +garrow_boolean_scalar_class_init(GArrowBooleanScalarClass *klass) +{ +} + +/** + * garrow_boolean_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowBooleanScalar. + * + * Since: 5.0.0 + */ +GArrowBooleanScalar * +garrow_boolean_scalar_new(gboolean value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_BOOLEAN_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_boolean_scalar_get_value: + * @scalar: A #GArrowBooleanScalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gboolean +garrow_boolean_scalar_get_value(GArrowBooleanScalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowInt8Scalar, + garrow_int8_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_int8_scalar_init(GArrowInt8Scalar *object) +{ +} + +static void +garrow_int8_scalar_class_init(GArrowInt8ScalarClass *klass) +{ +} + +/** + * garrow_int8_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowInt8Scalar. + * + * Since: 5.0.0 + */ +GArrowInt8Scalar * +garrow_int8_scalar_new(gint8 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_INT8_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_int8_scalar_get_value: + * @scalar: A #GArrowInt8Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gint8 +garrow_int8_scalar_get_value(GArrowInt8Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowInt16Scalar, + garrow_int16_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_int16_scalar_init(GArrowInt16Scalar *object) +{ +} + +static void +garrow_int16_scalar_class_init(GArrowInt16ScalarClass *klass) +{ +} + +/** + * garrow_int16_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowInt16Scalar. + * + * Since: 5.0.0 + */ +GArrowInt16Scalar * +garrow_int16_scalar_new(gint16 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_INT16_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_int16_scalar_get_value: + * @scalar: A #GArrowInt16Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gint16 +garrow_int16_scalar_get_value(GArrowInt16Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowInt32Scalar, + garrow_int32_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_int32_scalar_init(GArrowInt32Scalar *object) +{ +} + +static void +garrow_int32_scalar_class_init(GArrowInt32ScalarClass *klass) +{ +} + +/** + * garrow_int32_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowInt32Scalar. + * + * Since: 5.0.0 + */ +GArrowInt32Scalar * +garrow_int32_scalar_new(gint32 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_INT32_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_int32_scalar_get_value: + * @scalar: A #GArrowInt32Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gint32 +garrow_int32_scalar_get_value(GArrowInt32Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowInt64Scalar, + garrow_int64_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_int64_scalar_init(GArrowInt64Scalar *object) +{ +} + +static void +garrow_int64_scalar_class_init(GArrowInt64ScalarClass *klass) +{ +} + +/** + * garrow_int64_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowInt64Scalar. + * + * Since: 5.0.0 + */ +GArrowInt64Scalar * +garrow_int64_scalar_new(gint64 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_INT64_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_int64_scalar_get_value: + * @scalar: A #GArrowInt64Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gint64 +garrow_int64_scalar_get_value(GArrowInt64Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowUInt8Scalar, + garrow_uint8_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_uint8_scalar_init(GArrowUInt8Scalar *object) +{ +} + +static void +garrow_uint8_scalar_class_init(GArrowUInt8ScalarClass *klass) +{ +} + +/** + * garrow_uint8_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowUInt8Scalar. + * + * Since: 5.0.0 + */ +GArrowUInt8Scalar * +garrow_uint8_scalar_new(guint8 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_UINT8_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_uint8_scalar_get_value: + * @scalar: A #GArrowUInt8Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +guint8 +garrow_uint8_scalar_get_value(GArrowUInt8Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowUInt16Scalar, + garrow_uint16_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_uint16_scalar_init(GArrowUInt16Scalar *object) +{ +} + +static void +garrow_uint16_scalar_class_init(GArrowUInt16ScalarClass *klass) +{ +} + +/** + * garrow_uint16_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowUInt16Scalar. + * + * Since: 5.0.0 + */ +GArrowUInt16Scalar * +garrow_uint16_scalar_new(guint16 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_UINT16_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_uint16_scalar_get_value: + * @scalar: A #GArrowUInt16Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +guint16 +garrow_uint16_scalar_get_value(GArrowUInt16Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowUInt32Scalar, + garrow_uint32_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_uint32_scalar_init(GArrowUInt32Scalar *object) +{ +} + +static void +garrow_uint32_scalar_class_init(GArrowUInt32ScalarClass *klass) +{ +} + +/** + * garrow_uint32_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowUInt32Scalar. + * + * Since: 5.0.0 + */ +GArrowUInt32Scalar * +garrow_uint32_scalar_new(guint32 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_UINT32_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_uint32_scalar_get_value: + * @scalar: A #GArrowUInt32Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +guint32 +garrow_uint32_scalar_get_value(GArrowUInt32Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowUInt64Scalar, + garrow_uint64_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_uint64_scalar_init(GArrowUInt64Scalar *object) +{ +} + +static void +garrow_uint64_scalar_class_init(GArrowUInt64ScalarClass *klass) +{ +} + +/** + * garrow_uint64_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowUInt64Scalar. + * + * Since: 5.0.0 + */ +GArrowUInt64Scalar * +garrow_uint64_scalar_new(guint64 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_UINT64_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_uint64_scalar_get_value: + * @scalar: A #GArrowUInt64Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +guint64 +garrow_uint64_scalar_get_value(GArrowUInt64Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowFloatScalar, + garrow_float_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_float_scalar_init(GArrowFloatScalar *object) +{ +} + +static void +garrow_float_scalar_class_init(GArrowFloatScalarClass *klass) +{ +} + +/** + * garrow_float_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowFloatScalar. + * + * Since: 5.0.0 + */ +GArrowFloatScalar * +garrow_float_scalar_new(gfloat value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_FLOAT_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_float_scalar_get_value: + * @scalar: A #GArrowFloatScalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gfloat +garrow_float_scalar_get_value(GArrowFloatScalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowDoubleScalar, + garrow_double_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_double_scalar_init(GArrowDoubleScalar *object) +{ +} + +static void +garrow_double_scalar_class_init(GArrowDoubleScalarClass *klass) +{ +} + +/** + * garrow_double_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowDoubleScalar. + * + * Since: 5.0.0 + */ +GArrowDoubleScalar * +garrow_double_scalar_new(gdouble value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_DOUBLE_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_double_scalar_get_value: + * @scalar: A #GArrowDoubleScalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gdouble +garrow_double_scalar_get_value(GArrowDoubleScalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +typedef struct GArrowBaseBinaryScalarPrivate_ { + GArrowBuffer *value; +} GArrowBaseBinaryScalarPrivate; + +enum { + PROP_VALUE = 1, +}; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowBaseBinaryScalar, + garrow_base_binary_scalar, + GARROW_TYPE_SCALAR) + +#define GARROW_BASE_BINARY_SCALAR_GET_PRIVATE(obj) \ + static_cast( \ + garrow_base_binary_scalar_get_instance_private( \ + GARROW_BASE_BINARY_SCALAR(obj))) + +static void +garrow_base_binary_scalar_dispose(GObject *object) +{ + auto priv = GARROW_BASE_BINARY_SCALAR_GET_PRIVATE(object); + + if (priv->value) { + g_object_unref(priv->value); + priv->value = NULL; + } + + G_OBJECT_CLASS(garrow_base_binary_scalar_parent_class)->dispose(object); +} + +static void +garrow_base_binary_scalar_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_BASE_BINARY_SCALAR_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_VALUE: + priv->value = GARROW_BUFFER(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_base_binary_scalar_init(GArrowBaseBinaryScalar *object) +{ +} + +static void +garrow_base_binary_scalar_class_init(GArrowBaseBinaryScalarClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = garrow_base_binary_scalar_dispose; + gobject_class->set_property = garrow_base_binary_scalar_set_property; + + GParamSpec *spec; + /** + * GArrowBaseBinaryScalar:value: + * + * The value of the scalar. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("value", + "Value", + "The value of the scalar", + GARROW_TYPE_BUFFER, + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_VALUE, spec); +} + +G_END_DECLS +template +GArrowScalar * +garrow_base_binary_scalar_new(GArrowBuffer *value) +{ + auto arrow_value = garrow_buffer_get_raw(value); + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(arrow_value)); + return garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "value", value, + NULL); +} +G_BEGIN_DECLS + +/** + * garrow_base_binary_scalar_get_value: + * @scalar: A #GArrowBaseBinaryScalar. + * + * Returns: (transfer none): The value of this scalar. + * + * Since: 5.0.0 + */ +GArrowBuffer * +garrow_base_binary_scalar_get_value(GArrowBaseBinaryScalar *scalar) +{ + auto priv = GARROW_BASE_BINARY_SCALAR_GET_PRIVATE(scalar); + if (!priv->value) { + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + priv->value = garrow_buffer_new_raw(&(arrow_scalar->value)); + } + return priv->value; +} + + +G_DEFINE_TYPE(GArrowBinaryScalar, + garrow_binary_scalar, + GARROW_TYPE_BASE_BINARY_SCALAR) + +static void +garrow_binary_scalar_init(GArrowBinaryScalar *object) +{ +} + +static void +garrow_binary_scalar_class_init(GArrowBinaryScalarClass *klass) +{ +} + +/** + * garrow_binary_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowBinaryScalar. + * + * Since: 5.0.0 + */ +GArrowBinaryScalar * +garrow_binary_scalar_new(GArrowBuffer *value) +{ + return GARROW_BINARY_SCALAR( + garrow_base_binary_scalar_new(value)); +} + + +G_DEFINE_TYPE(GArrowStringScalar, + garrow_string_scalar, + GARROW_TYPE_BASE_BINARY_SCALAR) + +static void +garrow_string_scalar_init(GArrowStringScalar *object) +{ +} + +static void +garrow_string_scalar_class_init(GArrowStringScalarClass *klass) +{ +} + +/** + * garrow_string_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowStringScalar. + * + * Since: 5.0.0 + */ +GArrowStringScalar * +garrow_string_scalar_new(GArrowBuffer *value) +{ + return GARROW_STRING_SCALAR( + garrow_base_binary_scalar_new(value)); +} + + +G_DEFINE_TYPE(GArrowLargeBinaryScalar, + garrow_large_binary_scalar, + GARROW_TYPE_BASE_BINARY_SCALAR) + +static void +garrow_large_binary_scalar_init(GArrowLargeBinaryScalar *object) +{ +} + +static void +garrow_large_binary_scalar_class_init(GArrowLargeBinaryScalarClass *klass) +{ +} + +/** + * garrow_large_binary_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowLargeBinaryScalar. + * + * Since: 5.0.0 + */ +GArrowLargeBinaryScalar * +garrow_large_binary_scalar_new(GArrowBuffer *value) +{ + return GARROW_LARGE_BINARY_SCALAR( + garrow_base_binary_scalar_new(value)); +} + + +G_DEFINE_TYPE(GArrowLargeStringScalar, + garrow_large_string_scalar, + GARROW_TYPE_BASE_BINARY_SCALAR) + +static void +garrow_large_string_scalar_init(GArrowLargeStringScalar *object) +{ +} + +static void +garrow_large_string_scalar_class_init(GArrowLargeStringScalarClass *klass) +{ +} + +/** + * garrow_large_string_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowLargeStringScalar. + * + * Since: 5.0.0 + */ +GArrowLargeStringScalar * +garrow_large_string_scalar_new(GArrowBuffer *value) +{ + return GARROW_LARGE_STRING_SCALAR( + garrow_base_binary_scalar_new(value)); +} + + +G_DEFINE_TYPE(GArrowFixedSizeBinaryScalar, + garrow_fixed_size_binary_scalar, + GARROW_TYPE_BASE_BINARY_SCALAR) + +static void +garrow_fixed_size_binary_scalar_init(GArrowFixedSizeBinaryScalar *object) +{ +} + +static void +garrow_fixed_size_binary_scalar_class_init( + GArrowFixedSizeBinaryScalarClass *klass) +{ +} + +/** + * garrow_fixed_size_binary_scalar_new: + * @data_type: A #GArrowFixedSizeBinaryDataType for this scalar. + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowFixedSizeBinaryScalar. + * + * Since: 5.0.0 + */ +GArrowFixedSizeBinaryScalar * +garrow_fixed_size_binary_scalar_new(GArrowFixedSizeBinaryDataType *data_type, + GArrowBuffer *value) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_value = garrow_buffer_get_raw(value); + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared( + arrow_value, arrow_data_type)); + return GARROW_FIXED_SIZE_BINARY_SCALAR( + garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + "value", value, + NULL)); +} + + +G_DEFINE_TYPE(GArrowDate32Scalar, + garrow_date32_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_date32_scalar_init(GArrowDate32Scalar *object) +{ +} + +static void +garrow_date32_scalar_class_init(GArrowDate32ScalarClass *klass) +{ +} + +/** + * garrow_date32_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowDate32Scalar. + * + * Since: 5.0.0 + */ +GArrowDate32Scalar * +garrow_date32_scalar_new(gint32 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_DATE32_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_date32_scalar_get_value: + * @scalar: A #GArrowDate32Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gint32 +garrow_date32_scalar_get_value(GArrowDate32Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowDate64Scalar, + garrow_date64_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_date64_scalar_init(GArrowDate64Scalar *object) +{ +} + +static void +garrow_date64_scalar_class_init(GArrowDate64ScalarClass *klass) +{ +} + +/** + * garrow_date64_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowDate64Scalar. + * + * Since: 5.0.0 + */ +GArrowDate64Scalar * +garrow_date64_scalar_new(gint64 value) +{ + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value)); + return GARROW_DATE64_SCALAR(garrow_scalar_new_raw(&arrow_scalar)); +} + +/** + * garrow_date64_scalar_get_value: + * @scalar: A #GArrowDate64Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gint64 +garrow_date64_scalar_get_value(GArrowDate64Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowTime32Scalar, + garrow_time32_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_time32_scalar_init(GArrowTime32Scalar *object) +{ +} + +static void +garrow_time32_scalar_class_init(GArrowTime32ScalarClass *klass) +{ +} + +/** + * garrow_time32_scalar_new: + * @data_type: A #GArrowTime32DataType for this scalar. + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowTime32Scalar. + * + * Since: 5.0.0 + */ +GArrowTime32Scalar * +garrow_time32_scalar_new(GArrowTime32DataType *data_type, + gint32 value) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value, arrow_data_type)); + return GARROW_TIME32_SCALAR( + garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + NULL)); +} + +/** + * garrow_time32_scalar_get_value: + * @scalar: A #GArrowTime32Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gint32 +garrow_time32_scalar_get_value(GArrowTime32Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowTime64Scalar, + garrow_time64_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_time64_scalar_init(GArrowTime64Scalar *object) +{ +} + +static void +garrow_time64_scalar_class_init(GArrowTime64ScalarClass *klass) +{ +} + +/** + * garrow_time64_scalar_new: + * @data_type: A #GArrowTime64DataType for this scalar. + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowTime64Scalar. + * + * Since: 5.0.0 + */ +GArrowTime64Scalar * +garrow_time64_scalar_new(GArrowTime64DataType *data_type, + gint64 value) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value, arrow_data_type)); + return GARROW_TIME64_SCALAR( + garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + NULL)); +} + +/** + * garrow_time64_scalar_get_value: + * @scalar: A #GArrowTime64Scalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gint64 +garrow_time64_scalar_get_value(GArrowTime64Scalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +G_DEFINE_TYPE(GArrowTimestampScalar, + garrow_timestamp_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_timestamp_scalar_init(GArrowTimestampScalar *object) +{ +} + +static void +garrow_timestamp_scalar_class_init(GArrowTimestampScalarClass *klass) +{ +} + +/** + * garrow_timestamp_scalar_new: + * @data_type: A #GArrowTimestampDataType for this scalar. + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowTimestampScalar. + * + * Since: 5.0.0 + */ +GArrowTimestampScalar * +garrow_timestamp_scalar_new(GArrowTimestampDataType *data_type, + gint64 value) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(value, arrow_data_type)); + return GARROW_TIMESTAMP_SCALAR( + garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + NULL)); +} + +/** + * garrow_timestamp_scalar_get_value: + * @scalar: A #GArrowTimestampScalar. + * + * Returns: The value of this scalar. + * + * Since: 5.0.0 + */ +gint64 +garrow_timestamp_scalar_get_value(GArrowTimestampScalar *scalar) +{ + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->value; +} + + +typedef struct GArrowDecimal128ScalarPrivate_ { + GArrowDecimal128 *value; +} GArrowDecimal128ScalarPrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowDecimal128Scalar, + garrow_decimal128_scalar, + GARROW_TYPE_SCALAR) + +#define GARROW_DECIMAL128_SCALAR_GET_PRIVATE(obj) \ + static_cast( \ + garrow_decimal128_scalar_get_instance_private( \ + GARROW_DECIMAL128_SCALAR(obj))) + +static void +garrow_decimal128_scalar_dispose(GObject *object) +{ + auto priv = GARROW_DECIMAL128_SCALAR_GET_PRIVATE(object); + + if (priv->value) { + g_object_unref(priv->value); + priv->value = NULL; + } + + G_OBJECT_CLASS(garrow_decimal128_scalar_parent_class)->dispose(object); +} + +static void +garrow_decimal128_scalar_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_DECIMAL128_SCALAR_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_VALUE: + priv->value = GARROW_DECIMAL128(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_decimal128_scalar_init(GArrowDecimal128Scalar *object) +{ +} + +static void +garrow_decimal128_scalar_class_init(GArrowDecimal128ScalarClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_decimal128_scalar_dispose; + gobject_class->set_property = garrow_decimal128_scalar_set_property; + + GParamSpec *spec; + /** + * GArrowDecimal128Scalar:value: + * + * The value of the scalar. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("value", + "Value", + "The value of the scalar", + garrow_decimal128_get_type(), + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_VALUE, spec); +} + +/** + * garrow_decimal128_scalar_new: + * @data_type: A #GArrowDecimal128DataType for this scalar. + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowDecimal128Scalar. + * + * Since: 5.0.0 + */ +GArrowDecimal128Scalar * +garrow_decimal128_scalar_new(GArrowDecimal128DataType *data_type, + GArrowDecimal128 *value) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_value = garrow_decimal128_get_raw(value); + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(*arrow_value, arrow_data_type)); + return GARROW_DECIMAL128_SCALAR( + garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + "value", value, + NULL)); +} + +/** + * garrow_decimal128_scalar_get_value: + * @scalar: A #GArrowDecimal128Scalar. + * + * Returns: (transfer none): The value of this scalar. + * + * Since: 5.0.0 + */ +GArrowDecimal128 * +garrow_decimal128_scalar_get_value(GArrowDecimal128Scalar *scalar) +{ + auto priv = GARROW_DECIMAL128_SCALAR_GET_PRIVATE(scalar); + if (!priv->value) { + auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + auto arrow_value = std::make_shared(arrow_scalar->value); + priv->value = garrow_decimal128_new_raw(&arrow_value); + } + return priv->value; +} + + +typedef struct GArrowDecimal256ScalarPrivate_ { + GArrowDecimal256 *value; +} GArrowDecimal256ScalarPrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowDecimal256Scalar, + garrow_decimal256_scalar, + GARROW_TYPE_SCALAR) + +#define GARROW_DECIMAL256_SCALAR_GET_PRIVATE(obj) \ + static_cast( \ + garrow_decimal256_scalar_get_instance_private( \ + GARROW_DECIMAL256_SCALAR(obj))) + +static void +garrow_decimal256_scalar_dispose(GObject *object) +{ + auto priv = GARROW_DECIMAL256_SCALAR_GET_PRIVATE(object); + + if (priv->value) { + g_object_unref(priv->value); + priv->value = NULL; + } + + G_OBJECT_CLASS(garrow_decimal256_scalar_parent_class)->dispose(object); +} + +static void +garrow_decimal256_scalar_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_DECIMAL256_SCALAR_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_VALUE: + priv->value = GARROW_DECIMAL256(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_decimal256_scalar_init(GArrowDecimal256Scalar *object) +{ +} + +static void +garrow_decimal256_scalar_class_init(GArrowDecimal256ScalarClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_decimal256_scalar_dispose; + gobject_class->set_property = garrow_decimal256_scalar_set_property; + + GParamSpec *spec; + /** + * GArrowDecimal256Scalar:value: + * + * The value of the scalar. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("value", + "Value", + "The value of the scalar", + garrow_decimal256_get_type(), + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_VALUE, spec); +} + +/** + * garrow_decimal256_scalar_new: + * @data_type: A #GArrowDecimal256DataType for this scalar. + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowDecimal256Scalar. + * + * Since: 5.0.0 + */ +GArrowDecimal256Scalar * +garrow_decimal256_scalar_new(GArrowDecimal256DataType *data_type, + GArrowDecimal256 *value) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_value = garrow_decimal256_get_raw(value); + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(*arrow_value, arrow_data_type)); + return GARROW_DECIMAL256_SCALAR(garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + "value", value, + NULL)); +} + +/** + * garrow_decimal256_scalar_get_value: + * @scalar: A #GArrowDecimal256Scalar. + * + * Returns: (transfer none): The value of this scalar. + * + * Since: 5.0.0 + */ +GArrowDecimal256 * +garrow_decimal256_scalar_get_value(GArrowDecimal256Scalar *scalar) +{ + auto priv = GARROW_DECIMAL256_SCALAR_GET_PRIVATE(scalar); + if (!priv->value) { + auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + auto arrow_value = std::make_shared(arrow_scalar->value); + priv->value = garrow_decimal256_new_raw(&arrow_value); + } + return priv->value; +} + + +typedef struct GArrowBaseListScalarPrivate_ { + GArrowArray *value; +} GArrowBaseListScalarPrivate; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowBaseListScalar, + garrow_base_list_scalar, + GARROW_TYPE_SCALAR) + +#define GARROW_BASE_LIST_SCALAR_GET_PRIVATE(obj) \ + static_cast( \ + garrow_base_list_scalar_get_instance_private( \ + GARROW_BASE_LIST_SCALAR(obj))) + +static void +garrow_base_list_scalar_dispose(GObject *object) +{ + auto priv = GARROW_BASE_LIST_SCALAR_GET_PRIVATE(object); + + if (priv->value) { + g_object_unref(priv->value); + priv->value = NULL; + } + + G_OBJECT_CLASS(garrow_base_list_scalar_parent_class)->dispose(object); +} + +static void +garrow_base_list_scalar_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_BASE_LIST_SCALAR_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_VALUE: + priv->value = GARROW_ARRAY(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_base_list_scalar_init(GArrowBaseListScalar *object) +{ +} + +static void +garrow_base_list_scalar_class_init(GArrowBaseListScalarClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = garrow_base_list_scalar_dispose; + gobject_class->set_property = garrow_base_list_scalar_set_property; + + GParamSpec *spec; + /** + * GArrowBaseListScalar:value: + * + * The value of the scalar. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("value", + "Value", + "The value of the scalar", + GARROW_TYPE_ARRAY, + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_VALUE, spec); +} + +G_END_DECLS +template +GArrowScalar * +garrow_base_list_scalar_new(GArrowArray *value) +{ + auto arrow_value = garrow_array_get_raw(value); + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(arrow_value)); + auto data_type = garrow_array_get_value_data_type(value); + auto scalar = garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + "value", value, + NULL); + g_object_unref(data_type); + return scalar; +} +G_BEGIN_DECLS + +/** + * garrow_base_list_scalar_get_value: + * @scalar: A #GArrowBaseListScalar. + * + * Returns: (transfer none): The value of this scalar. + * + * Since: 5.0.0 + */ +GArrowArray * +garrow_base_list_scalar_get_value(GArrowBaseListScalar *scalar) +{ + auto priv = GARROW_BASE_LIST_SCALAR_GET_PRIVATE(scalar); + if (!priv->value) { + const auto arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + priv->value = garrow_array_new_raw(&(arrow_scalar->value)); + } + return priv->value; +} + + +G_DEFINE_TYPE(GArrowListScalar, + garrow_list_scalar, + GARROW_TYPE_BASE_LIST_SCALAR) + +static void +garrow_list_scalar_init(GArrowListScalar *object) +{ +} + +static void +garrow_list_scalar_class_init(GArrowListScalarClass *klass) +{ +} + +/** + * garrow_list_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowListScalar. + * + * Since: 5.0.0 + */ +GArrowListScalar * +garrow_list_scalar_new(GArrowListArray *value) +{ + return GARROW_LIST_SCALAR( + garrow_base_list_scalar_new(GARROW_ARRAY(value))); +} + + +G_DEFINE_TYPE(GArrowLargeListScalar, + garrow_large_list_scalar, + GARROW_TYPE_BASE_LIST_SCALAR) + +static void +garrow_large_list_scalar_init(GArrowLargeListScalar *object) +{ +} + +static void +garrow_large_list_scalar_class_init(GArrowLargeListScalarClass *klass) +{ +} + +/** + * garrow_large_list_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowLargeListScalar. + * + * Since: 5.0.0 + */ +GArrowLargeListScalar * +garrow_large_list_scalar_new(GArrowLargeListArray *value) +{ + return GARROW_LARGE_LIST_SCALAR( + garrow_base_list_scalar_new(GARROW_ARRAY(value))); +} + + +G_DEFINE_TYPE(GArrowMapScalar, + garrow_map_scalar, + GARROW_TYPE_BASE_LIST_SCALAR) + +static void +garrow_map_scalar_init(GArrowMapScalar *object) +{ +} + +static void +garrow_map_scalar_class_init(GArrowMapScalarClass *klass) +{ +} + +/** + * garrow_map_scalar_new: + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowMapScalar. + * + * Since: 5.0.0 + */ +GArrowMapScalar * +garrow_map_scalar_new(GArrowStructArray *value) +{ + return GARROW_MAP_SCALAR( + garrow_base_list_scalar_new(GARROW_ARRAY(value))); +} + + +typedef struct GArrowStructScalarPrivate_ { + GList *value; +} GArrowStructScalarPrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowStructScalar, + garrow_struct_scalar, + GARROW_TYPE_SCALAR) + +#define GARROW_STRUCT_SCALAR_GET_PRIVATE(obj) \ + static_cast( \ + garrow_struct_scalar_get_instance_private( \ + GARROW_STRUCT_SCALAR(obj))) + +static void +garrow_struct_scalar_dispose(GObject *object) +{ + auto priv = GARROW_STRUCT_SCALAR_GET_PRIVATE(object); + + if (priv->value) { + g_list_free_full(priv->value, g_object_unref); + priv->value = NULL; + } + + G_OBJECT_CLASS(garrow_struct_scalar_parent_class)->dispose(object); +} + +static void +garrow_struct_scalar_init(GArrowStructScalar *object) +{ +} + +static void +garrow_struct_scalar_class_init(GArrowStructScalarClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = garrow_struct_scalar_dispose; +} + +/** + * garrow_struct_scalar_new: + * @data_type: A #GArrowStructDataType for this scalar. + * @value: (element-type GArrowScalar): The value of this scalar. + * + * Returns: A newly created #GArrowDecimal256Scalar. + * + * Since: 5.0.0 + */ +GArrowStructScalar * +garrow_struct_scalar_new(GArrowStructDataType *data_type, + GList *value) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + std::vector> arrow_value; + for (GList *node = value; node; node = node->next) { + auto field = GARROW_SCALAR(node->data); + auto arrow_field = garrow_scalar_get_raw(field); + arrow_value.push_back(arrow_field); + } + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(arrow_value, arrow_data_type)); + auto scalar = + GARROW_STRUCT_SCALAR( + garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + NULL)); + auto priv = GARROW_STRUCT_SCALAR_GET_PRIVATE(scalar); + priv->value = g_list_copy_deep(value, + reinterpret_cast(g_object_ref), + NULL); + return scalar; +} + +/** + * garrow_struct_scalar_get_value: + * @scalar: A #GArrowStructScalar. + * + * Returns: (element-type GArrowScalar) (transfer none): + * The value of this scalar. + * + * Since: 5.0.0 + */ +GList * +garrow_struct_scalar_get_value(GArrowStructScalar *scalar) +{ + auto priv = GARROW_STRUCT_SCALAR_GET_PRIVATE(scalar); + return priv->value; +} + + +typedef struct GArrowUnionScalarPrivate_ { + GArrowScalar *value; +} GArrowUnionScalarPrivate; + +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowUnionScalar, + garrow_union_scalar, + GARROW_TYPE_SCALAR) + +#define GARROW_UNION_SCALAR_GET_PRIVATE(obj) \ + static_cast( \ + garrow_union_scalar_get_instance_private( \ + GARROW_UNION_SCALAR(obj))) + +static void +garrow_union_scalar_dispose(GObject *object) +{ + auto priv = GARROW_UNION_SCALAR_GET_PRIVATE(object); + + if (priv->value) { + g_object_unref(priv->value); + priv->value = NULL; + } + + G_OBJECT_CLASS(garrow_union_scalar_parent_class)->dispose(object); +} + +static void +garrow_union_scalar_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_UNION_SCALAR_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_VALUE: + priv->value = GARROW_SCALAR(g_value_dup_object(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_union_scalar_init(GArrowUnionScalar *object) +{ +} + +static void +garrow_union_scalar_class_init(GArrowUnionScalarClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + gobject_class->dispose = garrow_union_scalar_dispose; + gobject_class->set_property = garrow_union_scalar_set_property; + + GParamSpec *spec; + /** + * GArrowUnionScalar:value: + * + * The value of the scalar. + * + * Since: 5.0.0 + */ + spec = g_param_spec_object("value", + "Value", + "The value of the scalar", + GARROW_TYPE_SCALAR, + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_VALUE, spec); +} + +G_END_DECLS +template +GArrowScalar * +garrow_union_scalar_new(GArrowDataType *data_type, + gint8 type_code, + GArrowScalar *value) +{ + auto arrow_data_type = garrow_data_type_get_raw(data_type); + auto arrow_value = garrow_scalar_get_raw(value); + auto arrow_scalar = + std::static_pointer_cast( + std::make_shared(arrow_value, type_code, + arrow_data_type)); + auto scalar = garrow_scalar_new_raw(&arrow_scalar, + "scalar", &arrow_scalar, + "data-type", data_type, + "value", value, + NULL); + return scalar; +} +G_BEGIN_DECLS + +/** + * garrow_union_scalar_get_type_code: + * @scalar: A #GArrowUnionScalar. + * + * Returns: The type code of this scalar. + * + * Since: 6.0.0 + */ +gint8 +garrow_union_scalar_get_type_code(GArrowUnionScalar *scalar) +{ + const auto &arrow_scalar = + std::static_pointer_cast( + garrow_scalar_get_raw(GARROW_SCALAR(scalar))); + return arrow_scalar->type_code; +} + +/** + * garrow_union_scalar_get_value: + * @scalar: A #GArrowUnionScalar. + * + * Returns: (transfer none): The value of this scalar. + * + * Since: 5.0.0 + */ +GArrowScalar * +garrow_union_scalar_get_value(GArrowUnionScalar *scalar) +{ + auto priv = GARROW_UNION_SCALAR_GET_PRIVATE(scalar); + return priv->value; +} + + +G_DEFINE_TYPE(GArrowSparseUnionScalar, + garrow_sparse_union_scalar, + GARROW_TYPE_UNION_SCALAR) + +static void +garrow_sparse_union_scalar_init(GArrowSparseUnionScalar *object) +{ +} + +static void +garrow_sparse_union_scalar_class_init(GArrowSparseUnionScalarClass *klass) +{ +} + +/** + * garrow_sparse_union_scalar_new: + * @data_type: A #GArrowSparseUnionDataType for this scalar. + * @type_code: The type code of this scalar. + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowSparseUnionScalar. + * + * Since: 5.0.0 + */ +GArrowSparseUnionScalar * +garrow_sparse_union_scalar_new(GArrowSparseUnionDataType *data_type, + gint8 type_code, + GArrowScalar *value) +{ + return GARROW_SPARSE_UNION_SCALAR( + garrow_union_scalar_new( + GARROW_DATA_TYPE(data_type), type_code, value)); +} + + +G_DEFINE_TYPE(GArrowDenseUnionScalar, + garrow_dense_union_scalar, + GARROW_TYPE_UNION_SCALAR) + +static void +garrow_dense_union_scalar_init(GArrowDenseUnionScalar *object) +{ +} + +static void +garrow_dense_union_scalar_class_init(GArrowDenseUnionScalarClass *klass) +{ +} + +/** + * garrow_dense_union_scalar_new: + * @data_type: A #GArrowDenseUnionDataType for this scalar. + * @type_code: The type code of this scalar. + * @value: The value of this scalar. + * + * Returns: A newly created #GArrowDenseUnionScalar. + * + * Since: 5.0.0 + */ +GArrowDenseUnionScalar * +garrow_dense_union_scalar_new(GArrowDenseUnionDataType *data_type, + gint8 type_code, + GArrowScalar *value) +{ + return GARROW_DENSE_UNION_SCALAR( + garrow_union_scalar_new( + GARROW_DATA_TYPE(data_type), type_code, value)); +} + + +G_DEFINE_TYPE(GArrowExtensionScalar, + garrow_extension_scalar, + GARROW_TYPE_SCALAR) + +static void +garrow_extension_scalar_init(GArrowExtensionScalar *object) +{ +} + +static void +garrow_extension_scalar_class_init(GArrowExtensionScalarClass *klass) +{ +} + + +G_END_DECLS + +GArrowScalar * +garrow_scalar_new_raw(std::shared_ptr *arrow_scalar) +{ + return garrow_scalar_new_raw(arrow_scalar, + "scalar", arrow_scalar, + NULL); +} + +GArrowScalar * +garrow_scalar_new_raw(std::shared_ptr *arrow_scalar, + const gchar *first_property_name, + ...) +{ + va_list args; + va_start(args, first_property_name); + auto array = garrow_scalar_new_raw_valist(arrow_scalar, + first_property_name, + args); + va_end(args); + return array; +} + +GArrowScalar * +garrow_scalar_new_raw_valist(std::shared_ptr *arrow_scalar, + const gchar *first_property_name, + va_list args) +{ + GType type; + GArrowScalar *scalar; + + switch ((*arrow_scalar)->type->id()) { + case arrow::Type::type::NA: + type = GARROW_TYPE_NULL_SCALAR; + break; + case arrow::Type::type::BOOL: + type = GARROW_TYPE_BOOLEAN_SCALAR; + break; + case arrow::Type::type::INT8: + type = GARROW_TYPE_INT8_SCALAR; + break; + case arrow::Type::type::INT16: + type = GARROW_TYPE_INT16_SCALAR; + break; + case arrow::Type::type::INT32: + type = GARROW_TYPE_INT32_SCALAR; + break; + case arrow::Type::type::INT64: + type = GARROW_TYPE_INT64_SCALAR; + break; + case arrow::Type::type::UINT8: + type = GARROW_TYPE_UINT8_SCALAR; + break; + case arrow::Type::type::UINT16: + type = GARROW_TYPE_UINT16_SCALAR; + break; + case arrow::Type::type::UINT32: + type = GARROW_TYPE_UINT32_SCALAR; + break; + case arrow::Type::type::UINT64: + type = GARROW_TYPE_UINT64_SCALAR; + break; + case arrow::Type::type::FLOAT: + type = GARROW_TYPE_FLOAT_SCALAR; + break; + case arrow::Type::type::DOUBLE: + type = GARROW_TYPE_DOUBLE_SCALAR; + break; + case arrow::Type::type::BINARY: + type = GARROW_TYPE_BINARY_SCALAR; + break; + case arrow::Type::type::STRING: + type = GARROW_TYPE_STRING_SCALAR; + break; + case arrow::Type::type::LARGE_BINARY: + type = GARROW_TYPE_LARGE_BINARY_SCALAR; + break; + case arrow::Type::type::LARGE_STRING: + type = GARROW_TYPE_LARGE_STRING_SCALAR; + break; + case arrow::Type::type::FIXED_SIZE_BINARY: + type = GARROW_TYPE_FIXED_SIZE_BINARY_SCALAR; + break; + case arrow::Type::type::DATE32: + type = GARROW_TYPE_DATE32_SCALAR; + break; + case arrow::Type::type::DATE64: + type = GARROW_TYPE_DATE64_SCALAR; + break; + case arrow::Type::type::TIME32: + type = GARROW_TYPE_TIME32_SCALAR; + break; + case arrow::Type::type::TIME64: + type = GARROW_TYPE_TIME64_SCALAR; + break; + case arrow::Type::type::TIMESTAMP: + type = GARROW_TYPE_TIMESTAMP_SCALAR; + break; + case arrow::Type::type::DECIMAL128: + type = GARROW_TYPE_DECIMAL128_SCALAR; + break; + case arrow::Type::type::DECIMAL256: + type = GARROW_TYPE_DECIMAL256_SCALAR; + break; + case arrow::Type::type::LIST: + type = GARROW_TYPE_LIST_SCALAR; + break; + case arrow::Type::type::LARGE_LIST: + type = GARROW_TYPE_LARGE_LIST_SCALAR; + break; +/* + case arrow::Type::type::FIXED_SIZE_LIST: + type = GARROW_TYPE_FIXED_SIZE_LIST_SCALAR; + break; +*/ + case arrow::Type::type::MAP: + type = GARROW_TYPE_MAP_SCALAR; + break; + case arrow::Type::type::STRUCT: + type = GARROW_TYPE_STRUCT_SCALAR; + break; + case arrow::Type::type::SPARSE_UNION: + type = GARROW_TYPE_SPARSE_UNION_SCALAR; + break; + case arrow::Type::type::DENSE_UNION: + type = GARROW_TYPE_DENSE_UNION_SCALAR; + break; + case arrow::Type::type::EXTENSION: + type = GARROW_TYPE_EXTENSION_SCALAR; + break; + default: + type = GARROW_TYPE_SCALAR; + break; + } + scalar = GARROW_SCALAR(g_object_new_valist(type, + first_property_name, + args)); + return scalar; +} + +std::shared_ptr +garrow_scalar_get_raw(GArrowScalar *scalar) +{ + auto priv = GARROW_SCALAR_GET_PRIVATE(scalar); + return priv->scalar; +} diff --git a/c_glib/arrow-glib/scalar.h b/c_glib/arrow-glib/scalar.h new file mode 100644 index 00000000000..a110d1c5ef6 --- /dev/null +++ b/c_glib/arrow-glib/scalar.h @@ -0,0 +1,683 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +typedef struct _GArrowCastOptions GArrowCastOptions; + +#define GARROW_TYPE_SCALAR (garrow_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowScalar, + garrow_scalar, + GARROW, + SCALAR, + GObject) +struct _GArrowScalarClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowScalar * +garrow_scalar_parse(GArrowDataType *data_type, + const guint8 *data, + gsize size, + GError **error); + +GARROW_AVAILABLE_IN_5_0 +GArrowDataType * +garrow_scalar_get_data_type(GArrowScalar *scalar); +GARROW_AVAILABLE_IN_5_0 +gboolean +garrow_scalar_is_valid(GArrowScalar *scalar); +GARROW_AVAILABLE_IN_5_0 +gboolean +garrow_scalar_equal(GArrowScalar *scalar, + GArrowScalar *other_scalar); +GARROW_AVAILABLE_IN_5_0 +gboolean +garrow_scalar_equal_options(GArrowScalar *scalar, + GArrowScalar *other_scalar, + GArrowEqualOptions *options); +GARROW_AVAILABLE_IN_5_0 +gchar * +garrow_scalar_to_string(GArrowScalar *scalar); + +GARROW_AVAILABLE_IN_5_0 +GArrowScalar * +garrow_scalar_cast(GArrowScalar *scalar, + GArrowDataType *data_type, + GArrowCastOptions *options, + GError **error); + + +#define GARROW_TYPE_NULL_SCALAR (garrow_null_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowNullScalar, + garrow_null_scalar, + GARROW, + NULL_SCALAR, + GArrowScalar) +struct _GArrowNullScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowNullScalar * +garrow_null_scalar_new(void); + + +#define GARROW_TYPE_BOOLEAN_SCALAR (garrow_boolean_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowBooleanScalar, + garrow_boolean_scalar, + GARROW, + BOOLEAN_SCALAR, + GArrowScalar) +struct _GArrowBooleanScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowBooleanScalar * +garrow_boolean_scalar_new(gboolean value); +GARROW_AVAILABLE_IN_5_0 +gboolean +garrow_boolean_scalar_get_value(GArrowBooleanScalar *scalar); + + +#define GARROW_TYPE_INT8_SCALAR (garrow_int8_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowInt8Scalar, + garrow_int8_scalar, + GARROW, + INT8_SCALAR, + GArrowScalar) +struct _GArrowInt8ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowInt8Scalar * +garrow_int8_scalar_new(gint8 value); +GARROW_AVAILABLE_IN_5_0 +gint8 +garrow_int8_scalar_get_value(GArrowInt8Scalar *scalar); + + +#define GARROW_TYPE_INT16_SCALAR (garrow_int16_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowInt16Scalar, + garrow_int16_scalar, + GARROW, + INT16_SCALAR, + GArrowScalar) +struct _GArrowInt16ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowInt16Scalar * +garrow_int16_scalar_new(gint16 value); +GARROW_AVAILABLE_IN_5_0 +gint16 +garrow_int16_scalar_get_value(GArrowInt16Scalar *scalar); + + +#define GARROW_TYPE_INT32_SCALAR (garrow_int32_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowInt32Scalar, + garrow_int32_scalar, + GARROW, + INT32_SCALAR, + GArrowScalar) +struct _GArrowInt32ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowInt32Scalar * +garrow_int32_scalar_new(gint32 value); +GARROW_AVAILABLE_IN_5_0 +gint32 +garrow_int32_scalar_get_value(GArrowInt32Scalar *scalar); + + +#define GARROW_TYPE_INT64_SCALAR (garrow_int64_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowInt64Scalar, + garrow_int64_scalar, + GARROW, + INT64_SCALAR, + GArrowScalar) +struct _GArrowInt64ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowInt64Scalar * +garrow_int64_scalar_new(gint64 value); +GARROW_AVAILABLE_IN_5_0 +gint64 +garrow_int64_scalar_get_value(GArrowInt64Scalar *scalar); + + +#define GARROW_TYPE_UINT8_SCALAR (garrow_uint8_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowUInt8Scalar, + garrow_uint8_scalar, + GARROW, + UINT8_SCALAR, + GArrowScalar) +struct _GArrowUInt8ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowUInt8Scalar * +garrow_uint8_scalar_new(guint8 value); +GARROW_AVAILABLE_IN_5_0 +guint8 +garrow_uint8_scalar_get_value(GArrowUInt8Scalar *scalar); + + +#define GARROW_TYPE_UINT16_SCALAR (garrow_uint16_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowUInt16Scalar, + garrow_uint16_scalar, + GARROW, + UINT16_SCALAR, + GArrowScalar) +struct _GArrowUInt16ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowUInt16Scalar * +garrow_uint16_scalar_new(guint16 value); +GARROW_AVAILABLE_IN_5_0 +guint16 +garrow_uint16_scalar_get_value(GArrowUInt16Scalar *scalar); + + +#define GARROW_TYPE_UINT32_SCALAR (garrow_uint32_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowUInt32Scalar, + garrow_uint32_scalar, + GARROW, + UINT32_SCALAR, + GArrowScalar) +struct _GArrowUInt32ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowUInt32Scalar * +garrow_uint32_scalar_new(guint32 value); +GARROW_AVAILABLE_IN_5_0 +guint32 +garrow_uint32_scalar_get_value(GArrowUInt32Scalar *scalar); + + +#define GARROW_TYPE_UINT64_SCALAR (garrow_uint64_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowUInt64Scalar, + garrow_uint64_scalar, + GARROW, + UINT64_SCALAR, + GArrowScalar) +struct _GArrowUInt64ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowUInt64Scalar * +garrow_uint64_scalar_new(guint64 value); +GARROW_AVAILABLE_IN_5_0 +guint64 +garrow_uint64_scalar_get_value(GArrowUInt64Scalar *scalar); + + +#define GARROW_TYPE_FLOAT_SCALAR (garrow_float_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowFloatScalar, + garrow_float_scalar, + GARROW, + FLOAT_SCALAR, + GArrowScalar) +struct _GArrowFloatScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowFloatScalar * +garrow_float_scalar_new(gfloat value); +GARROW_AVAILABLE_IN_5_0 +gfloat +garrow_float_scalar_get_value(GArrowFloatScalar *scalar); + + +#define GARROW_TYPE_DOUBLE_SCALAR (garrow_double_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDoubleScalar, + garrow_double_scalar, + GARROW, + DOUBLE_SCALAR, + GArrowScalar) +struct _GArrowDoubleScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowDoubleScalar * +garrow_double_scalar_new(gdouble value); +GARROW_AVAILABLE_IN_5_0 +gdouble +garrow_double_scalar_get_value(GArrowDoubleScalar *scalar); + + +#define GARROW_TYPE_BASE_BINARY_SCALAR (garrow_base_binary_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowBaseBinaryScalar, + garrow_base_binary_scalar, + GARROW, + BASE_BINARY_SCALAR, + GArrowScalar) +struct _GArrowBaseBinaryScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowBuffer * +garrow_base_binary_scalar_get_value(GArrowBaseBinaryScalar *scalar); + + +#define GARROW_TYPE_BINARY_SCALAR (garrow_binary_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowBinaryScalar, + garrow_binary_scalar, + GARROW, + BINARY_SCALAR, + GArrowBaseBinaryScalar) +struct _GArrowBinaryScalarClass +{ + GArrowBaseBinaryScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowBinaryScalar * +garrow_binary_scalar_new(GArrowBuffer *value); + + +#define GARROW_TYPE_STRING_SCALAR (garrow_string_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowStringScalar, + garrow_string_scalar, + GARROW, + STRING_SCALAR, + GArrowBaseBinaryScalar) +struct _GArrowStringScalarClass +{ + GArrowBaseBinaryScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowStringScalar * +garrow_string_scalar_new(GArrowBuffer *value); + + +#define GARROW_TYPE_LARGE_BINARY_SCALAR (garrow_large_binary_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowLargeBinaryScalar, + garrow_large_binary_scalar, + GARROW, + LARGE_BINARY_SCALAR, + GArrowBaseBinaryScalar) +struct _GArrowLargeBinaryScalarClass +{ + GArrowBaseBinaryScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowLargeBinaryScalar * +garrow_large_binary_scalar_new(GArrowBuffer *value); + + +#define GARROW_TYPE_LARGE_STRING_SCALAR (garrow_large_string_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowLargeStringScalar, + garrow_large_string_scalar, + GARROW, + LARGE_STRING_SCALAR, + GArrowBaseBinaryScalar) +struct _GArrowLargeStringScalarClass +{ + GArrowBaseBinaryScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowLargeStringScalar * +garrow_large_string_scalar_new(GArrowBuffer *value); + + +#define GARROW_TYPE_FIXED_SIZE_BINARY_SCALAR \ + (garrow_fixed_size_binary_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowFixedSizeBinaryScalar, + garrow_fixed_size_binary_scalar, + GARROW, + FIXED_SIZE_BINARY_SCALAR, + GArrowBaseBinaryScalar) +struct _GArrowFixedSizeBinaryScalarClass +{ + GArrowBaseBinaryScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowFixedSizeBinaryScalar * +garrow_fixed_size_binary_scalar_new(GArrowFixedSizeBinaryDataType *data_type, + GArrowBuffer *value); + + +#define GARROW_TYPE_DATE32_SCALAR (garrow_date32_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDate32Scalar, + garrow_date32_scalar, + GARROW, + DATE32_SCALAR, + GArrowScalar) +struct _GArrowDate32ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowDate32Scalar * +garrow_date32_scalar_new(gint32 value); +GARROW_AVAILABLE_IN_5_0 +gint32 +garrow_date32_scalar_get_value(GArrowDate32Scalar *scalar); + + +#define GARROW_TYPE_DATE64_SCALAR (garrow_date64_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDate64Scalar, + garrow_date64_scalar, + GARROW, + DATE64_SCALAR, + GArrowScalar) +struct _GArrowDate64ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowDate64Scalar * +garrow_date64_scalar_new(gint64 value); +GARROW_AVAILABLE_IN_5_0 +gint64 +garrow_date64_scalar_get_value(GArrowDate64Scalar *scalar); + + +#define GARROW_TYPE_TIME32_SCALAR (garrow_time32_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowTime32Scalar, + garrow_time32_scalar, + GARROW, + TIME32_SCALAR, + GArrowScalar) +struct _GArrowTime32ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowTime32Scalar * +garrow_time32_scalar_new(GArrowTime32DataType *data_type, + gint32 value); +GARROW_AVAILABLE_IN_5_0 +gint32 +garrow_time32_scalar_get_value(GArrowTime32Scalar *scalar); + + +#define GARROW_TYPE_TIME64_SCALAR (garrow_time64_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowTime64Scalar, + garrow_time64_scalar, + GARROW, + TIME64_SCALAR, + GArrowScalar) +struct _GArrowTime64ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowTime64Scalar * +garrow_time64_scalar_new(GArrowTime64DataType *data_type, + gint64 value); +GARROW_AVAILABLE_IN_5_0 +gint64 +garrow_time64_scalar_get_value(GArrowTime64Scalar *scalar); + + +#define GARROW_TYPE_TIMESTAMP_SCALAR (garrow_timestamp_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowTimestampScalar, + garrow_timestamp_scalar, + GARROW, + TIMESTAMP_SCALAR, + GArrowScalar) +struct _GArrowTimestampScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowTimestampScalar * +garrow_timestamp_scalar_new(GArrowTimestampDataType *data_type, + gint64 value); +GARROW_AVAILABLE_IN_5_0 +gint64 +garrow_timestamp_scalar_get_value(GArrowTimestampScalar *scalar); + + +#define GARROW_TYPE_DECIMAL128_SCALAR (garrow_decimal128_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDecimal128Scalar, + garrow_decimal128_scalar, + GARROW, + DECIMAL128_SCALAR, + GArrowScalar) +struct _GArrowDecimal128ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowDecimal128Scalar * +garrow_decimal128_scalar_new(GArrowDecimal128DataType *data_type, + GArrowDecimal128 *value); +GARROW_AVAILABLE_IN_5_0 +GArrowDecimal128 * +garrow_decimal128_scalar_get_value(GArrowDecimal128Scalar *scalar); + + +#define GARROW_TYPE_DECIMAL256_SCALAR (garrow_decimal256_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDecimal256Scalar, + garrow_decimal256_scalar, + GARROW, + DECIMAL256_SCALAR, + GArrowScalar) +struct _GArrowDecimal256ScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowDecimal256Scalar * +garrow_decimal256_scalar_new(GArrowDecimal256DataType *data_type, + GArrowDecimal256 *value); +GARROW_AVAILABLE_IN_5_0 +GArrowDecimal256 * +garrow_decimal256_scalar_get_value(GArrowDecimal256Scalar *scalar); + + +#define GARROW_TYPE_BASE_LIST_SCALAR (garrow_base_list_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowBaseListScalar, + garrow_base_list_scalar, + GARROW, + BASE_LIST_SCALAR, + GArrowScalar) +struct _GArrowBaseListScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowArray * +garrow_base_list_scalar_get_value(GArrowBaseListScalar *scalar); + +#define GARROW_TYPE_LIST_SCALAR (garrow_list_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowListScalar, + garrow_list_scalar, + GARROW, + LIST_SCALAR, + GArrowBaseListScalar) +struct _GArrowListScalarClass +{ + GArrowBaseListScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowListScalar * +garrow_list_scalar_new(GArrowListArray *value); + + +#define GARROW_TYPE_LARGE_LIST_SCALAR (garrow_large_list_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowLargeListScalar, + garrow_large_list_scalar, + GARROW, + LARGE_LIST_SCALAR, + GArrowBaseListScalar) +struct _GArrowLargeListScalarClass +{ + GArrowBaseListScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowLargeListScalar * +garrow_large_list_scalar_new(GArrowLargeListArray *value); + + +#define GARROW_TYPE_MAP_SCALAR (garrow_map_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowMapScalar, + garrow_map_scalar, + GARROW, + MAP_SCALAR, + GArrowBaseListScalar) +struct _GArrowMapScalarClass +{ + GArrowBaseListScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowMapScalar * +garrow_map_scalar_new(GArrowStructArray *value); + + +#define GARROW_TYPE_STRUCT_SCALAR (garrow_struct_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowStructScalar, + garrow_struct_scalar, + GARROW, + STRUCT_SCALAR, + GArrowScalar) +struct _GArrowStructScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowStructScalar * +garrow_struct_scalar_new(GArrowStructDataType *data_type, + GList *value); +GARROW_AVAILABLE_IN_5_0 +GList * +garrow_struct_scalar_get_value(GArrowStructScalar *scalar); + + +#define GARROW_TYPE_UNION_SCALAR (garrow_union_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowUnionScalar, + garrow_union_scalar, + GARROW, + UNION_SCALAR, + GArrowScalar) +struct _GArrowUnionScalarClass +{ + GArrowScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_6_0 +gint8 +garrow_union_scalar_get_type_code(GArrowUnionScalar *scalar); +GARROW_AVAILABLE_IN_5_0 +GArrowScalar * +garrow_union_scalar_get_value(GArrowUnionScalar *scalar); + + +#define GARROW_TYPE_SPARSE_UNION_SCALAR (garrow_sparse_union_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowSparseUnionScalar, + garrow_sparse_union_scalar, + GARROW, + SPARSE_UNION_SCALAR, + GArrowUnionScalar) +struct _GArrowSparseUnionScalarClass +{ + GArrowUnionScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowSparseUnionScalar * +garrow_sparse_union_scalar_new(GArrowSparseUnionDataType *data_type, + gint8 type_code, + GArrowScalar *value); + + +#define GARROW_TYPE_DENSE_UNION_SCALAR (garrow_dense_union_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDenseUnionScalar, + garrow_dense_union_scalar, + GARROW, + DENSE_UNION_SCALAR, + GArrowUnionScalar) +struct _GArrowDenseUnionScalarClass +{ + GArrowUnionScalarClass parent_class; +}; + +GARROW_AVAILABLE_IN_5_0 +GArrowDenseUnionScalar * +garrow_dense_union_scalar_new(GArrowDenseUnionDataType *data_type, + gint8 type_code, + GArrowScalar *value); + + +#define GARROW_TYPE_EXTENSION_SCALAR (garrow_extension_scalar_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowExtensionScalar, + garrow_extension_scalar, + GARROW, + EXTENSION_SCALAR, + GArrowScalar) +struct _GArrowExtensionScalarClass +{ + GArrowScalarClass parent_class; +}; + +G_END_DECLS diff --git a/c_glib/arrow-glib/scalar.hpp b/c_glib/arrow-glib/scalar.hpp new file mode 100644 index 00000000000..46ac73e21e8 --- /dev/null +++ b/c_glib/arrow-glib/scalar.hpp @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +GArrowScalar * +garrow_scalar_new_raw(std::shared_ptr *arrow_scalar); +GArrowScalar * +garrow_scalar_new_raw(std::shared_ptr *arrow_scalar, + const gchar *first_property_name, + ...); +GArrowScalar * +garrow_scalar_new_raw_valist(std::shared_ptr *arrow_scalar, + const gchar *first_property_name, + va_list args); +std::shared_ptr +garrow_scalar_get_raw(GArrowScalar *scalar); diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index 5a74566fd4a..193853602ff 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,24 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_6_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 6.0.0 + */ +#define GARROW_VERSION_6_0 G_ENCODE_VERSION(6, 0) + +/** + * GARROW_VERSION_5_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 5.0.0 + */ +#define GARROW_VERSION_5_0 G_ENCODE_VERSION(5, 0) + /** * GARROW_VERSION_4_0: * @@ -256,6 +274,34 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_6_0 +# define GARROW_DEPRECATED_IN_6_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_6_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_6_0 +# define GARROW_DEPRECATED_IN_6_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_6_0 +# define GARROW_AVAILABLE_IN_6_0 GARROW_UNAVAILABLE(6, 0) +#else +# define GARROW_AVAILABLE_IN_6_0 +#endif + +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_5_0 +# define GARROW_DEPRECATED_IN_5_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_5_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_5_0 +# define GARROW_DEPRECATED_IN_5_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_5_0 +# define GARROW_AVAILABLE_IN_5_0 GARROW_UNAVAILABLE(5, 0) +#else +# define GARROW_AVAILABLE_IN_5_0 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_4_0 # define GARROW_DEPRECATED_IN_4_0 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_4_0_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml index 92ae0405dac..3e8da5bd9d1 100644 --- a/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml +++ b/c_glib/doc/arrow-dataset-glib/arrow-dataset-glib-docs.xml @@ -36,13 +36,21 @@ - - Read - + + Data + + Dataset + + Dataset factory + + + Scan Fragment + File format + @@ -58,9 +66,17 @@ Index of deprecated API - - Index of new symbols in 1.0.0 - + + Index of new symbols in 4.0.0 + + + + Index of new symbols in 4.0.0 + + + + Index of new symbols in 3.0.0 + diff --git a/c_glib/doc/arrow-dataset-glib/meson.build b/c_glib/doc/arrow-dataset-glib/meson.build index 1cb2f9e99c8..ca037b7e36a 100644 --- a/c_glib/doc/arrow-dataset-glib/meson.build +++ b/c_glib/doc/arrow-dataset-glib/meson.build @@ -70,7 +70,7 @@ gnome.gtkdoc(package_id, ], mkdb_args: [ '--output-format=xml', - '--name-space=gad', + '--name-space=gadataset', '--source-suffixes=c,cpp,h', ], fixxref_args: [ diff --git a/c_glib/doc/arrow-flight-glib/arrow-flight-glib-docs.xml b/c_glib/doc/arrow-flight-glib/arrow-flight-glib-docs.xml new file mode 100644 index 00000000000..397a8bec0d0 --- /dev/null +++ b/c_glib/doc/arrow-flight-glib/arrow-flight-glib-docs.xml @@ -0,0 +1,67 @@ + + + + + %gtkdocentities; +]> + + + &package_name; Reference Manual + + for &package_string;. + + + + + + RPC + + + + + + + Object Hierarchy + + + + API Index + + + + Index of deprecated API + + + + Index of new symbols in 6.0.0 + + + + Index of new symbols in 5.0.0 + + + + diff --git a/c_glib/doc/arrow-flight-glib/entities.xml.in b/c_glib/doc/arrow-flight-glib/entities.xml.in new file mode 100644 index 00000000000..aa5addb4e84 --- /dev/null +++ b/c_glib/doc/arrow-flight-glib/entities.xml.in @@ -0,0 +1,24 @@ + + + + + + + diff --git a/c_glib/doc/arrow-flight-glib/meson.build b/c_glib/doc/arrow-flight-glib/meson.build new file mode 100644 index 00000000000..7ae38e4f5e4 --- /dev/null +++ b/c_glib/doc/arrow-flight-glib/meson.build @@ -0,0 +1,83 @@ +# -*- indent-tabs-mode: nil -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +package_id = 'arrow-flight-glib' +package_name = 'Apache Arrow Flight GLib' +entities_conf = configuration_data() +entities_conf.set('PACKAGE', package_id) +entities_conf.set('PACKAGE_BUGREPORT', + 'https://issues.apache.org/jira/browse/ARROW') +entities_conf.set('PACKAGE_NAME', package_name) +entities_conf.set('PACKAGE_STRING', + ' '.join([package_id, version])) +entities_conf.set('PACKAGE_URL', 'https://arrow.apache.org/') +entities_conf.set('PACKAGE_VERSION', version) +configure_file(input: 'entities.xml.in', + output: 'entities.xml', + configuration: entities_conf) + +private_headers = [ +] + +content_files = [ +] + +html_images = [ +] + +glib_prefix = dependency('glib-2.0').get_pkgconfig_variable('prefix') +glib_doc_path = join_paths(glib_prefix, 'share', 'gtk-doc', 'html') +arrow_glib_doc_path = join_paths(data_dir, + 'gtk-doc', + 'html', + 'arrow-glib') +doc_path = join_paths(data_dir, 'gtk-doc', 'html', package_id) + +source_directories = [ + join_paths(meson.source_root(), package_id), + join_paths(meson.build_root(), package_id), +] +dependencies = [ + arrow_glib, + arrow_flight_glib, +] +ignore_headers = [] +gnome.gtkdoc(package_id, + main_xml: package_id + '-docs.xml', + src_dir: source_directories, + dependencies: dependencies, + ignore_headers: ignore_headers, + gobject_typesfile: package_id + '.types', + scan_args: [ + '--rebuild-types', + '--deprecated-guards=GARROW_DISABLE_DEPRECATED', + ], + mkdb_args: [ + '--output-format=xml', + '--name-space=gad', + '--source-suffixes=c,cpp,h', + ], + fixxref_args: [ + '--html-dir=' + doc_path, + '--extra-dir=' + join_paths(glib_doc_path, 'glib'), + '--extra-dir=' + join_paths(glib_doc_path, 'gobject'), + '--extra-dir=' + arrow_glib_doc_path, + ], + html_assets: html_images, + install: true) diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index 9198b6a13a6..4c061c06c40 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -55,6 +55,10 @@ Value + + Scalar + + Type @@ -179,6 +183,14 @@ Index of deprecated API + + Index of new symbols in 6.0.0 + + + + Index of new symbols in 5.0.0 + + Index of new symbols in 4.0.0 diff --git a/c_glib/meson.build b/c_glib/meson.build index 4ac407e97d2..0e090c97968 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -23,7 +23,7 @@ project('arrow-glib', 'c', 'cpp', 'cpp_std=c++11', ]) -version = '4.0.0-SNAPSHOT' +version = '6.0.0-SNAPSHOT' if version.endswith('-SNAPSHOT') version_numbers = version.split('-')[0].split('.') version_tag = version.split('-')[1] @@ -75,6 +75,7 @@ if arrow_cpp_build_lib_dir == '' have_arrow_orc = dependency('arrow-orc', required: false).found() arrow_cuda = dependency('arrow-cuda', required: false) arrow_dataset = dependency('arrow-dataset', required: false) + arrow_flight = dependency('arrow-flight', required: false) gandiva = dependency('gandiva', required: false) parquet = dependency('parquet', required: false) plasma = dependency('plasma', required: false) @@ -105,6 +106,9 @@ main(void) arrow_dataset = cpp_compiler.find_library('arrow_dataset', dirs: [arrow_cpp_build_lib_dir], required: false) + arrow_flight = cpp_compiler.find_library('arrow_flight', + dirs: [arrow_cpp_build_lib_dir], + required: false) gandiva = cpp_compiler.find_library('gandiva', dirs: [arrow_cpp_build_lib_dir], required: false) @@ -137,6 +141,9 @@ endif if arrow_dataset.found() subdir('arrow-dataset-glib') endif +if arrow_flight.found() + subdir('arrow-flight-glib') +endif if gandiva.found() subdir('gandiva-glib') endif @@ -153,6 +160,9 @@ if get_option('gtk_doc') if arrow_dataset.found() subdir('doc/arrow-dataset-glib') endif + if arrow_flight.found() + subdir('doc/arrow-flight-glib') + endif if gandiva.found() subdir('doc/gandiva-glib') endif diff --git a/c_glib/plasma-glib/client.cpp b/c_glib/plasma-glib/client.cpp index 2a5ccf98bd1..26476f4d6b5 100644 --- a/c_glib/plasma-glib/client.cpp +++ b/c_glib/plasma-glib/client.cpp @@ -265,6 +265,10 @@ gplasma_client_create_options_new(void) return GPLASMA_CLIENT_CREATE_OPTIONS(options); } +#if !GLIB_CHECK_VERSION(2, 68, 0) +# define g_memdup2(memory, byte_size) g_memdup(memory, byte_size) +#endif + /** * gplasma_client_create_options_set_metadata: * @options: A #GPlasmaClientCreateOptions. @@ -282,7 +286,7 @@ gplasma_client_create_options_set_metadata(GPlasmaClientCreateOptions *options, if (priv->metadata) { g_free(priv->metadata); } - priv->metadata = static_cast(g_memdup(metadata, size)); + priv->metadata = static_cast(g_memdup2(metadata, size)); priv->metadata_size = size; } diff --git a/c_glib/test/dataset/test-file-system-dataset-factory.rb b/c_glib/test/dataset/test-file-system-dataset-factory.rb new file mode 100644 index 00000000000..9ef629c222e --- /dev/null +++ b/c_glib/test/dataset/test-file-system-dataset-factory.rb @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetFileSystemDatasetFactory < Test::Unit::TestCase + include Helper::Buildable + include Helper::Writable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + @path = File.join(@dir, "table.arrow") + @table = build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int32_array([1, 2, 3]), + build_int32_array([-1, -2, -3, -4]), + ]) + @format = ArrowDataset::IPCFileFormat.new + write_table(@table, @path) + yield + end + end + + def test_file_system + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system = Arrow::LocalFileSystem.new + factory.add_path(File.expand_path(@path)) + dataset = factory.finish + assert_equal(@table, dataset.to_table) + end + + def test_file_system_uri + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(@path) + dataset = factory.finish + assert_equal(@table, dataset.to_table) + end +end diff --git a/c_glib/test/dataset/test-file-system-dataset.rb b/c_glib/test/dataset/test-file-system-dataset.rb new file mode 100644 index 00000000000..6d6ec3b18c6 --- /dev/null +++ b/c_glib/test/dataset/test-file-system-dataset.rb @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetFileSystemDataset < Test::Unit::TestCase + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + @dir = tmpdir + format = ArrowDataset::IPCFileFormat.new + factory = ArrowDataset::FileSystemDatasetFactory.new(format) + factory.file_system = Arrow::LocalFileSystem.new + @dataset = factory.finish + yield + end + end + + def test_type_name + assert_equal("filesystem", @dataset.type_name) + end +end diff --git a/c_glib/test/dataset/test-in-memory-scan-task.rb b/c_glib/test/dataset/test-in-memory-scan-task.rb deleted file mode 100644 index 06e3d0d2424..00000000000 --- a/c_glib/test/dataset/test-in-memory-scan-task.rb +++ /dev/null @@ -1,59 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestDatasetInMemoryScanTask < Test::Unit::TestCase - include Helper::Buildable - - def setup - omit("Arrow Dataset is required") unless defined?(ArrowDataset) - fields = [ - Arrow::Field.new("visible", Arrow::BooleanDataType.new), - Arrow::Field.new("point", Arrow::Int32DataType.new), - ] - @schema = Arrow::Schema.new(fields) - @record_batches = [ - [ - build_boolean_array([true, false, true]), - build_int32_array([1, 2, 3]), - ], - [ - build_boolean_array([false, true, false, true]), - build_int32_array([-1, -2, -3, -4]), - ] - ].collect do |columns| - Arrow::RecordBatch.new(@schema, columns[0].length, columns) - end - - @scan_options = ArrowDataset::ScanOptions.new(@schema) - - @fragment = ArrowDataset::InMemoryFragment.new(@schema, - @record_batches) - - @scan_task = ArrowDataset::InMemoryScanTask.new(@record_batches, - @scan_options, - @fragment) - end - - def test_scan_options - assert_equal(@scan_options, @scan_task.options) - end - - def test_execute - assert_equal(@record_batches, - @scan_task.execute.to_list) - end -end diff --git a/c_glib/test/dataset/test-scan-options.rb b/c_glib/test/dataset/test-scan-options.rb deleted file mode 100644 index 0536b2a7cca..00000000000 --- a/c_glib/test/dataset/test-scan-options.rb +++ /dev/null @@ -1,47 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestDatasetScanOptions < Test::Unit::TestCase - def setup - omit("Arrow Dataset is required") unless defined?(ArrowDataset) - @schema = Arrow::Schema.new([]) - @scan_options = ArrowDataset::ScanOptions.new(@schema) - end - - def test_schema - assert_equal(@schema, - @scan_options.schema) - end - - def test_batch_size - assert_equal(1<<20, - @scan_options.batch_size) - @scan_options.batch_size = 42 - assert_equal(42, - @scan_options.batch_size) - end - - def test_use_threads - assert do - not @scan_options.use_threads? - end - @scan_options.use_threads = true - assert do - @scan_options.use_threads? - end - end -end diff --git a/c_glib/test/dataset/test-scanner.rb b/c_glib/test/dataset/test-scanner.rb new file mode 100644 index 00000000000..f7702d4905f --- /dev/null +++ b/c_glib/test/dataset/test-scanner.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDatasetScanner < Test::Unit::TestCase + include Helper::Buildable + include Helper::Writable + + def setup + omit("Arrow Dataset is required") unless defined?(ArrowDataset) + Dir.mktmpdir do |tmpdir| + path = File.join(tmpdir, "table.arrow") + @table = build_table(visible: [ + build_boolean_array([true, false, true]), + build_boolean_array([false, true, false, true]), + ], + point: [ + build_int32_array([1, 2, 3]), + build_int32_array([-1, -2, -3, -4]), + ]) + @format = ArrowDataset::IPCFileFormat.new + write_table(@table, path) + factory = ArrowDataset::FileSystemDatasetFactory.new(@format) + factory.file_system_uri = build_file_uri(path) + @dataset = factory.finish + builder = @dataset.begin_scan + @scanner = builder.finish + yield + end + end + + def test_to_table + assert_equal(@table, @scanner.to_table) + end +end diff --git a/c_glib/test/flight/test-client.rb b/c_glib/test/flight/test-client.rb new file mode 100644 index 00000000000..f6660a4ca49 --- /dev/null +++ b/c_glib/test/flight/test-client.rb @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightClient < Test::Unit::TestCase + include Helper::Omittable + + def setup + @server = nil + omit("Arrow Flight is required") unless defined?(ArrowFlight) + omit("Unstable on Windows") if Gem.win_platform? + require_gi_bindings(3, 4, 7) + @server = Helper::FlightServer.new + host = "127.0.0.1" + location = ArrowFlight::Location.new("grpc://#{host}:0") + options = ArrowFlight::ServerOptions.new(location) + @server.listen(options) + @location = ArrowFlight::Location.new("grpc://#{host}:#{@server.port}") + end + + def teardown + return if @server.nil? + @server.shutdown + end + + def test_list_flights + client = ArrowFlight::Client.new(@location) + generator = Helper::FlightInfoGenerator.new + assert_equal([generator.page_view], + client.list_flights) + end + + sub_test_case("#do_get") do + def test_success + client = ArrowFlight::Client.new(@location) + info = client.list_flights.first + endpoint = info.endpoints.first + generator = Helper::FlightInfoGenerator.new + reader = client.do_get(endpoint.ticket) + assert_equal(generator.page_view_table, + reader.read_all) + end + + def test_error + client = ArrowFlight::Client.new(@location) + assert_raise(Arrow::Error::Invalid) do + client.do_get(ArrowFlight::Ticket.new("invalid")) + end + end + end +end diff --git a/c_glib/test/flight/test-command-descriptor.rb b/c_glib/test/flight/test-command-descriptor.rb new file mode 100644 index 00000000000..316973287f0 --- /dev/null +++ b/c_glib/test/flight/test-command-descriptor.rb @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightCommandDescriptor < Test::Unit::TestCase + def setup + omit("Arrow Flight is required") unless defined?(ArrowFlight) + end + + def test_to_s + descriptor = ArrowFlight::CommandDescriptor.new("command") + assert_equal("FlightDescriptor", + descriptor.to_s) + end + + def test_command + command = "command" + descriptor = ArrowFlight::CommandDescriptor.new(command) + assert_equal(command, descriptor.command) + end + + sub_test_case("#==") do + def test_true + descriptor1 = ArrowFlight::CommandDescriptor.new("command") + descriptor2 = ArrowFlight::CommandDescriptor.new("command") + assert do + descriptor1 == descriptor2 + end + end + + def test_false + descriptor1 = ArrowFlight::CommandDescriptor.new("command1") + descriptor2 = ArrowFlight::CommandDescriptor.new("command2") + assert do + not (descriptor1 == descriptor2) + end + end + end +end diff --git a/c_glib/test/flight/test-criteria.rb b/c_glib/test/flight/test-criteria.rb new file mode 100644 index 00000000000..d5f60a8953d --- /dev/null +++ b/c_glib/test/flight/test-criteria.rb @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightCriteria < Test::Unit::TestCase + def setup + omit("Arrow Flight is required") unless defined?(ArrowFlight) + end + + def test_expression + expression = "expression" + criteria = ArrowFlight::Criteria.new(expression) + assert_equal(expression, + criteria.expression.to_s) + end +end diff --git a/c_glib/test/flight/test-endpoint.rb b/c_glib/test/flight/test-endpoint.rb new file mode 100644 index 00000000000..06cddf0019b --- /dev/null +++ b/c_glib/test/flight/test-endpoint.rb @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightEndpoint < Test::Unit::TestCase + def setup + omit("Arrow Flight is required") unless defined?(ArrowFlight) + end + + def test_ticket + ticket = ArrowFlight::Ticket.new("data") + locations = [ + ArrowFlight::Location.new("grpc://127.0.0.1:2929"), + ArrowFlight::Location.new("grpc+tcp://127.0.0.1:12929"), + ] + endpoint = ArrowFlight::Endpoint.new(ticket, locations) + assert_equal(ticket, + endpoint.ticket) + end + + def test_locations + ticket = ArrowFlight::Ticket.new("data") + locations = [ + ArrowFlight::Location.new("grpc://127.0.0.1:2929"), + ArrowFlight::Location.new("grpc+tcp://127.0.0.1:12929"), + ] + endpoint = ArrowFlight::Endpoint.new(ticket, locations) + assert_equal(locations, + endpoint.locations) + end + + sub_test_case("#==") do + def test_true + ticket = ArrowFlight::Ticket.new("data") + location = ArrowFlight::Location.new("grpc://127.0.0.1:2929") + endpoint1 = ArrowFlight::Endpoint.new(ticket, [location]) + endpoint2 = ArrowFlight::Endpoint.new(ticket, [location]) + assert do + endpoint1 == endpoint2 + end + end + + def test_false + ticket = ArrowFlight::Ticket.new("data") + location1 = ArrowFlight::Location.new("grpc://127.0.0.1:2929") + location2 = ArrowFlight::Location.new("grpc://127.0.0.1:1129") + endpoint1 = ArrowFlight::Endpoint.new(ticket, [location1]) + endpoint2 = ArrowFlight::Endpoint.new(ticket, [location2]) + assert do + not (endpoint1 == endpoint2) + end + end + end +end diff --git a/c_glib/test/flight/test-info.rb b/c_glib/test/flight/test-info.rb new file mode 100644 index 00000000000..5bf0fbfad88 --- /dev/null +++ b/c_glib/test/flight/test-info.rb @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightInfo < Test::Unit::TestCase + include Helper::Writable + + def setup + omit("Arrow Flight is required") unless defined?(ArrowFlight) + @generator = Helper::FlightInfoGenerator.new + end + + sub_test_case("#get_schema") do + def test_with_options + info = @generator.page_view + table = @generator.page_view_table + options = Arrow::ReadOptions.new + assert_equal(table.schema, + info.get_schema(options)) + end + + def test_without_options + info = @generator.page_view + table = @generator.page_view_table + assert_equal(table.schema, + info.get_schema) + end + end + + def test_descriptor + info = @generator.page_view + assert_equal(@generator.page_view_descriptor, + info.descriptor) + end + + def test_endpoints + info = @generator.page_view + assert_equal(@generator.page_view_endpoints, + info.endpoints) + end + + def test_total_records + info = @generator.page_view + table = @generator.page_view_table + assert_equal(table.n_rows, + info.total_records) + end + + def test_total_bytes + info = @generator.page_view + table = @generator.page_view_table + output = Arrow::ResizableBuffer.new(0) + write_table(table, output, type: :stream) + assert_equal(output.size, + info.total_bytes) + end + + def test_equal + info1 = @generator.page_view + info2 = @generator.page_view + assert do + info1 == info2 + end + end +end diff --git a/c_glib/test/flight/test-location.rb b/c_glib/test/flight/test-location.rb new file mode 100644 index 00000000000..5b167932218 --- /dev/null +++ b/c_glib/test/flight/test-location.rb @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightLocation < Test::Unit::TestCase + def setup + omit("Arrow Flight is required") unless defined?(ArrowFlight) + end + + def test_to_s + location = ArrowFlight::Location.new("grpc://127.0.0.1:2929") + assert_equal("grpc://127.0.0.1:2929", location.to_s) + end + + def test_scheme + location = ArrowFlight::Location.new("grpc://127.0.0.1:2929") + assert_equal("grpc", location.scheme) + end + + def test_equal + location1 = ArrowFlight::Location.new("grpc://127.0.0.1:2929") + location2 = ArrowFlight::Location.new("grpc://127.0.0.1:2929") + assert do + location1 == location2 + end + end +end diff --git a/c_glib/test/flight/test-path-descriptor.rb b/c_glib/test/flight/test-path-descriptor.rb new file mode 100644 index 00000000000..441fc7bb043 --- /dev/null +++ b/c_glib/test/flight/test-path-descriptor.rb @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightPathDescriptor < Test::Unit::TestCase + def setup + omit("Arrow Flight is required") unless defined?(ArrowFlight) + end + + def test_to_s + descriptor = ArrowFlight::PathDescriptor.new(["a", "b", "c"]) + assert_equal("FlightDescriptor", + descriptor.to_s) + end + + def test_paths + paths = ["a", "b", "c"] + descriptor = ArrowFlight::PathDescriptor.new(paths) + assert_equal(paths, descriptor.paths) + end + + sub_test_case("#==") do + def test_true + descriptor1 = ArrowFlight::PathDescriptor.new(["a", "b", "c"]) + descriptor2 = ArrowFlight::PathDescriptor.new(["a", "b", "c"]) + assert do + descriptor1 == descriptor2 + end + end + + def test_false + descriptor1 = ArrowFlight::PathDescriptor.new(["a", "b", "c"]) + descriptor2 = ArrowFlight::PathDescriptor.new(["A", "B", "C"]) + assert do + not (descriptor1 == descriptor2) + end + end + end +end diff --git a/c_glib/test/flight/test-server-options.rb b/c_glib/test/flight/test-server-options.rb new file mode 100644 index 00000000000..93a90297ea2 --- /dev/null +++ b/c_glib/test/flight/test-server-options.rb @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightServerOptions < Test::Unit::TestCase + def setup + omit("Arrow Flight is required") unless defined?(ArrowFlight) + end + + def test_location + location = ArrowFlight::Location.new("grpc://127.0.0.1:0") + options = ArrowFlight::ServerOptions.new(location) + assert_equal(location, options.location) + end +end diff --git a/c_glib/test/flight/test-stream-reader.rb b/c_glib/test/flight/test-stream-reader.rb new file mode 100644 index 00000000000..f2e6229b0b3 --- /dev/null +++ b/c_glib/test/flight/test-stream-reader.rb @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightStreamReader < Test::Unit::TestCase + include Helper::Omittable + + def setup + @server = nil + omit("Arrow Flight is required") unless defined?(ArrowFlight) + omit("Unstable on Windows") if Gem.win_platform? + require_gi_bindings(3, 4, 5) + @server = Helper::FlightServer.new + host = "127.0.0.1" + location = ArrowFlight::Location.new("grpc://#{host}:0") + options = ArrowFlight::ServerOptions.new(location) + @server.listen(options) + location = ArrowFlight::Location.new("grpc://#{host}:#{@server.port}") + client = ArrowFlight::Client.new(location) + @generator = Helper::FlightInfoGenerator.new + @reader = client.do_get(@generator.page_view_ticket) + end + + def teardown + return if @server.nil? + @server.shutdown + end + + def test_read_next + chunks = [] + loop do + chunk = @reader.read_next + break if chunk.nil? + chunks << chunk + end + chunks_content = chunks.collect do |chunk| + [ + chunk.data, + chunk.metadata&.data&.to_s, + ] + end + table_batch_reader = Arrow::TableBatchReader.new(@generator.page_view_table) + assert_equal([ + [ + table_batch_reader.read_next, + nil, + ], + ], + chunks_content) + end + + def test_read_all + assert_equal(@generator.page_view_table, + @reader.read_all) + end +end diff --git a/c_glib/test/flight/test-ticket.rb b/c_glib/test/flight/test-ticket.rb new file mode 100644 index 00000000000..976089762f0 --- /dev/null +++ b/c_glib/test/flight/test-ticket.rb @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFlightTicket < Test::Unit::TestCase + def setup + omit("Arrow Flight is required") unless defined?(ArrowFlight) + end + + def test_data + data = "data" + ticket = ArrowFlight::Ticket.new(data) + assert_equal(data, + ticket.data.to_s) + end + + sub_test_case("#==") do + def test_true + ticket1 = ArrowFlight::Ticket.new("data") + ticket2 = ArrowFlight::Ticket.new("data") + assert do + ticket1 == ticket2 + end + end + + def test_false + ticket1 = ArrowFlight::Ticket.new("data1") + ticket2 = ArrowFlight::Ticket.new("data2") + assert do + not (ticket1 == ticket2) + end + end + end +end diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index 3528c2fbdc7..356fa651c6a 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -136,11 +136,7 @@ def build_list_array(value_data_type, values_list, field_name: "value") data_type = Arrow::ListDataType.new(value_field) builder = Arrow::ListArrayBuilder.new(data_type) values_list.each do |values| - if values.nil? - builder.append_null - else - append_to_builder(builder, values) - end + append_to_builder(builder, values) end builder.finish end @@ -150,11 +146,16 @@ def build_large_list_array(value_data_type, values_list, field_name: "value") data_type = Arrow::LargeListDataType.new(value_field) builder = Arrow::LargeListArrayBuilder.new(data_type) values_list.each do |values| - if values.nil? - builder.append_null - else - append_to_builder(builder, values) - end + append_to_builder(builder, values) + end + builder.finish + end + + def build_map_array(key_data_type, item_data_type, maps) + data_type = Arrow::MapDataType.new(key_data_type, item_data_type) + builder = Arrow::MapArrayBuilder.new(data_type) + maps.each do |map| + append_to_builder(builder, map) end builder.finish end @@ -163,11 +164,7 @@ def build_struct_array(fields, structs) data_type = Arrow::StructDataType.new(fields) builder = Arrow::StructArrayBuilder.new(data_type) structs.each do |struct| - if struct.nil? - builder.append_null - else - append_to_builder(builder, struct) - end + append_to_builder(builder, struct) end builder.finish end @@ -178,6 +175,14 @@ def append_to_builder(builder, value) else data_type = builder.value_data_type case data_type + when Arrow::MapDataType + builder.append_value + key_builder = builder.key_builder + item_builder = builder.item_builder + value.each do |k, v| + append_to_builder(key_builder, k) + append_to_builder(item_builder, v) + end when Arrow::ListDataType, Arrow::LargeListDataType builder.append_value value_builder = builder.value_builder @@ -200,7 +205,15 @@ def append_to_builder(builder, value) def build_table(columns) fields = [] chunked_arrays = [] - columns.each do |name, chunked_array| + columns.each do |name, data| + case data + when Arrow::Array + chunked_array = Arrow::ChunkedArray.new([data]) + when Array + chunked_array = Arrow::ChunkedArray.new(data) + else + chunked_array = data + end fields << Arrow::Field.new(name, chunked_array.value_data_type) chunked_arrays << chunked_array end @@ -217,6 +230,15 @@ def build_record_batch(columns) Arrow::RecordBatch.new(schema, n_rows, columns.values) end + def build_file_uri(path) + absolute_path = File.expand_path(path) + if absolute_path.start_with?("/") + "file://#{absolute_path}" + else + "file:///#{absolute_path}" + end + end + private def build_array(builder, values) values.each do |value| diff --git a/c_glib/test/helper/data-type.rb b/c_glib/test/helper/data-type.rb index b8224409873..bbe6866f5b9 100644 --- a/c_glib/test/helper/data-type.rb +++ b/c_glib/test/helper/data-type.rb @@ -52,6 +52,22 @@ def int64_data_type Arrow::Int64DataType.new end + def uint8_data_type + Arrow::UInt8DataType.new + end + + def uint16_data_type + Arrow::UInt16DataType.new + end + + def uint32_data_type + Arrow::UInt32DataType.new + end + + def uint64_data_type + Arrow::UInt64DataType.new + end + def string_data_type Arrow::StringDataType.new end diff --git a/c_glib/test/helper/flight-info-generator.rb b/c_glib/test/helper/flight-info-generator.rb new file mode 100644 index 00000000000..c57530879cb --- /dev/null +++ b/c_glib/test/helper/flight-info-generator.rb @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require_relative "buildable" +require_relative "data-type" +require_relative "writable" + +module Helper + class FlightInfoGenerator + include Buildable + include DataType + include Writable + + def page_view_table + build_table("count" => build_uint64_array([1, 2, 3]), + "private" => build_boolean_array([true, false, true])) + end + + def page_view_descriptor + ArrowFlight::PathDescriptor.new(["page-view"]) + end + + def page_view_ticket + ArrowFlight::Ticket.new("page-view") + end + + def page_view_endpoints + locations = [ + ArrowFlight::Location.new("grpc+tcp://127.0.0.1:10000"), + ArrowFlight::Location.new("grpc+tcp://127.0.0.1:10001"), + ] + [ + ArrowFlight::Endpoint.new(page_view_ticket, locations), + ] + end + + def page_view + table = page_view_table + descriptor = page_view_descriptor + endpoints = page_view_endpoints + output = Arrow::ResizableBuffer.new(0) + write_table(table, output, type: :stream) + ArrowFlight::Info.new(table.schema, + descriptor, + endpoints, + table.n_rows, + output.size) + end + end +end diff --git a/c_glib/test/helper/flight-server.rb b/c_glib/test/helper/flight-server.rb new file mode 100644 index 00000000000..89fd13b4211 --- /dev/null +++ b/c_glib/test/helper/flight-server.rb @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +require_relative "flight-info-generator" + +module Helper + class FlightServer < ArrowFlight::Server + type_register + + private + def virtual_do_list_flights(context, criteria) + generator = FlightInfoGenerator.new + [generator.page_view] + end + + def virtual_do_do_get(context, ticket) + generator = FlightInfoGenerator.new + unless ticket == generator.page_view_ticket + raise Arrow::Error::Invalid.new("invalid ticket") + end + table = generator.page_view_table + reader = Arrow::TableBatchReader.new(table) + ArrowFlight::RecordBatchStream.new(reader) + end + end +end diff --git a/c_glib/test/helper/writable.rb b/c_glib/test/helper/writable.rb new file mode 100644 index 00000000000..1c8db756c38 --- /dev/null +++ b/c_glib/test/helper/writable.rb @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Helper + module Writable + def write_table(table, output, type: :file) + if output.is_a?(Arrow::Buffer) + output_stream = Arrow::BufferOutputStream.new(output) + else + output_stream = Arrow::FileOutputStream.new(output, false) + end + begin + if type == :file + writer_class = Arrow::RecordBatchFileWriter + else + writer_class = Arrow::RecordBatchStreamWriter + end + writer = writer_class.new(output_stream, table.schema) + begin + writer.write_table(table) + ensure + writer.close + end + ensure + output_stream.close + end + end + end +end diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb index 7911cf44b6e..abae4e722c5 100755 --- a/c_glib/test/run-test.rb +++ b/c_glib/test/run-test.rb @@ -39,6 +39,10 @@ def initialize(data) @data = data end end + + class BooleanScalar + alias_method :value, :value? + end end begin @@ -51,6 +55,19 @@ def initialize(data) rescue GObjectIntrospection::RepositoryError::TypelibNotFound end +begin + class ArrowFlightLoader < GI::Loader + def should_unlock_gvl?(info, klass) + true + end + end + flight_module = Module.new + ArrowFlightLoader.load("ArrowFlight", flight_module) + ArrowFlight = flight_module + GObjectIntrospection::Loader.start_callback_dispatch_thread +rescue GObjectIntrospection::RepositoryError::TypelibNotFound +end + begin Gandiva = GI.load("Gandiva") rescue GObjectIntrospection::RepositoryError::TypelibNotFound @@ -74,7 +91,12 @@ def initialize(data) require_relative "helper/buildable" require_relative "helper/data-type" require_relative "helper/fixture" +if defined?(ArrowFlight) + require_relative "helper/flight-info-generator" + require_relative "helper/flight-server" +end require_relative "helper/omittable" require_relative "helper/plasma-store" +require_relative "helper/writable" exit(Test::Unit::AutoRunner.run(true, test_dir.to_s)) diff --git a/c_glib/test/run-test.sh b/c_glib/test/run-test.sh index 2120aa9f8f0..7e0901df5b5 100755 --- a/c_glib/test/run-test.sh +++ b/c_glib/test/run-test.sh @@ -20,7 +20,7 @@ test_dir="$(cd $(dirname $0); pwd)" build_dir="$(cd .; pwd)" -modules="arrow-glib arrow-cuda-glib arrow-dataset-glib gandiva-glib parquet-glib plasma-glib" +modules="arrow-glib arrow-cuda-glib arrow-dataset-glib arrow-flight-glib gandiva-glib parquet-glib plasma-glib" for module in ${modules}; do module_build_dir="${build_dir}/${module}" diff --git a/c_glib/test/test-array-datum.rb b/c_glib/test/test-array-datum.rb index f4bc9be7f14..623e5589ce4 100644 --- a/c_glib/test/test-array-datum.rb +++ b/c_glib/test/test-array-datum.rb @@ -35,6 +35,18 @@ def test_array_like? end end + def test_scalar? + assert do + not @datum.scalar? + end + end + + def test_value? + assert do + @datum.value? + end + end + sub_test_case("==") do def test_true assert_equal(Arrow::ArrayDatum.new(@array), diff --git a/c_glib/test/test-binary-scalar.rb b/c_glib/test/test-binary-scalar.rb new file mode 100644 index 00000000000..4efc50da080 --- /dev/null +++ b/c_glib/test/test-binary-scalar.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestBinaryScalar < Test::Unit::TestCase + def setup + @buffer = Arrow::Buffer.new("\x03\x01\x02") + @scalar = Arrow::BinaryScalar.new(@buffer) + end + + def test_data_type + assert_equal(Arrow::BinaryDataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::BinaryScalar.new(@buffer), + @scalar) + end + + def test_to_s + assert_equal("\x03\x01\x02", @scalar.to_s) + end + + def test_value + assert_equal(@buffer, + @scalar.value) + end +end diff --git a/c_glib/test/test-boolean-scalar.rb b/c_glib/test/test-boolean-scalar.rb new file mode 100644 index 00000000000..f8913d6a7e4 --- /dev/null +++ b/c_glib/test/test-boolean-scalar.rb @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestBooleanScalar < Test::Unit::TestCase + def setup + @scalar = Arrow::BooleanScalar.new(true) + end + + def test_parse + assert_equal(@scalar, + Arrow::Scalar.parse(Arrow::BooleanDataType.new, + "true")) + end + + def test_data_type + assert_equal(Arrow::BooleanDataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::BooleanScalar.new(true), + @scalar) + end + + def test_to_s + assert_equal("true", @scalar.to_s) + end + + def test_value + assert_equal(true, @scalar.value) + end +end diff --git a/c_glib/test/test-compare.rb b/c_glib/test/test-compare.rb deleted file mode 100644 index 2ffe39839df..00000000000 --- a/c_glib/test/test-compare.rb +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestCompare < Test::Unit::TestCase - include Helper::Buildable - - def setup - @options = Arrow::CompareOptions.new - end - - sub_test_case("CompareOptions") do - def test_default_operator - assert_equal(Arrow::CompareOperator::EQUAL, - @options.operator) - end - end - - sub_test_case("operator") do - def test_equal - @options.operator = :equal - assert_equal(build_boolean_array([true, nil, false]), - build_int32_array([1, nil, 3]).compare(1, @options)) - end - - def test_not_equal - @options.operator = :not_equal - assert_equal(build_boolean_array([false, nil, true]), - build_int32_array([1, nil, 3]).compare(1, @options)) - end - - def test_greater - @options.operator = :greater - assert_equal(build_boolean_array([false, nil, true]), - build_int32_array([1, nil, 3]).compare(1, @options)) - end - - def test_greater_equal - @options.operator = :greater_equal - assert_equal(build_boolean_array([true, nil, true]), - build_int32_array([1, nil, 3]).compare(1, @options)) - end - - def test_less - @options.operator = :less - assert_equal(build_boolean_array([false, nil, false]), - build_int32_array([1, nil, 3]).compare(1, @options)) - end - - def test_less_equal - @options.operator = :less_equal - assert_equal(build_boolean_array([true, nil, false]), - build_int32_array([1, nil, 3]).compare(1, @options)) - end - end -end diff --git a/c_glib/test/test-count.rb b/c_glib/test/test-count.rb index 36390f880aa..39b6f06c4e6 100644 --- a/c_glib/test/test-count.rb +++ b/c_glib/test/test-count.rb @@ -19,27 +19,14 @@ class TestCount < Test::Unit::TestCase include Helper::Buildable include Helper::Omittable - sub_test_case("CountOptions") do - def test_default_mode - assert_equal(Arrow::CountMode::ALL, - Arrow::CountOptions.new.mode) - end - end - - sub_test_case("mode") do + sub_test_case("skip_nulls") do def test_default assert_equal(2, build_int32_array([1, nil, 3]).count) end - def test_all - options = Arrow::CountOptions.new - options.mode = :all - assert_equal(2, build_int32_array([1, nil, 3]).count(options)) - end - - def test_null - options = Arrow::CountOptions.new - options.mode = :null + def test_false + options = Arrow::ScalarAggregateOptions.new + options.skip_nulls = false assert_equal(1, build_int32_array([1, nil, 3]).count(options)) end end diff --git a/c_glib/test/test-date32-scalar.rb b/c_glib/test/test-date32-scalar.rb new file mode 100644 index 00000000000..ae41ebf72f5 --- /dev/null +++ b/c_glib/test/test-date32-scalar.rb @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDate32Scalar < Test::Unit::TestCase + def setup + @value = 17406 # 2017-08-28 + @scalar = Arrow::Date32Scalar.new(@value) + end + + def test_data_type + assert_equal(Arrow::Date32DataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::Date32Scalar.new(@value), + @scalar) + end + + def test_to_s + assert_equal("2017-08-28", @scalar.to_s) + end + + def test_value + assert_equal(@value, @scalar.value) + end +end diff --git a/c_glib/test/test-date64-scalar.rb b/c_glib/test/test-date64-scalar.rb new file mode 100644 index 00000000000..ce39d3c2d74 --- /dev/null +++ b/c_glib/test/test-date64-scalar.rb @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDate64Scalar < Test::Unit::TestCase + def setup + @value = 1503878400000 # 2017-08-28T00:00:00Z + @scalar = Arrow::Date64Scalar.new(@value) + end + + def test_data_type + assert_equal(Arrow::Date64DataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::Date64Scalar.new(@value), + @scalar) + end + + def test_to_s + assert_equal("2017-08-28", @scalar.to_s) + end + + def test_value + assert_equal(@value, @scalar.value) + end +end diff --git a/c_glib/test/test-decimal128-scalar.rb b/c_glib/test/test-decimal128-scalar.rb new file mode 100644 index 00000000000..380623a6701 --- /dev/null +++ b/c_glib/test/test-decimal128-scalar.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDecimal128Scalar < Test::Unit::TestCase + def setup + @data_type = Arrow::Decimal128DataType.new(8, 2) + @value = Arrow::Decimal128.new("23423445") + @scalar = Arrow::Decimal128Scalar.new(@data_type, @value) + end + + def test_data_type + assert_equal(@data_type, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::Decimal128Scalar.new(@data_type, @value), + @scalar) + end + + def test_to_s + assert_equal("234234.45", @scalar.to_s) + end + + def test_value + assert_equal(@value, @scalar.value) + end +end diff --git a/c_glib/test/test-decimal256-scalar.rb b/c_glib/test/test-decimal256-scalar.rb new file mode 100644 index 00000000000..2c419940df7 --- /dev/null +++ b/c_glib/test/test-decimal256-scalar.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDecimal256Scalar < Test::Unit::TestCase + def setup + @data_type = Arrow::Decimal256DataType.new(8, 2) + @value = Arrow::Decimal256.new("23423445") + @scalar = Arrow::Decimal256Scalar.new(@data_type, @value) + end + + def test_data_type + assert_equal(@data_type, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::Decimal256Scalar.new(@data_type, @value), + @scalar) + end + + def test_to_s + assert_equal("234234.45", @scalar.to_s) + end + + def test_value + assert_equal(@value, @scalar.value) + end +end diff --git a/c_glib/test/test-dense-union-scalar.rb b/c_glib/test/test-dense-union-scalar.rb new file mode 100644 index 00000000000..ec2053b3fe9 --- /dev/null +++ b/c_glib/test/test-dense-union-scalar.rb @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDenseUnionScalar < Test::Unit::TestCase + def setup + fields = [ + Arrow::Field.new("number", Arrow::Int8DataType.new), + Arrow::Field.new("text", Arrow::StringDataType.new), + ] + @data_type = Arrow::DenseUnionDataType.new(fields, [2, 9]) + @type_code = 2 + @value = Arrow::Int8Scalar.new(-29) + @scalar = Arrow::DenseUnionScalar.new(@data_type, @type_code, @value) + end + + def test_type_code + assert_equal(@type_code, + @scalar.type_code) + end + + def test_data_type + assert_equal(@data_type, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::DenseUnionScalar.new(@data_type, @type_code, @value), + @scalar) + end + + def test_to_s + assert_equal("...", @scalar.to_s) + end + + def test_value + assert_equal(@value, @scalar.value) + end +end diff --git a/c_glib/test/test-double-scalar.rb b/c_glib/test/test-double-scalar.rb new file mode 100644 index 00000000000..eea673b41e5 --- /dev/null +++ b/c_glib/test/test-double-scalar.rb @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDoubleScalar < Test::Unit::TestCase + def setup + @scalar = Arrow::DoubleScalar.new(1.1) + end + + def test_data_type + assert_equal(Arrow::DoubleDataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + options = Arrow::EqualOptions.new + options.approx = true + assert do + @scalar.equal_options(Arrow::DoubleScalar.new(1.1), options) + end + end + + def test_to_s + assert_equal("1.1", @scalar.to_s) + end + + def test_value + assert_in_delta(1.1, @scalar.value) + end +end diff --git a/c_glib/test/test-equal-options.rb b/c_glib/test/test-equal-options.rb new file mode 100644 index 00000000000..4ea1979a76b --- /dev/null +++ b/c_glib/test/test-equal-options.rb @@ -0,0 +1,96 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestEqualOptions < Test::Unit::TestCase + include Helper::Buildable + + sub_test_case("approx") do + def setup + @options = Arrow::EqualOptions.new + end + + def test_accessor + assert do + not @options.approx? + end + @options.approx = true + assert do + @options.approx? + end + end + + def test_compare + array1 = build_float_array([0.01]) + array2 = build_float_array([0.010001]) + @options.approx = true + assert do + array1.equal_options(array2, @options) + end + end + end + + sub_test_case("nans-equal") do + def setup + @options = Arrow::EqualOptions.new + end + + def test_accessor + assert do + not @options.nans_equal? + end + @options.nans_equal = true + assert do + @options.nans_equal? + end + end + + def test_compare + array1 = build_float_array([0.1, Float::NAN, 0.2]) + array2 = build_float_array([0.1, Float::NAN, 0.2]) + @options.nans_equal = true + assert do + array1.equal_options(array2, @options) + end + end + end + + sub_test_case("absolute-tolerance") do + def setup + @options = Arrow::EqualOptions.new + end + + def test_accessor + assert do + @options.absolute_tolerance < 0.001 + end + @options.absolute_tolerance = 0.001 + assert do + @options.absolute_tolerance >= 0.001 + end + end + + def test_compare + array1 = build_float_array([0.01]) + array2 = build_float_array([0.0109]) + @options.approx = true + @options.absolute_tolerance = 0.001 + assert do + array1.equal_options(array2, @options) + end + end + end +end diff --git a/c_glib/test/test-fixed-size-binary-scalar.rb b/c_glib/test/test-fixed-size-binary-scalar.rb new file mode 100644 index 00000000000..1a6f0703594 --- /dev/null +++ b/c_glib/test/test-fixed-size-binary-scalar.rb @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFixedSizeBinaryScalar < Test::Unit::TestCase + def setup + @data_type = Arrow::FixedSizeBinaryDataType.new(3) + @buffer = Arrow::Buffer.new("\x03\x01\x02") + @scalar = Arrow::FixedSizeBinaryScalar.new(@data_type, @buffer) + end + + def test_data_type + assert_equal(@data_type, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::FixedSizeBinaryScalar.new(@data_type, @buffer), + @scalar) + end + + def test_to_s + assert_equal("\x03\x01\x02", @scalar.to_s) + end + + def test_value + assert_equal(@buffer, + @scalar.value) + end +end diff --git a/c_glib/test/test-float-scalar.rb b/c_glib/test/test-float-scalar.rb new file mode 100644 index 00000000000..1b830408cbb --- /dev/null +++ b/c_glib/test/test-float-scalar.rb @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestFloatScalar < Test::Unit::TestCase + def setup + @scalar = Arrow::FloatScalar.new(1.1) + end + + def test_data_type + assert_equal(Arrow::FloatDataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + options = Arrow::EqualOptions.new + options.approx = true + assert do + @scalar.equal_options(Arrow::FloatScalar.new(1.1), options) + end + end + + def test_to_s + assert_equal("1.1", @scalar.to_s) + end + + def test_value + assert_in_delta(1.1, @scalar.value) + end +end diff --git a/c_glib/test/test-function.rb b/c_glib/test/test-function.rb index 8530ea5c153..390bed5cc94 100644 --- a/c_glib/test/test-function.rb +++ b/c_glib/test/test-function.rb @@ -50,6 +50,25 @@ def test_chunked_array or_function.execute(args).value) end + def test_input_scalar + add_function = Arrow::Function.find("add") + args = [ + Arrow::ArrayDatum.new(build_int8_array([1, 2, 3])), + Arrow::ScalarDatum.new(Arrow::Int8Scalar.new(5)), + ] + assert_equal(build_int8_array([6, 7, 8]), + add_function.execute(args).value) + end + + def test_output_scalar + sum_function = Arrow::Function.find("sum") + args = [ + Arrow::ArrayDatum.new(build_int8_array([1, 2, 3])), + ] + assert_equal(Arrow::Int64Scalar.new(6), + sum_function.execute(args).value) + end + def test_options cast_function = Arrow::Function.find("cast") args = [ diff --git a/c_glib/test/test-int16-scalar.rb b/c_glib/test/test-int16-scalar.rb new file mode 100644 index 00000000000..1a792714079 --- /dev/null +++ b/c_glib/test/test-int16-scalar.rb @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestInt16Scalar < Test::Unit::TestCase + def setup + @scalar = Arrow::Int16Scalar.new(-(2 ** 15)) + end + + def test_data_type + assert_equal(Arrow::Int16DataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::Int16Scalar.new(-(2 ** 15)), + @scalar) + end + + def test_to_s + assert_equal((-(2 ** 15)).to_s, @scalar.to_s) + end + + def test_value + assert_equal(-(2 ** 15), @scalar.value) + end +end diff --git a/c_glib/test/test-int32-scalar.rb b/c_glib/test/test-int32-scalar.rb new file mode 100644 index 00000000000..eba554845c7 --- /dev/null +++ b/c_glib/test/test-int32-scalar.rb @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestInt32Scalar < Test::Unit::TestCase + def setup + @scalar = Arrow::Int32Scalar.new(-(2 ** 31)) + end + + def test_data_type + assert_equal(Arrow::Int32DataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::Int32Scalar.new(-(2 ** 31)), + @scalar) + end + + def test_to_s + assert_equal((-(2 ** 31)).to_s, @scalar.to_s) + end + + def test_value + assert_equal(-(2 ** 31), @scalar.value) + end +end diff --git a/c_glib/test/test-int64-scalar.rb b/c_glib/test/test-int64-scalar.rb new file mode 100644 index 00000000000..bfa7b4529e8 --- /dev/null +++ b/c_glib/test/test-int64-scalar.rb @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestInt64Scalar < Test::Unit::TestCase + def setup + @scalar = Arrow::Int64Scalar.new(-(2 ** 63)) + end + + def test_data_type + assert_equal(Arrow::Int64DataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::Int64Scalar.new(-(2 ** 63)), + @scalar) + end + + def test_to_s + assert_equal((-(2 ** 63)).to_s, @scalar.to_s) + end + + def test_value + assert_equal(-(2 ** 63), @scalar.value) + end +end diff --git a/c_glib/test/test-int8-scalar.rb b/c_glib/test/test-int8-scalar.rb new file mode 100644 index 00000000000..214c5907375 --- /dev/null +++ b/c_glib/test/test-int8-scalar.rb @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestInt8Scalar < Test::Unit::TestCase + def setup + @scalar = Arrow::Int8Scalar.new(-128) + end + + def test_data_type + assert_equal(Arrow::Int8DataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::Int8Scalar.new(-128), + @scalar) + end + + def test_to_s + assert_equal("-128", @scalar.to_s) + end + + def test_value + assert_equal(-128, @scalar.value) + end +end diff --git a/c_glib/test/test-large-binary-scalar.rb b/c_glib/test/test-large-binary-scalar.rb new file mode 100644 index 00000000000..a6bc4addb10 --- /dev/null +++ b/c_glib/test/test-large-binary-scalar.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestLargeBinaryScalar < Test::Unit::TestCase + def setup + @buffer = Arrow::Buffer.new("\x03\x01\x02") + @scalar = Arrow::LargeBinaryScalar.new(@buffer) + end + + def test_data_type + assert_equal(Arrow::LargeBinaryDataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::LargeBinaryScalar.new(@buffer), + @scalar) + end + + def test_to_s + assert_equal("...", @scalar.to_s) + end + + def test_value + assert_equal(@buffer, + @scalar.value) + end +end diff --git a/c_glib/test/test-large-string-scalar.rb b/c_glib/test/test-large-string-scalar.rb new file mode 100644 index 00000000000..13e28f647ac --- /dev/null +++ b/c_glib/test/test-large-string-scalar.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestLargeStringScalar < Test::Unit::TestCase + def setup + @buffer = Arrow::Buffer.new("Hello") + @scalar = Arrow::LargeStringScalar.new(@buffer) + end + + def test_data_type + assert_equal(Arrow::LargeStringDataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::LargeStringScalar.new(@buffer), + @scalar) + end + + def test_to_s + assert_equal("...", @scalar.to_s) + end + + def test_value + assert_equal(@buffer, + @scalar.value) + end +end diff --git a/c_glib/test/test-list-scalar.rb b/c_glib/test/test-list-scalar.rb new file mode 100644 index 00000000000..3fda3f25bbb --- /dev/null +++ b/c_glib/test/test-list-scalar.rb @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestListScalar < Test::Unit::TestCase + include Helper::Buildable + + def setup + @value = build_list_array(Arrow::Int8DataType.new, + [[1, 2, 3]]) + @scalar = Arrow::ListScalar.new(@value) + end + + def test_data_type + assert_equal(@value.value_data_type, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::ListScalar.new(@value), + @scalar) + end + + def test_to_s + assert_equal("...", @scalar.to_s) + end + + def test_value + assert_equal(@value, @scalar.value) + end +end diff --git a/c_glib/test/test-map-scalar.rb b/c_glib/test/test-map-scalar.rb new file mode 100644 index 00000000000..9c6eb69e0a8 --- /dev/null +++ b/c_glib/test/test-map-scalar.rb @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestMapScalar < Test::Unit::TestCase + include Helper::Buildable + + def setup + @value = build_struct_array([ + Arrow::Field.new("key", + Arrow::StringDataType.new, + false), + Arrow::Field.new("value", + Arrow::Int8DataType.new), + ], + [ + { + "key" => "hello", + "value" => 1, + }, + { + "key" => "world", + "value" => 2, + }, + ]) + @scalar = Arrow::MapScalar.new(@value) + end + + def test_data_type + assert_equal(@value.value_data_type, + @scalar.data_type) + end + + def test_valid? + assert do + @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::MapScalar.new(@value), + @scalar) + end + + def test_to_s + assert_equal("...", @scalar.to_s) + end + + def test_value + assert_equal(@value, @scalar.value) + end +end diff --git a/c_glib/test/test-null-scalar.rb b/c_glib/test/test-null-scalar.rb new file mode 100644 index 00000000000..07b887040fb --- /dev/null +++ b/c_glib/test/test-null-scalar.rb @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestNullScalar < Test::Unit::TestCase + def setup + @scalar = Arrow::NullScalar.new + end + + def test_data_type + assert_equal(Arrow::NullDataType.new, + @scalar.data_type) + end + + def test_valid? + assert do + not @scalar.valid? + end + end + + def test_equal + assert_equal(Arrow::NullScalar.new, + @scalar) + end + + def test_to_s + assert_equal("null", @scalar.to_s) + end +end diff --git a/c_glib/test/test-orc-file-reader.rb b/c_glib/test/test-orc-file-reader.rb index cd57cee4de6..38900cf12f3 100644 --- a/c_glib/test/test-orc-file-reader.rb +++ b/c_glib/test/test-orc-file-reader.rb @@ -40,7 +40,7 @@ def test_read_type string1: string middle: struct>> list: list> -map: list>> +map: map> SCHEMA end @@ -80,21 +80,6 @@ def build_middle_array(middles) build_struct_array(middle_fields, middles) end - def key_value_fields - [ - Arrow::Field.new("key", Arrow::StringDataType.new), - Arrow::Field.new("value", item_data_type), - ] - end - - def key_value_data_type - Arrow::StructDataType.new(key_value_fields) - end - - def build_key_value_array(key_value_array) - build_list_array(key_value_data_type, key_value_array, field_name: "item") - end - def middle_array build_middle_array([ { @@ -154,26 +139,21 @@ def list_array end def map_array - build_key_value_array([ - [ - ], - [ - { - "key" => "chani", - "value" => { - "int1" => 5, - "string1" => "chani", - }, - }, - { - "key" => "mauddib", - "value" => { - "int1" => 1, - "string1" => "mauddib", - }, - }, - ], - ]) + build_map_array(Arrow::StringDataType.new, + item_data_type, + [ + {}, + { + "chani" => { + "int1" => 5, + "string1" => "chani", + }, + "mauddib" => { + "int1" => 1, + "string1" => "mauddib", + }, + }, + ]) end def all_columns diff --git a/c_glib/test/test-record-batch-reader.rb b/c_glib/test/test-record-batch-reader.rb new file mode 100644 index 00000000000..a41da65fd76 --- /dev/null +++ b/c_glib/test/test-record-batch-reader.rb @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestRecordBatchReader =1.5.4 +boost-cpp>=1.68.0 +brotli +bzip2 +c-ares +cmake +gflags +glog +gmock>=1.10.0 +grpc-cpp>=1.27.3 +gtest=1.10.0 +libprotobuf +libutf8proc +lz4-c +make +ninja +pkg-config +python +rapidjson +re2 +snappy +thrift-cpp>=0.11.0 +zlib +zstd diff --git a/ci/conda_env_cpp.yml b/ci/conda_env_cpp.yml deleted file mode 100644 index 390eb7dcdd5..00000000000 --- a/ci/conda_env_cpp.yml +++ /dev/null @@ -1,42 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -aws-sdk-cpp -benchmark=1.5.2 -boost-cpp>=1.68.0 -brotli -bzip2 -c-ares -cmake -gflags -glog -gmock>=1.10.0 -grpc-cpp>=1.27.3 -gtest=1.10.0 -libprotobuf -libutf8proc -lz4-c -make -ninja -pkg-config -python -rapidjson -re2 -snappy -thrift-cpp>=0.11.0 -zlib -zstd diff --git a/ci/conda_env_gandiva.yml b/ci/conda_env_gandiva.txt similarity index 100% rename from ci/conda_env_gandiva.yml rename to ci/conda_env_gandiva.txt diff --git a/ci/conda_env_gandiva_win.yml b/ci/conda_env_gandiva_win.txt similarity index 100% rename from ci/conda_env_gandiva_win.yml rename to ci/conda_env_gandiva_win.txt diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.txt similarity index 100% rename from ci/conda_env_python.yml rename to ci/conda_env_python.txt diff --git a/ci/conda_env_r.yml b/ci/conda_env_r.txt similarity index 100% rename from ci/conda_env_r.yml rename to ci/conda_env_r.txt diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt new file mode 100644 index 00000000000..49388e2b437 --- /dev/null +++ b/ci/conda_env_sphinx.txt @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Requirements for building the documentation +breathe +doxygen +ipython +# Pinned per ARROW-9693 +sphinx=3.1.2 +pydata-sphinx-theme diff --git a/ci/conda_env_sphinx.yml b/ci/conda_env_sphinx.yml deleted file mode 100644 index 8654d231065..00000000000 --- a/ci/conda_env_sphinx.yml +++ /dev/null @@ -1,24 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Requirements for building the documentation -breathe -doxygen -ipython -# Pinned per ARROW-9693 -sphinx=3.1.2 -sphinx_rtd_theme diff --git a/ci/conda_env_unix.yml b/ci/conda_env_unix.txt similarity index 100% rename from ci/conda_env_unix.yml rename to ci/conda_env_unix.txt diff --git a/ci/detect-changes.py b/ci/detect-changes.py index c32f6e040dd..14e71ed48ce 100644 --- a/ci/detect-changes.py +++ b/ci/detect-changes.py @@ -140,7 +140,7 @@ def list_github_actions_affected_files(): LANGUAGE_TOPICS = ['c_glib', 'cpp', 'docs', 'go', 'java', 'js', 'python', - 'r', 'ruby', 'rust', 'csharp'] + 'r', 'ruby', 'csharp'] ALL_TOPICS = LANGUAGE_TOPICS + ['integration', 'dev'] @@ -161,7 +161,7 @@ def list_github_actions_affected_files(): } COMPONENTS = {'cpp', 'java', 'c_glib', 'r', 'ruby', 'integration', 'js', - 'rust', 'csharp', 'go', 'docs', 'python', 'dev'} + 'csharp', 'go', 'docs', 'python', 'dev'} def get_affected_topics(affected_files): @@ -298,7 +298,6 @@ def test_get_affected_topics(): 'python': True, 'r': True, 'ruby': True, - 'rust': False, 'csharp': False, 'integration': True, 'dev': False @@ -315,7 +314,6 @@ def test_get_affected_topics(): 'python': True, 'r': True, 'ruby': True, - 'rust': True, 'csharp': True, 'integration': True, 'dev': False @@ -332,7 +330,6 @@ def test_get_affected_topics(): 'python': True, 'r': True, 'ruby': True, - 'rust': True, 'csharp': True, 'integration': True, 'dev': True, diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index 1a5b87ef729..ff31930c06c 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -20,15 +20,14 @@ ARG arch FROM ${repo}:${arch}-conda # install the required conda packages into the test environment -COPY ci/conda_env_cpp.yml \ - ci/conda_env_gandiva.yml \ +COPY ci/conda_env_cpp.txt \ + ci/conda_env_gandiva.txt \ /arrow/ci/ RUN conda install \ - --file arrow/ci/conda_env_cpp.yml \ - --file arrow/ci/conda_env_gandiva.yml \ + --file arrow/ci/conda_env_cpp.txt \ + --file arrow/ci/conda_env_gandiva.txt \ compilers \ doxygen \ - gdb \ valgrind && \ conda clean --all diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index 1f2c9ac5da2..8a7dd48b947 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -26,10 +26,9 @@ ARG jdk=8 ARG go=1.15 # Install Archery and integration dependencies -COPY ci/conda_env_archery.yml /arrow/ci/ +COPY ci/conda_env_archery.txt /arrow/ci/ RUN conda install -q \ - --file arrow/ci/conda_env_cpp.yml \ - --file arrow/ci/conda_env_archery.yml \ + --file arrow/ci/conda_env_archery.txt \ numpy \ compilers \ maven=${maven} \ diff --git a/ci/docker/conda-python-kartothek.dockerfile b/ci/docker/conda-python-kartothek.dockerfile index b1c1ed860a9..d523161822c 100644 --- a/ci/docker/conda-python-kartothek.dockerfile +++ b/ci/docker/conda-python-kartothek.dockerfile @@ -38,9 +38,7 @@ RUN conda install -c conda-forge -q \ storefact \ toolz \ urlquote \ - zstandard \ - # temporary pin for numpy (see https://issues.apache.org/jira/browse/ARROW-11472) - numpy=1.19 && \ + zstandard && \ conda clean --all ARG kartothek=latest diff --git a/ci/docker/conda-python-turbodbc.dockerfile b/ci/docker/conda-python-turbodbc.dockerfile index ff7fdf6e1d0..e748604dee3 100644 --- a/ci/docker/conda-python-turbodbc.dockerfile +++ b/ci/docker/conda-python-turbodbc.dockerfile @@ -30,7 +30,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \ rm -rf /var/lib/apt/lists/* # install turbodbc dependencies from conda-forge -RUN conda install -c conda-forge -q\ +RUN conda install -c conda-forge -q \ pybind11 \ pytest-cov \ mock \ diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index a7e76974825..ab3f77be1b6 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -21,9 +21,9 @@ FROM ${repo}:${arch}-conda-cpp # install python specific packages ARG python=3.6 -COPY ci/conda_env_python.yml /arrow/ci/ +COPY ci/conda_env_python.txt /arrow/ci/ RUN conda install -q \ - --file arrow/ci/conda_env_python.yml \ + --file arrow/ci/conda_env_python.txt \ $([ "$python" == "3.6" -o "$python" == "3.7" ] && echo "pickle5") \ python=${python} \ nomkl && \ diff --git a/ci/docker/conda.dockerfile b/ci/docker/conda.dockerfile index 94de009904a..2e773b5437e 100644 --- a/ci/docker/conda.dockerfile +++ b/ci/docker/conda.dockerfile @@ -25,7 +25,7 @@ ARG prefix=/opt/conda # install build essentials RUN export DEBIAN_FRONTEND=noninteractive && \ apt-get update -y -q && \ - apt-get install -y -q wget tzdata libc6-dbg \ + apt-get install -y -q wget tzdata libc6-dbg gdb \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -38,8 +38,8 @@ RUN /arrow/ci/scripts/install_conda.sh ${arch} linux latest ${prefix} RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest ${prefix} # create a conda environment -ADD ci/conda_env_unix.yml /arrow/ci/ -RUN conda create -n arrow --file arrow/ci/conda_env_unix.yml git && \ +ADD ci/conda_env_unix.txt /arrow/ci/ +RUN conda create -n arrow --file arrow/ci/conda_env_unix.txt git && \ conda clean --all # activate the created environment by default diff --git a/ci/docker/debian-10-cpp.dockerfile b/ci/docker/debian-10-cpp.dockerfile index 83f8ce529cb..d99a2c161bd 100644 --- a/ci/docker/debian-10-cpp.dockerfile +++ b/ci/docker/debian-10-cpp.dockerfile @@ -60,7 +60,6 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ - libzstd-dev \ llvm-${llvm}-dev \ make \ ninja-build \ @@ -77,8 +76,8 @@ COPY ci/scripts/install_minio.sh \ RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local ENV ARROW_BUILD_TESTS=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_DATASET=ON \ + ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_FLIGHT=ON \ ARROW_GANDIVA=ON \ ARROW_HOME=/usr/local \ @@ -101,4 +100,5 @@ ENV ARROW_BUILD_TESTS=ON \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PATH=/usr/lib/ccache/:$PATH \ - Protobuf_SOURCE=BUNDLED + Protobuf_SOURCE=BUNDLED \ + zstd_SOURCE=BUNDLED diff --git a/ci/docker/java-jni-manylinux-201x.dockerfile b/ci/docker/java-jni-manylinux-201x.dockerfile new file mode 100644 index 00000000000..021dab686f3 --- /dev/null +++ b/ci/docker/java-jni-manylinux-201x.dockerfile @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base +FROM ${base} + +# Install the libaries required by the Gandiva to run +RUN vcpkg install --clean-after-build \ + llvm \ + boost-system \ + boost-date-time \ + boost-regex \ + boost-predef \ + boost-algorithm \ + boost-locale \ + boost-format \ + boost-variant \ + boost-multiprecision + +# Install Java +ARG java=1.8.0 +RUN yum install -y java-$java-openjdk-devel && yum clean all +ENV JAVA_HOME=/usr/lib/jvm/java-$java-openjdk/ diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index 604a05afb07..8f124a77658 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -18,9 +18,10 @@ ARG base FROM ${base} -ARG r=3.6 +ARG r=4.1 ARG jdk=8 +# See R install instructions at https://cloud.r-project.org/bin/linux/ubuntu/ RUN apt-get update -y && \ apt-get install -y \ dirmngr \ @@ -29,8 +30,8 @@ RUN apt-get update -y && \ apt-key adv \ --keyserver keyserver.ubuntu.com \ --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ - add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran35/' && \ - apt-get install -y \ + add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran40/' && \ + apt-get install -y --no-install-recommends \ autoconf-archive \ automake \ curl \ @@ -43,12 +44,14 @@ RUN apt-get update -y && \ libgirepository1.0-dev \ libglib2.0-doc \ libharfbuzz-dev \ + libtiff-dev \ libtool \ libxml2-dev \ ninja-build \ nvidia-cuda-toolkit \ openjdk-${jdk}-jdk-headless \ pandoc \ + r-recommended=${r}* \ r-base=${r}* \ rsync \ ruby-dev \ @@ -72,13 +75,14 @@ RUN wget -q -O - https://deb.nodesource.com/setup_${node}.x | bash - && \ rm -rf /var/lib/apt/lists/* && \ npm install -g yarn -# Sphinx is pinned because of ARROW-9693 +# ARROW-13353: breathe >= 4.29.1 tries to parse template arguments, +# but Sphinx can't parse constructs like `typename...`. RUN pip install \ meson \ - breathe \ + breathe==4.29.0 \ ipython \ sphinx \ - sphinx_rtd_theme + pydata-sphinx-theme COPY c_glib/Gemfile /arrow/c_glib/ RUN gem install --no-document bundler && \ @@ -96,8 +100,11 @@ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow && \ R -e "install.packages('pkgdown')" -ENV ARROW_PYTHON=ON \ +ENV ARROW_FLIGHT=ON \ + ARROW_PYTHON=ON \ + ARROW_S3=ON \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ ARROW_BUILD_UTILITIES=OFF \ ARROW_USE_GLOG=OFF \ + CMAKE_UNITY_BUILD=ON \ diff --git a/ci/docker/linux-apt-lint.dockerfile b/ci/docker/linux-apt-lint.dockerfile index 4be9fcc7f62..04646585322 100644 --- a/ci/docker/linux-apt-lint.dockerfile +++ b/ci/docker/linux-apt-lint.dockerfile @@ -35,9 +35,48 @@ RUN apt-get update && \ python3-dev \ python3-pip \ ruby \ + apt-transport-https \ + software-properties-common \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +ARG r=4.1 +RUN apt-key adv \ + --keyserver keyserver.ubuntu.com \ + --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ + # NOTE: R 3.5 and 3.6 are available in the repos with -cran35 suffix + # for trusty, xenial, bionic, and eoan (as of May 2020) + # -cran40 has 4.0 versions for bionic and focal + # R 3.2, 3.3, 3.4 are available without the suffix but only for trusty and xenial + # TODO: make sure OS version and R version are valid together and conditionally set repo suffix + # This is a hack to turn 3.6 into 35, and 4.0/4.1 into 40: + add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran'$(echo "${r}" | tr -d . | tr 6 5 | tr 1 0)'/' && \ + apt-get install -y \ + r-base=${r}* \ + r-recommended=${r}* \ + libxml2-dev + +# Ensure parallel R package installation, set CRAN repo mirror, +# and use pre-built binaries where possible +COPY ci/etc/rprofile /arrow/ci/etc/ +RUN cat /arrow/ci/etc/rprofile >> $(R RHOME)/etc/Rprofile.site +# Also ensure parallel compilation of C/C++ code +RUN echo "MAKEFLAGS=-j$(R -s -e 'cat(parallel::detectCores())')" >> $(R RHOME)/etc/Makeconf + + +COPY ci/scripts/r_deps.sh /arrow/ci/scripts/ +COPY r/DESCRIPTION /arrow/r/ +# We need to install Arrow's dependencies in order for lintr's namespace searching to work. +# This could be removed if lintr no longer loads the dependency namespaces (see issues/PRs below) +RUN /arrow/ci/scripts/r_deps.sh /arrow +# This fork has a number of changes that have PRs and Issues to resolve upstream: +# https://github.com/jimhester/lintr/pull/843 +# https://github.com/jimhester/lintr/pull/841 +# https://github.com/jimhester/lintr/pull/845 +# https://github.com/jimhester/lintr/issues/842 +# https://github.com/jimhester/lintr/issues/846 +RUN R -e "remotes::install_github('jonkeane/lintr@arrow-branch')" + # Docker linter COPY --from=hadolint /bin/hadolint /usr/bin/hadolint @@ -45,25 +84,12 @@ COPY --from=hadolint /bin/hadolint /usr/bin/hadolint COPY ci/scripts/install_iwyu.sh /arrow/ci/scripts/ RUN arrow/ci/scripts/install_iwyu.sh /tmp/iwyu /usr/local ${clang_tools} -# Rust linter -ARG rust=nightly-2019-09-25 -RUN curl https://sh.rustup.rs -sSf | \ - sh -s -- --default-toolchain stable -y -ENV PATH /root/.cargo/bin:$PATH -RUN rustup install ${rust} && \ - rustup default ${rust} && \ - rustup component add rustfmt - # Use python3 by default in scripts RUN ln -s /usr/bin/python3 /usr/local/bin/python && \ ln -s /usr/bin/pip3 /usr/local/bin/pip -COPY dev/archery/requirements.txt \ - dev/archery/requirements-lint.txt \ - /arrow/dev/archery/ -RUN pip install \ - -r arrow/dev/archery/requirements.txt \ - -r arrow/dev/archery/requirements-lint.txt +COPY dev/archery/setup.py /arrow/dev/archery/ +RUN pip install -e arrow/dev/archery[lint] ENV LC_ALL=C.UTF-8 \ LANG=C.UTF-8 diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index f47044e334b..97029ce62ad 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -19,6 +19,9 @@ ARG base FROM ${base} ARG arch +ARG tz="UTC" +ENV TZ=${tz} + # Build R # [1] https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-18-04 # [2] https://linuxize.com/post/how-to-install-r-on-ubuntu-18-04/#installing-r-packages-from-cran @@ -36,10 +39,11 @@ RUN apt-get update -y && \ # -cran40 has 4.0 versions for bionic and focal # R 3.2, 3.3, 3.4 are available without the suffix but only for trusty and xenial # TODO: make sure OS version and R version are valid together and conditionally set repo suffix - # This is a hack to turn 3.6 into 35 and 4.0 into 40: - add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran'$(echo "${r}" | tr -d . | tr 6 5)'/' && \ + # This is a hack to turn 3.6 into 35, and 4.0/4.1 into 40: + add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran'$(echo "${r}" | tr -d . | tr 6 5 | tr 1 0)'/' && \ apt-get install -y \ r-base=${r}* \ + r-recommended=${r}* \ # system libs needed by core R packages libxml2-dev \ libgit2-dev \ @@ -60,6 +64,16 @@ RUN apt-get update -y && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +ARG gcc_version="" +RUN if [ "${gcc_version}" != "" ]; then \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${gcc_version} 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${gcc_version} 100 && \ + update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30 && \ + update-alternatives --set cc /usr/bin/gcc && \ + update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30 && \ + update-alternatives --set c++ /usr/bin/g++; \ + fi + # Ensure parallel R package installation, set CRAN repo mirror, # and use pre-built binaries where possible COPY ci/etc/rprofile /arrow/ci/etc/ diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile index ac414829d42..a501d69955c 100644 --- a/ci/docker/linux-r.dockerfile +++ b/ci/docker/linux-r.dockerfile @@ -30,6 +30,9 @@ ENV ARROW_R_DEV=${r_dev} ARG devtoolset_version=-1 ENV DEVTOOLSET_VERSION=${devtoolset_version} +ARG tz="UTC" +ENV TZ=${tz} + # Make sure R is on the path for the R-hub devel versions (where RPREFIX is set in its dockerfile) ENV PATH "${RPREFIX}/bin:${PATH}" diff --git a/ci/docker/python-wheel-manylinux-201x.dockerfile b/ci/docker/python-wheel-manylinux-201x.dockerfile index 19246a46764..ae1b0a7767c 100644 --- a/ci/docker/python-wheel-manylinux-201x.dockerfile +++ b/ci/docker/python-wheel-manylinux-201x.dockerfile @@ -58,7 +58,9 @@ RUN git clone https://github.com/microsoft/vcpkg /opt/vcpkg && \ ln -s /opt/vcpkg/vcpkg /usr/bin/vcpkg # Patch ports files as needed -COPY ci/vcpkg arrow/ci/vcpkg +COPY ci/vcpkg/*.patch \ + ci/vcpkg/*linux*.cmake \ + arrow/ci/vcpkg/ RUN cd /opt/vcpkg && git apply --ignore-whitespace /arrow/ci/vcpkg/ports.patch ARG build_type=release diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2017.dockerfile index 0f66a20396e..ebf51d75d29 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2017.dockerfile @@ -27,14 +27,19 @@ RUN choco install --no-progress -r -y cmake --installargs 'ADD_CMAKE_TO_PATH=Sys RUN setx path "%path%;C:\Program Files\Git\usr\bin" # Install vcpkg +# +# Compiling vcpkg itself from a git tag doesn't work anymore since vcpkg has +# started to ship precompiled binaries for the vcpkg-tool. ARG vcpkg RUN git clone https://github.com/Microsoft/vcpkg && \ - git -C vcpkg checkout %vcpkg% && \ - vcpkg\bootstrap-vcpkg.bat -disableMetrics -win64 && \ - setx PATH "%PATH%;C:\vcpkg" + vcpkg\bootstrap-vcpkg.bat -disableMetrics && \ + setx PATH "%PATH%;C:\vcpkg" && \ + git -C vcpkg checkout %vcpkg% # Patch ports files as needed -COPY ci/vcpkg arrow/ci/vcpkg +COPY ci/vcpkg/*.patch \ + ci/vcpkg/*windows*.cmake \ + arrow/ci/vcpkg/ RUN cd vcpkg && git apply --ignore-whitespace C:/arrow/ci/vcpkg/ports.patch # Configure vcpkg and install dependencies @@ -42,12 +47,12 @@ RUN cd vcpkg && git apply --ignore-whitespace C:/arrow/ci/vcpkg/ports.patch # statements but bash notation in ENV statements # VCPKG_FORCE_SYSTEM_BINARIES=1 spare around ~750MB of image size if the system # cmake's and ninja's versions are recent enough -COPY ci/vcpkg arrow/ci/vcpkg ARG build_type=release ENV CMAKE_BUILD_TYPE=${build_type} \ VCPKG_OVERLAY_TRIPLETS=C:\\arrow\\ci\\vcpkg \ VCPKG_DEFAULT_TRIPLET=x64-windows-static-md-${build_type} \ VCPKG_FEATURE_FLAGS=-manifests + RUN vcpkg install --clean-after-build \ abseil \ aws-sdk-cpp[config,cognito-identity,core,identity-management,s3,sts,transfer] \ diff --git a/ci/docker/ubuntu-18.04-cpp.dockerfile b/ci/docker/ubuntu-18.04-cpp.dockerfile index 4b855b52610..0c05ac4ee6b 100644 --- a/ci/docker/ubuntu-18.04-cpp.dockerfile +++ b/ci/docker/ubuntu-18.04-cpp.dockerfile @@ -69,6 +69,7 @@ RUN apt-get update -y -q && \ libboost-system-dev \ libbrotli-dev \ libbz2-dev \ + libc-ares-dev \ libcurl4-openssl-dev \ libgflags-dev \ libgoogle-glog-dev \ @@ -78,8 +79,6 @@ RUN apt-get update -y -q && \ libre2-dev \ libsnappy-dev \ libssl-dev \ - libutf8proc-dev \ - libzstd-dev \ ninja-build \ pkg-config \ protobuf-compiler \ @@ -96,10 +95,11 @@ RUN apt-get update -y -q && \ # - libgtest-dev only provide sources # - libprotobuf-dev only provide sources # - thrift is too old +# - utf8proc is too old(v2.1.0) # - s3 tests would require boost-asio that is included since Boost 1.66.0 ENV ARROW_BUILD_TESTS=ON \ - ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_DATASET=ON \ + ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_FLIGHT=OFF \ ARROW_GANDIVA=ON \ ARROW_HDFS=ON \ @@ -122,7 +122,9 @@ ENV ARROW_BUILD_TESTS=ON \ AWSSDK_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ - PARQUET_BUILD_EXECUTABLES=ON \ PARQUET_BUILD_EXAMPLES=ON \ + PARQUET_BUILD_EXECUTABLES=ON \ PATH=/usr/lib/ccache/:$PATH \ - Thrift_SOURCE=BUNDLED + Thrift_SOURCE=BUNDLED \ + utf8proc_SOURCE=BUNDLED \ + zstd_SOURCE=BUNDLED diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 3a37ace1381..c2a468d9e35 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -71,8 +71,9 @@ RUN apt-get update -y -q && \ libboost-system-dev \ libbrotli-dev \ libbz2-dev \ - libgflags-dev \ + libc-ares-dev \ libcurl4-openssl-dev \ + libgflags-dev \ libgoogle-glog-dev \ liblz4-dev \ libprotobuf-dev \ @@ -126,10 +127,13 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_WITH_SNAPPY=ON \ ARROW_WITH_ZLIB=ON \ ARROW_WITH_ZSTD=ON \ + ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \ AWSSDK_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ + gRPC_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PARQUET_BUILD_EXAMPLES=ON \ PARQUET_BUILD_EXECUTABLES=ON \ PATH=/usr/lib/ccache/:$PATH \ + Protobuf_SOURCE=BUNDLED \ PYTHON=python3 diff --git a/ci/docker/ubuntu-20.10-cpp.dockerfile b/ci/docker/ubuntu-20.10-cpp.dockerfile new file mode 100644 index 00000000000..6cefecfd678 --- /dev/null +++ b/ci/docker/ubuntu-20.10-cpp.dockerfile @@ -0,0 +1,138 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base=amd64/ubuntu:20.10 +FROM ${base} +ARG arch + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN echo "debconf debconf/frontend select Noninteractive" | \ + debconf-set-selections + +# Installs LLVM toolchain, for Gandiva and testing other compilers +# +# Note that this is installed before the base packages to improve iteration +# while debugging package list with docker build. +ARG clang_tools +ARG llvm +RUN if [ "${llvm}" -gt "10" ]; then \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + wget && \ + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ + echo "deb https://apt.llvm.org/groovy/ llvm-toolchain-groovy-${llvm} main" > \ + /etc/apt/sources.list.d/llvm.list && \ + if [ "${clang_tools}" != "${llvm}" -a "${clang_tools}" -gt 10 ]; then \ + echo "deb https://apt.llvm.org/groovy/ llvm-toolchain-groovy-${clang_tools} main" > \ + /etc/apt/sources.list.d/clang-tools.list; \ + fi \ + fi && \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + clang-${clang_tools} \ + clang-${llvm} \ + clang-format-${clang_tools} \ + clang-tidy-${clang_tools} \ + llvm-${llvm}-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Installs C++ toolchain and dependencies +RUN apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + autoconf \ + ca-certificates \ + ccache \ + cmake \ + g++ \ + gcc \ + gdb \ + git \ + libbenchmark-dev \ + libboost-filesystem-dev \ + libboost-system-dev \ + libbrotli-dev \ + libbz2-dev \ + libc-ares-dev \ + libcurl4-openssl-dev \ + libgflags-dev \ + libgoogle-glog-dev \ + libgrpc++-dev \ + liblz4-dev \ + libprotobuf-dev \ + libprotoc-dev \ + libre2-dev \ + libsnappy-dev \ + libssl-dev \ + libthrift-dev \ + libutf8proc-dev \ + libzstd-dev \ + make \ + ninja-build \ + pkg-config \ + protobuf-compiler \ + protobuf-compiler-grpc \ + rapidjson-dev \ + tzdata \ + wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +COPY ci/scripts/install_minio.sh \ + /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local + +# Prioritize system packages and local installation +# The following dependencies will be downloaded due to missing/invalid packages +# provided by the distribution: +# - libc-ares-dev does not install CMake config files +# - flatbuffer is not packaged +# - libgtest-dev only provide sources +# - libprotobuf-dev only provide sources +ENV ARROW_BUILD_TESTS=ON \ + ARROW_DEPENDENCY_SOURCE=SYSTEM \ + ARROW_DATASET=ON \ + ARROW_FLIGHT=OFF \ + ARROW_GANDIVA=ON \ + ARROW_HDFS=ON \ + ARROW_HOME=/usr/local \ + ARROW_INSTALL_NAME_RPATH=OFF \ + ARROW_NO_DEPRECATED_API=ON \ + ARROW_ORC=ON \ + ARROW_PARQUET=ON \ + ARROW_PLASMA=ON \ + ARROW_S3=ON \ + ARROW_USE_ASAN=OFF \ + ARROW_USE_CCACHE=ON \ + ARROW_USE_UBSAN=OFF \ + ARROW_WITH_BROTLI=ON \ + ARROW_WITH_BZ2=ON \ + ARROW_WITH_LZ4=ON \ + ARROW_WITH_SNAPPY=ON \ + ARROW_WITH_ZLIB=ON \ + ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ + GTest_SOURCE=BUNDLED \ + ORC_SOURCE=BUNDLED \ + PARQUET_BUILD_EXAMPLES=ON \ + PARQUET_BUILD_EXECUTABLES=ON \ + PATH=/usr/lib/ccache/:$PATH \ + PYTHON=python3 diff --git a/ci/docker/ubuntu-21.04-cpp.dockerfile b/ci/docker/ubuntu-21.04-cpp.dockerfile new file mode 100644 index 00000000000..18c377811bc --- /dev/null +++ b/ci/docker/ubuntu-21.04-cpp.dockerfile @@ -0,0 +1,160 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base=amd64/ubuntu:20.04 +FROM ${base} +ARG arch + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN echo "debconf debconf/frontend select Noninteractive" | \ + debconf-set-selections + +# Installs LLVM toolchain, for Gandiva and testing other compilers +# +# Note that this is installed before the base packages to improve iteration +# while debugging package list with docker build. +ARG clang_tools +ARG llvm +RUN if [ "${llvm}" -gt "10" ]; then \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + apt-transport-https \ + ca-certificates \ + gnupg \ + wget && \ + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ + echo "deb https://apt.llvm.org/hirsute/ llvm-toolchain-hirsute-${llvm} main" > \ + /etc/apt/sources.list.d/llvm.list && \ + if [ "${clang_tools}" != "${llvm}" -a "${clang_tools}" -gt 10 ]; then \ + echo "deb https://apt.llvm.org/hirsute/ llvm-toolchain-hirsute-${clang_tools} main" > \ + /etc/apt/sources.list.d/clang-tools.list; \ + fi \ + fi && \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + clang-${clang_tools} \ + clang-${llvm} \ + clang-format-${clang_tools} \ + clang-tidy-${clang_tools} \ + llvm-${llvm}-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Installs C++ toolchain and dependencies +RUN apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + autoconf \ + ca-certificates \ + ccache \ + cmake \ + gdb \ + git \ + libbenchmark-dev \ + libboost-filesystem-dev \ + libboost-system-dev \ + libbrotli-dev \ + libbz2-dev \ + libc-ares-dev \ + libcurl4-openssl-dev \ + libgflags-dev \ + libgoogle-glog-dev \ + libgrpc++-dev \ + liblz4-dev \ + libprotobuf-dev \ + libprotoc-dev \ + libre2-dev \ + libsnappy-dev \ + libssl-dev \ + libthrift-dev \ + libutf8proc-dev \ + libzstd-dev \ + make \ + ninja-build \ + pkg-config \ + protobuf-compiler \ + protobuf-compiler-grpc \ + rapidjson-dev \ + tzdata \ + wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +COPY ci/scripts/install_minio.sh \ + /arrow/ci/scripts/ +RUN /arrow/ci/scripts/install_minio.sh ${arch} linux latest /usr/local + +# Prioritize system packages and local installation +# The following dependencies will be downloaded due to missing/invalid packages +# provided by the distribution: +# - libc-ares-dev does not install CMake config files +# - flatbuffer is not packaged +# - libgtest-dev only provide sources +# - libprotobuf-dev only provide sources +ENV ARROW_BUILD_TESTS=ON \ + ARROW_DEPENDENCY_SOURCE=SYSTEM \ + ARROW_DATASET=ON \ + ARROW_FLIGHT=OFF \ + ARROW_GANDIVA=ON \ + ARROW_HDFS=ON \ + ARROW_HOME=/usr/local \ + ARROW_INSTALL_NAME_RPATH=OFF \ + ARROW_NO_DEPRECATED_API=ON \ + ARROW_ORC=ON \ + ARROW_PARQUET=ON \ + ARROW_PLASMA=ON \ + ARROW_S3=ON \ + ARROW_USE_ASAN=OFF \ + ARROW_USE_CCACHE=ON \ + ARROW_USE_UBSAN=OFF \ + ARROW_WITH_BROTLI=ON \ + ARROW_WITH_BZ2=ON \ + ARROW_WITH_LZ4=ON \ + ARROW_WITH_SNAPPY=ON \ + ARROW_WITH_ZLIB=ON \ + ARROW_WITH_ZSTD=ON \ + AWSSDK_SOURCE=BUNDLED \ + GTest_SOURCE=BUNDLED \ + ORC_SOURCE=BUNDLED \ + PARQUET_BUILD_EXAMPLES=ON \ + PARQUET_BUILD_EXECUTABLES=ON \ + PATH=/usr/lib/ccache/:$PATH \ + PYTHON=python3 + +ARG gcc_version="" +RUN if [ "${gcc_version}" = "" ]; then \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + g++ \ + gcc; \ + else \ + if [ "${gcc_version}" -gt "10" ]; then \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends software-properties-common && \ + add-apt-repository ppa:ubuntu-toolchain-r/volatile; \ + fi; \ + apt-get update -y -q && \ + apt-get install -y -q --no-install-recommends \ + g++-${gcc_version} \ + gcc-${gcc_version} && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${gcc_version} 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${gcc_version} 100 && \ + update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \ + update-alternatives --set cc /usr/bin/gcc && \ + update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 && \ + update-alternatives --set c++ /usr/bin/g++; \ + fi \ No newline at end of file diff --git a/ci/etc/rprofile b/ci/etc/rprofile index 524eb50aee0..229a0101a25 100644 --- a/ci/etc/rprofile +++ b/ci/etc/rprofile @@ -1,51 +1,53 @@ -.pick_cran <- function() { - # Return a CRAN repo URL, preferring RSPM binaries if available for this OS - rspm_template <- "https://packagemanager.rstudio.com/cran/__linux__/%s/latest" - supported_os <- c("xenial", "bionic", "centos7", "opensuse42", "opensuse15") - - if (nzchar(Sys.which("lsb_release"))) { - os <- tolower(system("lsb_release -cs", intern = TRUE)) - if (os %in% supported_os) { - return(sprintf(rspm_template, os)) + local({ + .pick_cran <- function() { + # Return a CRAN repo URL, preferring RSPM binaries if available for this OS + rspm_template <- "https://packagemanager.rstudio.com/cran/__linux__/%s/latest" + supported_os <- c("focal", "xenial", "bionic", "centos7", "centos8", "opensuse42", "opensuse15", "opensuse152") + + if (nzchar(Sys.which("lsb_release"))) { + os <- tolower(system("lsb_release -cs", intern = TRUE)) + if (os %in% supported_os) { + return(sprintf(rspm_template, os)) + } } - } - if (file.exists("/etc/os-release")) { - os_release <- readLines("/etc/os-release") - vals <- sub("^.*=(.*)$", "\\1", os_release) - os <- intersect(vals, supported_os) - if (length(os)) { - # e.g. "bionic" - return(sprintf(rspm_template, os)) - } else { - names(vals) <- sub("^(.*)=.*$", "\\1", os_release) - if (vals["ID"] == "opensuse") { - version <- sub('^"?([0-9]+).*"?.*$', "\\1", vals["VERSION_ID"]) - os <- paste0("opensuse", version) - if (os %in% supported_os) { - return(sprintf(rspm_template, os)) + if (file.exists("/etc/os-release")) { + os_release <- readLines("/etc/os-release") + vals <- sub("^.*=(.*)$", "\\1", os_release) + os <- intersect(vals, supported_os) + if (length(os)) { + # e.g. "bionic" + return(sprintf(rspm_template, os)) + } else { + names(vals) <- sub("^(.*)=.*$", "\\1", os_release) + if (vals["ID"] == "opensuse") { + version <- sub('^"?([0-9]+).*"?.*$', "\\1", vals["VERSION_ID"]) + os <- paste0("opensuse", version) + if (os %in% supported_os) { + return(sprintf(rspm_template, os)) + } } } } - } - if (file.exists("/etc/system-release")) { - # Something like "CentOS Linux release 7.7.1908 (Core)" - system_release <- tolower(utils::head(readLines("/etc/system-release"), 1)) - # Extract from that the distro and the major version number - os <- sub("^([a-z]+) .* ([0-9]+).*$", "\\1\\2", system_release) - if (os %in% supported_os) { - return(sprintf(rspm_template, os)) + if (file.exists("/etc/system-release")) { + # Something like "CentOS Linux release 7.7.1908 (Core)" + system_release <- tolower(utils::head(readLines("/etc/system-release"), 1)) + # Extract from that the distro and the major version number + os <- sub("^([a-z]+) .* ([0-9]+).*$", "\\1\\2", system_release) + if (os %in% supported_os) { + return(sprintf(rspm_template, os)) + } } + + return("https://cloud.r-project.org") } - - return("https://cloud.r-project.org") -} - -options( - Ncpus = parallel::detectCores(), - repos = tryCatch(.pick_cran(), error = function(e) "https://cloud.r-project.org"), - HTTPUserAgent = sprintf( - 'R/%s R (%s)', - getRversion(), - paste(getRversion(), R.version$platform, R.version$arch, R.version$os) + + options( + Ncpus = parallel::detectCores(), + repos = tryCatch(.pick_cran(), error = function(e) "https://cloud.r-project.org"), + HTTPUserAgent = sprintf( + 'R/%s R (%s)', + getRversion(), + paste(getRversion(), R.version$platform, R.version$arch, R.version$os) + ) ) -) +}) diff --git a/ci/etc/valgrind-cran.supp b/ci/etc/valgrind-cran.supp new file mode 100644 index 00000000000..4d292202608 --- /dev/null +++ b/ci/etc/valgrind-cran.supp @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +{ + # `testthat::skip()`s cause a valgrind error that does not show up on CRAN. + + Memcheck:Cond + fun:gregexpr_Regexc + fun:do_regexpr + fun:bcEval + fun:Rf_eval + fun:R_execClosure + fun:Rf_applyClosure + fun:bcEval + fun:Rf_eval + fun:forcePromise + fun:FORCE_PROMISE + fun:getvar + fun:bcEval +} diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index c5b55eef42a..56d70d83daf 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=3.0.0.9000 +pkgver=5.0.0.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 8a1e4f32f3a..46845d0e623 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -41,6 +41,11 @@ if [ "${ARROW_USE_CCACHE}" == "ON" ]; then ccache -s fi +if [ "${ARROW_USE_TSAN}" == "ON" ] && [ ! -x "${ASAN_SYMBOLIZER_PATH}" ]; then + echo -e "Invalid value for \$ASAN_SYMBOLIZER_PATH: ${ASAN_SYMBOLIZER_PATH}" + exit 1 +fi + mkdir -p ${build_dir} pushd ${build_dir} @@ -59,6 +64,7 @@ cmake -G "${CMAKE_GENERATOR:-Ninja}" \ -DARROW_CUDA=${ARROW_CUDA:-OFF} \ -DARROW_CXXFLAGS=${ARROW_CXXFLAGS:-} \ -DARROW_DATASET=${ARROW_DATASET:-ON} \ + -DARROW_ENGINE=${ARROW_ENGINE:-ON} \ -DARROW_DEPENDENCY_SOURCE=${ARROW_DEPENDENCY_SOURCE:-AUTO} \ -DARROW_EXTRA_ERROR_CONTEXT=${ARROW_EXTRA_ERROR_CONTEXT:-OFF} \ -DARROW_ENABLE_TIMING_TESTS=${ARROW_ENABLE_TIMING_TESTS:-ON} \ diff --git a/ci/scripts/docs_build.sh b/ci/scripts/docs_build.sh index a0d926a335e..e6ee768ee87 100755 --- a/ci/scripts/docs_build.sh +++ b/ci/scripts/docs_build.sh @@ -27,8 +27,10 @@ export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 export CFLAGS="-DARROW_NO_DEPRECATED_API" export CXXFLAGS="-DARROW_NO_DEPRECATED_API" -# Prose and Python -sphinx-build -b html ${arrow_dir}/docs/source ${build_dir} +ncpus=$(python3 -c "import os; print(os.cpu_count())") + +# Sphinx docs +sphinx-build -b html -j ${ncpus} ${arrow_dir}/docs/source ${build_dir} # C++ - original doxygen # rsync -a ${arrow_dir}/cpp/apidoc/ ${build_dir}/cpp diff --git a/ci/scripts/install_python.sh b/ci/scripts/install_python.sh new file mode 100755 index 00000000000..088da817676 --- /dev/null +++ b/ci/scripts/install_python.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eu + +declare -A platforms +platforms=([windows]=Windows + [macos]=MacOSX + [linux]=Linux) + +declare -A versions +versions=([3.6]=3.6.8 + [3.7]=3.7.9 + [3.8]=3.8.10 + [3.9]=3.9.6) + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +elif [[ -z ${platforms[$1]} ]]; then + echo "Unexpected platform: ${1}" + exit 1 +fi + +platform=${platforms[$1]} +version=$2 +full_version=${versions[$2]} + +if [ $platform = "MacOSX" ]; then + echo "Downloading Python installer..." + if [ "$(uname -m)" = "arm64" ]; then + fname="python-${full_version}-macos11.pkg" + else + fname="python-${full_version}-macosx10.9.pkg" + fi + wget "https://www.python.org/ftp/python/${full_version}/${fname}" + + echo "Installing Python..." + installer -pkg $fname -target / + rm $fname + + echo "Installing Pip..." + python="/Library/Frameworks/Python.framework/Versions/${version}/bin/python${version}" + pip="${python} -m pip" + + $python -m ensurepip + $pip install -U pip setuptools virtualenv +else + echo "Unsupported platform: $platform" +fi diff --git a/ci/scripts/install_turbodbc.sh b/ci/scripts/install_turbodbc.sh index a71520bebf4..3e644a3e27a 100755 --- a/ci/scripts/install_turbodbc.sh +++ b/ci/scripts/install_turbodbc.sh @@ -35,3 +35,9 @@ elif [ "${turbodbc}" = "latest" ]; then else git -C "${target}" checkout ${turbodbc}; fi + +pushd ${target} +wget -q https://github.com/pybind/pybind11/archive/v2.6.2.tar.gz +tar xvf v2.6.2.tar.gz +mv pybind11-2.6.2 pybind11 +popd diff --git a/ci/scripts/install_vcpkg.sh b/ci/scripts/install_vcpkg.sh new file mode 100755 index 00000000000..fe99a7fea2f --- /dev/null +++ b/ci/scripts/install_vcpkg.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +vcpkg_version=$1 +vcpkg_destination=$2 +vcpkg_patch=$(realpath $(dirname "${0}")/../vcpkg/ports.patch) + +git clone --depth 1 --branch ${vcpkg_version} https://github.com/microsoft/vcpkg ${vcpkg_destination} + +pushd ${vcpkg_destination} + +./bootstrap-vcpkg.sh -useSystemBinaries -disableMetrics +git apply --ignore-whitespace ${vcpkg_patch} +echo "Patch successfully applied!" + +popd diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index 5d2e71916ed..453561d3171 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -20,14 +20,17 @@ set -ex arrow_dir=${1} -source_dir=${1}/cpp -build_dir=${2}/cpp - gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration pip install -e $arrow_dir/dev/archery -archery integration --with-all --run-flight \ +# Rust can be enabled by exporting ARCHERY_INTEGRATION_WITH_RUST=1 +archery integration \ + --run-flight \ + --with-cpp=1 \ + --with-java=1 \ + --with-js=1 \ + --with-go=1 \ --gold-dirs=$gold_dir/0.14.1 \ --gold-dirs=$gold_dir/0.17.1 \ --gold-dirs=$gold_dir/1.0.0-bigendian \ diff --git a/ci/scripts/integration_turbodbc.sh b/ci/scripts/integration_turbodbc.sh index f56074358a6..f0fafd51228 100755 --- a/ci/scripts/integration_turbodbc.sh +++ b/ci/scripts/integration_turbodbc.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -set -e +set -ex source_dir=${1} build_dir=${2}/turbodbc @@ -31,7 +31,7 @@ mkdir -p ${build_dir} pushd ${build_dir} cmake -DCMAKE_INSTALL_PREFIX=${ARROW_HOME} \ - -DCMAKE_CXX_FLAGS=${CXXFLAGS} \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS}" \ -DPYTHON_EXECUTABLE=$(which python) \ -GNinja \ ${source_dir} @@ -39,7 +39,7 @@ ninja install # TODO(ARROW-5074) export LD_LIBRARY_PATH="${ARROW_HOME}/lib:${LD_LIBRARY_PATH}" -export ODBCSYSINI="${source_dir}/travis/odbc/" +export ODBCSYSINI="${source_dir}/earthly/odbc/" service postgresql start ctest --output-on-failure diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 54cddb50372..5ef150fd1e7 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -25,59 +25,50 @@ with_docs=${3:-false} if [[ "$(uname -s)" == "Linux" ]] && [[ "$(uname -m)" == "s390x" ]]; then # Since some files for s390_64 are not available at maven central, - # download pre-build files from bintray and install them explicitly + # download pre-build files from Artifactory and install them explicitly mvn_install="mvn install:install-file" wget="wget" - bintray_base_url="https://dl.bintray.com/apache/arrow" + artifactory_base_url="https://apache.jfrog.io/artifactory/arrow" - bintray_dir="flatc-binary" - group="com.github.icexelloss" - artifact="flatc-linux-s390_64" - ver="1.9.0" - extension="exe" - target=${artifact}-${ver}.${extension} - ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} - ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dpackaging=${extension} -Dfile=$(pwd)/${target} - - bintray_dir="protoc-binary" + artifactory_dir="protoc-binary" group="com.google.protobuf" artifact="protoc" ver="3.7.1" classifier="linux-s390_64" extension="exe" target=${artifact}-${ver}-${classifier}.${extension} - ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} + ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/${target} ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} # protoc requires libprotoc.so.18 libprotobuf.so.18 - ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/libprotoc.so.18 - ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/libprotobuf.so.18 + ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/libprotoc.so.18 + ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/libprotobuf.so.18 mkdir -p ${ARROW_HOME}/lib cp lib*.so.18 ${ARROW_HOME}/lib export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${ARROW_HOME}/lib - bintray_dir="protoc-gen-grpc-java-binary" + artifactory_dir="protoc-gen-grpc-java-binary" group="io.grpc" artifact="protoc-gen-grpc-java" ver="1.30.2" classifier="linux-s390_64" extension="exe" target=${artifact}-${ver}-${classifier}.${extension} - ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} + ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/${target} ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} - bintray_dir="netty-binary" + artifactory_dir="netty-binary" group="io.netty" artifact="netty-transport-native-unix-common" ver="4.1.48.Final" classifier="linux-s390_64" extension="jar" target=${artifact}-${ver}-${classifier}.${extension} - ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} + ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/${target} ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} artifact="netty-transport-native-epoll" extension="jar" target=${artifact}-${ver}-${classifier}.${extension} - ${wget} ${bintray_base_url}/${bintray_dir}/${ver}/${target} + ${wget} ${artifactory_base_url}/${artifactory_dir}/${ver}/${target} ${mvn_install} -DgroupId=${group} -DartifactId=${artifact} -Dversion=${ver} -Dclassifier=${classifier} -Dpackaging=${extension} -Dfile=$(pwd)/${target} fi @@ -104,7 +95,8 @@ if [ "${ARROW_PLASMA}" = "ON" ]; then fi if [ "${with_docs}" == "true" ]; then - ${mvn} -Dcheckstyle.skip=true install site + # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 + ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false install site fi popd diff --git a/ci/scripts/java_jni_build.sh b/ci/scripts/java_jni_build.sh new file mode 100755 index 00000000000..b4ae48f3d9a --- /dev/null +++ b/ci/scripts/java_jni_build.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +arrow_dir=${1} +cpp_lib_dir=${2} +java_dist_dir=${3} + +export ARROW_TEST_DATA=${arrow_dir}/testing/data + +pushd ${arrow_dir}/java + +# build the entire project +mvn clean install -P arrow-jni -Darrow.cpp.build.dir=$cpp_lib_dir + +# copy all jars and pom files to the distribution folder +find . -name "*.jar" -exec echo {} \; -exec cp {} $java_dist_dir \; +find . -name "*.pom" -exec echo {} \; -exec cp {} $java_dist_dir \; + +popd diff --git a/ci/scripts/java_jni_macos_build.sh b/ci/scripts/java_jni_macos_build.sh new file mode 100755 index 00000000000..5c11ee97584 --- /dev/null +++ b/ci/scripts/java_jni_macos_build.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +arrow_dir=${1} +build_dir=${2} +# The directory where the final binaries will be stored when scripts finish +dist_dir=${3} + +echo "=== Clear output directories and leftovers ===" +# Clear output directories and leftovers +rm -rf ${build_dir} + +echo "=== Building Arrow C++ libraries ===" +: ${ARROW_BUILD_TESTS:=ON} +: ${ARROW_DATASET:=ON} +: ${ARROW_FILESYSTEM:=ON} +: ${ARROW_GANDIVA_JAVA:=ON} +: ${ARROW_GANDIVA:=ON} +: ${ARROW_ORC:=ON} +: ${ARROW_PARQUET:=ON} +: ${ARROW_PLASMA_JAVA_CLIENT:=ON} +: ${ARROW_PLASMA:=ON} +: ${ARROW_PYTHON:=OFF} +: ${CMAKE_BUILD_TYPE:=Release} +: ${CMAKE_UNITY_BUILD:=ON} + +export ARROW_TEST_DATA="${arrow_dir}/testing/data" +export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" +export AWS_EC2_METADATA_DISABLED=TRUE + +mkdir -p "${build_dir}" +pushd "${build_dir}" + +cmake \ + -DARROW_BOOST_USE_SHARED=OFF \ + -DARROW_BROTLI_USE_SHARED=OFF \ + -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS} \ + -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_BZ2_USE_SHARED=OFF \ + -DARROW_DATASET=${ARROW_DATASET} \ + -DARROW_FILESYSTEM=${ARROW_FILESYSTEM} \ + -DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA} \ + -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON \ + -DARROW_GANDIVA=${ARROW_GANDIVA} \ + -DARROW_GFLAGS_USE_SHARED=OFF \ + -DARROW_GRPC_USE_SHARED=OFF \ + -DARROW_JNI=ON \ + -DARROW_LZ4_USE_SHARED=OFF \ + -DARROW_OPENSSL_USE_SHARED=OFF \ + -DARROW_ORC=${ARROW_ORC} \ + -DARROW_PARQUET=${ARROW_PARQUET} \ + -DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT} \ + -DARROW_PLASMA=${ARROW_PLASMA} \ + -DARROW_PROTOBUF_USE_SHARED=OFF \ + -DARROW_PYTHON=${ARROW_PYTHON} \ + -DARROW_SNAPPY_USE_SHARED=OFF \ + -DARROW_THRIFT_USE_SHARED=OFF \ + -DARROW_UTF8PROC_USE_SHARED=OFF \ + -DARROW_ZSTD_USE_SHARED=OFF \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DCMAKE_INSTALL_PREFIX=${build_dir} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ + -DPARQUET_BUILD_EXAMPLES=OFF \ + -DPARQUET_BUILD_EXECUTABLES=OFF \ + -DPARQUET_REQUIRE_ENCRYPTION=OFF \ + ${arrow_dir}/cpp +cmake --build . --target install + +if [ "${ARROW_BUILD_TESTS}" == "ON" ]; then + ctest +fi + +popd + +echo "=== Copying libraries to the distribution folder ===" +mkdir -p "${dist_dir}" +cp -L ${build_dir}/lib/libgandiva_jni.dylib ${dist_dir} +cp -L ${build_dir}/lib/libarrow_dataset_jni.dylib ${dist_dir} +cp -L ${build_dir}/lib/libarrow_orc_jni.dylib ${dist_dir} + +echo "=== Checking shared dependencies for libraries ===" + +pushd ${dist_dir} +archery linking check-dependencies \ + --allow libarrow_dataset_jni \ + --allow libarrow_orc_jni \ + --allow libc++ \ + --allow libgandiva_jni \ + --allow libncurses \ + --allow libSystem \ + --allow libz \ + libgandiva_jni.dylib \ + libarrow_dataset_jni.dylib \ + libarrow_orc_jni.dylib +popd diff --git a/ci/scripts/java_jni_manylinux_build.sh b/ci/scripts/java_jni_manylinux_build.sh new file mode 100755 index 00000000000..4d01c1c30d1 --- /dev/null +++ b/ci/scripts/java_jni_manylinux_build.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +arrow_dir=${1} +build_dir=${2} +# The directory where the final binaries will be stored when scripts finish +dist_dir=${3} + +echo "=== Clear output directories and leftovers ===" +# Clear output directories and leftovers +rm -rf ${build_dir} + +echo "=== Building Arrow C++ libraries ===" +: ${ARROW_DATASET:=ON} +: ${ARROW_GANDIVA:=ON} +: ${ARROW_GANDIVA_JAVA:=ON} +: ${ARROW_FILESYSTEM:=ON} +: ${ARROW_JEMALLOC:=ON} +: ${ARROW_RPATH_ORIGIN:=ON} +: ${ARROW_ORC:=ON} +: ${ARROW_PARQUET:=ON} +: ${ARROW_PLASMA:=ON} +: ${ARROW_PLASMA_JAVA_CLIENT:=ON} +: ${ARROW_PYTHON:=OFF} +: ${ARROW_BUILD_TESTS:=ON} +: ${CMAKE_BUILD_TYPE:=Release} +: ${CMAKE_UNITY_BUILD:=ON} +: ${VCPKG_FEATURE_FLAGS:=-manifests} +: ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}} +: ${GANDIVA_CXX_FLAGS:=-isystem;/opt/rh/devtoolset-9/root/usr/include/c++/9;-isystem;/opt/rh/devtoolset-9/root/usr/include/c++/9/x86_64-redhat-linux;-isystem;-lpthread} + +export ARROW_TEST_DATA="${arrow_dir}/testing/data" +export PARQUET_TEST_DATA="${arrow_dir}/cpp/submodules/parquet-testing/data" +export AWS_EC2_METADATA_DISABLED=TRUE + +mkdir -p "${build_dir}" +pushd "${build_dir}" + +cmake \ + -DARROW_BOOST_USE_SHARED=OFF \ + -DARROW_BROTLI_USE_SHARED=OFF \ + -DARROW_BUILD_SHARED=ON \ + -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS} \ + -DARROW_BUILD_UTILITIES=OFF \ + -DARROW_BZ2_USE_SHARED=OFF \ + -DARROW_DATASET=${ARROW_DATASET} \ + -DARROW_DEPENDENCY_SOURCE="VCPKG" \ + -DARROW_FILESYSTEM=${ARROW_FILESYSTEM} \ + -DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA} \ + -DARROW_GANDIVA_PC_CXX_FLAGS=${GANDIVA_CXX_FLAGS} \ + -DARROW_GANDIVA=${ARROW_GANDIVA} \ + -DARROW_GRPC_USE_SHARED=OFF \ + -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ + -DARROW_JNI=ON \ + -DARROW_LZ4_USE_SHARED=OFF \ + -DARROW_OPENSSL_USE_SHARED=OFF \ + -DARROW_ORC=${ARROW_ORC} \ + -DARROW_PARQUET=${ARROW_PARQUET} \ + -DARROW_PLASMA_JAVA_CLIENT=${ARROW_PLASMA_JAVA_CLIENT} \ + -DARROW_PLASMA=${ARROW_PLASMA} \ + -DARROW_PROTOBUF_USE_SHARED=OFF \ + -DARROW_PYTHON=${ARROW_PYTHON} \ + -DARROW_RPATH_ORIGIN=${ARROW_RPATH_ORIGIN} \ + -DARROW_SNAPPY_USE_SHARED=OFF \ + -DARROW_THRIFT_USE_SHARED=OFF \ + -DARROW_UTF8PROC_USE_SHARED=OFF \ + -DARROW_ZSTD_USE_SHARED=OFF \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DCMAKE_INSTALL_PREFIX=${build_dir} \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ + -DPARQUET_BUILD_EXAMPLES=OFF \ + -DPARQUET_BUILD_EXECUTABLES=OFF \ + -DPARQUET_REQUIRE_ENCRYPTION=OFF \ + -DPythonInterp_FIND_VERSION_MAJOR=3 \ + -DPythonInterp_FIND_VERSION=ON \ + -DVCPKG_MANIFEST_MODE=OFF \ + -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ + -GNinja \ + ${arrow_dir}/cpp +ninja install + +if [ $ARROW_BUILD_TESTS = "ON" ]; then + CTEST_OUTPUT_ON_FAILURE=1 ninja test +fi + +popd + +echo "=== Copying libraries to the distribution folder ===" +mkdir -p "${dist_dir}" +cp -L ${build_dir}/lib/libgandiva_jni.so ${dist_dir} +cp -L ${build_dir}/lib/libarrow_dataset_jni.so ${dist_dir} +cp -L ${build_dir}/lib/libarrow_orc_jni.so ${dist_dir} + +echo "=== Checking shared dependencies for libraries ===" + +pushd ${dist_dir} +archery linking check-dependencies \ + --allow ld-linux-x86-64 \ + --allow libc \ + --allow libdl \ + --allow libgcc_s \ + --allow libm \ + --allow libpthread \ + --allow librt \ + --allow libstdc++ \ + --allow libz \ + --allow linux-vdso \ + libgandiva_jni.so \ + libarrow_dataset_jni.so \ + libarrow_orc_jni.so +popd diff --git a/ci/scripts/msys2_setup.sh b/ci/scripts/msys2_setup.sh index cb6ca30a64e..991d98bbb4a 100755 --- a/ci/scripts/msys2_setup.sh +++ b/ci/scripts/msys2_setup.sh @@ -35,6 +35,7 @@ case "${target}" in packages+=(${MINGW_PACKAGE_PREFIX}-grpc) packages+=(${MINGW_PACKAGE_PREFIX}-gtest) packages+=(${MINGW_PACKAGE_PREFIX}-libutf8proc) + packages+=(${MINGW_PACKAGE_PREFIX}-libxml2) packages+=(${MINGW_PACKAGE_PREFIX}-llvm) packages+=(${MINGW_PACKAGE_PREFIX}-lz4) packages+=(${MINGW_PACKAGE_PREFIX}-ninja) diff --git a/ci/scripts/msys2_system_upgrade.sh b/ci/scripts/msys2_system_upgrade.sh index aecd3089332..646428fbb7a 100755 --- a/ci/scripts/msys2_system_upgrade.sh +++ b/ci/scripts/msys2_system_upgrade.sh @@ -19,26 +19,6 @@ set -eux -# https://www.msys2.org/news/#2020-06-29-new-packagers -msys2_repo_base_url=https://repo.msys2.org/msys -# Mirror -msys2_repo_base_url=https://sourceforge.net/projects/msys2/files/REPOS/MSYS2 -msys2_keyring_pkg=msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz -for suffix in "" ".sig"; do - curl \ - --location \ - --remote-name \ - --show-error \ - --silent \ - ${msys2_repo_base_url}/x86_64/${msys2_keyring_pkg}${suffix} -done -pacman-key --verify ${msys2_keyring_pkg}.sig -pacman \ - --noconfirm \ - --upgrade \ - ${msys2_keyring_pkg} - - pacman \ --noconfirm \ --refresh \ diff --git a/ci/scripts/python_sdist_test.sh b/ci/scripts/python_sdist_test.sh index 1388ca09e43..3dd7d7ddd5b 100755 --- a/ci/scripts/python_sdist_test.sh +++ b/ci/scripts/python_sdist_test.sh @@ -42,10 +42,16 @@ export PYARROW_WITH_DATASET=${ARROW_DATASET:-OFF} # unset ARROW_HOME # apt purge -y pkg-config +# ARROW-12619 +if command -v git &> /dev/null; then + echo "Git exists, remove it from PATH before executing this script." + exit 1 +fi + if [ -n "${PYARROW_VERSION:-}" ]; then sdist="${arrow_dir}/python/dist/pyarrow-${PYARROW_VERSION}.tar.gz" else - sdist=$(ls "${arrow_dir}/python/dist/pyarrow-*.tar.gz" | sort -r | head -n1) + sdist=$(ls ${arrow_dir}/python/dist/pyarrow-*.tar.gz | sort -r | head -n1) fi ${PYTHON:-python} -m pip install ${sdist} diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 80a9cdef4a3..6e05af89a19 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -29,4 +29,4 @@ export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} # Enable some checks inside Python itself export PYTHONDEVMODE=1 -pytest -r s ${PYTEST_ARGS} --pyargs pyarrow +pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 7a021f70f74..82e0339c9d0 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -19,8 +19,9 @@ set -ex -source_dir=${1} -build_dir=${2} +arch=${1} +source_dir=${2} +build_dir=${3} echo "=== (${PYTHON_VERSION}) Clear output directories and leftovers ===" # Clear output directories and leftovers @@ -31,11 +32,32 @@ rm -rf ${source_dir}/python/repaired_wheels rm -rf ${source_dir}/python/pyarrow/*.so rm -rf ${source_dir}/python/pyarrow/*.so.* -echo "=== (${PYTHON_VERSION}) Set OSX SDK and C flags ===" -# Arrow is 64-bit-only at the moment -export CFLAGS="-fPIC -arch x86_64 ${CFLAGS//-arch i386/}" -export CXXFLAGS="-fPIC -arch x86_64 ${CXXFLAGS//-arch i386} -std=c++11" -export SDKROOT="$(xcrun --show-sdk-path)" +echo "=== (${PYTHON_VERSION}) Set SDK, C++ and Wheel flags ===" +export _PYTHON_HOST_PLATFORM="macosx-${MACOSX_DEPLOYMENT_TARGET}-${arch}" +export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-10.9} +export SDKROOT=${SDKROOT:-$(xcrun --sdk macosx --show-sdk-path)} + +if [ $arch = "arm64" ]; then + export CMAKE_OSX_ARCHITECTURES="arm64" +elif [ $arch = "x86_64" ]; then + export CMAKE_OSX_ARCHITECTURES="x86_64" +elif [ $arch = "universal2" ]; then + export CMAKE_OSX_ARCHITECTURES="x86_64;arm64" +else + echo "Unexpected architecture: $arch" + exit 1 +fi + +echo "=== (${PYTHON_VERSION}) Install Python build dependencies ===" +export PIP_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') +export PIP_TARGET_PLATFORM="macosx_${MACOSX_DEPLOYMENT_TARGET//./_}_${arch}" + +pip install \ + --only-binary=:all: \ + --target $PIP_SITE_PACKAGES \ + --platform $PIP_TARGET_PLATFORM \ + -r ${source_dir}/python/requirements-wheel-build.txt +pip install "delocate>=0.9" echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_DATASET:=ON} @@ -48,6 +70,7 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_PARQUET:=ON} : ${ARROW_PLASMA:=ON} : ${ARROW_S3:=ON} +: ${ARROW_SIMD_LEVEL:="SSE4_2"} : ${ARROW_TENSORFLOW:=ON} : ${ARROW_WITH_BROTLI:=ON} : ${ARROW_WITH_BZ2:=ON} @@ -57,30 +80,35 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_WITH_ZSTD:=ON} : ${CMAKE_BUILD_TYPE:=release} : ${CMAKE_GENERATOR:=Ninja} +: ${CMAKE_UNITY_BUILD:=ON} : ${VCPKG_FEATURE_FLAGS:=-manifests} : ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-osx-static-${CMAKE_BUILD_TYPE}}} mkdir -p ${build_dir}/build pushd ${build_dir}/build + cmake \ -DARROW_BUILD_SHARED=ON \ + -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 \ + -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} \ -DARROW_BUILD_STATIC=OFF \ -DARROW_BUILD_TESTS=OFF \ -DARROW_DATASET=${ARROW_DATASET} \ -DARROW_DEPENDENCY_SOURCE="VCPKG" \ -DARROW_DEPENDENCY_USE_SHARED=OFF \ - -DARROW_FLIGHT==${ARROW_FLIGHT} \ + -DARROW_FLIGHT=${ARROW_FLIGHT} \ -DARROW_GANDIVA=${ARROW_GANDIVA} \ -DARROW_HDFS=${ARROW_HDFS} \ -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC} \ -DARROW_ORC=${ARROW_ORC} \ - -DARROW_PACKAGE_KIND="manylinux${MANYLINUX_VERSION}" \ + -DARROW_PACKAGE_KIND="python-wheel-macos" \ -DARROW_PARQUET=${ARROW_PARQUET} \ -DARROW_PLASMA=${ARROW_PLASMA} \ -DARROW_PYTHON=ON \ -DARROW_RPATH_ORIGIN=ON \ -DARROW_S3=${ARROW_S3} \ + -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL} \ -DARROW_TENSORFLOW=${ARROW_TENSORFLOW} \ -DARROW_USE_CCACHE=ON \ -DARROW_WITH_BROTLI=${ARROW_WITH_BROTLI} \ @@ -92,7 +120,7 @@ cmake \ -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DCMAKE_INSTALL_LIBDIR=lib \ -DCMAKE_INSTALL_PREFIX=${build_dir}/install \ - -DCMAKE_UNITY_BUILD=ON \ + -DCMAKE_UNITY_BUILD=${CMAKE_UNITY_BUILD} \ -DOPENSSL_USE_STATIC_LIBS=ON \ -DVCPKG_MANIFEST_MODE=OFF \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} \ @@ -101,9 +129,6 @@ cmake \ cmake --build . --target install popd -# Check that we don't expose any unwanted symbols -# check_arrow_visibility - echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} export PYARROW_BUNDLE_ARROW_CPP=1 @@ -117,8 +142,11 @@ export PYARROW_WITH_ORC=${ARROW_ORC} export PYARROW_WITH_PARQUET=${ARROW_PARQUET} export PYARROW_WITH_PLASMA=${ARROW_PLASMA} export PYARROW_WITH_S3=${ARROW_S3} +export PYARROW_CMAKE_OPTIONS="-DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} -DARROW_SIMD_LEVEL=${ARROW_SIMD_LEVEL}" # PyArrow build configuration export PKG_CONFIG_PATH=/usr/lib/pkgconfig:${build_dir}/install/lib/pkgconfig +# Set PyArrow version explicitly +export SETUPTOOLS_SCM_PRETEND_VERSION=${PYARROW_VERSION} pushd ${source_dir}/python python setup.py bdist_wheel @@ -127,7 +155,11 @@ popd echo "=== (${PYTHON_VERSION}) Show dynamic libraries the wheel depend on ===" deps=$(delocate-listdeps ${source_dir}/python/dist/*.whl) -if echo $deps | grep -v "^@rpath/lib\(arrow\|gandiva\|parquet\|plasma\)"; then +if echo $deps | grep -v "^pyarrow/lib\(arrow\|gandiva\|parquet\|plasma\)"; then echo "There are non-bundled shared library dependencies." exit 1 fi + +# Move the verified wheels +mkdir -p ${source_dir}/python/repaired_wheels +mv ${source_dir}/python/dist/*.whl ${source_dir}/python/repaired_wheels/ diff --git a/ci/scripts/python_wheel_macos_test.sh b/ci/scripts/python_wheel_macos_test.sh deleted file mode 100755 index 6ac8576d484..00000000000 --- a/ci/scripts/python_wheel_macos_test.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -ex - -source_dir=${1} - -: ${ARROW_S3:=ON} - -export PYARROW_TEST_CYTHON=OFF -export PYARROW_TEST_DATASET=ON -export PYARROW_TEST_GANDIVA=OFF -export PYARROW_TEST_HDFS=ON -export PYARROW_TEST_ORC=ON -export PYARROW_TEST_PANDAS=ON -export PYARROW_TEST_PARQUET=ON -export PYARROW_TEST_PLASMA=ON -export PYARROW_TEST_S3=${ARROW_S3} -export PYARROW_TEST_TENSORFLOW=ON -export PYARROW_TEST_FLIGHT=ON - -export ARROW_TEST_DATA=${source_dir}/testing/data -export PARQUET_TEST_DATA=${source_dir}/submodules/parquet-testing/data - -# Install the built wheels -pip install ${source_dir}/python/dist/*.whl - -# Test that the modules are importable -python -c " -import pyarrow -import pyarrow._hdfs -import pyarrow.csv -import pyarrow.dataset -import pyarrow.flight -import pyarrow.fs -import pyarrow.json -import pyarrow.orc -import pyarrow.parquet -import pyarrow.plasma -" - -if [ "${PYARROW_TEST_S3}" == "ON" ]; then - python -c "import pyarrow._s3fs" -fi - -# Install testing dependencies -pip install -r ${source_dir}/python/requirements-wheel-test.txt - -# Execute unittest -pytest -r s --pyargs pyarrow diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 83aa623b49b..312e1c3b9b7 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -87,7 +87,7 @@ cmake \ -DARROW_JEMALLOC=${ARROW_JEMALLOC} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC} \ -DARROW_ORC=${ARROW_ORC} \ - -DARROW_PACKAGE_KIND="manylinux${MANYLINUX_VERSION}" \ + -DARROW_PACKAGE_KIND="python-wheel-manylinux${MANYLINUX_VERSION}" \ -DARROW_PARQUET=${ARROW_PARQUET} \ -DARROW_PLASMA=${ARROW_PLASMA} \ -DARROW_PYTHON=ON \ diff --git a/ci/scripts/python_wheel_manylinux_test.sh b/ci/scripts/python_wheel_manylinux_test.sh deleted file mode 100755 index 21987748f73..00000000000 --- a/ci/scripts/python_wheel_manylinux_test.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -x -set -o pipefail - -case $# in - 1) KIND="$1" - case $KIND in - imports|unittests) ;; - *) echo "Invalid argument: '${KIND}', valid options are 'imports', 'unittests'" - exit 1 - ;; - esac - ;; - *) echo "Usage: $0 imports|unittests" - exit 1 - ;; -esac - -export PYARROW_TEST_CYTHON=OFF -export PYARROW_TEST_DATASET=ON -export PYARROW_TEST_GANDIVA=OFF -export PYARROW_TEST_HDFS=ON -export PYARROW_TEST_ORC=ON -export PYARROW_TEST_PANDAS=ON -export PYARROW_TEST_PARQUET=ON -export PYARROW_TEST_PLASMA=ON -export PYARROW_TEST_S3=ON -export PYARROW_TEST_TENSORFLOW=ON -export PYARROW_TEST_FLIGHT=ON - -export ARROW_TEST_DATA=/arrow/testing/data -export PARQUET_TEST_DATA=/arrow/submodules/parquet-testing/data - -# Install the built wheels -pip install /arrow/python/repaired_wheels/*.whl - -if [ "${KIND}" == "imports" ]; then - # Test that the modules are importable - python -c " -import pyarrow -import pyarrow._hdfs -import pyarrow._s3fs -import pyarrow.csv -import pyarrow.dataset -import pyarrow.flight -import pyarrow.fs -import pyarrow.json -import pyarrow.orc -import pyarrow.parquet -import pyarrow.plasma" -elif [ "${KIND}" == "unittests" ]; then - # Execute unittest, test dependencies must be installed - pytest -r s --pyargs pyarrow -fi diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh new file mode 100755 index 00000000000..50d3ccb4f8e --- /dev/null +++ b/ci/scripts/python_wheel_unix_test.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -x +set -o pipefail + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +source_dir=${1} + +: ${ARROW_FLIGHT:=ON} +: ${ARROW_S3:=ON} +: ${CHECK_IMPORTS:=ON} +: ${CHECK_UNITTESTS:=ON} +: ${INSTALL_PYARROW:=ON} + +export PYARROW_TEST_CYTHON=OFF +export PYARROW_TEST_DATASET=ON +export PYARROW_TEST_FLIGHT=${ARROW_FLIGHT} +export PYARROW_TEST_GANDIVA=OFF +export PYARROW_TEST_HDFS=ON +export PYARROW_TEST_ORC=ON +export PYARROW_TEST_PANDAS=ON +export PYARROW_TEST_PARQUET=ON +export PYARROW_TEST_PLASMA=ON +export PYARROW_TEST_S3=${ARROW_S3} +export PYARROW_TEST_TENSORFLOW=ON + +export ARROW_TEST_DATA=${source_dir}/testing/data +export PARQUET_TEST_DATA=${source_dir}/submodules/parquet-testing/data + +if [ "${INSTALL_PYARROW}" == "ON" ]; then + # Install the built wheels + pip install ${source_dir}/python/repaired_wheels/*.whl +fi + +if [ "${CHECK_IMPORTS}" == "ON" ]; then + # Test that the modules are importable + python -c " +import pyarrow +import pyarrow._hdfs +import pyarrow.csv +import pyarrow.dataset +import pyarrow.fs +import pyarrow.json +import pyarrow.orc +import pyarrow.parquet +import pyarrow.plasma +" + if [ "${PYARROW_TEST_S3}" == "ON" ]; then + python -c "import pyarrow._s3fs" + fi + if [ "${PYARROW_TEST_FLIGHT}" == "ON" ]; then + python -c "import pyarrow.flight" + fi +fi + +if [ "${CHECK_UNITTESTS}" == "ON" ]; then + # Install testing dependencies + pip install -U -r ${source_dir}/python/requirements-wheel-test.txt + # Execute unittest, test dependencies must be installed + python -c 'import pyarrow; pyarrow.create_library_symlinks()' + pytest -r s --pyargs pyarrow +fi diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 18c1b657b21..23be7f512d6 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -64,7 +64,7 @@ cmake ^ -DARROW_HDFS=%ARROW_HDFS% ^ -DARROW_MIMALLOC=%ARROW_MIMALLOC% ^ -DARROW_ORC=%ARROW_ORC% ^ - -DARROW_PACKAGE_KIND="wheel-windows" ^ + -DARROW_PACKAGE_KIND="python-wheel-windows" ^ -DARROW_PARQUET=%ARROW_PARQUET% ^ -DARROW_PYTHON=ON ^ -DARROW_S3=%ARROW_S3% ^ diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index 8352e586226..586fd58f651 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -19,6 +19,7 @@ set PYARROW_TEST_CYTHON=OFF set PYARROW_TEST_DATASET=ON +set PYARROW_TEST_FLIGHT=ON set PYARROW_TEST_GANDIVA=OFF set PYARROW_TEST_HDFS=ON set PYARROW_TEST_ORC=OFF @@ -27,7 +28,6 @@ set PYARROW_TEST_PARQUET=ON set PYARROW_TEST_PLASMA=OFF set PYARROW_TEST_S3=OFF set PYARROW_TEST_TENSORFLOW=ON -set PYARROW_TEST_FLIGHT=ON set ARROW_TEST_DATA=C:\arrow\testing\data set PARQUET_TEST_DATA=C:\arrow\submodules\parquet-testing\data diff --git a/ci/scripts/r_deps.sh b/ci/scripts/r_deps.sh index 7e9d2eac7a9..243a7efc9cf 100755 --- a/ci/scripts/r_deps.sh +++ b/ci/scripts/r_deps.sh @@ -26,6 +26,15 @@ pushd ${source_dir} # Install R package dependencies ${R_BIN} -e "install.packages('remotes'); remotes::install_cran(c('glue', 'rcmdcheck', 'sys'))" + +if [ ${R_BIN} = "RDsan" ]; then + # To prevent the build from timing out, let's prune some optional deps (and their possible version requirements) + ${R_BIN} -e 'd <- read.dcf("DESCRIPTION") + to_prune <- c("duckdb", "DBI", "dbplyr", "decor", "knitr", "rmarkdown", "pkgload", "reticulate") + pattern <- paste0("\\n?", to_prune, " (\\\\(.*\\\\))?,?", collapse = "|") + d[,"Suggests"] <- gsub(pattern, "", d[,"Suggests"]) + write.dcf(d, "DESCRIPTION")' +fi ${R_BIN} -e "remotes::install_deps(dependencies = TRUE)" popd diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 3e553fe9edd..2b9bc03bea0 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -37,6 +37,9 @@ if [ "$RHUB_PLATFORM" = "linux-x86_64-fedora-clang" ]; then dnf install -y libcxx-devel sed -i.bak -E -e 's/(CXX1?1? =.*)/\1 -stdlib=libc++/g' $(${R_BIN} RHOME)/etc/Makeconf rm -rf $(${R_BIN} RHOME)/etc/Makeconf.bak + + sed -i.bak -E -e 's/(CXXFLAGS = )(.*)/\1 -g -O3 -Wall -pedantic -frtti -fPIC/' $(${R_BIN} RHOME)/etc/Makeconf + rm -rf $(${R_BIN} RHOME)/etc/Makeconf.bak fi # Special hacking to try to reproduce quirks on centos using non-default build diff --git a/ci/scripts/r_revdepcheck.sh b/ci/scripts/r_revdepcheck.sh new file mode 100755 index 00000000000..79ace9ca09d --- /dev/null +++ b/ci/scripts/r_revdepcheck.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +: ${R_BIN:=R} + +source_dir=${1}/r + +# cpp building dependencies +apt install -y cmake + +# system dependencies needed for arrow's reverse dependencies +apt install -y libxml2-dev \ + libfontconfig1-dev \ + libcairo2-dev \ + libglpk-dev \ + libmysqlclient-dev \ + unixodbc-dev \ + libpq-dev \ + coinor-libsymphony-dev \ + coinor-libcgl-dev \ + coinor-symphony \ + libzmq3-dev \ + libudunits2-dev \ + libgdal-dev \ + libgeos-dev \ + libproj-dev + +pushd ${source_dir} + +printenv + +: ${TEST_R_WITH_ARROW:=TRUE} +export TEST_R_WITH_ARROW=$TEST_R_WITH_ARROW + +# By default, aws-sdk tries to contact a non-existing local ip host +# to retrieve metadata. Disable this so that S3FileSystem tests run faster. +export AWS_EC2_METADATA_DISABLED=TRUE + +# Set crancache dir so we can cache it +export CRANCACHE_DIR="/arrow/.crancache" + +SCRIPT=" + # We can't use RSPM binaries because we need source packages + options('repos' = c(CRAN = 'https://packagemanager.rstudio.com/all/latest')) + remotes::install_github('r-lib/revdepcheck') + + # zoo is needed by RcisTarget tests, though only listed in enhances so not installed by revdepcheck + install.packages('zoo') + + # actually run revdepcheck + revdepcheck::revdep_check( + quiet = FALSE, + timeout = as.difftime(120, units = 'mins'), + num_workers = 1, + env = c( + ARROW_R_DEV = '$ARROW_R_DEV', + LIBARROW_DOWNLOAD = TRUE, + LIBARROW_MINIMAL = FALSE, + revdepcheck::revdep_env_vars() + )) + revdepcheck::revdep_report(all = TRUE) + + # Go through the summary and fail if any of the statuses include - + summary <- revdepcheck::revdep_summary() + failed <- lapply(summary, function(check) grepl('-', check[['status']])) + + if (any(unlist(failed))) { + quit(status = 1) + } + " + +echo "$SCRIPT" | ${R_BIN} --no-save + +popd diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh index 89963eb2dd8..61d0755878f 100755 --- a/ci/scripts/r_sanitize.sh +++ b/ci/scripts/r_sanitize.sh @@ -22,12 +22,23 @@ set -ex source_dir=${1}/r -${R_BIN} CMD INSTALL ${source_dir} -pushd ${source_dir}/tests +pushd ${source_dir} + +# Unity builds were causing the CI job to run out of memory +export CMAKE_UNITY_BUILD=OFF +# Make installation verbose so that the CI job doesn't time out due to silence +export ARROW_R_DEV=TRUE +${R_BIN} CMD INSTALL . +# But unset the env var so that it doesn't cause us to run extra dev tests +unset ARROW_R_DEV export TEST_R_WITH_ARROW=TRUE export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp" + +pushd tests ${R_BIN} < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; } +popd +${R_BIN} -e 'library(arrow); testthat::test_examples(".")' >> testthat.out 2>&1 || { cat testthat.out; exit 1; } cat testthat.out if grep -q "runtime error" testthat.out; then diff --git a/ci/scripts/r_valgrind.sh b/ci/scripts/r_valgrind.sh new file mode 100755 index 00000000000..ae61d076655 --- /dev/null +++ b/ci/scripts/r_valgrind.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -ex + +: ${R_BIN:=RDvalgrind} + +source_dir=${1}/r + +export CMAKE_BUILD_TYPE=RelWithDebInfo + +${R_BIN} CMD INSTALL ${source_dir} +pushd ${source_dir}/tests + +export TEST_R_WITH_ARROW=TRUE + +# to generate suppression files run: +# ${R_BIN} --vanilla -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --gen-suppressions=all --log-file=memcheck.log" -f testtthat.supp +${R_BIN} --vanilla -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --suppressions=/${1}/ci/etc/valgrind-cran.supp" -f testthat.R |& tee testthat.out + +# valgrind --error-exitcode=1 should return an erroring exit code that we can catch, +# but R eats that and returns 0, so we need to look at the output and make sure that +# we have 0 errors instead. +if [ $(grep -c "ERROR SUMMARY: 0 errors" testthat.out) != 1 ]; then + cat testthat.out + echo "Found Valgrind errors" + exit 1 +fi + +# We might also considering using the greps that LibthGBM uses: +# https://github.com/microsoft/LightGBM/blob/fa6d356555f9ef888acf5f5e259dca958ca24f6d/.ci/test_r_package_valgrind.sh#L20-L85 + +popd diff --git a/ci/scripts/r_windows_build.sh b/ci/scripts/r_windows_build.sh index be03b75f5ad..20f824a9e01 100755 --- a/ci/scripts/r_windows_build.sh +++ b/ci/scripts/r_windows_build.sh @@ -26,16 +26,6 @@ export ARROW_HOME="$(cd "${ARROW_HOME}" && pwd)" if [ "$RTOOLS_VERSION" = "35" ]; then # Use rtools-backports if building with rtools35 curl https://raw.githubusercontent.com/r-windows/rtools-backports/master/pacman.conf > /etc/pacman.conf - # Update keys: https://www.msys2.org/news/#2020-06-29-new-packagers - msys2_repo_base_url=https://repo.msys2.org/msys - # Mirror - msys2_repo_base_url=https://sourceforge.net/projects/msys2/files/REPOS/MSYS2 - curl -OSsL "${msys2_repo_base_url}/x86_64/msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz" - pacman -U --noconfirm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz && rm msys2-keyring-r21.b39fb11-1-any.pkg.tar.xz - # Use sf.net instead of http://repo.msys2.org/ temporary. - sed -i -e "s,^Server = http://repo\.msys2\.org/msys,Server = ${msys2_repo_base_url},g" \ - /etc/pacman.conf - pacman --noconfirm -Scc pacman --noconfirm -Syy # lib-4.9.3 is for libraries compiled with gcc 4.9 (Rtools 3.5) RWINLIB_LIB_DIR="lib-4.9.3" @@ -48,6 +38,7 @@ else pacman --noconfirm -Syy RWINLIB_LIB_DIR="lib" + export MINGW_ARCH="mingw32 mingw64 ucrt64" fi cp $ARROW_HOME/ci/scripts/PKGBUILD . @@ -69,7 +60,7 @@ MSYS_LIB_DIR="/c/rtools40" ls $MSYS_LIB_DIR/mingw64/lib/ ls $MSYS_LIB_DIR/mingw32/lib/ -# Untar the two builds we made +# Untar the three builds we made ls *.xz | xargs -n 1 tar -xJf mkdir -p $DST_DIR # Grab the headers from one, either one is fine @@ -99,6 +90,14 @@ cp $MSYS_LIB_DIR/mingw32/lib/lib{thrift,snappy}.a $DST_DIR/${RWINLIB_LIB_DIR}/i3 cp $MSYS_LIB_DIR/mingw64/lib/lib{zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64 cp $MSYS_LIB_DIR/mingw32/lib/lib{zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/i386 +# Do the same also for ucrt64 +if [ "$RTOOLS_VERSION" != "35" ]; then +ls $MSYS_LIB_DIR/ucrt64/lib/ +mkdir -p $DST_DIR/lib/x64-ucrt +mv ucrt64/lib/*.a $DST_DIR/${RWINLIB_LIB_DIR}/x64-ucrt +cp $MSYS_LIB_DIR/ucrt64/lib/lib{zstd,lz4,crypto,utf8proc,re2,aws*}.a $DST_DIR/lib/x64-ucrt +fi + # Create build artifact zip -r ${DST_DIR}.zip $DST_DIR diff --git a/ci/scripts/rust_build.sh b/ci/scripts/rust_build.sh index 726ecd80f1a..3532ea3d5c6 100755 --- a/ci/scripts/rust_build.sh +++ b/ci/scripts/rust_build.sh @@ -17,13 +17,14 @@ # specific language governing permissions and limitations # under the License. -set -ex +set -e +arrow_dir=${1} source_dir=${1}/rust -# This file is used to build the rust binaries needed for the -# archery integration tests. Testing of the rust implementation -# in normal CI is handled by github workflows +# This file is used to build the rust binaries needed for the archery +# integration tests. Testing of the rust implementation in normal CI is handled +# by github workflows in the arrow-rs repository. # Disable full debug symbol generation to speed up CI build / reduce memory required export RUSTFLAGS="-C debuginfo=1" @@ -31,6 +32,22 @@ export RUSTFLAGS="-C debuginfo=1" export ARROW_TEST_DATA=${arrow_dir}/testing/data export PARQUET_TEST_DATA=${arrow_dir}/cpp/submodules/parquet-testing/data +if [ "${ARCHERY_INTEGRATION_WITH_RUST}" -eq "0" ]; then + echo "=====================================================================" + echo "Not building the Rust implementation." + echo "=====================================================================" + exit 0; +elif [ ! -d "${source_dir}" ]; then + echo "=====================================================================" + echo "The Rust source is missing. Please clone the arrow-rs repository" + echo "to arrow/rust before running the integration tests:" + echo " git clone https://github.com/apache/arrow-rs.git path/to/arrow/rust" + echo "=====================================================================" + exit 1; +fi + +set -x + # show activated toolchain rustup show @@ -39,7 +56,4 @@ pushd ${source_dir} # build only the integration testing binaries cargo build -p arrow-integration-testing -# Remove incremental build artifacts to save space -rm -rf target/debug/deps/ target/debug/build/ - popd diff --git a/ci/vcpkg/arm64-osx-static-debug.cmake b/ci/vcpkg/arm64-osx-static-debug.cmake new file mode 100644 index 00000000000..f511819a2ed --- /dev/null +++ b/ci/vcpkg/arm64-osx-static-debug.cmake @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(VCPKG_TARGET_ARCHITECTURE arm64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) + +set(VCPKG_CMAKE_SYSTEM_NAME Darwin) +set(VCPKG_OSX_ARCHITECTURES arm64) +set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0") + +set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/arm64-osx-static-release.cmake b/ci/vcpkg/arm64-osx-static-release.cmake new file mode 100644 index 00000000000..43d65efb265 --- /dev/null +++ b/ci/vcpkg/arm64-osx-static-release.cmake @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(VCPKG_TARGET_ARCHITECTURE arm64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) + +set(VCPKG_CMAKE_SYSTEM_NAME Darwin) +set(VCPKG_OSX_ARCHITECTURES arm64) +set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0") + +set(VCPKG_BUILD_TYPE release) diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 14b9678690e..7bcba49c194 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,5 +1,5 @@ diff --git a/ports/aws-c-common/portfile.cmake b/ports/aws-c-common/portfile.cmake -index f3704ef05..3af543058 100644 +index f3704ef05b..3af543058d 100644 --- a/ports/aws-c-common/portfile.cmake +++ b/ports/aws-c-common/portfile.cmake @@ -1,8 +1,8 @@ @@ -12,22 +12,22 @@ index f3704ef05..3af543058 100644 + SHA512 28256522ac6af544d7464e3e7dcd4dc802ae2b09728bf8f167f86a6487bb756d0cad5eb4a2480610b2967b9c24c4a7f70621894517aa2828ffdeb0479453803b HEAD_REF master PATCHES - disable-error-4068.patch # This patch fixes dependency port compilation failure + disable-error-4068.patch # This patch fixes dependency port compilation failure diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake -index 6e18aecd0..2ccecf33c 100644 +index be66d452be..a5ce325e9d 100644 --- a/ports/curl/portfile.cmake +++ b/ports/curl/portfile.cmake -@@ -76,6 +76,8 @@ vcpkg_configure_cmake( +@@ -94,6 +94,8 @@ vcpkg_configure_cmake( -DCMAKE_DISABLE_FIND_PACKAGE_Perl=ON -DENABLE_DEBUG=ON -DCURL_CA_FALLBACK=ON + -DCURL_CA_PATH=none + -DCURL_CA_BUNDLE=none - ) - - vcpkg_install_cmake() + OPTIONS_DEBUG + ${EXTRA_ARGS_DEBUG} + OPTIONS_RELEASE diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake -index 75dd13302..84345c7ca 100644 +index 75dd133027..84345c7caa 100644 --- a/ports/snappy/portfile.cmake +++ b/ports/snappy/portfile.cmake @@ -4,6 +4,7 @@ vcpkg_from_github( @@ -36,11 +36,11 @@ index 75dd13302..84345c7ca 100644 HEAD_REF master + PATCHES "snappy-disable-bmi.patch" ) - + vcpkg_configure_cmake( diff --git a/ports/snappy/snappy-disable-bmi.patch b/ports/snappy/snappy-disable-bmi.patch new file mode 100644 -index 000000000..2cbb1533a +index 0000000000..2cbb1533a8 --- /dev/null +++ b/ports/snappy/snappy-disable-bmi.patch @@ -0,0 +1,17 @@ diff --git a/ci/vcpkg/universal2-osx-static-debug.cmake b/ci/vcpkg/universal2-osx-static-debug.cmake new file mode 100644 index 00000000000..706ac47a72c --- /dev/null +++ b/ci/vcpkg/universal2-osx-static-debug.cmake @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(VCPKG_TARGET_ARCHITECTURE arm64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) + +set(VCPKG_CMAKE_SYSTEM_NAME Darwin) +set(VCPKG_OSX_ARCHITECTURES "x86_64\;arm64") +set(VCPKG_OSX_DEPLOYMENT_TARGET "10.13") + +set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/universal2-osx-static-release.cmake b/ci/vcpkg/universal2-osx-static-release.cmake new file mode 100644 index 00000000000..8670690171e --- /dev/null +++ b/ci/vcpkg/universal2-osx-static-release.cmake @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(VCPKG_TARGET_ARCHITECTURE arm64) +set(VCPKG_CRT_LINKAGE dynamic) +set(VCPKG_LIBRARY_LINKAGE static) + +set(VCPKG_CMAKE_SYSTEM_NAME Darwin) +set(VCPKG_OSX_ARCHITECTURES "x86_64\;arm64") +set(VCPKG_OSX_DEPLOYMENT_TARGET "10.13") + +set(VCPKG_BUILD_TYPE release) diff --git a/cmake-format.py b/cmake-format.py index 0976642031f..3e77733f4d1 100644 --- a/cmake-format.py +++ b/cmake-format.py @@ -16,44 +16,61 @@ # under the License. # cmake-format configuration file -# Use run-cmake-format.py to reformat all cmake files in the source tree +# Use `archery lint --cmake-format --fix` to reformat all cmake files in the +# source tree -# How wide to allow formatted cmake files -line_width = 90 +# ----------------------------- +# Options affecting formatting. +# ----------------------------- +with section("format"): + # How wide to allow formatted cmake files + line_width = 90 -# How many spaces to tab for indent -tab_size = 2 + # How many spaces to tab for indent + tab_size = 2 -# If arglists are longer than this, break them always -max_subargs_per_line = 4 + # If a positional argument group contains more than this many arguments, + # then force it to a vertical layout. + max_pargs_hwrap = 4 -# If true, separate flow control names from their parentheses with a space -separate_ctrl_name_with_space = False + # If the statement spelling length (including space and parenthesis) is + # smaller than this amount, then force reject nested layouts. + # This value only comes into play when considering whether or not to nest + # arguments below their parent. If the number of characters in the parent + # is less than this value, we will not nest. + min_prefix_chars = 32 -# If true, separate function names from parentheses with a space -separate_fn_name_with_space = False + # If true, separate flow control names from their parentheses with a space + separate_ctrl_name_with_space = False -# If a statement is wrapped to more than one line, than dangle the closing -# parenthesis on it's own line -dangle_parens = False + # If true, separate function names from parentheses with a space + separate_fn_name_with_space = False -# What style line endings to use in the output. -line_ending = 'unix' + # If a statement is wrapped to more than one line, than dangle the closing + # parenthesis on it's own line + dangle_parens = False -# Format command names consistently as 'lower' or 'upper' case -command_case = 'lower' + # What style line endings to use in the output. + line_ending = 'unix' -# Format keywords consistently as 'lower' or 'upper' case -keyword_case = 'unchanged' + # Format command names consistently as 'lower' or 'upper' case + command_case = 'lower' -# enable comment markup parsing and reflow -enable_markup = False + # Format keywords consistently as 'lower' or 'upper' case + keyword_case = 'unchanged' -# If comment markup is enabled, don't reflow the first comment block in -# eachlistfile. Use this to preserve formatting of your -# copyright/licensestatements. -first_comment_is_literal = False +# ------------------------------------------------ +# Options affecting comment reflow and formatting. +# ------------------------------------------------ +with section("markup"): + # enable comment markup parsing and reflow + enable_markup = False -# If comment markup is enabled, don't reflow any comment block which matchesthis -# (regex) pattern. Default is `None` (disabled). -literal_comment_pattern = None + # If comment markup is enabled, don't reflow the first comment block in + # eachlistfile. Use this to preserve formatting of your + # copyright/licensestatements. + first_comment_is_literal = True + + # If comment markup is enabled, don't reflow any comment block which + # matchesthis (regex) pattern. Default is `None` (disabled). + literal_comment_pattern = None diff --git a/cpp/Brewfile b/cpp/Brewfile index 7de6c7deabe..78ee5e64c8f 100644 --- a/cpp/Brewfile +++ b/cpp/Brewfile @@ -35,9 +35,6 @@ brew "openssl@1.1" brew "protobuf" brew "python" brew "rapidjson" -# grpc bundles re2 and causes a conflict when Homebrew tries to install it, -# so temporarily skip installing re2. See ARROW-9972. -# brew "re2" brew "snappy" brew "thrift" brew "wget" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1705e854fb1..2bcdc0de179 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -47,13 +47,15 @@ if(POLICY CMP0074) cmake_policy(SET CMP0074 NEW) endif() -set(ARROW_VERSION "4.0.0-SNAPSHOT") +set(ARROW_VERSION "6.0.0-SNAPSHOT") string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") # if no build build type is specified, default to release builds if(NOT DEFINED CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build.") + set(CMAKE_BUILD_TYPE + Release + CACHE STRING "Choose the type of build.") endif() string(TOLOWER ${CMAKE_BUILD_TYPE} LOWERCASE_BUILD_TYPE) string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) @@ -109,6 +111,7 @@ set(ARROW_CMAKE_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") set(ARROW_LLVM_VERSIONS + "12.0" "11.1" "11.0" "10" @@ -116,18 +119,15 @@ set(ARROW_LLVM_VERSIONS "8" "7") list(GET ARROW_LLVM_VERSIONS 0 ARROW_LLVM_VERSION_PRIMARY) -string(REGEX - REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_LLVM_VERSION_PRIMARY_MAJOR - "${ARROW_LLVM_VERSION_PRIMARY}") +string(REGEX REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_LLVM_VERSION_PRIMARY_MAJOR + "${ARROW_LLVM_VERSION_PRIMARY}") file(READ ${CMAKE_CURRENT_SOURCE_DIR}/../.env ARROW_ENV) string(REGEX MATCH "CLANG_TOOLS=[^\n]+" ARROW_ENV_CLANG_TOOLS_VERSION "${ARROW_ENV}") -string(REGEX - REPLACE "^CLANG_TOOLS=" "" ARROW_CLANG_TOOLS_VERSION - "${ARROW_ENV_CLANG_TOOLS_VERSION}") -string(REGEX - REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_CLANG_TOOLS_VERSION_MAJOR - "${ARROW_CLANG_TOOLS_VERSION}") +string(REGEX REPLACE "^CLANG_TOOLS=" "" ARROW_CLANG_TOOLS_VERSION + "${ARROW_ENV_CLANG_TOOLS_VERSION}") +string(REGEX REPLACE "^([0-9]+)(\\..+)?" "\\1" ARROW_CLANG_TOOLS_VERSION_MAJOR + "${ARROW_CLANG_TOOLS_VERSION}") if(APPLE) find_program(BREW_BIN brew) @@ -162,7 +162,9 @@ endif() find_package(ClangTools) find_package(InferTools) -if("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND OR INFER_FOUND) +if("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" + OR CLANG_TIDY_FOUND + OR INFER_FOUND) # Generate a Clang compile_commands.json "compilation database" file for use # with various development tools, such as Vim's YouCompleteMe plugin. # See http://clang.llvm.org/docs/JSONCompilationDatabase.html @@ -225,7 +227,9 @@ if(NOT LINT_EXCLUSIONS_FILE) set(LINT_EXCLUSIONS_FILE ${BUILD_SUPPORT_DIR}/lint_exclusions.txt) endif() -find_program(CPPLINT_BIN NAMES cpplint cpplint.py HINTS ${BUILD_SUPPORT_DIR}) +find_program(CPPLINT_BIN + NAMES cpplint cpplint.py + HINTS ${BUILD_SUPPORT_DIR}) message(STATUS "Found cpplint executable at ${CPPLINT_BIN}") add_custom_target(lint @@ -270,7 +274,7 @@ if(${CLANG_FORMAT_FOUND}) endif() add_custom_target(lint_cpp_cli ${PYTHON_EXECUTABLE} ${BUILD_SUPPORT_DIR}/lint_cpp_cli.py - ${CMAKE_CURRENT_SOURCE_DIR}/src) + ${CMAKE_CURRENT_SOURCE_DIR}/src) if(ARROW_LINT_ONLY) message("ARROW_LINT_ONLY was specified, this is only a partial build directory") @@ -342,6 +346,10 @@ if(ARROW_CUDA set(ARROW_IPC ON) endif() +if(ARROW_ENGINE) + set(ARROW_COMPUTE ON) +endif() + if(ARROW_DATASET) set(ARROW_COMPUTE ON) set(ARROW_FILESYSTEM ON) @@ -445,6 +453,26 @@ endif() include(SetupCxxFlags) +# +# Linker flags +# + +# Localize thirdparty symbols using a linker version script. This hides them +# from the client application. The OS X linker does not support the +# version-script option. +if(CMAKE_VERSION VERSION_LESS 3.18) + if(APPLE OR WIN32) + set(CXX_LINKER_SUPPORTS_VERSION_SCRIPT FALSE) + else() + set(CXX_LINKER_SUPPORTS_VERSION_SCRIPT TRUE) + endif() +else() + include(CheckLinkerFlag) + check_linker_flag(CXX + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/src/arrow/symbols.map" + CXX_LINKER_SUPPORTS_VERSION_SCRIPT) +endif() + # # Build output directory # @@ -464,10 +492,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_BINARY_DIR}) if(NOT APPLE) set(MORE_ARGS "-T") endif() - execute_process(COMMAND ln - ${MORE_ARGS} - -sf - ${BUILD_OUTPUT_ROOT_DIRECTORY} + execute_process(COMMAND ln ${MORE_ARGS} -sf ${BUILD_OUTPUT_ROOT_DIRECTORY} ${CMAKE_CURRENT_BINARY_DIR}/build/latest) else() set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}/") @@ -502,6 +527,11 @@ endif() include(BuildUtils) enable_testing() +# For arrow.pc. Requires.private and Libs.private are used when +# "pkg-config --libs --static arrow" is used. +set(ARROW_PC_REQUIRES_PRIVATE) +set(ARROW_PC_LIBS_PRIVATE) + include(ThirdpartyToolchain) # Add common flags @@ -545,12 +575,9 @@ include_directories(src/generated) # if(PARQUET_BUILD_SHARED) set_target_properties(arrow_shared - PROPERTIES C_VISIBILITY_PRESET - hidden - CXX_VISIBILITY_PRESET - hidden - VISIBILITY_INLINES_HIDDEN - 1) + PROPERTIES C_VISIBILITY_PRESET hidden + CXX_VISIBILITY_PRESET hidden + VISIBILITY_INLINES_HIDDEN 1) endif() # @@ -594,7 +621,9 @@ endif(UNIX) # "make cscope" target # if(UNIX) - add_custom_target(cscope find ${CMAKE_CURRENT_SOURCE_DIR} + add_custom_target(cscope + find + ${CMAKE_CURRENT_SOURCE_DIR} (-name \\*.cc -or @@ -631,23 +660,14 @@ endif(UNIX) if(${INFER_FOUND}) # runs infer capture - add_custom_target(infer - ${BUILD_SUPPORT_DIR}/run-infer.sh - ${INFER_BIN} - ${CMAKE_BINARY_DIR}/compile_commands.json - 1) + add_custom_target(infer ${BUILD_SUPPORT_DIR}/run-infer.sh ${INFER_BIN} + ${CMAKE_BINARY_DIR}/compile_commands.json 1) # runs infer analyze - add_custom_target(infer-analyze - ${BUILD_SUPPORT_DIR}/run-infer.sh - ${INFER_BIN} - ${CMAKE_BINARY_DIR}/compile_commands.json - 2) + add_custom_target(infer-analyze ${BUILD_SUPPORT_DIR}/run-infer.sh ${INFER_BIN} + ${CMAKE_BINARY_DIR}/compile_commands.json 2) # runs infer report - add_custom_target(infer-report - ${BUILD_SUPPORT_DIR}/run-infer.sh - ${INFER_BIN} - ${CMAKE_BINARY_DIR}/compile_commands.json - 3) + add_custom_target(infer-report ${BUILD_SUPPORT_DIR}/run-infer.sh ${INFER_BIN} + ${CMAKE_BINARY_DIR}/compile_commands.json 3) endif() # @@ -716,7 +736,7 @@ if(ARROW_ORC) list(APPEND ARROW_STATIC_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF}) if(ORC_SOURCE STREQUAL "SYSTEM") list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::liborc - ${ARROW_PROTOBUF_LIBPROTOBUF}) + ${ARROW_PROTOBUF_LIBPROTOBUF}) endif() endif() @@ -860,8 +880,9 @@ endif() set(ARROW_SYSTEM_LINK_LIBS) -if(THREADS_FOUND) - list(APPEND ARROW_SYSTEM_LINK_LIBS Threads::Threads) +list(APPEND ARROW_SYSTEM_LINK_LIBS Threads::Threads) +if(CMAKE_THREAD_LIBS_INIT) + string(APPEND ARROW_PC_LIBS_PRIVATE " ${CMAKE_THREAD_LIBS_INIT}") endif() if(WIN32) @@ -911,8 +932,7 @@ endif() install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/../LICENSE.txt ${CMAKE_CURRENT_SOURCE_DIR}/../NOTICE.txt - ${CMAKE_CURRENT_SOURCE_DIR}/README.md - DESTINATION "${ARROW_DOC_DIR}") + ${CMAKE_CURRENT_SOURCE_DIR}/README.md DESTINATION "${ARROW_DOC_DIR}") # # Validate and print out Arrow configuration options diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index f6b782276e3..d8b0928ed3c 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -2170,6 +2170,7 @@ PREDEFINED = __attribute__(x)= \ __declspec(x)= \ PARQUET_EXPORT= \ ARROW_EXPORT= \ + ARROW_DS_EXPORT= \ ARROW_FLIGHT_EXPORT= \ ARROW_EXTERN_TEMPLATE= \ ARROW_DEPRECATED(x)= diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 2fd897b5d1d..cd8290d1bbb 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -62,17 +62,16 @@ function(ADD_THIRDPARTY_LIB LIB_NAME) if(ARG_STATIC_LIB AND ARG_SHARED_LIB) set(AUG_LIB_NAME "${LIB_NAME}_static") add_library(${AUG_LIB_NAME} STATIC IMPORTED) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES IMPORTED_LOCATION + "${ARG_STATIC_LIB}") if(ARG_DEPS) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES INTERFACE_LINK_LIBRARIES "${ARG_DEPS}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES INTERFACE_LINK_LIBRARIES + "${ARG_DEPS}") endif() message(STATUS "Added static library dependency ${AUG_LIB_NAME}: ${ARG_STATIC_LIB}") if(ARG_INCLUDE_DIRECTORIES) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${ARG_INCLUDE_DIRECTORIES}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${ARG_INCLUDE_DIRECTORIES}") endif() set(AUG_LIB_NAME "${LIB_NAME}_shared") @@ -80,36 +79,34 @@ function(ADD_THIRDPARTY_LIB LIB_NAME) if(WIN32) # Mark the ".lib" location as part of a Windows DLL - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_IMPLIB "${ARG_SHARED_LIB}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES IMPORTED_IMPLIB + "${ARG_SHARED_LIB}") else() - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES IMPORTED_LOCATION + "${ARG_SHARED_LIB}") endif() if(ARG_DEPS) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES INTERFACE_LINK_LIBRARIES "${ARG_DEPS}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES INTERFACE_LINK_LIBRARIES + "${ARG_DEPS}") endif() message(STATUS "Added shared library dependency ${AUG_LIB_NAME}: ${ARG_SHARED_LIB}") if(ARG_INCLUDE_DIRECTORIES) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${ARG_INCLUDE_DIRECTORIES}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${ARG_INCLUDE_DIRECTORIES}") endif() elseif(ARG_STATIC_LIB) set(AUG_LIB_NAME "${LIB_NAME}_static") add_library(${AUG_LIB_NAME} STATIC IMPORTED) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES IMPORTED_LOCATION + "${ARG_STATIC_LIB}") if(ARG_DEPS) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES INTERFACE_LINK_LIBRARIES "${ARG_DEPS}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES INTERFACE_LINK_LIBRARIES + "${ARG_DEPS}") endif() message(STATUS "Added static library dependency ${AUG_LIB_NAME}: ${ARG_STATIC_LIB}") if(ARG_INCLUDE_DIRECTORIES) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${ARG_INCLUDE_DIRECTORIES}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${ARG_INCLUDE_DIRECTORIES}") endif() elseif(ARG_SHARED_LIB) set(AUG_LIB_NAME "${LIB_NAME}_shared") @@ -117,21 +114,20 @@ function(ADD_THIRDPARTY_LIB LIB_NAME) if(WIN32) # Mark the ".lib" location as part of a Windows DLL - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_IMPLIB "${ARG_SHARED_LIB}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES IMPORTED_IMPLIB + "${ARG_SHARED_LIB}") else() - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES IMPORTED_LOCATION + "${ARG_SHARED_LIB}") endif() message(STATUS "Added shared library dependency ${AUG_LIB_NAME}: ${ARG_SHARED_LIB}") if(ARG_DEPS) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES INTERFACE_LINK_LIBRARIES "${ARG_DEPS}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES INTERFACE_LINK_LIBRARIES + "${ARG_DEPS}") endif() if(ARG_INCLUDE_DIRECTORIES) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${ARG_INCLUDE_DIRECTORIES}") + set_target_properties(${AUG_LIB_NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${ARG_INCLUDE_DIRECTORIES}") endif() else() message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") @@ -159,10 +155,9 @@ function(create_merged_static_lib output_target) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() - set( - output_lib_path - ${BUILD_OUTPUT_ROOT_DIRECTORY}${CMAKE_STATIC_LIBRARY_PREFIX}${ARG_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX} - ) + set(output_lib_path + ${BUILD_OUTPUT_ROOT_DIRECTORY}${CMAKE_STATIC_LIBRARY_PREFIX}${ARG_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX} + ) set(all_library_paths $) foreach(lib ${ARG_TO_MERGE}) @@ -170,13 +165,8 @@ function(create_merged_static_lib output_target) endforeach() if(APPLE) - set(BUNDLE_COMMAND - "libtool" - "-no_warning_for_no_symbols" - "-static" - "-o" - ${output_lib_path} - ${all_library_paths}) + set(BUNDLE_COMMAND "libtool" "-no_warning_for_no_symbols" "-static" "-o" + ${output_lib_path} ${all_library_paths}) elseif(CMAKE_CXX_COMPILER_ID MATCHES "^(Clang|GNU|Intel)$") set(ar_script_path ${CMAKE_BINARY_DIR}/${ARG_NAME}.ar) @@ -188,7 +178,9 @@ function(create_merged_static_lib output_target) endforeach() file(APPEND ${ar_script_path}.in "SAVE\nEND\n") - file(GENERATE OUTPUT ${ar_script_path} INPUT ${ar_script_path}.in) + file(GENERATE + OUTPUT ${ar_script_path} + INPUT ${ar_script_path}.in) set(ar_tool ${CMAKE_AR}) if(CMAKE_INTERPROCEDURAL_OPTIMIZATION) @@ -218,9 +210,8 @@ function(create_merged_static_lib output_target) COMMENT "Bundling ${output_lib_path}" VERBATIM) - message( - STATUS "Creating bundled static library target ${output_target} at ${output_lib_path}" - ) + message(STATUS "Creating bundled static library target ${output_target} at ${output_lib_path}" + ) add_custom_target(${output_target} ALL DEPENDS ${output_lib_path}) add_dependencies(${output_target} ${ARG_ROOT} ${ARG_TO_MERGE}) @@ -355,7 +346,9 @@ function(ADD_ARROW_LIB LIB_NAME) endif() # On iOS, specifying -undefined conflicts with enabling bitcode - if(APPLE AND NOT IOS AND NOT DEFINED ENV{EMSCRIPTEN}) + if(APPLE + AND NOT IOS + AND NOT DEFINED ENV{EMSCRIPTEN}) # On OS X, you can avoid linking at library load time and instead # expecting that the symbols have been loaded separately. This happens # with libpython* where there can be conflicts between system Python and @@ -367,20 +360,13 @@ function(ADD_ARROW_LIB LIB_NAME) endif() set_target_properties(${LIB_NAME}_shared - PROPERTIES LIBRARY_OUTPUT_DIRECTORY - "${OUTPUT_PATH}" - RUNTIME_OUTPUT_DIRECTORY - "${OUTPUT_PATH}" - PDB_OUTPUT_DIRECTORY - "${OUTPUT_PATH}" - LINK_FLAGS - "${ARG_SHARED_LINK_FLAGS}" - OUTPUT_NAME - ${LIB_NAME} - VERSION - "${ARROW_FULL_SO_VERSION}" - SOVERSION - "${ARROW_SO_VERSION}") + PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${OUTPUT_PATH}" + RUNTIME_OUTPUT_DIRECTORY "${OUTPUT_PATH}" + PDB_OUTPUT_DIRECTORY "${OUTPUT_PATH}" + LINK_FLAGS "${ARG_SHARED_LINK_FLAGS}" + OUTPUT_NAME ${LIB_NAME} + VERSION "${ARROW_FULL_SO_VERSION}" + SOVERSION "${ARROW_SO_VERSION}") target_link_libraries(${LIB_NAME}_shared LINK_PUBLIC @@ -395,8 +381,8 @@ function(ADD_ARROW_LIB LIB_NAME) else() set(_lib_install_rpath "\$ORIGIN") endif() - set_target_properties(${LIB_NAME}_shared - PROPERTIES INSTALL_RPATH ${_lib_install_rpath}) + set_target_properties(${LIB_NAME}_shared PROPERTIES INSTALL_RPATH + ${_lib_install_rpath}) endif() if(APPLE) @@ -407,7 +393,7 @@ function(ADD_ARROW_LIB LIB_NAME) endif() set_target_properties(${LIB_NAME}_shared PROPERTIES BUILD_WITH_INSTALL_RPATH ON INSTALL_NAME_DIR - "${_lib_install_name}") + "${_lib_install_name}") endif() install(TARGETS ${LIB_NAME}_shared ${INSTALL_IS_OPTIONAL} @@ -415,7 +401,8 @@ function(ADD_ARROW_LIB LIB_NAME) RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + INCLUDES + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(BUILD_STATIC) @@ -451,8 +438,8 @@ function(ADD_ARROW_LIB LIB_NAME) endif() set_target_properties(${LIB_NAME}_static - PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${OUTPUT_PATH}" OUTPUT_NAME - ${LIB_NAME_STATIC}) + PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${OUTPUT_PATH}" + OUTPUT_NAME ${LIB_NAME_STATIC}) if(ARG_STATIC_INSTALL_INTERFACE_LIBS) target_link_libraries(${LIB_NAME}_static LINK_PUBLIC @@ -469,7 +456,8 @@ function(ADD_ARROW_LIB LIB_NAME) RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + INCLUDES + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(ARG_CMAKE_PACKAGE_NAME) @@ -488,9 +476,10 @@ function(ADD_ARROW_LIB LIB_NAME) set(CONFIG_VERSION_CMAKE "${ARG_CMAKE_PACKAGE_NAME}ConfigVersion.cmake") set(BUILT_CONFIG_VERSION_CMAKE "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_VERSION_CMAKE}") - write_basic_package_version_file("${BUILT_CONFIG_VERSION_CMAKE}" - VERSION ${${PROJECT_NAME}_VERSION} - COMPATIBILITY AnyNewerVersion) + write_basic_package_version_file( + "${BUILT_CONFIG_VERSION_CMAKE}" + VERSION ${${PROJECT_NAME}_VERSION} + COMPATIBILITY AnyNewerVersion) install(FILES "${BUILT_CONFIG_VERSION_CMAKE}" DESTINATION "${ARROW_CMAKE_INSTALL_DIR}") endif() @@ -501,7 +490,9 @@ function(ADD_ARROW_LIB LIB_NAME) # Modify variable in calling scope if(ARG_OUTPUTS) - set(${ARG_OUTPUTS} ${${ARG_OUTPUTS}} PARENT_SCOPE) + set(${ARG_OUTPUTS} + ${${ARG_OUTPUTS}} + PARENT_SCOPE) endif() endfunction() @@ -589,10 +580,8 @@ function(ADD_BENCHMARK REL_BENCHMARK_NAME) # installed there. if(NOT "$ENV{CONDA_PREFIX}" STREQUAL "" AND APPLE) set_target_properties(${BENCHMARK_NAME} - PROPERTIES BUILD_WITH_INSTALL_RPATH - TRUE - INSTALL_RPATH_USE_LINK_PATH - TRUE + PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE + INSTALL_RPATH_USE_LINK_PATH TRUE INSTALL_RPATH "$ENV{CONDA_PREFIX}/lib;${EXECUTABLE_OUTPUT_PATH}") endif() @@ -619,7 +608,9 @@ function(ADD_BENCHMARK REL_BENCHMARK_NAME) benchmark ${BENCHMARK_PATH} ${NO_COLOR}) - set_property(TEST ${BENCHMARK_NAME} APPEND PROPERTY LABELS ${ARG_LABELS}) + set_property(TEST ${BENCHMARK_NAME} + APPEND + PROPERTY LABELS ${ARG_LABELS}) endfunction() # @@ -699,10 +690,8 @@ function(ADD_TEST_CASE REL_TEST_NAME) # installed there. if(NOT "$ENV{CONDA_PREFIX}" STREQUAL "" AND APPLE) set_target_properties(${TEST_NAME} - PROPERTIES BUILD_WITH_INSTALL_RPATH - TRUE - INSTALL_RPATH_USE_LINK_PATH - TRUE + PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE + INSTALL_RPATH_USE_LINK_PATH TRUE INSTALL_RPATH "${EXECUTABLE_OUTPUT_PATH};$ENV{CONDA_PREFIX}/lib") endif() @@ -735,9 +724,10 @@ function(ADD_TEST_CASE REL_TEST_NAME) endif() if(ARROW_TEST_MEMCHECK AND NOT ARG_NO_VALGRIND) - add_test( - ${TEST_NAME} bash -c - "cd '${CMAKE_SOURCE_DIR}'; \ + add_test(${TEST_NAME} + bash + -c + "cd '${CMAKE_SOURCE_DIR}'; \ valgrind --suppressions=valgrind.supp --tool=memcheck --gen-suppressions=all \ --num-callers=500 --leak-check=full --leak-check-heuristics=stdstring \ --error-exitcode=1 ${TEST_PATH}") @@ -773,17 +763,16 @@ function(ADD_TEST_CASE REL_TEST_NAME) set(LABEL_TEST_NAME "test-${LABEL}") if(NOT TARGET ${LABEL_TEST_NAME}) add_custom_target(${LABEL_TEST_NAME} - ctest - -L - "${LABEL}" - --output-on-failure + ctest -L "${LABEL}" --output-on-failure USES_TERMINAL) endif() # ensure the test is (re)built before the LABEL test runs add_dependencies(${LABEL_TEST_NAME} ${TEST_NAME}) endforeach() - set_property(TEST ${TEST_NAME} APPEND PROPERTY LABELS ${LABELS}) + set_property(TEST ${TEST_NAME} + APPEND + PROPERTY LABELS ${LABELS}) endfunction() # @@ -896,8 +885,8 @@ function(ADD_FUZZ_TARGET REL_FUZZING_NAME) add_executable(${FUZZING_NAME} "${REL_FUZZING_NAME}.cc") target_link_libraries(${FUZZING_NAME} ${LINK_LIBS}) target_compile_options(${FUZZING_NAME} PRIVATE ${FUZZ_LDFLAGS}) - set_target_properties(${FUZZING_NAME} - PROPERTIES LINK_FLAGS ${FUZZ_LDFLAGS} LABELS "fuzzing") + set_target_properties(${FUZZING_NAME} PROPERTIES LINK_FLAGS ${FUZZ_LDFLAGS} LABELS + "fuzzing") endfunction() function(ARROW_INSTALL_ALL_HEADERS PATH) diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 0e92811da8c..e2a85a4aa55 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -33,7 +33,9 @@ endfunction() function(list_join lst glue out) if("${${lst}}" STREQUAL "") - set(${out} "" PARENT_SCOPE) + set(${out} + "" + PARENT_SCOPE) return() endif() @@ -42,7 +44,9 @@ function(list_join lst glue out) foreach(item ${${lst}}) set(joined "${joined}${glue}${item}") endforeach() - set(${out} ${joined} PARENT_SCOPE) + set(${out} + ${joined} + PARENT_SCOPE) endfunction() macro(define_option name description default) @@ -61,7 +65,9 @@ macro(define_option_string name description default) check_description_length(${name} ${description}) list_join(description "\n" multiline_description) - set(${name} ${default} CACHE STRING "${multiline_description}") + set(${name} + ${default} + CACHE STRING "${multiline_description}") list(APPEND "ARROW_${ARROW_OPTION_CATEGORY}_OPTION_NAMES" ${name}) set("${name}_OPTION_DESCRIPTION" ${description}) @@ -181,8 +187,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_ONLY_LINT "Only define the lint and check-format targets" OFF) - define_option(ARROW_VERBOSE_LINT "If off, 'quiet' flags will be passed to linting tools" - OFF) + define_option(ARROW_VERBOSE_LINT + "If off, 'quiet' flags will be passed to linting tools" OFF) define_option(ARROW_GENERATE_COVERAGE "Build with C++ code coverage enabled" OFF) @@ -320,7 +326,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_LZ4_USE_SHARED "Rely on lz4 shared libraries where relevant" ${ARROW_DEPENDENCY_USE_SHARED}) - define_option(ARROW_OPENSSL_USE_SHARED "Rely on OpenSSL shared libraries where relevant" + define_option(ARROW_OPENSSL_USE_SHARED + "Rely on OpenSSL shared libraries where relevant" ${ARROW_DEPENDENCY_USE_SHARED}) define_option(ARROW_PROTOBUF_USE_SHARED @@ -363,14 +370,12 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(ARROW_WITH_ZLIB "Build with zlib compression" OFF) define_option(ARROW_WITH_ZSTD "Build with zstd compression" OFF) - define_option( - ARROW_WITH_UTF8PROC - "Build with support for Unicode properties using the utf8proc library;(only used if ARROW_COMPUTE is ON)" - ON) - define_option( - ARROW_WITH_RE2 - "Build with support for regular expressions using the re2 library;(only used if ARROW_COMPUTE or ARROW_GANDIVA is ON)" - ON) + define_option(ARROW_WITH_UTF8PROC + "Build with support for Unicode properties using the utf8proc library;(only used if ARROW_COMPUTE is ON or ARROW_GANDIVA is ON)" + ON) + define_option(ARROW_WITH_RE2 + "Build with support for regular expressions using the re2 library;(only used if ARROW_COMPUTE or ARROW_GANDIVA is ON)" + ON) #---------------------------------------------------------------------- if(MSVC_TOOLCHAIN) @@ -416,9 +421,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Depend only on Thirdparty headers to build libparquet.;\ Always OFF if building binaries" OFF) - define_option( - PARQUET_BUILD_EXECUTABLES - "Build the Parquet executable CLI tools. Requires static libraries to be built." OFF) + define_option(PARQUET_BUILD_EXECUTABLES + "Build the Parquet executable CLI tools. Requires static libraries to be built." + OFF) define_option(PARQUET_BUILD_EXAMPLES "Build the Parquet examples. Requires static libraries to be built." OFF) @@ -432,10 +437,9 @@ Always OFF if building binaries" OFF) define_option(ARROW_GANDIVA_JAVA "Build the Gandiva JNI wrappers" OFF) # ARROW-3860: Temporary workaround - define_option( - ARROW_GANDIVA_STATIC_LIBSTDCPP - "Include -static-libstdc++ -static-libgcc when linking with;Gandiva static libraries" - OFF) + define_option(ARROW_GANDIVA_STATIC_LIBSTDCPP + "Include -static-libstdc++ -static-libgcc when linking with;Gandiva static libraries" + OFF) define_option_string(ARROW_GANDIVA_PC_CXX_FLAGS "Compiler flags to append when pre-compiling Gandiva operations" @@ -450,7 +454,8 @@ Always OFF if building binaries" OFF) define_option(ARROW_OPTIONAL_INSTALL "If enabled install ONLY targets that have already been built. Please be;\ advised that if this is enabled 'install' will fail silently on components;\ -that have not been built" OFF) +that have not been built" + OFF) option(ARROW_BUILD_CONFIG_SUMMARY_JSON "Summarize build configuration in a JSON file" ON) @@ -465,9 +470,8 @@ macro(validate_config) set(value "${${name}}") if(possible_values) if(NOT "${value}" IN_LIST possible_values) - message( - FATAL_ERROR "Configuration option ${name} got invalid value '${value}'. " - "Allowed values: ${${name}_OPTION_ENUM}.") + message(FATAL_ERROR "Configuration option ${name} got invalid value '${value}'. " + "Allowed values: ${${name}_OPTION_ENUM}.") endif() endif() endforeach() @@ -486,8 +490,8 @@ macro(config_summary_message) message(STATUS " Source directory: ${CMAKE_CURRENT_SOURCE_DIR}") message(STATUS " Install prefix: ${CMAKE_INSTALL_PREFIX}") if(${CMAKE_EXPORT_COMPILE_COMMANDS}) - message( - STATUS " Compile commands: ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json") + message(STATUS " Compile commands: ${CMAKE_CURRENT_BINARY_DIR}/compile_commands.json" + ) endif() foreach(category ${ARROW_OPTION_CATEGORIES}) diff --git a/cpp/cmake_modules/FindArrow.cmake b/cpp/cmake_modules/FindArrow.cmake index 9c987665896..68024cc2760 100644 --- a/cpp/cmake_modules/FindArrow.cmake +++ b/cpp/cmake_modules/FindArrow.cmake @@ -50,11 +50,12 @@ set(ARROW_SEARCH_LIB_PATH_SUFFIXES) if(CMAKE_LIBRARY_ARCHITECTURE) list(APPEND ARROW_SEARCH_LIB_PATH_SUFFIXES "lib/${CMAKE_LIBRARY_ARCHITECTURE}") endif() -list(APPEND ARROW_SEARCH_LIB_PATH_SUFFIXES - "lib64" - "lib32" - "lib" - "bin") +list(APPEND + ARROW_SEARCH_LIB_PATH_SUFFIXES + "lib64" + "lib32" + "lib" + "bin") set(ARROW_CONFIG_SUFFIXES "_RELEASE" "_RELWITHDEBINFO" @@ -120,10 +121,9 @@ endfunction() # # -> ARROW_STATIC_LIBRARY_NAME=arrow.lib with MSVC on Windows # # -> ARROW_STATIC_LIBRARY_NAME=libarrow.dll.a with MinGW on Windows function(arrow_build_static_library_name output_variable base_name) - set( - ${output_variable} - "${CMAKE_STATIC_LIBRARY_PREFIX}${base_name}${ARROW_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - PARENT_SCOPE) + set(${output_variable} + "${CMAKE_STATIC_LIBRARY_PREFIX}${base_name}${ARROW_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) endfunction() # Internal function. @@ -138,9 +138,11 @@ endfunction() function(arrow_extract_macro_value output_variable macro_name header_content) string(REGEX MATCH "#define +${macro_name} +[^\r\n]+" macro_definition "${header_content}") - string(REGEX - REPLACE "^#define +${macro_name} +(.+)$" "\\1" macro_value "${macro_definition}") - set(${output_variable} "${macro_value}" PARENT_SCOPE) + string(REGEX REPLACE "^#define +${macro_name} +(.+)$" "\\1" macro_value + "${macro_definition}") + set(${output_variable} + "${macro_value}" + PARENT_SCOPE) endfunction() # Internal macro only for arrow_find_package. @@ -152,7 +154,9 @@ macro(arrow_find_package_home) PATH_SUFFIXES "include" NO_DEFAULT_PATH) set(include_dir "${${prefix}_include_dir}") - set(${prefix}_INCLUDE_DIR "${include_dir}" PARENT_SCOPE) + set(${prefix}_INCLUDE_DIR + "${include_dir}" + PARENT_SCOPE) if(MSVC_TOOLCHAIN) set(CMAKE_SHARED_LIBRARY_SUFFIXES_ORIGINAL ${CMAKE_FIND_LIBRARY_SUFFIXES}) @@ -169,13 +173,15 @@ macro(arrow_find_package_home) set(CMAKE_SHARED_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_ORIGINAL}) endif() set(shared_lib "${${prefix}_shared_lib}") - set(${prefix}_SHARED_LIB "${shared_lib}" PARENT_SCOPE) + set(${prefix}_SHARED_LIB + "${shared_lib}" + PARENT_SCOPE) if(shared_lib) add_library(${target_shared} SHARED IMPORTED) set_target_properties(${target_shared} PROPERTIES IMPORTED_LOCATION "${shared_lib}") if(include_dir) - set_target_properties(${target_shared} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${include_dir}") + set_target_properties(${target_shared} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${include_dir}") endif() find_library(${prefix}_import_lib NAMES "${import_lib_name}" @@ -183,7 +189,9 @@ macro(arrow_find_package_home) PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES} NO_DEFAULT_PATH) set(import_lib "${${prefix}_import_lib}") - set(${prefix}_IMPORT_LIB "${import_lib}" PARENT_SCOPE) + set(${prefix}_IMPORT_LIB + "${import_lib}" + PARENT_SCOPE) if(import_lib) set_target_properties(${target_shared} PROPERTIES IMPORTED_IMPLIB "${import_lib}") endif() @@ -195,13 +203,15 @@ macro(arrow_find_package_home) PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES} NO_DEFAULT_PATH) set(static_lib "${${prefix}_static_lib}") - set(${prefix}_STATIC_LIB "${static_lib}" PARENT_SCOPE) + set(${prefix}_STATIC_LIB + "${static_lib}" + PARENT_SCOPE) if(static_lib) add_library(${target_static} STATIC IMPORTED) set_target_properties(${target_static} PROPERTIES IMPORTED_LOCATION "${static_lib}") if(include_dir) - set_target_properties(${target_static} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${include_dir}") + set_target_properties(${target_static} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + "${include_dir}") endif() endif() endmacro() @@ -212,7 +222,9 @@ endmacro() macro(arrow_find_package_cmake_package_configuration) find_package(${cmake_package_name} CONFIG) if(${cmake_package_name}_FOUND) - set(${prefix}_USE_CMAKE_PACKAGE_CONFIG TRUE PARENT_SCOPE) + set(${prefix}_USE_CMAKE_PACKAGE_CONFIG + TRUE + PARENT_SCOPE) if(TARGET ${target_shared}) foreach(suffix ${ARROW_CONFIG_SUFFIXES}) get_target_property(shared_lib ${target_shared} IMPORTED_LOCATION${suffix}) @@ -221,10 +233,11 @@ macro(arrow_find_package_cmake_package_configuration) # libarrow.so.100.0.0 -> libarrow.so # Because ARROW_HOME and pkg-config approaches don't add # shared library version. - string(REGEX - REPLACE "(${CMAKE_SHARED_LIBRARY_SUFFIX})[.0-9]+$" "\\1" shared_lib - "${shared_lib}") - set(${prefix}_SHARED_LIB "${shared_lib}" PARENT_SCOPE) + string(REGEX REPLACE "(${CMAKE_SHARED_LIBRARY_SUFFIX})[.0-9]+$" "\\1" + shared_lib "${shared_lib}") + set(${prefix}_SHARED_LIB + "${shared_lib}" + PARENT_SCOPE) break() endif() endforeach() @@ -233,7 +246,9 @@ macro(arrow_find_package_cmake_package_configuration) foreach(suffix ${ARROW_CONFIG_SUFFIXES}) get_target_property(static_lib ${target_static} IMPORTED_LOCATION${suffix}) if(static_lib) - set(${prefix}_STATIC_LIB "${static_lib}" PARENT_SCOPE) + set(${prefix}_STATIC_LIB + "${static_lib}" + PARENT_SCOPE) break() endif() endforeach() @@ -247,7 +262,9 @@ endmacro() macro(arrow_find_package_pkg_config) pkg_check_modules(${prefix}_PC ${pkg_config_name}) if(${prefix}_PC_FOUND) - set(${prefix}_USE_PKG_CONFIG TRUE PARENT_SCOPE) + set(${prefix}_USE_PKG_CONFIG + TRUE + PARENT_SCOPE) set(include_dir "${${prefix}_PC_INCLUDEDIR}") set(lib_dir "${${prefix}_PC_LIBDIR}") @@ -270,18 +287,21 @@ macro(arrow_find_package_pkg_config) rest_shared_lib_paths) endif() - set(${prefix}_VERSION "${${prefix}_PC_VERSION}" PARENT_SCOPE) - set(${prefix}_INCLUDE_DIR "${include_dir}" PARENT_SCOPE) - set(${prefix}_SHARED_LIB "${first_shared_lib_path}" PARENT_SCOPE) + set(${prefix}_VERSION + "${${prefix}_PC_VERSION}" + PARENT_SCOPE) + set(${prefix}_INCLUDE_DIR + "${include_dir}" + PARENT_SCOPE) + set(${prefix}_SHARED_LIB + "${first_shared_lib_path}" + PARENT_SCOPE) add_library(${target_shared} SHARED IMPORTED) set_target_properties(${target_shared} - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${include_dir}" - INTERFACE_LINK_LIBRARIES - "${rest_shared_lib_paths}" - IMPORTED_LOCATION - "${first_shared_lib_path}") + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${include_dir}" + INTERFACE_LINK_LIBRARIES "${rest_shared_lib_paths}" + IMPORTED_LOCATION "${first_shared_lib_path}") get_target_property(shared_lib ${target_shared} IMPORTED_LOCATION) find_library(${prefix}_static_lib @@ -289,7 +309,9 @@ macro(arrow_find_package_pkg_config) PATHS "${lib_dir}" NO_DEFAULT_PATH) set(static_lib "${${prefix}_static_lib}") - set(${prefix}_STATIC_LIB "${static_lib}" PARENT_SCOPE) + set(${prefix}_STATIC_LIB + "${static_lib}" + PARENT_SCOPE) if(static_lib) add_library(${target_static} STATIC IMPORTED) set_target_properties(${target_static} @@ -315,7 +337,9 @@ function(arrow_find_package if(home) arrow_find_package_home() - set(${prefix}_FIND_APPROACH "HOME: ${home}" PARENT_SCOPE) + set(${prefix}_FIND_APPROACH + "HOME: ${home}" + PARENT_SCOPE) else() arrow_find_package_cmake_package_configuration() if(${cmake_package_name}_FOUND) @@ -324,7 +348,9 @@ function(arrow_find_package PARENT_SCOPE) else() arrow_find_package_pkg_config() - set(${prefix}_FIND_APPROACH "pkg-config: ${pkg_config_name}" PARENT_SCOPE) + set(${prefix}_FIND_APPROACH + "pkg-config: ${pkg_config_name}" + PARENT_SCOPE) endif() endif() @@ -336,7 +362,9 @@ function(arrow_find_package endif() endif() if(include_dir) - set(${prefix}_INCLUDE_DIR "${include_dir}" PARENT_SCOPE) + set(${prefix}_INCLUDE_DIR + "${include_dir}" + PARENT_SCOPE) endif() if(shared_lib) @@ -346,9 +374,13 @@ function(arrow_find_package else() set(lib_dir NOTFOUND) endif() - set(${prefix}_LIB_DIR "${lib_dir}" PARENT_SCOPE) + set(${prefix}_LIB_DIR + "${lib_dir}" + PARENT_SCOPE) # For backward compatibility - set(${prefix}_LIBS "${lib_dir}" PARENT_SCOPE) + set(${prefix}_LIBS + "${lib_dir}" + PARENT_SCOPE) endfunction() if(NOT "$ENV{ARROW_HOME}" STREQUAL "") @@ -384,9 +416,8 @@ if(ARROW_HOME) string(REGEX REPLACE "^\"(.+)\"$" "\\1" ARROW_SO_VERSION "${ARROW_SO_VERSION_QUOTED}") arrow_extract_macro_value(ARROW_FULL_SO_VERSION_QUOTED "ARROW_FULL_SO_VERSION" "${ARROW_CONFIG_H_CONTENT}") - string(REGEX - REPLACE "^\"(.+)\"$" "\\1" ARROW_FULL_SO_VERSION - "${ARROW_FULL_SO_VERSION_QUOTED}") + string(REGEX REPLACE "^\"(.+)\"$" "\\1" ARROW_FULL_SO_VERSION + "${ARROW_FULL_SO_VERSION_QUOTED}") endif() else() if(ARROW_USE_CMAKE_PACKAGE_CONFIG) @@ -416,16 +447,13 @@ mark_as_advanced(ARROW_ABI_VERSION ARROW_VERSION_MINOR ARROW_VERSION_PATCH) -find_package_handle_standard_args(Arrow REQUIRED_VARS - # The first required variable is shown - # in the found message. So this list is - # not sorted alphabetically. - ARROW_INCLUDE_DIR - ARROW_LIB_DIR - ARROW_FULL_SO_VERSION - ARROW_SO_VERSION - VERSION_VAR - ARROW_VERSION) +find_package_handle_standard_args( + Arrow + REQUIRED_VARS # The first required variable is shown + # in the found message. So this list is + # not sorted alphabetically. + ARROW_INCLUDE_DIR ARROW_LIB_DIR ARROW_FULL_SO_VERSION ARROW_SO_VERSION + VERSION_VAR ARROW_VERSION) set(ARROW_FOUND ${Arrow_FOUND}) if(Arrow_FOUND AND NOT Arrow_FIND_QUIETLY) diff --git a/cpp/cmake_modules/FindArrowCUDA.cmake b/cpp/cmake_modules/FindArrowCUDA.cmake index 7bc2f5b745b..014386f3012 100644 --- a/cpp/cmake_modules/FindArrowCUDA.cmake +++ b/cpp/cmake_modules/FindArrowCUDA.cmake @@ -74,13 +74,10 @@ mark_as_advanced(ARROW_CUDA_IMPORT_LIB ARROW_CUDA_VERSION ARROW_CUDA_VERSION_MATCH) -find_package_handle_standard_args(ArrowCUDA - REQUIRED_VARS - ARROW_CUDA_INCLUDE_DIR - ARROW_CUDA_LIB_DIR - ARROW_CUDA_VERSION_MATCH - VERSION_VAR - ARROW_CUDA_VERSION) +find_package_handle_standard_args( + ArrowCUDA + REQUIRED_VARS ARROW_CUDA_INCLUDE_DIR ARROW_CUDA_LIB_DIR ARROW_CUDA_VERSION_MATCH + VERSION_VAR ARROW_CUDA_VERSION) set(ARROW_CUDA_FOUND ${ArrowCUDA_FOUND}) if(ArrowCUDA_FOUND AND NOT ArrowCUDA_FIND_QUIETLY) diff --git a/cpp/cmake_modules/FindArrowDataset.cmake b/cpp/cmake_modules/FindArrowDataset.cmake index d45fae6799b..fe74f247fc3 100644 --- a/cpp/cmake_modules/FindArrowDataset.cmake +++ b/cpp/cmake_modules/FindArrowDataset.cmake @@ -74,13 +74,11 @@ mark_as_advanced(ARROW_DATASET_IMPORT_LIB ARROW_DATASET_VERSION ARROW_DATASET_VERSION_MATCH) -find_package_handle_standard_args(ArrowDataset - REQUIRED_VARS - ARROW_DATASET_INCLUDE_DIR - ARROW_DATASET_LIB_DIR - ARROW_DATASET_VERSION_MATCH - VERSION_VAR - ARROW_DATASET_VERSION) +find_package_handle_standard_args( + ArrowDataset + REQUIRED_VARS ARROW_DATASET_INCLUDE_DIR ARROW_DATASET_LIB_DIR + ARROW_DATASET_VERSION_MATCH + VERSION_VAR ARROW_DATASET_VERSION) set(ARROW_DATASET_FOUND ${ArrowDataset_FOUND}) if(ArrowDataset_FOUND AND NOT ArrowDataset_FIND_QUIETLY) diff --git a/cpp/cmake_modules/FindArrowFlight.cmake b/cpp/cmake_modules/FindArrowFlight.cmake index 344c408995c..805a4ff3803 100644 --- a/cpp/cmake_modules/FindArrowFlight.cmake +++ b/cpp/cmake_modules/FindArrowFlight.cmake @@ -75,13 +75,10 @@ mark_as_advanced(ARROW_FLIGHT_IMPORT_LIB ARROW_FLIGHT_VERSION ARROW_FLIGHT_VERSION_MATCH) -find_package_handle_standard_args(ArrowFlight - REQUIRED_VARS - ARROW_FLIGHT_INCLUDE_DIR - ARROW_FLIGHT_LIB_DIR - ARROW_FLIGHT_VERSION_MATCH - VERSION_VAR - ARROW_FLIGHT_VERSION) +find_package_handle_standard_args( + ArrowFlight + REQUIRED_VARS ARROW_FLIGHT_INCLUDE_DIR ARROW_FLIGHT_LIB_DIR ARROW_FLIGHT_VERSION_MATCH + VERSION_VAR ARROW_FLIGHT_VERSION) set(ARROW_FLIGHT_FOUND ${ArrowFlight_FOUND}) if(ArrowFlight_FOUND AND NOT ArrowFlight_FIND_QUIETLY) diff --git a/cpp/cmake_modules/FindArrowFlightTesting.cmake b/cpp/cmake_modules/FindArrowFlightTesting.cmake index feb2790dfc6..c0756cf637c 100644 --- a/cpp/cmake_modules/FindArrowFlightTesting.cmake +++ b/cpp/cmake_modules/FindArrowFlightTesting.cmake @@ -79,25 +79,20 @@ mark_as_advanced(ARROW_FLIGHT_TESTING_IMPORT_LIB ARROW_FLIGHT_TESTING_VERSION ARROW_FLIGHT_TESTING_VERSION_MATCH) -find_package_handle_standard_args(ArrowFlightTesting - REQUIRED_VARS - ARROW_FLIGHT_TESTING_INCLUDE_DIR - ARROW_FLIGHT_TESTING_LIB_DIR - ARROW_FLIGHT_TESTING_VERSION_MATCH - VERSION_VAR - ARROW_FLIGHT_TESTING_VERSION) +find_package_handle_standard_args( + ArrowFlightTesting + REQUIRED_VARS ARROW_FLIGHT_TESTING_INCLUDE_DIR ARROW_FLIGHT_TESTING_LIB_DIR + ARROW_FLIGHT_TESTING_VERSION_MATCH + VERSION_VAR ARROW_FLIGHT_TESTING_VERSION) set(ARROW_FLIGHT_TESTING_FOUND ${ArrowFlightTesting_FOUND}) if(ArrowFlightTesting_FOUND AND NOT ArrowFlightTesting_FIND_QUIETLY) - message( - STATUS "Found the Arrow Flight testing by ${ARROW_FLIGHT_TESTING_FIND_APPROACH}") - message( - STATUS - "Found the Arrow Flight testing shared library: ${ARROW_FLIGHT_TESTING_SHARED_LIB}") - message( - STATUS - "Found the Arrow Flight testing import library: ${ARROW_FLIGHT_TESTING_IMPORT_LIB}") - message( - STATUS - "Found the Arrow Flight testing static library: ${ARROW_FLIGHT_TESTING_STATIC_LIB}") + message(STATUS "Found the Arrow Flight testing by ${ARROW_FLIGHT_TESTING_FIND_APPROACH}" + ) + message(STATUS "Found the Arrow Flight testing shared library: ${ARROW_FLIGHT_TESTING_SHARED_LIB}" + ) + message(STATUS "Found the Arrow Flight testing import library: ${ARROW_FLIGHT_TESTING_IMPORT_LIB}" + ) + message(STATUS "Found the Arrow Flight testing static library: ${ARROW_FLIGHT_TESTING_STATIC_LIB}" + ) endif() diff --git a/cpp/cmake_modules/FindArrowPython.cmake b/cpp/cmake_modules/FindArrowPython.cmake index 3d1280dff72..b503e6a9e02 100644 --- a/cpp/cmake_modules/FindArrowPython.cmake +++ b/cpp/cmake_modules/FindArrowPython.cmake @@ -73,13 +73,10 @@ mark_as_advanced(ARROW_PYTHON_IMPORT_LIB ARROW_PYTHON_VERSION ARROW_PYTHON_VERSION_MATCH) -find_package_handle_standard_args(ArrowPython - REQUIRED_VARS - ARROW_PYTHON_INCLUDE_DIR - ARROW_PYTHON_LIB_DIR - ARROW_PYTHON_VERSION_MATCH - VERSION_VAR - ARROW_PYTHON_VERSION) +find_package_handle_standard_args( + ArrowPython + REQUIRED_VARS ARROW_PYTHON_INCLUDE_DIR ARROW_PYTHON_LIB_DIR ARROW_PYTHON_VERSION_MATCH + VERSION_VAR ARROW_PYTHON_VERSION) set(ARROW_PYTHON_FOUND ${ArrowPython_FOUND}) if(ArrowPython_FOUND AND NOT ArrowPython_FIND_QUIETLY) diff --git a/cpp/cmake_modules/FindArrowPythonFlight.cmake b/cpp/cmake_modules/FindArrowPythonFlight.cmake index acb22c64231..3a639928ce5 100644 --- a/cpp/cmake_modules/FindArrowPythonFlight.cmake +++ b/cpp/cmake_modules/FindArrowPythonFlight.cmake @@ -76,24 +76,19 @@ mark_as_advanced(ARROW_PYTHON_FLIGHT_IMPORT_LIB ARROW_PYTHON_FLIGHT_VERSION ARROW_PYTHON_FLIGHT_VERSION_MATCH) -find_package_handle_standard_args(ArrowPythonFlight - REQUIRED_VARS - ARROW_PYTHON_FLIGHT_INCLUDE_DIR - ARROW_PYTHON_FLIGHT_LIB_DIR - ARROW_PYTHON_FLIGHT_VERSION_MATCH - VERSION_VAR - ARROW_PYTHON_FLIGHT_VERSION) +find_package_handle_standard_args( + ArrowPythonFlight + REQUIRED_VARS ARROW_PYTHON_FLIGHT_INCLUDE_DIR ARROW_PYTHON_FLIGHT_LIB_DIR + ARROW_PYTHON_FLIGHT_VERSION_MATCH + VERSION_VAR ARROW_PYTHON_FLIGHT_VERSION) set(ARROW_PYTHON_FLIGHT_FOUND ${ArrowPythonFlight_FOUND}) if(ArrowPythonFlight_FOUND AND NOT ArrowPythonFlight_FIND_QUIETLY) message(STATUS "Found the Arrow Python Flight by ${ARROW_PYTHON_FLIGHT_FIND_APPROACH}") - message( - STATUS - "Found the Arrow Python Flight shared library: ${ARROW_PYTHON_FLIGHT_SHARED_LIB}") - message( - STATUS - "Found the Arrow Python Flight import library: ${ARROW_PYTHON_FLIGHT_IMPORT_LIB}") - message( - STATUS - "Found the Arrow Python Flight static library: ${ARROW_PYTHON_FLIGHT_STATIC_LIB}") + message(STATUS "Found the Arrow Python Flight shared library: ${ARROW_PYTHON_FLIGHT_SHARED_LIB}" + ) + message(STATUS "Found the Arrow Python Flight import library: ${ARROW_PYTHON_FLIGHT_IMPORT_LIB}" + ) + message(STATUS "Found the Arrow Python Flight static library: ${ARROW_PYTHON_FLIGHT_STATIC_LIB}" + ) endif() diff --git a/cpp/cmake_modules/FindArrowTesting.cmake b/cpp/cmake_modules/FindArrowTesting.cmake index ed5a28cd3e4..c405003ad70 100644 --- a/cpp/cmake_modules/FindArrowTesting.cmake +++ b/cpp/cmake_modules/FindArrowTesting.cmake @@ -74,13 +74,11 @@ mark_as_advanced(ARROW_TESTING_IMPORT_LIB ARROW_TESTING_VERSION ARROW_TESTING_VERSION_MATCH) -find_package_handle_standard_args(ArrowTesting - REQUIRED_VARS - ARROW_TESTING_INCLUDE_DIR - ARROW_TESTING_LIB_DIR - ARROW_TESTING_VERSION_MATCH - VERSION_VAR - ARROW_TESTING_VERSION) +find_package_handle_standard_args( + ArrowTesting + REQUIRED_VARS ARROW_TESTING_INCLUDE_DIR ARROW_TESTING_LIB_DIR + ARROW_TESTING_VERSION_MATCH + VERSION_VAR ARROW_TESTING_VERSION) set(ARROW_TESTING_FOUND ${ArrowTesting_FOUND}) if(ArrowTesting_FOUND AND NOT ArrowTesting_FIND_QUIETLY) diff --git a/cpp/cmake_modules/FindBrotli.cmake b/cpp/cmake_modules/FindBrotli.cmake index b46a0f1a0cf..e2670b51a9e 100644 --- a/cpp/cmake_modules/FindBrotli.cmake +++ b/cpp/cmake_modules/FindBrotli.cmake @@ -110,12 +110,9 @@ else() endif() endif() -find_package_handle_standard_args(Brotli - REQUIRED_VARS - BROTLI_COMMON_LIBRARY - BROTLI_ENC_LIBRARY - BROTLI_DEC_LIBRARY - BROTLI_INCLUDE_DIR) +find_package_handle_standard_args( + Brotli REQUIRED_VARS BROTLI_COMMON_LIBRARY BROTLI_ENC_LIBRARY BROTLI_DEC_LIBRARY + BROTLI_INCLUDE_DIR) if(Brotli_FOUND OR BROTLI_FOUND) set(Brotli_FOUND TRUE) add_library(Brotli::brotlicommon UNKNOWN IMPORTED) diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake index 88171abed92..52fc59895b8 100644 --- a/cpp/cmake_modules/FindClangTools.cmake +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -69,15 +69,18 @@ function(FIND_CLANG_TOOL NAME OUTPUT VERSION_CHECK_PATTERN) endif() endif() if(CLANG_TOOL_BIN) - set(${OUTPUT} ${CLANG_TOOL_BIN} PARENT_SCOPE) + set(${OUTPUT} + ${CLANG_TOOL_BIN} + PARENT_SCOPE) else() - set(${OUTPUT} "${OUTPUT}-NOTFOUND" PARENT_SCOPE) + set(${OUTPUT} + "${OUTPUT}-NOTFOUND" + PARENT_SCOPE) endif() endfunction() -string(REGEX - REPLACE "\\." "\\\\." ARROW_CLANG_TOOLS_VERSION_ESCAPED - "${ARROW_CLANG_TOOLS_VERSION}") +string(REGEX REPLACE "\\." "\\\\." ARROW_CLANG_TOOLS_VERSION_ESCAPED + "${ARROW_CLANG_TOOLS_VERSION}") find_clang_tool(clang-tidy CLANG_TIDY_BIN "LLVM version ${ARROW_CLANG_TOOLS_VERSION_ESCAPED}") @@ -100,4 +103,4 @@ else() endif() find_package_handle_standard_args(ClangTools REQUIRED_VARS CLANG_FORMAT_BIN - CLANG_TIDY_BIN) + CLANG_TIDY_BIN) diff --git a/cpp/cmake_modules/FindGLOG.cmake b/cpp/cmake_modules/FindGLOG.cmake index 81c3f2ec57e..d67eb005621 100644 --- a/cpp/cmake_modules/FindGLOG.cmake +++ b/cpp/cmake_modules/FindGLOG.cmake @@ -38,7 +38,9 @@ elseif(GLOG_ROOT) NO_DEFAULT_PATH PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) else() - find_library(GLOG_LIB NAMES glog PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) + find_library(GLOG_LIB + NAMES glog + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) find_path(GLOG_INCLUDE_DIR NAMES glog/logging.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) diff --git a/cpp/cmake_modules/FindGandiva.cmake b/cpp/cmake_modules/FindGandiva.cmake index 15279fd841a..c533abed733 100644 --- a/cpp/cmake_modules/FindGandiva.cmake +++ b/cpp/cmake_modules/FindGandiva.cmake @@ -79,14 +79,11 @@ mark_as_advanced(GANDIVA_ABI_VERSION GANDIVA_VERSION GANDIVA_VERSION_MATCH) -find_package_handle_standard_args(Gandiva - REQUIRED_VARS - GANDIVA_INCLUDE_DIR - GANDIVA_LIB_DIR - GANDIVA_SO_VERSION - GANDIVA_VERSION_MATCH - VERSION_VAR - GANDIVA_VERSION) +find_package_handle_standard_args( + Gandiva + REQUIRED_VARS GANDIVA_INCLUDE_DIR GANDIVA_LIB_DIR GANDIVA_SO_VERSION + GANDIVA_VERSION_MATCH + VERSION_VAR GANDIVA_VERSION) set(GANDIVA_FOUND ${Gandiva_FOUND}) if(Gandiva_FOUND AND NOT Gandiva_FIND_QUIETLY) diff --git a/cpp/cmake_modules/FindLLVMAlt.cmake b/cpp/cmake_modules/FindLLVMAlt.cmake index 7695c09ae8c..380f2d47c72 100644 --- a/cpp/cmake_modules/FindLLVMAlt.cmake +++ b/cpp/cmake_modules/FindLLVMAlt.cmake @@ -58,22 +58,17 @@ if(LLVM_FOUND) add_library(LLVM::LLVM_INTERFACE INTERFACE IMPORTED) set_target_properties(LLVM::LLVM_INTERFACE - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "${LLVM_INCLUDE_DIRS}" - INTERFACE_COMPILE_FLAGS - "${LLVM_DEFINITIONS}" - INTERFACE_LINK_LIBRARIES - "${LLVM_LIBS}") + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LLVM_INCLUDE_DIRS}" + INTERFACE_COMPILE_FLAGS "${LLVM_DEFINITIONS}" + INTERFACE_LINK_LIBRARIES "${LLVM_LIBS}") endif() mark_as_advanced(CLANG_EXECUTABLE LLVM_LINK_EXECUTABLE) -find_package_handle_standard_args(LLVMAlt - REQUIRED_VARS # The first variable is used for display. - LLVM_PACKAGE_VERSION - CLANG_EXECUTABLE - LLVM_FOUND - LLVM_LINK_EXECUTABLE) +find_package_handle_standard_args( + LLVMAlt + REQUIRED_VARS # The first variable is used for display. + LLVM_PACKAGE_VERSION CLANG_EXECUTABLE LLVM_FOUND LLVM_LINK_EXECUTABLE) if(LLVMAlt_FOUND) message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") message(STATUS "Found llvm-link ${LLVM_LINK_EXECUTABLE}") diff --git a/cpp/cmake_modules/FindLz4.cmake b/cpp/cmake_modules/FindLz4.cmake index 14b6d93b983..bc8051fe9c5 100644 --- a/cpp/cmake_modules/FindLz4.cmake +++ b/cpp/cmake_modules/FindLz4.cmake @@ -23,16 +23,13 @@ set(LZ4_LIB_NAME_BASE "${LZ4_MSVC_LIB_PREFIX}lz4") if(ARROW_LZ4_USE_SHARED) set(LZ4_LIB_NAMES) if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list( - APPEND - LZ4_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}${LZ4_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" - ) + list(APPEND + LZ4_LIB_NAMES + "${CMAKE_IMPORT_LIBRARY_PREFIX}${LZ4_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" + ) endif() - list( - APPEND - LZ4_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}${LZ4_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}") + list(APPEND LZ4_LIB_NAMES + "${CMAKE_SHARED_LIBRARY_PREFIX}${LZ4_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}") else() if(MSVC AND NOT DEFINED LZ4_MSVC_STATIC_LIB_SUFFIX) set(LZ4_MSVC_STATIC_LIB_SUFFIX "_static") @@ -70,7 +67,9 @@ else() find_library(LZ4_LIB NAMES ${LZ4_LIB_NAMES} PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_path(LZ4_INCLUDE_DIR NAMES lz4.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + find_path(LZ4_INCLUDE_DIR + NAMES lz4.h + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() endif() diff --git a/cpp/cmake_modules/FindORC.cmake b/cpp/cmake_modules/FindORC.cmake index 061a0df2e9e..d45b1607833 100644 --- a/cpp/cmake_modules/FindORC.cmake +++ b/cpp/cmake_modules/FindORC.cmake @@ -33,7 +33,9 @@ if(ORC_ROOT) NO_DEFAULT_PATH PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) else() - find_library(ORC_STATIC_LIB NAMES orc PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) + find_library(ORC_STATIC_LIB + NAMES orc + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) find_path(ORC_INCLUDE_DIR NAMES orc/orc-config.hh PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake index 99124b2c037..e071fc822b6 100644 --- a/cpp/cmake_modules/FindParquet.cmake +++ b/cpp/cmake_modules/FindParquet.cmake @@ -83,13 +83,12 @@ if(ARROW_FOUND) arrow_extract_macro_value(PARQUET_SO_VERSION_QUOTED "PARQUET_SO_VERSION" "${PARQUET_VERSION_H_CONTENT}") - string(REGEX - REPLACE "^\"(.+)\"$" "\\1" PARQUET_SO_VERSION "${PARQUET_SO_VERSION_QUOTED}") + string(REGEX REPLACE "^\"(.+)\"$" "\\1" PARQUET_SO_VERSION + "${PARQUET_SO_VERSION_QUOTED}") arrow_extract_macro_value(PARQUET_FULL_SO_VERSION_QUOTED "PARQUET_FULL_SO_VERSION" "${PARQUET_VERSION_H_CONTENT}") - string(REGEX - REPLACE "^\"(.+)\"$" "\\1" PARQUET_FULL_SO_VERSION - "${PARQUET_FULL_SO_VERSION_QUOTED}") + string(REGEX REPLACE "^\"(.+)\"$" "\\1" PARQUET_FULL_SO_VERSION + "${PARQUET_FULL_SO_VERSION_QUOTED}") endif() else() if(PARQUET_USE_CMAKE_PACKAGE_CONFIG) @@ -113,13 +112,10 @@ mark_as_advanced(PARQUET_ABI_VERSION PARQUET_STATIC_LIB PARQUET_VERSION) -find_package_handle_standard_args(Parquet - REQUIRED_VARS - PARQUET_INCLUDE_DIR - PARQUET_LIB_DIR - PARQUET_SO_VERSION - VERSION_VAR - PARQUET_VERSION) +find_package_handle_standard_args( + Parquet + REQUIRED_VARS PARQUET_INCLUDE_DIR PARQUET_LIB_DIR PARQUET_SO_VERSION + VERSION_VAR PARQUET_VERSION) set(PARQUET_FOUND ${Parquet_FOUND}) if(Parquet_FOUND AND NOT Parquet_FIND_QUIETLY) diff --git a/cpp/cmake_modules/FindPlasma.cmake b/cpp/cmake_modules/FindPlasma.cmake index d56b7141320..2e634844c59 100644 --- a/cpp/cmake_modules/FindPlasma.cmake +++ b/cpp/cmake_modules/FindPlasma.cmake @@ -87,14 +87,10 @@ mark_as_advanced(PLASMA_ABI_VERSION PLASMA_STORE_SERVER PLASMA_VERSION) -find_package_handle_standard_args(Plasma - REQUIRED_VARS - PLASMA_INCLUDE_DIR - PLASMA_LIB_DIR - PLASMA_SO_VERSION - PLASMA_STORE_SERVER - VERSION_VAR - PLASMA_VERSION) +find_package_handle_standard_args( + Plasma + REQUIRED_VARS PLASMA_INCLUDE_DIR PLASMA_LIB_DIR PLASMA_SO_VERSION PLASMA_STORE_SERVER + VERSION_VAR PLASMA_VERSION) set(PLASMA_FOUND ${Plasma_FOUND}) if(Plasma_FOUND AND NOT Plasma_FIND_QUIETLY) diff --git a/cpp/cmake_modules/FindPython3Alt.cmake b/cpp/cmake_modules/FindPython3Alt.cmake index 131a0d395fc..ab91c7be052 100644 --- a/cpp/cmake_modules/FindPython3Alt.cmake +++ b/cpp/cmake_modules/FindPython3Alt.cmake @@ -33,11 +33,8 @@ if(${CMAKE_VERSION} VERSION_LESS "3.15.0") find_package(PythonLibsNew) find_package(NumPy) endif() - find_package_handle_standard_args(Python3Alt - REQUIRED_VARS - PYTHON_EXECUTABLE - PYTHON_INCLUDE_DIRS - NUMPY_INCLUDE_DIRS) + find_package_handle_standard_args( + Python3Alt REQUIRED_VARS PYTHON_EXECUTABLE PYTHON_INCLUDE_DIRS NUMPY_INCLUDE_DIRS) return() endif() @@ -46,13 +43,17 @@ if(${CMAKE_VERSION} VERSION_LESS "3.18.0" OR ARROW_BUILD_TESTS) # the full "Development" component. Also ask for it on CMake < 3.18, # where "Development.Module" is not available. if(Python3Alt_FIND_REQUIRED) - find_package(Python3 COMPONENTS Interpreter Development NumPy REQUIRED) + find_package(Python3 + COMPONENTS Interpreter Development NumPy + REQUIRED) else() find_package(Python3 COMPONENTS Interpreter Development NumPy) endif() else() if(Python3Alt_FIND_REQUIRED) - find_package(Python3 COMPONENTS Interpreter Development.Module NumPy REQUIRED) + find_package(Python3 + COMPONENTS Interpreter Development.Module NumPy + REQUIRED) else() find_package(Python3 COMPONENTS Interpreter Development.Module NumPy) endif() @@ -72,12 +73,11 @@ get_target_property(NUMPY_INCLUDE_DIRS Python3::NumPy INTERFACE_INCLUDE_DIRECTOR # CMake's python3_add_library() doesn't apply the required extension suffix, # detect it ourselves. # (https://gitlab.kitware.com/cmake/cmake/issues/20408) -execute_process( - COMMAND "${PYTHON_EXECUTABLE}" "-c" - "from distutils import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))" - RESULT_VARIABLE _PYTHON_RESULT - OUTPUT_VARIABLE _PYTHON_STDOUT - ERROR_VARIABLE _PYTHON_STDERR) +execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "from distutils import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))" + RESULT_VARIABLE _PYTHON_RESULT + OUTPUT_VARIABLE _PYTHON_STDOUT + ERROR_VARIABLE _PYTHON_STDERR) if(NOT _PYTHON_RESULT MATCHES 0) if(Python3Alt_FIND_REQUIRED) @@ -92,8 +92,5 @@ function(PYTHON_ADD_MODULE name) set_target_properties(${name} PROPERTIES SUFFIX ${_EXT_SUFFIX}) endfunction() -find_package_handle_standard_args(Python3Alt - REQUIRED_VARS - PYTHON_EXECUTABLE - PYTHON_INCLUDE_DIRS - NUMPY_INCLUDE_DIRS) +find_package_handle_standard_args( + Python3Alt REQUIRED_VARS PYTHON_EXECUTABLE PYTHON_INCLUDE_DIRS NUMPY_INCLUDE_DIRS) diff --git a/cpp/cmake_modules/FindRapidJSONAlt.cmake b/cpp/cmake_modules/FindRapidJSONAlt.cmake index a967ef61a66..9a449a5280e 100644 --- a/cpp/cmake_modules/FindRapidJSONAlt.cmake +++ b/cpp/cmake_modules/FindRapidJSONAlt.cmake @@ -36,39 +36,37 @@ if(RapidJSON_ROOT) NO_DEFAULT_PATH PATH_SUFFIXES "include") else() - find_path(RAPIDJSON_INCLUDE_DIR NAMES rapidjson/rapidjson.h PATH_SUFFIXES "include") + find_path(RAPIDJSON_INCLUDE_DIR + NAMES rapidjson/rapidjson.h + PATH_SUFFIXES "include") endif() if(RAPIDJSON_INCLUDE_DIR) file(READ "${RAPIDJSON_INCLUDE_DIR}/rapidjson/rapidjson.h" RAPIDJSON_H_CONTENT) string(REGEX MATCH "#define RAPIDJSON_MAJOR_VERSION ([0-9]+)" RAPIDJSON_MAJOR_VERSION_DEFINITION "${RAPIDJSON_H_CONTENT}") - string(REGEX - REPLACE "^.+ ([0-9]+)$" "\\1" RAPIDJSON_MAJOR_VERSION - "${RAPIDJSON_MAJOR_VERSION_DEFINITION}") + string(REGEX REPLACE "^.+ ([0-9]+)$" "\\1" RAPIDJSON_MAJOR_VERSION + "${RAPIDJSON_MAJOR_VERSION_DEFINITION}") string(REGEX MATCH "#define RAPIDJSON_MINOR_VERSION ([0-9]+)" RAPIDJSON_MINOR_VERSION_DEFINITION "${RAPIDJSON_H_CONTENT}") - string(REGEX - REPLACE "^.+ ([0-9]+)$" "\\1" RAPIDJSON_MINOR_VERSION - "${RAPIDJSON_MINOR_VERSION_DEFINITION}") + string(REGEX REPLACE "^.+ ([0-9]+)$" "\\1" RAPIDJSON_MINOR_VERSION + "${RAPIDJSON_MINOR_VERSION_DEFINITION}") string(REGEX MATCH "#define RAPIDJSON_PATCH_VERSION ([0-9]+)" RAPIDJSON_PATCH_VERSION_DEFINITION "${RAPIDJSON_H_CONTENT}") - string(REGEX - REPLACE "^.+ ([0-9]+)$" "\\1" RAPIDJSON_PATCH_VERSION - "${RAPIDJSON_PATCH_VERSION_DEFINITION}") + string(REGEX REPLACE "^.+ ([0-9]+)$" "\\1" RAPIDJSON_PATCH_VERSION + "${RAPIDJSON_PATCH_VERSION_DEFINITION}") if("${RAPIDJSON_MAJOR_VERSION}" STREQUAL "" OR "${RAPIDJSON_MINOR_VERSION}" STREQUAL "" OR "${RAPIDJSON_PATCH_VERSION}" STREQUAL "") set(RAPIDJSON_VERSION "0.0.0") else() - set( - RAPIDJSON_VERSION - "${RAPIDJSON_MAJOR_VERSION}.${RAPIDJSON_MINOR_VERSION}.${RAPIDJSON_PATCH_VERSION}") + set(RAPIDJSON_VERSION + "${RAPIDJSON_MAJOR_VERSION}.${RAPIDJSON_MINOR_VERSION}.${RAPIDJSON_PATCH_VERSION}" + ) endif() endif() -find_package_handle_standard_args(RapidJSONAlt - REQUIRED_VARS - RAPIDJSON_INCLUDE_DIR - VERSION_VAR - RAPIDJSON_VERSION) +find_package_handle_standard_args( + RapidJSONAlt + REQUIRED_VARS RAPIDJSON_INCLUDE_DIR + VERSION_VAR RAPIDJSON_VERSION) diff --git a/cpp/cmake_modules/FindSnappy.cmake b/cpp/cmake_modules/FindSnappy.cmake index 26cccb786c5..747df31854d 100644 --- a/cpp/cmake_modules/FindSnappy.cmake +++ b/cpp/cmake_modules/FindSnappy.cmake @@ -19,20 +19,19 @@ if(ARROW_SNAPPY_USE_SHARED) set(SNAPPY_LIB_NAMES) if(CMAKE_IMPORT_LIBRARY_SUFFIX) list(APPEND SNAPPY_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}snappy${CMAKE_IMPORT_LIBRARY_SUFFIX}") + "${CMAKE_IMPORT_LIBRARY_PREFIX}snappy${CMAKE_IMPORT_LIBRARY_SUFFIX}") endif() list(APPEND SNAPPY_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}snappy${CMAKE_SHARED_LIBRARY_SUFFIX}") + "${CMAKE_SHARED_LIBRARY_PREFIX}snappy${CMAKE_SHARED_LIBRARY_SUFFIX}") else() set(SNAPPY_STATIC_LIB_NAME_BASE "snappy") if(MSVC) set(SNAPPY_STATIC_LIB_NAME_BASE "${SNAPPY_STATIC_LIB_NAME_BASE}${SNAPPY_MSVC_STATIC_LIB_SUFFIX}") endif() - set( - SNAPPY_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(SNAPPY_LIB_NAMES + "${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) endif() if(Snappy_ROOT) diff --git a/cpp/cmake_modules/FindThrift.cmake b/cpp/cmake_modules/FindThrift.cmake index 273d907ed07..750d8ce8341 100644 --- a/cpp/cmake_modules/FindThrift.cmake +++ b/cpp/cmake_modules/FindThrift.cmake @@ -33,9 +33,13 @@ function(EXTRACT_THRIFT_VERSION) string(REGEX MATCH "#define PACKAGE_VERSION \"[0-9.]+\"" THRIFT_VERSION_DEFINITION "${THRIFT_CONFIG_H_CONTENT}") string(REGEX MATCH "[0-9.]+" THRIFT_VERSION "${THRIFT_VERSION_DEFINITION}") - set(THRIFT_VERSION "${THRIFT_VERSION}" PARENT_SCOPE) + set(THRIFT_VERSION + "${THRIFT_VERSION}" + PARENT_SCOPE) else() - set(THRIFT_VERSION "" PARENT_SCOPE) + set(THRIFT_VERSION + "" + PARENT_SCOPE) endif() endfunction(EXTRACT_THRIFT_VERSION) @@ -53,21 +57,19 @@ set(THRIFT_LIB_NAME_BASE "thrift${THRIFT_MSVC_LIB_SUFFIX}") if(ARROW_THRIFT_USE_SHARED) set(THRIFT_LIB_NAMES thrift) if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list( - APPEND - THRIFT_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" - ) - endif() - list( - APPEND - THRIFT_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}" + list(APPEND + THRIFT_LIB_NAMES + "${CMAKE_IMPORT_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" ) + endif() + list(APPEND + THRIFT_LIB_NAMES + "${CMAKE_SHARED_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}" + ) else() - set( - THRIFT_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(THRIFT_LIB_NAMES + "${CMAKE_STATIC_LIBRARY_PREFIX}${THRIFT_LIB_NAME_BASE}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) endif() if(Thrift_ROOT) @@ -78,7 +80,9 @@ if(Thrift_ROOT) find_path(THRIFT_INCLUDE_DIR thrift/Thrift.h PATHS ${Thrift_ROOT} PATH_SUFFIXES "include") - find_program(THRIFT_COMPILER thrift PATHS ${Thrift_ROOT} PATH_SUFFIXES "bin") + find_program(THRIFT_COMPILER thrift + PATHS ${Thrift_ROOT} + PATH_SUFFIXES "bin") extract_thrift_version() else() # THRIFT-4760: The pkgconfig files are currently only installed when using autotools. @@ -115,13 +119,11 @@ else() set(Thrift_COMPILER_FOUND FALSE) endif() -find_package_handle_standard_args(Thrift - REQUIRED_VARS - THRIFT_LIB - THRIFT_INCLUDE_DIR - VERSION_VAR - THRIFT_VERSION - HANDLE_COMPONENTS) +find_package_handle_standard_args( + Thrift + REQUIRED_VARS THRIFT_LIB THRIFT_INCLUDE_DIR + VERSION_VAR THRIFT_VERSION + HANDLE_COMPONENTS) if(Thrift_FOUND OR THRIFT_FOUND) set(Thrift_FOUND TRUE) diff --git a/cpp/cmake_modules/Findc-aresAlt.cmake b/cpp/cmake_modules/Findc-aresAlt.cmake new file mode 100644 index 00000000000..5213e8d12a1 --- /dev/null +++ b/cpp/cmake_modules/Findc-aresAlt.cmake @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set(find_package_args) +if(c-aresAlt_FIND_VERSION) + list(APPEND find_package_args ${c-aresAlt_FIND_VERSION}) +endif() +if(c-aresAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() +find_package(c-ares ${find_package_args}) +if(c-ares_FOUND) + set(c-aresAlt_FOUND TRUE) + return() +endif() + +find_package(PkgConfig QUIET) +pkg_check_modules(c-ares_PC libcares) +if(c-ares_PC_FOUND) + set(c-ares_INCLUDE_DIR "${c-ares_PC_INCLUDEDIR}") + + list(APPEND c-ares_PC_LIBRARY_DIRS "${c-ares_PC_LIBDIR}") + find_library(c-ares_LIB cares + PATHS ${c-ares_PC_LIBRARY_DIRS} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} + NO_DEFAULT_PATH) +elseif(c-ares_ROOT) + find_library(c-ares_LIB + NAMES cares + "${CMAKE_SHARED_LIBRARY_PREFIX}cares${CMAKE_SHARED_LIBRARY_SUFFIX}" + PATHS ${c-ares_ROOT} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} + NO_DEFAULT_PATH) + find_path(c-ares_INCLUDE_DIR + NAMES ares.h + PATHS ${c-ares_ROOT} + NO_DEFAULT_PATH + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) +else() + find_library(c-ares_LIB + NAMES cares + "${CMAKE_SHARED_LIBRARY_PREFIX}cares${CMAKE_SHARED_LIBRARY_SUFFIX}" + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) + find_path(c-ares_INCLUDE_DIR + NAMES ares.h + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) +endif() + +find_package_handle_standard_args(c-aresAlt REQUIRED_VARS c-ares_LIB c-ares_INCLUDE_DIR) + +if(c-aresAlt_FOUND) + if(NOT TARGET c-ares::cares) + add_library(c-ares::cares UNKNOWN IMPORTED) + set_target_properties(c-ares::cares + PROPERTIES IMPORTED_LOCATION "${c-ares_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${c-ares_INCLUDE_DIR}") + endif() +endif() diff --git a/cpp/cmake_modules/FindgRPCAlt.cmake b/cpp/cmake_modules/FindgRPCAlt.cmake index 79fe01744d3..18b23f32269 100644 --- a/cpp/cmake_modules/FindgRPCAlt.cmake +++ b/cpp/cmake_modules/FindgRPCAlt.cmake @@ -24,224 +24,53 @@ if(gRPC_FOUND) return() endif() -unset(GRPC_ALT_VERSION) - -if(ARROW_GRPC_USE_SHARED) - set(GRPC_GPR_LIB_NAMES) - set(GRPC_GRPC_LIB_NAMES) - set(GRPC_GRPCPP_LIB_NAMES) - set(GRPC_ADDRESS_SORTING_LIB_NAMES) - set(GRPC_UPB_LIB_NAMES) - if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list(APPEND GRPC_GPR_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}gpr${CMAKE_IMPORT_LIBRARY_SUFFIX}") - list(APPEND GRPC_GRPC_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}grpc${CMAKE_IMPORT_LIBRARY_SUFFIX}") - list(APPEND GRPC_GRPCPP_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}grpc++${CMAKE_IMPORT_LIBRARY_SUFFIX}") - list( - APPEND GRPC_ADDRESS_SORTING_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}address_sorting${CMAKE_IMPORT_LIBRARY_SUFFIX}" - ) - list(APPEND GRPC_UPB_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}upb${CMAKE_IMPORT_LIBRARY_SUFFIX}") +find_package(PkgConfig QUIET) +pkg_check_modules(GRPCPP_PC grpc++) +if(GRPCPP_PC_FOUND) + set(gRPCAlt_VERSION "${GRPCPP_PC_VERSION}") + set(GRPCPP_INCLUDE_DIRECTORIES ${GRPCPP_PC_INCLUDEDIR}) + if(ARROW_GRPC_USE_SHARED) + set(GRPCPP_LINK_LIBRARIES ${GRPCPP_PC_LINK_LIBRARIES}) + set(GRPCPP_LINK_OPTIONS ${GRPCPP_PC_LDFLAGS_OTHER}) + set(GRPCPP_COMPILE_OPTIONS ${GRPCPP_PC_CFLAGS_OTHER}) + else() + set(GRPCPP_LINK_LIBRARIES) + foreach(GRPCPP_LIBRARY_NAME ${GRPCPP_PC_STATIC_LIBRARIES}) + find_library(GRPCPP_LIBRARY_${GRPCPP_LIBRARY_NAME} + NAMES "${CMAKE_STATIC_LIBRARY_PREFIX}${GRPCPP_LIBRARY_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + HINTS ${GRPCPP_PC_STATIC_LIBRARY_DIRS}) + list(APPEND GRPCPP_LINK_LIBRARIES "${GRPCPP_LIBRARY_${GRPCPP_LIBRARY_NAME}}") + endforeach() + set(GRPCPP_LINK_OPTIONS ${GRPCPP_PC_STATIC_LDFLAGS_OTHER}) + set(GRPCPP_COMPILE_OPTIONS ${GRPCPP_PC_STATIC_CFLAGS_OTHER}) endif() - list(APPEND GRPC_GPR_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}gpr${CMAKE_SHARED_LIBRARY_SUFFIX}") - list(APPEND GRPC_GRPC_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}grpc${CMAKE_SHARED_LIBRARY_SUFFIX}") - list(APPEND GRPC_GRPCPP_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}grpc++${CMAKE_SHARED_LIBRARY_SUFFIX}") - list( - APPEND GRPC_ADDRESS_SORTING_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}address_sorting${CMAKE_SHARED_LIBRARY_SUFFIX}") - list(APPEND GRPC_UPB_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}upb${CMAKE_SHARED_LIBRARY_SUFFIX}") -else() - set(GRPC_GPR_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}gpr${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GRPC_GRPC_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}grpc${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GRPC_GRPCPP_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}grpc++${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GRPC_ADDRESS_SORTING_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}address_sorting${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(GRPC_UPB_LIB_NAMES - "${CMAKE_STATIC_LIBRARY_PREFIX}upb${CMAKE_STATIC_LIBRARY_SUFFIX}") -endif() - -if(gRPC_ROOT) - find_library(GRPC_GPR_LIB - NAMES ${GRPC_GPR_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_GRPC_LIB - NAMES ${GRPC_GRPC_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_GRPCPP_LIB - NAMES ${GRPC_GRPCPP_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_ADDRESS_SORTING_LIB - NAMES ${GRPC_ADDRESS_SORTING_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_UPB_LIB - NAMES ${GRPC_UPB_LIB_NAMES} - PATHS ${gRPC_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin NO_DEFAULT_PATH - PATHS ${gRPC_ROOT} + list(GET GRPCPP_LINK_LIBRARIES 0 GRPCPP_IMPORTED_LOCATION) + list(REMOVE_AT GRPCPP_LINK_LIBRARIES 0) + find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin + HINTS ${GRPCPP_PC_PREFIX} + NO_DEFAULT_PATH PATH_SUFFIXES "bin") - find_path(GRPC_INCLUDE_DIR - NAMES grpc/grpc.h - PATHS ${gRPC_ROOT} - NO_DEFAULT_PATH - PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) -else() - find_package(PkgConfig QUIET) - pkg_check_modules(GRPC_PC grpc++) - if(GRPC_PC_FOUND) - set(GRPC_ALT_VERSION "${GRPC_PC_VERSION}") - set(GRPC_INCLUDE_DIR "${GRPC_PC_INCLUDEDIR}") - list(APPEND GRPC_PC_LIBRARY_DIRS "${GRPC_PC_LIBDIR}") - message(STATUS "${GRPC_PC_LIBRARY_DIRS}") - - find_library(GRPC_GPR_LIB - NAMES ${GRPC_GPR_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_GRPC_LIB - NAMES ${GRPC_GRPC_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_GRPCPP_LIB - NAMES ${GRPC_GRPCPP_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_ADDRESS_SORTING_LIB - NAMES ${GRPC_ADDRESS_SORTING_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_library(GRPC_UPB_LIB - NAMES ${GRPC_UPB_LIB_NAMES} - PATHS ${GRPC_PC_LIBRARY_DIRS} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) - find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin - HINTS ${GRPC_PC_PREFIX} - NO_DEFAULT_PATH - PATH_SUFFIXES "bin") - else() - find_library(GRPC_GPR_LIB - NAMES ${GRPC_GPR_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_library(GRPC_GRPC_LIB - NAMES ${GRPC_GRPC_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_library(GRPC_GRPCPP_LIB - NAMES ${GRPC_GRPCPP_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_library(GRPC_ADDRESS_SORTING_LIB - NAMES ${GRPC_ADDRESS_SORTING_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_library(GRPC_UPB_LIB - NAMES ${GRPC_UPB_LIB_NAMES} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin PATH_SUFFIXES "bin") - find_path(GRPC_INCLUDE_DIR - NAMES grpc/grpc.h - PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + set(gRPCAlt_FIND_PACKAGE_ARGS gRPCAlt REQUIRED_VARS GRPCPP_IMPORTED_LOCATION + GRPC_CPP_PLUGIN) + if(gRPCAlt_VERSION) + list(APPEND gRPCAlt_FIND_PACKAGE_ARGS VERSION_VAR gRPCAlt_VERSION) endif() + find_package_handle_standard_args(${gRPCAlt_FIND_PACKAGE_ARGS}) +else() + set(gRPCAlt_FOUND FALSE) endif() -set(GRPC_ALT_FIND_PACKAGE_ARGS - gRPCAlt - REQUIRED_VARS - GRPC_INCLUDE_DIR - GRPC_GPR_LIB - GRPC_GRPC_LIB - GRPC_GRPCPP_LIB - GRPC_CPP_PLUGIN) -if(GRPC_ALT_VERSION) - list(APPEND GRPC_ALT_FIND_PACKAGE_ARGS VERSION_VAR GRPC_ALT_VERSION) -endif() -find_package_handle_standard_args(${GRPC_ALT_FIND_PACKAGE_ARGS}) - if(gRPCAlt_FOUND) - add_library(gRPC::gpr UNKNOWN IMPORTED) - set_target_properties(gRPC::gpr - PROPERTIES IMPORTED_LOCATION "${GRPC_GPR_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - - add_library(gRPC::grpc UNKNOWN IMPORTED) - set_target_properties( - gRPC::grpc - PROPERTIES IMPORTED_LOCATION - "${GRPC_GRPC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES - "${GRPC_INCLUDE_DIR}" - INTERFACE_LINK_LIBRARIES - "OpenSSL::SSL;OpenSSL::Crypto;ZLIB::ZLIB;c-ares::cares") - - set(_GRPCPP_LINK_LIBRARIES "gRPC::grpc;gRPC::gpr") - - if(GRPC_ADDRESS_SORTING_LIB) - # Address sorting is optional and not always required. - add_library(gRPC::address_sorting UNKNOWN IMPORTED) - set_target_properties(gRPC::address_sorting - PROPERTIES IMPORTED_LOCATION "${GRPC_ADDRESS_SORTING_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - set(_GRPCPP_LINK_LIBRARIES "${_GRPCPP_LINK_LIBRARIES};gRPC::address_sorting") - endif() - - if(GRPC_UPB_LIB) - # upb is used by recent gRPC versions - add_library(gRPC::upb UNKNOWN IMPORTED) - set_target_properties(gRPC::upb - PROPERTIES IMPORTED_LOCATION "${GRPC_UPB_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - set(_GRPCPP_LINK_LIBRARIES "${_GRPCPP_LINK_LIBRARIES};gRPC::upb") - endif() - - find_package(absl CONFIG) - if(absl_FOUND) - # Abseil libraries that recent gRPC versions depend on - set(_ABSL_LIBS - bad_optional_access - int128 - raw_logging_internal - str_format_internal - strings - throw_delegate - time - time_zone) - - foreach(_ABSL_LIB ${_ABSL_LIBS}) - set(_GRPCPP_LINK_LIBRARIES "${_GRPCPP_LINK_LIBRARIES};absl::${_ABSL_LIB}") - endforeach() - endif() - add_library(gRPC::grpc++ UNKNOWN IMPORTED) set_target_properties(gRPC::grpc++ - PROPERTIES IMPORTED_LOCATION - "${GRPC_GRPCPP_LIB}" - INTERFACE_LINK_LIBRARIES - "${_GRPCPP_LINK_LIBRARIES}" + PROPERTIES IMPORTED_LOCATION "${GRPCPP_IMPORTED_LOCATION}" + INTERFACE_COMPILE_OPTIONS "${GRPCPP_COMPILE_OPTIONS}" INTERFACE_INCLUDE_DIRECTORIES - "${GRPC_INCLUDE_DIR}") + "${GRPCPP_INCLUDE_DIRECTORIES}" + INTERFACE_LINK_LIBRARIES "${GRPCPP_LINK_LIBRARIES}" + INTERFACE_LINK_OPTIONS "${GRPCPP_LINK_OPTIONS}") add_executable(gRPC::grpc_cpp_plugin IMPORTED) - set_target_properties(gRPC::grpc_cpp_plugin - PROPERTIES IMPORTED_LOCATION ${GRPC_CPP_PLUGIN}) + set_target_properties(gRPC::grpc_cpp_plugin PROPERTIES IMPORTED_LOCATION + ${GRPC_CPP_PLUGIN}) endif() diff --git a/cpp/cmake_modules/Findre2Alt.cmake b/cpp/cmake_modules/Findre2Alt.cmake index 93b69ce77cb..68abf1b75fe 100644 --- a/cpp/cmake_modules/Findre2Alt.cmake +++ b/cpp/cmake_modules/Findre2Alt.cmake @@ -42,35 +42,37 @@ if(RE2_PC_FOUND) # On Fedora, the reported prefix is wrong. As users likely run into this, # workaround. # https://bugzilla.redhat.com/show_bug.cgi?id=1652589 - if(UNIX AND NOT APPLE AND NOT RE2_LIB) + if(UNIX + AND NOT APPLE + AND NOT RE2_LIB) if(RE2_PC_PREFIX STREQUAL "/usr/local") find_library(RE2_LIB re2) endif() endif() elseif(RE2_ROOT) - find_library( - RE2_LIB - NAMES - re2_static re2 - "${CMAKE_STATIC_LIBRARY_PREFIX}re2${RE2_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - "${CMAKE_SHARED_LIBRARY_PREFIX}re2${CMAKE_SHARED_LIBRARY_SUFFIX}" - PATHS ${RE2_ROOT} - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} - NO_DEFAULT_PATH) + find_library(RE2_LIB + NAMES re2_static + re2 + "${CMAKE_STATIC_LIBRARY_PREFIX}re2${RE2_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${CMAKE_SHARED_LIBRARY_PREFIX}re2${CMAKE_SHARED_LIBRARY_SUFFIX}" + PATHS ${RE2_ROOT} + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES} + NO_DEFAULT_PATH) find_path(RE2_INCLUDE_DIR NAMES re2/re2.h PATHS ${RE2_ROOT} NO_DEFAULT_PATH PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) else() - find_library( - RE2_LIB - NAMES - re2_static re2 - "${CMAKE_STATIC_LIBRARY_PREFIX}re2${RE2_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - "${CMAKE_SHARED_LIBRARY_PREFIX}re2${CMAKE_SHARED_LIBRARY_SUFFIX}" - PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_path(RE2_INCLUDE_DIR NAMES re2/re2.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + find_library(RE2_LIB + NAMES re2_static + re2 + "${CMAKE_STATIC_LIBRARY_PREFIX}re2${RE2_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" + "${CMAKE_SHARED_LIBRARY_PREFIX}re2${CMAKE_SHARED_LIBRARY_SUFFIX}" + PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) + find_path(RE2_INCLUDE_DIR + NAMES re2/re2.h + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() find_package_handle_standard_args(re2Alt REQUIRED_VARS RE2_LIB RE2_INCLUDE_DIR) diff --git a/cpp/cmake_modules/Findutf8proc.cmake b/cpp/cmake_modules/Findutf8proc.cmake index edea73b8dae..4d732f18694 100644 --- a/cpp/cmake_modules/Findutf8proc.cmake +++ b/cpp/cmake_modules/Findutf8proc.cmake @@ -15,14 +15,41 @@ # specific language governing permissions and limitations # under the License. +function(extract_utf8proc_version) + if(utf8proc_INCLUDE_DIR) + file(READ "${utf8proc_INCLUDE_DIR}/utf8proc.h" UTF8PROC_H_CONTENT) + + string(REGEX MATCH "#define UTF8PROC_VERSION_MAJOR [0-9]+" + UTF8PROC_MAJOR_VERSION_DEFINITION "${UTF8PROC_H_CONTENT}") + string(REGEX MATCH "#define UTF8PROC_VERSION_MINOR [0-9]+" + UTF8PROC_MINOR_VERSION_DEFINITION "${UTF8PROC_H_CONTENT}") + string(REGEX MATCH "#define UTF8PROC_VERSION_PATCH [0-9]+" + UTF8PROC_PATCH_VERSION_DEFINITION "${UTF8PROC_H_CONTENT}") + + string(REGEX MATCH "[0-9]+$" UTF8PROC_MAJOR_VERSION + "${UTF8PROC_MAJOR_VERSION_DEFINITION}") + string(REGEX MATCH "[0-9]+$" UTF8PROC_MINOR_VERSION + "${UTF8PROC_MINOR_VERSION_DEFINITION}") + string(REGEX MATCH "[0-9]+$" UTF8PROC_PATCH_VERSION + "${UTF8PROC_PATCH_VERSION_DEFINITION}") + set(utf8proc_VERSION + "${UTF8PROC_MAJOR_VERSION}.${UTF8PROC_MINOR_VERSION}.${UTF8PROC_PATCH_VERSION}" + PARENT_SCOPE) + else() + set(utf8proc_VERSION + "" + PARENT_SCOPE) + endif() +endfunction(extract_utf8proc_version) + if(ARROW_UTF8PROC_USE_SHARED) set(utf8proc_LIB_NAMES) if(CMAKE_IMPORT_LIBRARY_SUFFIX) list(APPEND utf8proc_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}utf8proc${CMAKE_IMPORT_LIBRARY_SUFFIX}") + "${CMAKE_IMPORT_LIBRARY_PREFIX}utf8proc${CMAKE_IMPORT_LIBRARY_SUFFIX}") endif() list(APPEND utf8proc_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}utf8proc${CMAKE_SHARED_LIBRARY_SUFFIX}") + "${CMAKE_SHARED_LIBRARY_PREFIX}utf8proc${CMAKE_SHARED_LIBRARY_SUFFIX}") else() if(MSVC AND NOT DEFINED utf8proc_MSVC_STATIC_LIB_SUFFIX) set(utf8proc_MSVC_STATIC_LIB_SUFFIX "_static") @@ -44,6 +71,7 @@ if(utf8proc_ROOT) PATHS ${utf8proc_ROOT} NO_DEFAULT_PATH PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + extract_utf8proc_version() else() find_library(utf8proc_LIB NAMES ${utf8proc_LIB_NAMES} @@ -51,20 +79,23 @@ else() find_path(utf8proc_INCLUDE_DIR NAMES utf8proc.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + extract_utf8proc_version() endif() -find_package_handle_standard_args(utf8proc REQUIRED_VARS utf8proc_LIB - utf8proc_INCLUDE_DIR) +find_package_handle_standard_args( + utf8proc + REQUIRED_VARS utf8proc_LIB utf8proc_INCLUDE_DIR + VERSION_VAR utf8proc_VERSION) if(utf8proc_FOUND) set(utf8proc_FOUND TRUE) add_library(utf8proc::utf8proc UNKNOWN IMPORTED) - set_target_properties( - utf8proc::utf8proc - PROPERTIES IMPORTED_LOCATION "${utf8proc_LIB}" INTERFACE_INCLUDE_DIRECTORIES - "${utf8proc_INCLUDE_DIR}") + set_target_properties(utf8proc::utf8proc + PROPERTIES IMPORTED_LOCATION "${utf8proc_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${utf8proc_INCLUDE_DIR}") if(NOT ARROW_UTF8PROC_USE_SHARED) - set_target_properties(utf8proc::utf8proc - PROPERTIES INTERFACE_COMPILER_DEFINITIONS "UTF8PROC_STATIC") + set_target_properties(utf8proc::utf8proc PROPERTIES INTERFACE_COMPILER_DEFINITIONS + "UTF8PROC_STATIC") endif() endif() diff --git a/cpp/cmake_modules/Findzstd.cmake b/cpp/cmake_modules/Findzstd.cmake index f32892aecb8..3fc14ec0d72 100644 --- a/cpp/cmake_modules/Findzstd.cmake +++ b/cpp/cmake_modules/Findzstd.cmake @@ -23,16 +23,14 @@ set(ZSTD_LIB_NAME_BASE "${ZSTD_MSVC_LIB_PREFIX}zstd") if(ARROW_ZSTD_USE_SHARED) set(ZSTD_LIB_NAMES) if(CMAKE_IMPORT_LIBRARY_SUFFIX) - list( - APPEND - ZSTD_LIB_NAMES - "${CMAKE_IMPORT_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" - ) + list(APPEND + ZSTD_LIB_NAMES + "${CMAKE_IMPORT_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${CMAKE_IMPORT_LIBRARY_SUFFIX}" + ) endif() - list( - APPEND - ZSTD_LIB_NAMES - "${CMAKE_SHARED_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}") + list(APPEND ZSTD_LIB_NAMES + "${CMAKE_SHARED_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}" + ) else() if(MSVC AND NOT DEFINED ZSTD_MSVC_STATIC_LIB_SUFFIX) set(ZSTD_MSVC_STATIC_LIB_SUFFIX "_static") @@ -43,7 +41,7 @@ else() "${CMAKE_STATIC_LIBRARY_PREFIX}${ZSTD_LIB_NAME_BASE}${ZSTD_STATIC_LIB_SUFFIX}") endif() -# First, find via if specified ZTD_ROOT +# First, find via if specified ZSTD_ROOT if(ZSTD_ROOT) message(STATUS "Using ZSTD_ROOT: ${ZSTD_ROOT}") find_library(ZSTD_LIB @@ -75,7 +73,9 @@ else() find_library(ZSTD_LIB NAMES ${ZSTD_LIB_NAMES} PATH_SUFFIXES ${ARROW_LIBRARY_PATH_SUFFIXES}) - find_path(ZSTD_INCLUDE_DIR NAMES zstd.h PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) + find_path(ZSTD_INCLUDE_DIR + NAMES zstd.h + PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() endif() diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 9f68c560472..86c6e9706e0 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -28,7 +28,7 @@ if(NOT DEFINED ARROW_CPU_FLAG) set(ARROW_CPU_FLAG "armv8") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv7") set(ARROW_CPU_FLAG "armv7") - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "powerpc|ppc") set(ARROW_CPU_FLAG "ppc") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") set(ARROW_CPU_FLAG "s390x") @@ -76,12 +76,13 @@ if(ARROW_CPU_FLAG STREQUAL "x86") char out[32]; _mm512_storeu_si512(out, mask); return 0; - }" CXX_SUPPORTS_AVX512) + }" + CXX_SUPPORTS_AVX512) set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) endif() # Runtime SIMD level it can get from compiler and ARROW_RUNTIME_SIMD_LEVEL - if(CXX_SUPPORTS_SSE4_2 - AND ARROW_RUNTIME_SIMD_LEVEL MATCHES "^(SSE4_2|AVX2|AVX512|MAX)$") + if(CXX_SUPPORTS_SSE4_2 AND ARROW_RUNTIME_SIMD_LEVEL MATCHES + "^(SSE4_2|AVX2|AVX512|MAX)$") set(ARROW_HAVE_RUNTIME_SSE4_2 ON) add_definitions(-DARROW_HAVE_RUNTIME_SSE4_2) endif() @@ -252,30 +253,26 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /wd4365") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /wd4267") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /wd4838") - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" - OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL + "Clang") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wextra") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wdocumentation") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-missing-braces") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-parameter") - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unknown-warning-option") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-constant-logical-operand") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-conversion") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated-declarations") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-sign-conversion") - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-variable") elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") if(WIN32) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wno-deprecated") - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wno-unused-variable") else() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated") - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-variable") endif() else() message(FATAL_ERROR "${UNKNOWN_COMPILER_MESSAGE}") @@ -289,8 +286,8 @@ elseif("${BUILD_WARNING_LEVEL}" STREQUAL "EVERYTHING") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /Wall") # https://docs.microsoft.com/en-us/cpp/build/reference/compiler-option-warning-level # /wdnnnn disables a warning where "nnnn" is a warning number - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" - OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL + "Clang") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Weverything") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-c++98-compat") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-c++98-compat-pedantic") @@ -344,9 +341,10 @@ if(MSVC) # Disable "switch statement contains 'default' but no 'case' labels" warning # (required for protobuf, see https://github.com/protocolbuffers/protobuf/issues/6885) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} /wd4065") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" - OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "7.0") + if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "7.0" OR CMAKE_CXX_COMPILER_VERSION + VERSION_GREATER "7.0") # Without this, gcc >= 7 warns related to changes in C++17 set(CXX_ONLY_FLAGS "${CXX_ONLY_FLAGS} -Wno-noexcept-type") endif() @@ -373,8 +371,8 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(CXX_ONLY_FLAGS "${CXX_ONLY_FLAGS} -Wno-subobject-linkage") endif() -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" - OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL + "Clang") # Clang options for all builds # Using Clang with ccache causes a bunch of spurious warnings that are @@ -385,7 +383,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Qunused-arguments") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") - # Avoid clang error when an unknown warning flag is passed + # Avoid error when an unknown warning flag is passed set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unknown-warning-option") # Add colors when paired with ninja set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcolor-diagnostics") @@ -443,28 +441,31 @@ if(ARROW_CPU_FLAG STREQUAL "ppc") endif() if(ARROW_CPU_FLAG STREQUAL "armv8") - if(NOT CXX_SUPPORTS_ARMV8_ARCH) - message(FATAL_ERROR "Unsupported arch flag: ${ARROW_ARMV8_ARCH_FLAG}.") - endif() - if(ARROW_ARMV8_ARCH_FLAG MATCHES "native") - message(FATAL_ERROR "native arch not allowed, please specify arch explicitly.") - endif() - set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}") - if(NOT ARROW_SIMD_LEVEL STREQUAL "NONE") - add_definitions(-DARROW_HAVE_NEON) - endif() + set(ARROW_HAVE_NEON ON) - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" - AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.4") - message(WARNING "Disable Armv8 CRC and Crypto as compiler doesn't support them well.") - else() - if(ARROW_ARMV8_ARCH_FLAG MATCHES "\\+crypto") - add_definitions(-DARROW_HAVE_ARMV8_CRYPTO) + if(NOT CXX_SUPPORTS_ARMV8_ARCH) + message(FATAL_ERROR "Unsupported arch flag: ${ARROW_ARMV8_ARCH_FLAG}.") + endif() + if(ARROW_ARMV8_ARCH_FLAG MATCHES "native") + message(FATAL_ERROR "native arch not allowed, please specify arch explicitly.") endif() - # armv8.1+ implies crc support - if(ARROW_ARMV8_ARCH_FLAG MATCHES "armv8\\.[1-9]|\\+crc") - add_definitions(-DARROW_HAVE_ARMV8_CRC) + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}") + + add_definitions(-DARROW_HAVE_NEON) + + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS + "5.4") + message(WARNING "Disable Armv8 CRC and Crypto as compiler doesn't support them well." + ) + else() + if(ARROW_ARMV8_ARCH_FLAG MATCHES "\\+crypto") + add_definitions(-DARROW_HAVE_ARMV8_CRYPTO) + endif() + # armv8.1+ implies crc support + if(ARROW_ARMV8_ARCH_FLAG MATCHES "armv8\\.[1-9]|\\+crc") + add_definitions(-DARROW_HAVE_ARMV8_CRC) + endif() endif() endif() endif() @@ -493,7 +494,9 @@ function(GET_GOLD_VERSION) message(SEND_ERROR "Could not extract GNU gold version. " "Linker version output: ${LINKER_OUTPUT}") endif() - set(GOLD_VERSION "${CMAKE_MATCH_1}" PARENT_SCOPE) + set(GOLD_VERSION + "${CMAKE_MATCH_1}" + PARENT_SCOPE) endif() endfunction() @@ -590,9 +593,8 @@ set(CXX_FLAGS_PROFILE_GEN "${CXX_FLAGS_RELEASE} -fprofile-generate") set(CXX_FLAGS_PROFILE_BUILD "${CXX_FLAGS_RELEASE} -fprofile-use") # Set compile flags based on the build type. -message( - "Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})" - ) +message("Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})" +) if("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${C_FLAGS_DEBUG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_DEBUG}") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 9f240e448f6..0631d277b08 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -196,6 +196,7 @@ endmacro() macro(resolve_dependency DEPENDENCY_NAME) set(options) set(one_value_args HAVE_ALT IS_RUNTIME_DEPENDENCY REQUIRED_VERSION USE_CONFIG) + set(multi_value_args PC_PACKAGE_NAMES) cmake_parse_arguments(ARG "${options}" "${one_value_args}" @@ -236,6 +237,17 @@ macro(resolve_dependency DEPENDENCY_NAME) if(${DEPENDENCY_NAME}_SOURCE STREQUAL "SYSTEM" AND ARG_IS_RUNTIME_DEPENDENCY) provide_find_module(${PACKAGE_NAME}) list(APPEND ARROW_SYSTEM_DEPENDENCIES ${PACKAGE_NAME}) + find_package(PkgConfig QUIET) + foreach(ARG_PC_PACKAGE_NAME ${ARG_PC_PACKAGE_NAMES}) + pkg_check_modules(${ARG_PC_PACKAGE_NAME}_PC + ${ARG_PC_PACKAGE_NAME} + NO_CMAKE_PATH + NO_CMAKE_ENVIRONMENT_PATH + QUIET) + if(${${ARG_PC_PACKAGE_NAME}_PC_FOUND}) + string(APPEND ARROW_PC_REQUIRES_PRIVATE " ${ARG_PC_PACKAGE_NAME}") + endif() + endforeach() endif() endmacro() @@ -279,7 +291,9 @@ if(ARROW_JSON) set(ARROW_WITH_RAPIDJSON ON) endif() -if(ARROW_ORC OR ARROW_FLIGHT OR ARROW_GANDIVA) +if(ARROW_ORC + OR ARROW_FLIGHT + OR ARROW_GANDIVA) set(ARROW_WITH_PROTOBUF ON) endif() @@ -287,11 +301,13 @@ if(ARROW_S3) set(ARROW_WITH_ZLIB ON) endif() -if(NOT ARROW_COMPUTE) - # utf8proc is only potentially used in kernels for now +if((NOT ARROW_COMPUTE) AND (NOT ARROW_GANDIVA)) set(ARROW_WITH_UTF8PROC OFF) endif() -if((NOT ARROW_COMPUTE) AND (NOT ARROW_GANDIVA) AND (NOT ARROW_WITH_GRPC)) + +if((NOT ARROW_COMPUTE) + AND (NOT ARROW_GANDIVA) + AND (NOT ARROW_WITH_GRPC)) set(ARROW_WITH_RE2 OFF) endif() @@ -313,9 +329,8 @@ endmacro() file(STRINGS "${THIRDPARTY_DIR}/versions.txt" TOOLCHAIN_VERSIONS_TXT) foreach(_VERSION_ENTRY ${TOOLCHAIN_VERSIONS_TXT}) # Exclude comments - if(NOT - ((_VERSION_ENTRY MATCHES "^[^#][A-Za-z0-9-_]+_VERSION=") - OR (_VERSION_ENTRY MATCHES "^[^#][A-Za-z0-9-_]+_CHECKSUM="))) + if(NOT ((_VERSION_ENTRY MATCHES "^[^#][A-Za-z0-9-_]+_VERSION=") + OR (_VERSION_ENTRY MATCHES "^[^#][A-Za-z0-9-_]+_CHECKSUM="))) continue() endif() @@ -336,46 +351,42 @@ endforeach() if(DEFINED ENV{ARROW_ABSL_URL}) set(ABSL_SOURCE_URL "$ENV{ARROW_ABSL_URL}") else() - set_urls( - ABSL_SOURCE_URL - "https://github.com/abseil/abseil-cpp/archive/${ARROW_ABSL_BUILD_VERSION}.tar.gz") + set_urls(ABSL_SOURCE_URL + "https://github.com/abseil/abseil-cpp/archive/${ARROW_ABSL_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_AWS_C_COMMON_URL}) set(AWS_C_COMMON_SOURCE_URL "$ENV{ARROW_AWS_C_COMMON_URL}") else() - set_urls( - AWS_C_COMMON_SOURCE_URL - "https://github.com/awslabs/aws-c-common/archive/${ARROW_AWS_C_COMMON_BUILD_VERSION}.tar.gz" - ) + set_urls(AWS_C_COMMON_SOURCE_URL + "https://github.com/awslabs/aws-c-common/archive/${ARROW_AWS_C_COMMON_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_AWS_CHECKSUMS_URL}) set(AWS_CHECKSUMS_SOURCE_URL "$ENV{ARROW_AWS_CHECKSUMS_URL}") else() - set_urls( - AWS_CHECKSUMS_SOURCE_URL - "https://github.com/awslabs/aws-checksums/archive/${ARROW_AWS_CHECKSUMS_BUILD_VERSION}.tar.gz" - ) + set_urls(AWS_CHECKSUMS_SOURCE_URL + "https://github.com/awslabs/aws-checksums/archive/${ARROW_AWS_CHECKSUMS_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_AWS_C_EVENT_STREAM_URL}) set(AWS_C_EVENT_STREAM_SOURCE_URL "$ENV{ARROW_AWS_C_EVENT_STREAM_URL}") else() - set_urls( - AWS_C_EVENT_STREAM_SOURCE_URL - "https://github.com/awslabs/aws-c-event-stream/archive/${ARROW_AWS_C_EVENT_STREAM_BUILD_VERSION}.tar.gz" - ) + set_urls(AWS_C_EVENT_STREAM_SOURCE_URL + "https://github.com/awslabs/aws-c-event-stream/archive/${ARROW_AWS_C_EVENT_STREAM_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_AWSSDK_URL}) set(AWSSDK_SOURCE_URL "$ENV{ARROW_AWSSDK_URL}") else() - set_urls( - AWSSDK_SOURCE_URL - "https://github.com/aws/aws-sdk-cpp/archive/${ARROW_AWSSDK_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz" - ) + set_urls(AWSSDK_SOURCE_URL + "https://github.com/aws/aws-sdk-cpp/archive/${ARROW_AWSSDK_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_BOOST_URL}) @@ -383,136 +394,125 @@ if(DEFINED ENV{ARROW_BOOST_URL}) else() string(REPLACE "." "_" ARROW_BOOST_BUILD_VERSION_UNDERSCORES ${ARROW_BOOST_BUILD_VERSION}) - set_urls( - BOOST_SOURCE_URL - # These are trimmed boost bundles we maintain. - # See cpp/build-support/trim-boost.sh - # FIXME(ARROW-6407) automate uploading this archive to ensure it reflects - # our currently used packages and doesn't fall out of sync with - # ${ARROW_BOOST_BUILD_VERSION_UNDERSCORES} - "https://github.com/ursa-labs/thirdparty/releases/download/latest/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" - "https://sourceforge.net/projects/boost/files/boost/${ARROW_BOOST_BUILD_VERSION}/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" - "https://github.com/boostorg/boost/archive/boost-${ARROW_BOOST_BUILD_VERSION}.tar.gz") + set_urls(BOOST_SOURCE_URL + # These are trimmed boost bundles we maintain. + # See cpp/build-support/trim-boost.sh + # FIXME(ARROW-6407) automate uploading this archive to ensure it reflects + # our currently used packages and doesn't fall out of sync with + # ${ARROW_BOOST_BUILD_VERSION_UNDERSCORES} + "https://github.com/ursa-labs/thirdparty/releases/download/latest/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" + "https://sourceforge.net/projects/boost/files/boost/${ARROW_BOOST_BUILD_VERSION}/boost_${ARROW_BOOST_BUILD_VERSION_UNDERSCORES}.tar.gz" + "https://github.com/boostorg/boost/archive/boost-${ARROW_BOOST_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_BROTLI_URL}) set(BROTLI_SOURCE_URL "$ENV{ARROW_BROTLI_URL}") else() - set_urls( - BROTLI_SOURCE_URL - "https://github.com/google/brotli/archive/${ARROW_BROTLI_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/brotli-${ARROW_BROTLI_BUILD_VERSION}.tar.gz" - ) + set_urls(BROTLI_SOURCE_URL + "https://github.com/google/brotli/archive/${ARROW_BROTLI_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/brotli-${ARROW_BROTLI_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_BZIP2_URL}) set(ARROW_BZIP2_SOURCE_URL "$ENV{ARROW_BZIP2_URL}") else() - set_urls( - ARROW_BZIP2_SOURCE_URL - "https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" - ) + set_urls(ARROW_BZIP2_SOURCE_URL + "https://sourceware.org/pub/bzip2/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/bzip2-${ARROW_BZIP2_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_CARES_URL}) set(CARES_SOURCE_URL "$ENV{ARROW_CARES_URL}") else() - set_urls( - CARES_SOURCE_URL - "https://c-ares.haxx.se/download/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/cares-${ARROW_CARES_BUILD_VERSION}.tar.gz" - ) + set_urls(CARES_SOURCE_URL + "https://c-ares.haxx.se/download/c-ares-${ARROW_CARES_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/cares-${ARROW_CARES_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_GBENCHMARK_URL}) set(GBENCHMARK_SOURCE_URL "$ENV{ARROW_GBENCHMARK_URL}") else() - set_urls( - GBENCHMARK_SOURCE_URL - "https://github.com/google/benchmark/archive/${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/gbenchmark-${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz" - ) + set_urls(GBENCHMARK_SOURCE_URL + "https://github.com/google/benchmark/archive/${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/gbenchmark-${ARROW_GBENCHMARK_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_GFLAGS_URL}) set(GFLAGS_SOURCE_URL "$ENV{ARROW_GFLAGS_URL}") else() - set_urls( - GFLAGS_SOURCE_URL - "https://github.com/gflags/gflags/archive/${ARROW_GFLAGS_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/gflags-${ARROW_GFLAGS_BUILD_VERSION}.tar.gz" - ) + set_urls(GFLAGS_SOURCE_URL + "https://github.com/gflags/gflags/archive/${ARROW_GFLAGS_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/gflags-${ARROW_GFLAGS_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_GLOG_URL}) set(GLOG_SOURCE_URL "$ENV{ARROW_GLOG_URL}") else() - set_urls( - GLOG_SOURCE_URL - "https://github.com/google/glog/archive/${ARROW_GLOG_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/glog-${ARROW_GLOG_BUILD_VERSION}.tar.gz" - ) + set_urls(GLOG_SOURCE_URL + "https://github.com/google/glog/archive/${ARROW_GLOG_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/glog-${ARROW_GLOG_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_GRPC_URL}) set(GRPC_SOURCE_URL "$ENV{ARROW_GRPC_URL}") else() - set_urls( - GRPC_SOURCE_URL - "https://github.com/grpc/grpc/archive/${ARROW_GRPC_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/grpc-${ARROW_GRPC_BUILD_VERSION}.tar.gz" - ) + set_urls(GRPC_SOURCE_URL + "https://github.com/grpc/grpc/archive/${ARROW_GRPC_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/grpc-${ARROW_GRPC_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_GTEST_URL}) set(GTEST_SOURCE_URL "$ENV{ARROW_GTEST_URL}") else() - set_urls( - GTEST_SOURCE_URL - "https://github.com/google/googletest/archive/release-${ARROW_GTEST_BUILD_VERSION}.tar.gz" - "https://chromium.googlesource.com/external/github.com/google/googletest/+archive/release-${ARROW_GTEST_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/gtest-${ARROW_GTEST_BUILD_VERSION}.tar.gz" - ) + set_urls(GTEST_SOURCE_URL + "https://github.com/google/googletest/archive/release-${ARROW_GTEST_BUILD_VERSION}.tar.gz" + "https://chromium.googlesource.com/external/github.com/google/googletest/+archive/release-${ARROW_GTEST_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/gtest-${ARROW_GTEST_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_JEMALLOC_URL}) set(JEMALLOC_SOURCE_URL "$ENV{ARROW_JEMALLOC_URL}") else() - set_urls( - JEMALLOC_SOURCE_URL - "https://github.com/jemalloc/jemalloc/releases/download/${ARROW_JEMALLOC_BUILD_VERSION}/jemalloc-${ARROW_JEMALLOC_BUILD_VERSION}.tar.bz2" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/jemalloc-${ARROW_JEMALLOC_BUILD_VERSION}.tar.bz2" - ) + set_urls(JEMALLOC_SOURCE_URL + "https://github.com/jemalloc/jemalloc/releases/download/${ARROW_JEMALLOC_BUILD_VERSION}/jemalloc-${ARROW_JEMALLOC_BUILD_VERSION}.tar.bz2" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/jemalloc-${ARROW_JEMALLOC_BUILD_VERSION}.tar.bz2" + ) endif() if(DEFINED ENV{ARROW_MIMALLOC_URL}) set(MIMALLOC_SOURCE_URL "$ENV{ARROW_MIMALLOC_URL}") else() - set_urls( - MIMALLOC_SOURCE_URL - "https://github.com/microsoft/mimalloc/archive/${ARROW_MIMALLOC_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/mimalloc-${ARROW_MIMALLOC_BUILD_VERSION}.tar.gz" - ) + set_urls(MIMALLOC_SOURCE_URL + "https://github.com/microsoft/mimalloc/archive/${ARROW_MIMALLOC_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/mimalloc-${ARROW_MIMALLOC_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_LZ4_URL}) set(LZ4_SOURCE_URL "$ENV{ARROW_LZ4_URL}") else() - set_urls( - LZ4_SOURCE_URL "https://github.com/lz4/lz4/archive/${ARROW_LZ4_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/lz4-${ARROW_LZ4_BUILD_VERSION}.tar.gz" - ) + set_urls(LZ4_SOURCE_URL + "https://github.com/lz4/lz4/archive/${ARROW_LZ4_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/lz4-${ARROW_LZ4_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_ORC_URL}) set(ORC_SOURCE_URL "$ENV{ARROW_ORC_URL}") else() - set_urls( - ORC_SOURCE_URL - "https://github.com/apache/orc/archive/rel/release-${ARROW_ORC_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/orc-${ARROW_ORC_BUILD_VERSION}.tar.gz" - ) + set_urls(ORC_SOURCE_URL + "https://github.com/apache/orc/archive/rel/release-${ARROW_ORC_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/orc-${ARROW_ORC_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_PROTOBUF_URL}) @@ -521,109 +521,101 @@ else() string(SUBSTRING ${ARROW_PROTOBUF_BUILD_VERSION} 1 -1 ARROW_PROTOBUF_STRIPPED_BUILD_VERSION) # strip the leading `v` - set_urls( - PROTOBUF_SOURCE_URL - "https://github.com/protocolbuffers/protobuf/releases/download/${ARROW_PROTOBUF_BUILD_VERSION}/protobuf-all-${ARROW_PROTOBUF_STRIPPED_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/protobuf-${ARROW_PROTOBUF_BUILD_VERSION}.tar.gz" - ) + set_urls(PROTOBUF_SOURCE_URL + "https://github.com/protocolbuffers/protobuf/releases/download/${ARROW_PROTOBUF_BUILD_VERSION}/protobuf-all-${ARROW_PROTOBUF_STRIPPED_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/protobuf-${ARROW_PROTOBUF_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_RE2_URL}) set(RE2_SOURCE_URL "$ENV{ARROW_RE2_URL}") else() - set_urls( - RE2_SOURCE_URL - "https://github.com/google/re2/archive/${ARROW_RE2_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/re2-${ARROW_RE2_BUILD_VERSION}.tar.gz" - ) + set_urls(RE2_SOURCE_URL + "https://github.com/google/re2/archive/${ARROW_RE2_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/re2-${ARROW_RE2_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_RAPIDJSON_URL}) set(RAPIDJSON_SOURCE_URL "$ENV{ARROW_RAPIDJSON_URL}") else() - set_urls( - RAPIDJSON_SOURCE_URL - "https://github.com/miloyip/rapidjson/archive/${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/rapidjson-${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz" - ) + set_urls(RAPIDJSON_SOURCE_URL + "https://github.com/miloyip/rapidjson/archive/${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/rapidjson-${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_SNAPPY_URL}) set(SNAPPY_SOURCE_URL "$ENV{ARROW_SNAPPY_URL}") else() - set_urls( - SNAPPY_SOURCE_URL - "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" - ) + set_urls(SNAPPY_SOURCE_URL + "https://github.com/google/snappy/archive/${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/snappy-${ARROW_SNAPPY_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_THRIFT_URL}) set(THRIFT_SOURCE_URL "$ENV{ARROW_THRIFT_URL}") else() - set_urls( - THRIFT_SOURCE_URL - "http://www.apache.org/dyn/closer.cgi?action=download&filename=/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://downloads.apache.org/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://github.com/apache/thrift/archive/v${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://apache.claz.org/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://apache.cs.utah.edu/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://apache.mirrors.lucidnetworks.net/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://apache.osuosl.org/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://ftp.wayne.edu/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://mirror.olnevhost.net/pub/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://mirrors.gigenet.com/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://mirrors.koehn.com/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://mirrors.ocf.berkeley.edu/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://mirrors.sonic.net/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://us.mirrors.quenda.co/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" - ) + set_urls(THRIFT_SOURCE_URL + "http://www.apache.org/dyn/closer.cgi?action=download&filename=/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://downloads.apache.org/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://github.com/apache/thrift/archive/v${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://apache.claz.org/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://apache.cs.utah.edu/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://apache.mirrors.lucidnetworks.net/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://apache.osuosl.org/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://ftp.wayne.edu/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://mirror.olnevhost.net/pub/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://mirrors.gigenet.com/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://mirrors.koehn.com/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://mirrors.ocf.berkeley.edu/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://mirrors.sonic.net/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://us.mirrors.quenda.co/apache/thrift/${ARROW_THRIFT_BUILD_VERSION}/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/thrift-${ARROW_THRIFT_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_UTF8PROC_URL}) set(ARROW_UTF8PROC_SOURCE_URL "$ENV{ARROW_UTF8PROC_URL}") else() - set_urls( - ARROW_UTF8PROC_SOURCE_URL - "https://github.com/JuliaStrings/utf8proc/archive/${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz" - ) + set_urls(ARROW_UTF8PROC_SOURCE_URL + "https://github.com/JuliaStrings/utf8proc/archive/${ARROW_UTF8PROC_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_XSIMD_URL}) set(XSIMD_SOURCE_URL "$ENV{ARROW_XSIMD_URL}") else() - set_urls( - XSIMD_SOURCE_URL - "https://github.com/xtensor-stack/xsimd/archive/${ARROW_XSIMD_BUILD_VERSION}.tar.gz") + set_urls(XSIMD_SOURCE_URL + "https://github.com/xtensor-stack/xsimd/archive/${ARROW_XSIMD_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_ZLIB_URL}) set(ZLIB_SOURCE_URL "$ENV{ARROW_ZLIB_URL}") else() - set_urls( - ZLIB_SOURCE_URL "https://zlib.net/fossils/zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz" - ) + set_urls(ZLIB_SOURCE_URL + "https://zlib.net/fossils/zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/zlib-${ARROW_ZLIB_BUILD_VERSION}.tar.gz" + ) endif() if(DEFINED ENV{ARROW_ZSTD_URL}) set(ZSTD_SOURCE_URL "$ENV{ARROW_ZSTD_URL}") else() - set_urls( - ZSTD_SOURCE_URL - "https://github.com/facebook/zstd/archive/${ARROW_ZSTD_BUILD_VERSION}.tar.gz" - "https://github.com/ursa-labs/thirdparty/releases/download/latest/zstd-${ARROW_ZSTD_BUILD_VERSION}.tar.gz" - ) + set_urls(ZSTD_SOURCE_URL + "https://github.com/facebook/zstd/archive/${ARROW_ZSTD_BUILD_VERSION}.tar.gz" + "https://github.com/ursa-labs/thirdparty/releases/download/latest/zstd-${ARROW_ZSTD_BUILD_VERSION}.tar.gz" + ) endif() # ---------------------------------------------------------------------- # ExternalProject options -set( - EP_CXX_FLAGS - "${CMAKE_CXX_COMPILER_ARG1} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}" - ) +set(EP_CXX_FLAGS + "${CMAKE_CXX_COMPILER_ARG1} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${UPPERCASE_BUILD_TYPE}}" +) set(EP_C_FLAGS "${CMAKE_C_COMPILER_ARG1} ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${UPPERCASE_BUILD_TYPE}}") @@ -730,12 +722,12 @@ macro(build_boost) set(BOOST_BUILD_WITH_LIBRARIES "filesystem" "system") string(REPLACE ";" "," BOOST_CONFIGURE_LIBRARIES "${BOOST_BUILD_WITH_LIBRARIES}") list(APPEND BOOST_CONFIGURE_COMMAND "--prefix=${BOOST_PREFIX}" - "--with-libraries=${BOOST_CONFIGURE_LIBRARIES}") + "--with-libraries=${BOOST_CONFIGURE_LIBRARIES}") set(BOOST_BUILD_COMMAND "./b2" "-j${NPROC}" "link=${BOOST_BUILD_LINK}" "variant=${BOOST_BUILD_VARIANT}") if(MSVC) - string(REGEX - REPLACE "([0-9])$" ".\\1" BOOST_TOOLSET_MSVC_VERSION ${MSVC_TOOLSET_VERSION}) + string(REGEX REPLACE "([0-9])$" ".\\1" BOOST_TOOLSET_MSVC_VERSION + ${MSVC_TOOLSET_VERSION}) list(APPEND BOOST_BUILD_COMMAND "toolset=msvc-${BOOST_TOOLSET_MSVC_VERSION}") set(BOOST_BUILD_WITH_LIBRARIES_MSVC) foreach(_BOOST_LIB ${BOOST_BUILD_WITH_LIBRARIES}) @@ -760,14 +752,12 @@ macro(build_boost) else() set(BOOST_LIBRARY_SUFFIX "") endif() - set( - BOOST_STATIC_SYSTEM_LIBRARY - "${BOOST_LIB_DIR}/libboost_system${BOOST_LIBRARY_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set( - BOOST_STATIC_FILESYSTEM_LIBRARY - "${BOOST_LIB_DIR}/libboost_filesystem${BOOST_LIBRARY_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(BOOST_STATIC_SYSTEM_LIBRARY + "${BOOST_LIB_DIR}/libboost_system${BOOST_LIBRARY_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(BOOST_STATIC_FILESYSTEM_LIBRARY + "${BOOST_LIB_DIR}/libboost_filesystem${BOOST_LIBRARY_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) set(BOOST_BUILD_PRODUCTS ${BOOST_STATIC_SYSTEM_LIBRARY} @@ -925,14 +915,13 @@ macro(build_snappy) message(STATUS "Building snappy from source") set(SNAPPY_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/snappy_ep/src/snappy_ep-install") set(SNAPPY_STATIC_LIB_NAME snappy) - set( - SNAPPY_STATIC_LIB - "${SNAPPY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(SNAPPY_STATIC_LIB + "${SNAPPY_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}${SNAPPY_STATIC_LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) - set(SNAPPY_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} -DCMAKE_INSTALL_LIBDIR=lib - -DSNAPPY_BUILD_TESTS=OFF - "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}") + set(SNAPPY_CMAKE_ARGS + ${EP_COMMON_CMAKE_ARGS} -DCMAKE_INSTALL_LIBDIR=lib -DSNAPPY_BUILD_TESTS=OFF + "-DCMAKE_INSTALL_PREFIX=${SNAPPY_PREFIX}") externalproject_add(snappy_ep ${EP_LOG_OPTIONS} @@ -956,7 +945,11 @@ macro(build_snappy) endmacro() if(ARROW_WITH_SNAPPY) - resolve_dependency(Snappy) + resolve_dependency(Snappy PC_PACKAGE_NAMES snappy) + if(${Snappy_SOURCE} STREQUAL "SYSTEM" AND NOT snappy_PC_FOUND) + get_target_property(SNAPPY_LIB Snappy::snappy IMPORTED_LOCATION) + string(APPEND ARROW_PC_LIBS_PRIVATE " ${SNAPPY_LIB}") + endif() # TODO: Don't use global includes but rather target_include_directories get_target_property(SNAPPY_INCLUDE_DIRS Snappy::snappy INTERFACE_INCLUDE_DIRECTORIES) include_directories(SYSTEM ${SNAPPY_INCLUDE_DIRS}) @@ -970,18 +963,15 @@ macro(build_brotli) set(BROTLI_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/brotli_ep/src/brotli_ep-install") set(BROTLI_INCLUDE_DIR "${BROTLI_PREFIX}/include") set(BROTLI_LIB_DIR lib) - set( - BROTLI_STATIC_LIBRARY_ENC - "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set( - BROTLI_STATIC_LIBRARY_DEC - "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlidec-static${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set( - BROTLI_STATIC_LIBRARY_COMMON - "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlicommon-static${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(BROTLI_STATIC_LIBRARY_ENC + "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlienc-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(BROTLI_STATIC_LIBRARY_DEC + "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlidec-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(BROTLI_STATIC_LIBRARY_COMMON + "${BROTLI_PREFIX}/${BROTLI_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}brotlicommon-static${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(BROTLI_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${BROTLI_PREFIX}" -DCMAKE_INSTALL_LIBDIR=${BROTLI_LIB_DIR}) @@ -1016,12 +1006,15 @@ macro(build_brotli) INTERFACE_INCLUDE_DIRECTORIES "${BROTLI_INCLUDE_DIR}") add_dependencies(Brotli::brotlidec brotli_ep) - list(APPEND ARROW_BUNDLED_STATIC_LIBS Brotli::brotlicommon Brotli::brotlienc - Brotli::brotlidec) + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + Brotli::brotlicommon + Brotli::brotlienc + Brotli::brotlidec) endmacro() if(ARROW_WITH_BROTLI) - resolve_dependency(Brotli) + resolve_dependency(Brotli PC_PACKAGE_NAMES libbrotlidec libbrotlienc) # TODO: Don't use global includes but rather target_include_directories get_target_property(BROTLI_INCLUDE_DIR Brotli::brotlicommon INTERFACE_INCLUDE_DIRECTORIES) @@ -1049,7 +1042,9 @@ if(BREW_BIN AND NOT OPENSSL_ROOT_DIR) endif() set(ARROW_USE_OPENSSL OFF) -if(PARQUET_REQUIRE_ENCRYPTION OR ARROW_FLIGHT OR ARROW_S3) +if(PARQUET_REQUIRE_ENCRYPTION + OR ARROW_FLIGHT + OR ARROW_S3) # OpenSSL is required if(ARROW_OPENSSL_USE_SHARED) # Find shared OpenSSL libraries. @@ -1079,10 +1074,8 @@ if(ARROW_USE_OPENSSL) include_directories(SYSTEM ${OPENSSL_INCLUDE_DIR}) else() - message( - STATUS - "Building without OpenSSL support. Minimum OpenSSL version ${ARROW_OPENSSL_REQUIRED_VERSION} required." - ) + message(STATUS "Building without OpenSSL support. Minimum OpenSSL version ${ARROW_OPENSSL_REQUIRED_VERSION} required." + ) endif() # ---------------------------------------------------------------------- @@ -1097,15 +1090,14 @@ macro(build_glog) else() set(GLOG_LIB_SUFFIX "") endif() - set( - GLOG_STATIC_LIB - "${GLOG_BUILD_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}glog${GLOG_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(GLOG_STATIC_LIB + "${GLOG_BUILD_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}glog${GLOG_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") set(GLOG_CMAKE_C_FLAGS "${EP_C_FLAGS} -fPIC") - if(Threads::Threads) - set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -pthread") - set(GLOG_CMAKE_C_FLAGS "${EP_C_FLAGS} -fPIC -pthread") + if(CMAKE_THREAD_LIBS_INIT) + set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_THREAD_LIBS_INIT}") + set(GLOG_CMAKE_C_FLAGS "${EP_C_FLAGS} ${CMAKE_THREAD_LIBS_INIT}") endif() if(APPLE) @@ -1141,7 +1133,7 @@ macro(build_glog) endmacro() if(ARROW_USE_GLOG) - resolve_dependency(GLOG) + resolve_dependency(GLOG PC_PACKAGE_NAMES libglog) # TODO: Don't use global includes but rather target_include_directories get_target_property(GLOG_INCLUDE_DIR glog::glog INTERFACE_INCLUDE_DIRECTORIES) include_directories(SYSTEM ${GLOG_INCLUDE_DIR}) @@ -1201,8 +1193,8 @@ macro(build_gflags) PROPERTIES INTERFACE_COMPILE_DEFINITIONS "GFLAGS_IS_A_DLL=0" INTERFACE_INCLUDE_DIRECTORIES "${GFLAGS_INCLUDE_DIR}") if(MSVC) - set_target_properties(${GFLAGS_LIBRARY} - PROPERTIES INTERFACE_LINK_LIBRARIES "shlwapi.lib") + set_target_properties(${GFLAGS_LIBRARY} PROPERTIES INTERFACE_LINK_LIBRARIES + "shlwapi.lib") endif() set(GFLAGS_LIBRARIES ${GFLAGS_LIBRARY}) @@ -1237,8 +1229,8 @@ endif() macro(build_thrift) if(CMAKE_VERSION VERSION_LESS 3.10) - message( - FATAL_ERROR "Building thrift using ExternalProject requires at least CMake 3.10") + message(FATAL_ERROR "Building thrift using ExternalProject requires at least CMake 3.10" + ) endif() message("Building Apache Thrift from source") set(THRIFT_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/thrift_ep-install") @@ -1316,7 +1308,11 @@ if(ARROW_WITH_THRIFT) # to build Boost, so don't look again if already found. if(NOT Thrift_FOUND AND NOT THRIFT_FOUND) # Thrift c++ code generated by 0.13 requires 0.11 or greater - resolve_dependency(Thrift REQUIRED_VERSION 0.11.0) + resolve_dependency(Thrift + REQUIRED_VERSION + 0.11.0 + PC_PACKAGE_NAMES + thrift) endif() # TODO: Don't use global includes but rather target_include_directories include_directories(SYSTEM ${THRIFT_INCLUDE_DIR}) @@ -1372,8 +1368,8 @@ macro(build_protobuf) if(ZLIB_ROOT) list(APPEND PROTOBUF_CMAKE_ARGS "-DZLIB_ROOT=${ZLIB_ROOT}") endif() - set(PROTOBUF_EXTERNAL_PROJECT_ADD_ARGS CMAKE_ARGS ${PROTOBUF_CMAKE_ARGS} SOURCE_SUBDIR - "cmake") + set(PROTOBUF_EXTERNAL_PROJECT_ADD_ARGS CMAKE_ARGS ${PROTOBUF_CMAKE_ARGS} + SOURCE_SUBDIR "cmake") endif() externalproject_add(protobuf_ep @@ -1386,18 +1382,18 @@ macro(build_protobuf) file(MAKE_DIRECTORY "${PROTOBUF_INCLUDE_DIR}") add_library(arrow::protobuf::libprotobuf STATIC IMPORTED) - set_target_properties( - arrow::protobuf::libprotobuf - PROPERTIES IMPORTED_LOCATION "${PROTOBUF_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES - "${PROTOBUF_INCLUDE_DIR}") + set_target_properties(arrow::protobuf::libprotobuf + PROPERTIES IMPORTED_LOCATION "${PROTOBUF_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${PROTOBUF_INCLUDE_DIR}") add_library(arrow::protobuf::libprotoc STATIC IMPORTED) - set_target_properties( - arrow::protobuf::libprotoc - PROPERTIES IMPORTED_LOCATION "${PROTOC_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES - "${PROTOBUF_INCLUDE_DIR}") + set_target_properties(arrow::protobuf::libprotoc + PROPERTIES IMPORTED_LOCATION "${PROTOC_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${PROTOBUF_INCLUDE_DIR}") add_executable(arrow::protobuf::protoc IMPORTED) - set_target_properties(arrow::protobuf::protoc - PROPERTIES IMPORTED_LOCATION "${PROTOBUF_COMPILER}") + set_target_properties(arrow::protobuf::protoc PROPERTIES IMPORTED_LOCATION + "${PROTOBUF_COMPILER}") add_dependencies(toolchain protobuf_ep) add_dependencies(arrow::protobuf::libprotobuf protobuf_ep) @@ -1416,7 +1412,11 @@ if(ARROW_WITH_PROTOBUF) else() set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") endif() - resolve_dependency(Protobuf REQUIRED_VERSION ${ARROW_PROTOBUF_REQUIRED_VERSION}) + resolve_dependency(Protobuf + REQUIRED_VERSION + ${ARROW_PROTOBUF_REQUIRED_VERSION} + PC_PACKAGE_NAMES + protobuf) if(ARROW_PROTOBUF_USE_SHARED AND MSVC_TOOLCHAIN) add_definitions(-DPROTOBUF_USE_DLLS) @@ -1463,8 +1463,8 @@ if(ARROW_WITH_PROTOBUF) else() if(NOT TARGET protobuf::protoc) add_executable(protobuf::protoc IMPORTED) - set_target_properties(protobuf::protoc - PROPERTIES IMPORTED_LOCATION "${PROTOBUF_PROTOC_EXECUTABLE}") + set_target_properties(protobuf::protoc PROPERTIES IMPORTED_LOCATION + "${PROTOBUF_PROTOC_EXECUTABLE}") endif() set(ARROW_PROTOBUF_PROTOC protobuf::protoc) endif() @@ -1495,37 +1495,40 @@ if(ARROW_JEMALLOC) set(ARROW_JEMALLOC_USE_SHARED OFF) set(JEMALLOC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-prefix/src/jemalloc_ep/dist/") + set(JEMALLOC_LIB_DIR "${JEMALLOC_PREFIX}/lib") set(JEMALLOC_STATIC_LIB - "${JEMALLOC_PREFIX}/lib/libjemalloc_pic${CMAKE_STATIC_LIBRARY_SUFFIX}") + "${JEMALLOC_LIB_DIR}/libjemalloc_pic${CMAKE_STATIC_LIBRARY_SUFFIX}") set(JEMALLOC_CONFIGURE_COMMAND ./configure "AR=${CMAKE_AR}" "CC=${CMAKE_C_COMPILER}") if(CMAKE_OSX_SYSROOT) list(APPEND JEMALLOC_CONFIGURE_COMMAND "SDKROOT=${CMAKE_OSX_SYSROOT}") endif() - list(APPEND JEMALLOC_CONFIGURE_COMMAND - "--prefix=${JEMALLOC_PREFIX}" - "--with-jemalloc-prefix=je_arrow_" - "--with-private-namespace=je_arrow_private_" - "--without-export" - "--disable-shared" - # Don't override operator new() - "--disable-cxx" "--disable-libdl" - # See https://github.com/jemalloc/jemalloc/issues/1237 - "--disable-initial-exec-tls" ${EP_LOG_OPTIONS}) + list(APPEND + JEMALLOC_CONFIGURE_COMMAND + "--prefix=${JEMALLOC_PREFIX}" + "--libdir=${JEMALLOC_LIB_DIR}" + "--with-jemalloc-prefix=je_arrow_" + "--with-private-namespace=je_arrow_private_" + "--without-export" + "--disable-shared" + # Don't override operator new() + "--disable-cxx" + "--disable-libdl" + # See https://github.com/jemalloc/jemalloc/issues/1237 + "--disable-initial-exec-tls" + ${EP_LOG_OPTIONS}) set(JEMALLOC_BUILD_COMMAND ${MAKE} ${MAKE_BUILD_ARGS}) if(CMAKE_OSX_SYSROOT) list(APPEND JEMALLOC_BUILD_COMMAND "SDKROOT=${CMAKE_OSX_SYSROOT}") endif() - externalproject_add( - jemalloc_ep - URL ${JEMALLOC_SOURCE_URL} - PATCH_COMMAND - touch doc/jemalloc.3 doc/jemalloc.html - # The prefix "je_arrow_" must be kept in sync with the value in memory_pool.cc - CONFIGURE_COMMAND ${JEMALLOC_CONFIGURE_COMMAND} - BUILD_IN_SOURCE 1 - BUILD_COMMAND ${JEMALLOC_BUILD_COMMAND} - BUILD_BYPRODUCTS "${JEMALLOC_STATIC_LIB}" - INSTALL_COMMAND ${MAKE} -j1 install) + externalproject_add(jemalloc_ep + URL ${JEMALLOC_SOURCE_URL} + PATCH_COMMAND touch doc/jemalloc.3 doc/jemalloc.html + # The prefix "je_arrow_" must be kept in sync with the value in memory_pool.cc + CONFIGURE_COMMAND ${JEMALLOC_CONFIGURE_COMMAND} + BUILD_IN_SOURCE 1 + BUILD_COMMAND ${JEMALLOC_BUILD_COMMAND} + BUILD_BYPRODUCTS "${JEMALLOC_STATIC_LIB}" + INSTALL_COMMAND ${MAKE} -j1 install) # Don't use the include directory directly so that we can point to a path # that is unique to our codebase. @@ -1534,10 +1537,8 @@ if(ARROW_JEMALLOC) file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-prefix/src/") add_library(jemalloc::jemalloc STATIC IMPORTED) set_target_properties(jemalloc::jemalloc - PROPERTIES INTERFACE_LINK_LIBRARIES - Threads::Threads - IMPORTED_LOCATION - "${JEMALLOC_STATIC_LIB}" + PROPERTIES INTERFACE_LINK_LIBRARIES Threads::Threads + IMPORTED_LOCATION "${JEMALLOC_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}/jemalloc_ep-prefix/src") add_dependencies(jemalloc::jemalloc jemalloc_ep) @@ -1561,11 +1562,10 @@ if(ARROW_MIMALLOC) endif() set(MIMALLOC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/mimalloc_ep/src/mimalloc_ep") - set(MIMALLOC_INCLUDE_DIR "${MIMALLOC_PREFIX}/lib/mimalloc-2.0/include") - set( - MIMALLOC_STATIC_LIB - "${MIMALLOC_PREFIX}/lib/mimalloc-2.0/${CMAKE_STATIC_LIBRARY_PREFIX}${MIMALLOC_LIB_BASE_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(MIMALLOC_INCLUDE_DIR "${MIMALLOC_PREFIX}/include/mimalloc-1.7") + set(MIMALLOC_STATIC_LIB + "${MIMALLOC_PREFIX}/lib/mimalloc-1.7/${CMAKE_STATIC_LIBRARY_PREFIX}${MIMALLOC_LIB_BASE_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(MIMALLOC_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} @@ -1586,10 +1586,8 @@ if(ARROW_MIMALLOC) add_library(mimalloc::mimalloc STATIC IMPORTED) set_target_properties(mimalloc::mimalloc - PROPERTIES INTERFACE_LINK_LIBRARIES - Threads::Threads - IMPORTED_LOCATION - "${MIMALLOC_STATIC_LIB}" + PROPERTIES INTERFACE_LINK_LIBRARIES Threads::Threads + IMPORTED_LOCATION "${MIMALLOC_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES "${MIMALLOC_INCLUDE_DIR}") add_dependencies(mimalloc::mimalloc mimalloc_ep) @@ -1641,10 +1639,9 @@ macro(build_gtest) "${_GTEST_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest${_GTEST_LIBRARY_SUFFIX}") set(GMOCK_SHARED_LIB "${_GTEST_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gmock${_GTEST_LIBRARY_SUFFIX}") - set( - GTEST_MAIN_SHARED_LIB - "${_GTEST_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest_main${_GTEST_LIBRARY_SUFFIX}" - ) + set(GTEST_MAIN_SHARED_LIB + "${_GTEST_LIBRARY_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest_main${_GTEST_LIBRARY_SUFFIX}" + ) set(GTEST_INSTALL_NAME_DIR "$/lib") # Fix syntax highlighting mess introduced by unclosed bracket above set(dummy ">") @@ -1682,20 +1679,18 @@ macro(build_gtest) set(_GTEST_RUNTIME_DIR "${GTEST_PREFIX}/bin") set(_GTEST_RUNTIME_SUFFIX "${CMAKE_GTEST_DEBUG_EXTENSION}${CMAKE_SHARED_LIBRARY_SUFFIX}") - set( - _GTEST_RUNTIME_LIB - "${_GTEST_RUNTIME_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest${_GTEST_RUNTIME_SUFFIX}") - set( - _GMOCK_RUNTIME_LIB - "${_GTEST_RUNTIME_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gmock${_GTEST_RUNTIME_SUFFIX}") - set( - _GTEST_MAIN_RUNTIME_LIB - "${_GTEST_RUNTIME_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest_main${_GTEST_RUNTIME_SUFFIX}" - ) + set(_GTEST_RUNTIME_LIB + "${_GTEST_RUNTIME_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest${_GTEST_RUNTIME_SUFFIX}" + ) + set(_GMOCK_RUNTIME_LIB + "${_GTEST_RUNTIME_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gmock${_GTEST_RUNTIME_SUFFIX}" + ) + set(_GTEST_MAIN_RUNTIME_LIB + "${_GTEST_RUNTIME_DIR}/${CMAKE_SHARED_LIBRARY_PREFIX}gtest_main${_GTEST_RUNTIME_SUFFIX}" + ) if(CMAKE_VERSION VERSION_LESS 3.9) - message( - FATAL_ERROR - "Building GoogleTest from source on Windows requires at least CMake 3.9") + message(FATAL_ERROR "Building GoogleTest from source on Windows requires at least CMake 3.9" + ) endif() get_property(_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) if(_GENERATOR_IS_MULTI_CONFIG) @@ -1706,20 +1701,11 @@ macro(build_gtest) externalproject_add_step(googletest_ep copy COMMAND ${CMAKE_COMMAND} -E make_directory ${_GTEST_RUNTIME_OUTPUT_DIR} - COMMAND ${CMAKE_COMMAND} - -E - copy - ${_GTEST_RUNTIME_LIB} + COMMAND ${CMAKE_COMMAND} -E copy ${_GTEST_RUNTIME_LIB} ${_GTEST_RUNTIME_OUTPUT_DIR} - COMMAND ${CMAKE_COMMAND} - -E - copy - ${_GMOCK_RUNTIME_LIB} + COMMAND ${CMAKE_COMMAND} -E copy ${_GMOCK_RUNTIME_LIB} ${_GTEST_RUNTIME_OUTPUT_DIR} - COMMAND ${CMAKE_COMMAND} - -E - copy - ${_GTEST_MAIN_RUNTIME_LIB} + COMMAND ${CMAKE_COMMAND} -E copy ${_GTEST_MAIN_RUNTIME_LIB} ${_GTEST_RUNTIME_OUTPUT_DIR} DEPENDEES install) endif() @@ -1798,25 +1784,20 @@ macro(build_benchmark) set(GBENCHMARK_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} -std=c++11") endif() - if(APPLE - AND (CMAKE_CXX_COMPILER_ID - STREQUAL - "AppleClang" - OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")) + if(APPLE AND (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID + STREQUAL "Clang")) set(GBENCHMARK_CMAKE_CXX_FLAGS "${GBENCHMARK_CMAKE_CXX_FLAGS} -stdlib=libc++") endif() set(GBENCHMARK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/gbenchmark_ep/src/gbenchmark_ep-install") set(GBENCHMARK_INCLUDE_DIR "${GBENCHMARK_PREFIX}/include") - set( - GBENCHMARK_STATIC_LIB - "${GBENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set( - GBENCHMARK_MAIN_STATIC_LIB - "${GBENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark_main${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(GBENCHMARK_STATIC_LIB + "${GBENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(GBENCHMARK_MAIN_STATIC_LIB + "${GBENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark_main${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(GBENCHMARK_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} "-DCMAKE_INSTALL_PREFIX=${GBENCHMARK_PREFIX}" @@ -1856,19 +1837,11 @@ endmacro() if(ARROW_BUILD_BENCHMARKS) # ArgsProduct() is available since 1.5.2 set(BENCHMARK_REQUIRED_VERSION 1.5.2) - if("${ARROW_DEPENDENCY_SOURCE}" STREQUAL "CONDA" - AND "${benchmark_SOURCE}" STREQUAL "SYSTEM") - # TODO: Remove this workaround once - # https://github.com/google/benchmark/issues/1046 is resolved. - # - # benchmark doesn't set suitable version when we use released - # archive. So the benchmark package on conda-forge isn't report - # the real version. We accept all the benchmark package with - # conda. Conda users should install benchmark 1.5.2 or later by - # ci/conda_env_cpp.yml. - set(BENCHMARK_REQUIRED_VERSION 0.0.0) - endif() - resolve_dependency(benchmark REQUIRED_VERSION ${BENCHMARK_REQUIRED_VERSION}) + resolve_dependency(benchmark + REQUIRED_VERSION + ${BENCHMARK_REQUIRED_VERSION} + IS_RUNTIME_DEPENDENCY + FALSE) # TODO: Don't use global includes but rather target_include_directories get_target_property(BENCHMARK_INCLUDE_DIR benchmark::benchmark INTERFACE_INCLUDE_DIRECTORIES) @@ -1938,8 +1911,8 @@ macro(build_xsimd) set(XSIMD_VENDORED TRUE) endmacro() -# For now xsimd is always bundled from upstream -if(NOT ARROW_SIMD_LEVEL STREQUAL "NONE") +if((NOT ARROW_SIMD_LEVEL STREQUAL "NONE") OR (NOT ARROW_RUNTIME_SIMD_LEVEL STREQUAL "NONE" + )) set(xsimd_SOURCE "BUNDLED") resolve_dependency(xsimd) # TODO: Don't use global includes but rather target_include_directories @@ -1983,7 +1956,7 @@ macro(build_zlib) endmacro() if(ARROW_WITH_ZLIB) - resolve_dependency(ZLIB) + resolve_dependency(ZLIB PC_PACKAGE_NAMES zlib) # TODO: Don't use global includes but rather target_include_directories get_target_property(ZLIB_INCLUDE_DIR ZLIB::ZLIB INTERFACE_INCLUDE_DIRECTORIES) @@ -2006,29 +1979,21 @@ macro(build_lz4) set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/liblz4_static.lib") set(LZ4_BUILD_COMMAND - BUILD_COMMAND - msbuild.exe - /m - /p:Configuration=${CMAKE_BUILD_TYPE} - /p:Platform=x64 - /p:PlatformToolset=v140 - ${LZ4_RUNTIME_LIBRARY_LINKAGE} - /t:Build + BUILD_COMMAND msbuild.exe /m /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 + /p:PlatformToolset=v140 ${LZ4_RUNTIME_LIBRARY_LINKAGE} /t:Build ${LZ4_BUILD_DIR}/build/VS2010/lz4.sln) else() set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/lib/liblz4.a") - set(LZ4_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-lz4-lib.sh - "AR=${CMAKE_AR}" "OS=${CMAKE_SYSTEM_NAME}") + set(LZ4_BUILD_COMMAND + BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-lz4-lib.sh "AR=${CMAKE_AR}" + "OS=${CMAKE_SYSTEM_NAME}") endif() # We need to copy the header in lib to directory outside of the build externalproject_add(lz4_ep URL ${LZ4_SOURCE_URL} ${EP_LOG_OPTIONS} - UPDATE_COMMAND ${CMAKE_COMMAND} - -E - copy_directory - "${LZ4_BUILD_DIR}/lib" - "${LZ4_PREFIX}/include" + UPDATE_COMMAND ${CMAKE_COMMAND} -E copy_directory + "${LZ4_BUILD_DIR}/lib" "${LZ4_PREFIX}/include" ${LZ4_PATCH_COMMAND} CONFIGURE_COMMAND "" INSTALL_COMMAND "" @@ -2047,7 +2012,7 @@ macro(build_lz4) endmacro() if(ARROW_WITH_LZ4) - resolve_dependency(Lz4) + resolve_dependency(Lz4 PC_PACKAGE_NAMES liblz4) # TODO: Don't use global includes but rather target_include_directories get_target_property(LZ4_INCLUDE_DIR LZ4::lz4 INTERFACE_INCLUDE_DIRECTORIES) @@ -2111,7 +2076,12 @@ macro(build_zstd) endmacro() if(ARROW_WITH_ZSTD) - resolve_dependency(zstd) + # ARROW-13384: ZSTD_minCLevel was added in v1.4.0, required by ARROW-13091 + resolve_dependency(zstd + PC_PACKAGE_NAMES + libzstd + REQUIRED_VERSION + 1.4.0) if(TARGET zstd::libzstd) set(ARROW_ZSTD_LIBZSTD zstd::libzstd) @@ -2171,7 +2141,14 @@ macro(build_re2) endmacro() if(ARROW_WITH_RE2) + # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may + # include -std=c++11. It's not compatible with C source and C++ + # source not uses C++ 11. resolve_dependency(re2 HAVE_ALT TRUE) + if(${re2_SOURCE} STREQUAL "SYSTEM") + get_target_property(RE2_LIB re2::re2 IMPORTED_LOCATION) + string(APPEND ARROW_PC_LIBS_PRIVATE " ${RE2_LIB}") + endif() add_definitions(-DARROW_WITH_RE2) # TODO: Don't use global includes but rather target_include_directories @@ -2182,9 +2159,9 @@ endif() macro(build_bzip2) message(STATUS "Building BZip2 from source") set(BZIP2_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/bzip2_ep-install") - set( - BZIP2_STATIC_LIB - "${BZIP2_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}bz2${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(BZIP2_STATIC_LIB + "${BZIP2_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}bz2${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(BZIP2_EXTRA_ARGS "CC=${CMAKE_C_COMPILER}" "CFLAGS=${EP_C_FLAGS}") @@ -2206,10 +2183,10 @@ macro(build_bzip2) file(MAKE_DIRECTORY "${BZIP2_PREFIX}/include") add_library(BZip2::BZip2 STATIC IMPORTED) - set_target_properties( - BZip2::BZip2 - PROPERTIES IMPORTED_LOCATION "${BZIP2_STATIC_LIB}" INTERFACE_INCLUDE_DIRECTORIES - "${BZIP2_PREFIX}/include") + set_target_properties(BZip2::BZip2 + PROPERTIES IMPORTED_LOCATION "${BZIP2_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${BZIP2_PREFIX}/include") set(BZIP2_INCLUDE_DIR "${BZIP2_PREFIX}/include") add_dependencies(toolchain bzip2_ep) @@ -2220,6 +2197,9 @@ endmacro() if(ARROW_WITH_BZ2) resolve_dependency(BZip2) + if(${BZip2_SOURCE} STREQUAL "SYSTEM") + string(APPEND ARROW_PC_LIBS_PRIVATE " ${BZIP2_LIBRARIES}") + endif() if(NOT TARGET BZip2::BZip2) add_library(BZip2::BZip2 UNKNOWN IMPORTED) @@ -2236,10 +2216,9 @@ macro(build_utf8proc) if(MSVC) set(UTF8PROC_STATIC_LIB "${UTF8PROC_PREFIX}/lib/utf8proc_static.lib") else() - set( - UTF8PROC_STATIC_LIB - "${UTF8PROC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(UTF8PROC_STATIC_LIB + "${UTF8PROC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}utf8proc${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) endif() set(UTF8PROC_CMAKE_ARGS @@ -2259,10 +2238,8 @@ macro(build_utf8proc) file(MAKE_DIRECTORY "${UTF8PROC_PREFIX}/include") add_library(utf8proc::utf8proc STATIC IMPORTED) set_target_properties(utf8proc::utf8proc - PROPERTIES IMPORTED_LOCATION - "${UTF8PROC_STATIC_LIB}" - INTERFACE_COMPILER_DEFINITIONS - "UTF8PROC_STATIC" + PROPERTIES IMPORTED_LOCATION "${UTF8PROC_STATIC_LIB}" + INTERFACE_COMPILER_DEFINITIONS "UTF8PROC_STATIC" INTERFACE_INCLUDE_DIRECTORIES "${UTF8PROC_PREFIX}/include") @@ -2273,7 +2250,11 @@ macro(build_utf8proc) endmacro() if(ARROW_WITH_UTF8PROC) - resolve_dependency(utf8proc) + resolve_dependency(utf8proc + REQUIRED_VERSION + "2.2.0" + PC_PACKAGE_NAMES + libutf8proc) add_definitions(-DARROW_WITH_UTF8PROC) @@ -2299,10 +2280,9 @@ macro(build_cares) # If you set -DCARES_SHARED=ON then the build system names the library # libcares_static.a - set( - CARES_STATIC_LIB - "${CARES_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}cares${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(CARES_STATIC_LIB + "${CARES_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}cares${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(CARES_CMAKE_ARGS "${EP_COMMON_CMAKE_ARGS}" @@ -2329,8 +2309,8 @@ macro(build_cares) if(APPLE) # libresolv must be linked from c-ares version 1.16.1 find_library(LIBRESOLV_LIBRARY NAMES resolv libresolv REQUIRED) - set_target_properties(c-ares::cares - PROPERTIES INTERFACE_LINK_LIBRARIES "${LIBRESOLV_LIBRARY}") + set_target_properties(c-ares::cares PROPERTIES INTERFACE_LINK_LIBRARIES + "${LIBRESOLV_LIBRARY}") endif() set(CARES_VENDORED TRUE) @@ -2338,29 +2318,19 @@ macro(build_cares) list(APPEND ARROW_BUNDLED_STATIC_LIBS c-ares::cares) endmacro() -if(ARROW_WITH_GRPC) - if(c-ares_SOURCE STREQUAL "AUTO") - find_package(c-ares QUIET CONFIG) - if(c-ares_FOUND) - set(CARES_INCLUDE_DIR ${c-ares_INCLUDE_DIR}) - else() - build_cares() - endif() - elseif(c-ares_SOURCE STREQUAL "BUNDLED") - build_cares() - elseif(c-ares_SOURCE STREQUAL "SYSTEM") - find_package(c-ares REQUIRED CONFIG) - set(CARES_INCLUDE_DIR ${c-ares_INCLUDE_DIR}) - endif() - - # TODO: Don't use global includes but rather target_include_directories - include_directories(SYSTEM ${CARES_INCLUDE_DIR}) -endif() - # ---------------------------------------------------------------------- # Dependencies for Arrow Flight RPC macro(build_grpc) + resolve_dependency(c-ares + HAVE_ALT + TRUE + PC_PACKAGE_NAMES + libcares) + # TODO: Don't use global includes but rather target_include_directories + get_target_property(c-ares_INCLUDE_DIR c-ares::cares INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${c-ares_INCLUDE_DIR}) + message(STATUS "Building gRPC from source") # First need to build Abseil @@ -2401,13 +2371,12 @@ macro(build_grpc) raw_logging_internal) foreach(_ABSL_LIB ${_ABSL_LIBS}) - set( - _ABSL_STATIC_LIBRARY - "${ABSL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}absl_${_ABSL_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(_ABSL_STATIC_LIBRARY + "${ABSL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}absl_${_ABSL_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) add_library(absl::${_ABSL_LIB} STATIC IMPORTED) - set_target_properties(absl::${_ABSL_LIB} - PROPERTIES IMPORTED_LOCATION ${_ABSL_STATIC_LIBRARY}) + set_target_properties(absl::${_ABSL_LIB} PROPERTIES IMPORTED_LOCATION + ${_ABSL_STATIC_LIBRARY}) list(APPEND ABSL_BUILD_BYPRODUCTS ${_ABSL_STATIC_LIBRARY}) list(APPEND ABSL_LIBRARIES absl::${_ABSL_LIB}) endforeach() @@ -2423,23 +2392,21 @@ macro(build_grpc) set(GRPC_HOME "${GRPC_PREFIX}") set(GRPC_INCLUDE_DIR "${GRPC_PREFIX}/include") - set( - GRPC_STATIC_LIBRARY_GPR - "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gpr${CMAKE_STATIC_LIBRARY_SUFFIX}") - set( - GRPC_STATIC_LIBRARY_GRPC - "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}grpc${CMAKE_STATIC_LIBRARY_SUFFIX}") - set( - GRPC_STATIC_LIBRARY_GRPCPP - "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}grpc++${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set( - GRPC_STATIC_LIBRARY_ADDRESS_SORTING - "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}address_sorting${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) - set( - GRPC_STATIC_LIBRARY_UPB - "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}upb${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GRPC_STATIC_LIBRARY_GPR + "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gpr${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(GRPC_STATIC_LIBRARY_GRPC + "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}grpc${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(GRPC_STATIC_LIBRARY_GRPCPP + "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}grpc++${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(GRPC_STATIC_LIBRARY_ADDRESS_SORTING + "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}address_sorting${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(GRPC_STATIC_LIBRARY_UPB + "${GRPC_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}upb${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(GRPC_CPP_PLUGIN "${GRPC_PREFIX}/bin/grpc_cpp_plugin${CMAKE_EXECUTABLE_SUFFIX}") set(GRPC_CMAKE_PREFIX) @@ -2548,31 +2515,38 @@ macro(build_grpc) PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_GPR}" INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - add_library(gRPC::grpc STATIC IMPORTED) - set_target_properties(gRPC::grpc - PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_GRPC}" - INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") - add_library(gRPC::address_sorting STATIC IMPORTED) set_target_properties(gRPC::address_sorting PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_ADDRESS_SORTING}" INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}") + add_library(gRPC::grpc STATIC IMPORTED) + set(GRPC_LINK_LIBRARIES + gRPC::gpr + gRPC::upb + gRPC::address_sorting + ${ABSL_LIBRARIES} + re2::re2 + c-ares::cares + ZLIB::ZLIB + OpenSSL::SSL + Threads::Threads) + set_target_properties(gRPC::grpc + PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_GRPC}" + INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}" + INTERFACE_LINK_LIBRARIES "${GRPC_LINK_LIBRARIES}") + add_library(gRPC::grpc++ STATIC IMPORTED) - set_target_properties( - gRPC::grpc++ - PROPERTIES - IMPORTED_LOCATION - "${GRPC_STATIC_LIBRARY_GRPCPP}" - INTERFACE_LINK_LIBRARIES - "gRPC::grpc;gRPC::gpr;gRPC::upb;gRPC::address_sorting;${ABSL_LIBRARIES};Threads::Threads" - INTERFACE_INCLUDE_DIRECTORIES - "${GRPC_INCLUDE_DIR}") + set(GRPCPP_LINK_LIBRARIES gRPC::grpc ${ARROW_PROTOBUF_LIBPROTOBUF}) + set_target_properties(gRPC::grpc++ + PROPERTIES IMPORTED_LOCATION "${GRPC_STATIC_LIBRARY_GRPCPP}" + INTERFACE_INCLUDE_DIRECTORIES "${GRPC_INCLUDE_DIR}" + INTERFACE_LINK_LIBRARIES "${GRPCPP_LINK_LIBRARIES}") add_executable(gRPC::grpc_cpp_plugin IMPORTED) - set_target_properties(gRPC::grpc_cpp_plugin - PROPERTIES IMPORTED_LOCATION ${GRPC_CPP_PLUGIN}) + set_target_properties(gRPC::grpc_cpp_plugin PROPERTIES IMPORTED_LOCATION + ${GRPC_CPP_PLUGIN}) add_dependencies(grpc_ep grpc_dependencies) add_dependencies(toolchain grpc_ep) @@ -2584,15 +2558,11 @@ macro(build_grpc) # continuation character in these scripts, so we have to create a copy of the # static lib that we will bundle later - set( - GRPC_STATIC_LIBRARY_GRPCPP_FOR_AR - "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}grpcpp${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(GRPC_STATIC_LIBRARY_GRPCPP_FOR_AR + "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}grpcpp${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) add_custom_command(OUTPUT ${GRPC_STATIC_LIBRARY_GRPCPP_FOR_AR} - COMMAND ${CMAKE_COMMAND} - -E - copy - $ + COMMAND ${CMAKE_COMMAND} -E copy $ ${GRPC_STATIC_LIBRARY_GRPCPP_FOR_AR} DEPENDS grpc_ep) add_library(gRPC::grpcpp_for_bundling STATIC IMPORTED) @@ -2601,17 +2571,18 @@ macro(build_grpc) "${GRPC_STATIC_LIBRARY_GRPCPP_FOR_AR}") set_source_files_properties("${GRPC_STATIC_LIBRARY_GRPCPP_FOR_AR}" PROPERTIES GENERATED - TRUE) + TRUE) add_custom_target(grpc_copy_grpc++ ALL DEPENDS "${GRPC_STATIC_LIBRARY_GRPCPP_FOR_AR}") add_dependencies(gRPC::grpcpp_for_bundling grpc_copy_grpc++) - list(APPEND ARROW_BUNDLED_STATIC_LIBS - ${ABSL_LIBRARIES} - gRPC::upb - gRPC::gpr - gRPC::grpc - gRPC::address_sorting - gRPC::grpcpp_for_bundling) + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + ${ABSL_LIBRARIES} + gRPC::address_sorting + gRPC::gpr + gRPC::grpc + gRPC::grpcpp_for_bundling + gRPC::upb) endmacro() if(ARROW_WITH_GRPC) @@ -2620,16 +2591,12 @@ if(ARROW_WITH_GRPC) HAVE_ALT TRUE REQUIRED_VERSION - ${ARROW_GRPC_REQUIRED_VERSION}) - - if(TARGET gRPC::address_sorting) - set(GRPC_HAS_ADDRESS_SORTING TRUE) - else() - set(GRPC_HAS_ADDRESS_SORTING FALSE) - endif() + ${ARROW_GRPC_REQUIRED_VERSION} + PC_PACKAGE_NAMES + grpc++) # TODO: Don't use global includes but rather target_include_directories - get_target_property(GRPC_INCLUDE_DIR gRPC::grpc INTERFACE_INCLUDE_DIRECTORIES) + get_target_property(GRPC_INCLUDE_DIR gRPC::grpc++ INTERFACE_INCLUDE_DIRECTORIES) include_directories(SYSTEM ${GRPC_INCLUDE_DIR}) if(GRPC_VENDORED) @@ -2754,8 +2721,8 @@ endif() macro(build_awssdk) message("Building AWS C++ SDK from source") - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" - AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9") + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS + "4.9") message(FATAL_ERROR "AWS C++ SDK requires gcc >= 4.9") endif() set(AWSSDK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/awssdk_ep-install") @@ -2782,14 +2749,14 @@ macro(build_awssdk) "-DCMAKE_INSTALL_PREFIX=${AWSSDK_PREFIX}" "-DCMAKE_PREFIX_PATH=${AWSSDK_PREFIX}") - set( - AWSSDK_CMAKE_ARGS - ${AWSSDK_COMMON_CMAKE_ARGS} -DBUILD_DEPS=OFF - -DBUILD_ONLY=config\\$s3\\$transfer\\$identity-management\\$sts - -DMINIMIZE_SIZE=ON) + set(AWSSDK_CMAKE_ARGS + ${AWSSDK_COMMON_CMAKE_ARGS} + -DBUILD_DEPS=OFF + -DBUILD_ONLY=config\\$s3\\$transfer\\$identity-management\\$sts + -DMINIMIZE_SIZE=ON) if(UNIX AND TARGET zlib_ep) list(APPEND AWSSDK_CMAKE_ARGS -DZLIB_INCLUDE_DIR=${ZLIB_INCLUDE_DIRS} - -DZLIB_LIBRARY=${ZLIB_LIBRARIES}) + -DZLIB_LIBRARY=${ZLIB_LIBRARIES}) endif() file(MAKE_DIRECTORY ${AWSSDK_INCLUDE_DIR}) @@ -2810,20 +2777,19 @@ macro(build_awssdk) string(TOUPPER ${_AWSSDK_LIB} _AWSSDK_LIB_UPPER) # AWS-C-COMMON -> AWS_C_COMMON string(REPLACE "-" "_" _AWSSDK_LIB_NAME_PREFIX ${_AWSSDK_LIB_UPPER}) - set( - _AWSSDK_STATIC_LIBRARY - "${AWSSDK_PREFIX}/${AWSSDK_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${_AWSSDK_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) + set(_AWSSDK_STATIC_LIBRARY + "${AWSSDK_PREFIX}/${AWSSDK_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${_AWSSDK_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) if(${_AWSSDK_LIB} MATCHES "^aws-cpp-sdk-") set(_AWSSDK_TARGET_NAME ${_AWSSDK_LIB}) else() set(_AWSSDK_TARGET_NAME AWS::${_AWSSDK_LIB}) endif() add_library(${_AWSSDK_TARGET_NAME} STATIC IMPORTED) - set_target_properties( - ${_AWSSDK_TARGET_NAME} - PROPERTIES IMPORTED_LOCATION ${_AWSSDK_STATIC_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES - "${AWSSDK_INCLUDE_DIR}") + set_target_properties(${_AWSSDK_TARGET_NAME} + PROPERTIES IMPORTED_LOCATION ${_AWSSDK_STATIC_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES + "${AWSSDK_INCLUDE_DIR}") set("${_AWSSDK_LIB_NAME_PREFIX}_STATIC_LIBRARY" ${_AWSSDK_STATIC_LIBRARY}) list(APPEND AWSSDK_LIBRARIES ${_AWSSDK_TARGET_NAME}) endforeach() @@ -2880,7 +2846,7 @@ macro(build_awssdk) set_target_properties(CURL::libcurl PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CURL_INCLUDE_DIRS}" IMPORTED_LOCATION - "${CURL_LIBRARIES}") + "${CURL_LIBRARIES}") endif() set_property(TARGET aws-cpp-sdk-core APPEND diff --git a/cpp/cmake_modules/UseCython.cmake b/cpp/cmake_modules/UseCython.cmake index 0d4b17d3e57..f2025efb4c9 100644 --- a/cpp/cmake_modules/UseCython.cmake +++ b/cpp/cmake_modules/UseCython.cmake @@ -107,8 +107,9 @@ function(compile_pyx endif() if(NOT WIN32) - if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug" - OR "${CMAKE_BUILD_TYPE}" STREQUAL "RelWithDebInfo") + string( TOLOWER "${CMAKE_BUILD_TYPE}" build_type ) + if("${build_type}" STREQUAL "debug" + OR "${build_type}" STREQUAL "relwithdebinfo") set(cython_debug_arg "--gdb") endif() endif() @@ -144,6 +145,8 @@ function(compile_pyx ${no_docstrings_arg} ${cython_debug_arg} ${CYTHON_FLAGS} + # Necessary for autodoc of function arguments + --directive embedsignature=True # Necessary for Cython code coverage --working ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/cpp/cmake_modules/Usevcpkg.cmake b/cpp/cmake_modules/Usevcpkg.cmake index 781bec436f3..06ac4dd075d 100644 --- a/cpp/cmake_modules/Usevcpkg.cmake +++ b/cpp/cmake_modules/Usevcpkg.cmake @@ -22,7 +22,9 @@ message(STATUS "Using vcpkg to find dependencies") # macro to list subdirectirectories (non-recursive) macro(list_subdirs SUBDIRS DIR) - file(GLOB children_ RELATIVE ${DIR} ${DIR}/*) + file(GLOB children_ + RELATIVE ${DIR} + ${DIR}/*) set(subdirs_ "") foreach(child_ ${children_}) if(IS_DIRECTORY "${DIR}/${child_}") @@ -44,24 +46,27 @@ if(DEFINED CMAKE_TOOLCHAIN_FILE) get_filename_component(_VCPKG_BUILDSYSTEMS_DIR "${CMAKE_TOOLCHAIN_FILE}" DIRECTORY) get_filename_component(VCPKG_ROOT "${_VCPKG_BUILDSYSTEMS_DIR}/../.." ABSOLUTE) else() - message( - FATAL_ERROR - "vcpkg toolchain file not found at path specified in -DCMAKE_TOOLCHAIN_FILE") + message(FATAL_ERROR "vcpkg toolchain file not found at path specified in -DCMAKE_TOOLCHAIN_FILE" + ) endif() else() if(DEFINED VCPKG_ROOT) # Get it from the CMake variable VCPKG_ROOT - find_program(_VCPKG_BIN vcpkg PATHS "${VCPKG_ROOT}" NO_DEFAULT_PATH) + find_program(_VCPKG_BIN vcpkg + PATHS "${VCPKG_ROOT}" + NO_DEFAULT_PATH) if(NOT _VCPKG_BIN) message(FATAL_ERROR "vcpkg not found in directory specified in -DVCPKG_ROOT") endif() elseif(DEFINED ENV{VCPKG_ROOT}) # Get it from the environment variable VCPKG_ROOT set(VCPKG_ROOT $ENV{VCPKG_ROOT}) - find_program(_VCPKG_BIN vcpkg PATHS "${VCPKG_ROOT}" NO_DEFAULT_PATH) + find_program(_VCPKG_BIN vcpkg + PATHS "${VCPKG_ROOT}" + NO_DEFAULT_PATH) if(NOT _VCPKG_BIN) - message( - FATAL_ERROR "vcpkg not found in directory in environment variable VCPKG_ROOT") + message(FATAL_ERROR "vcpkg not found in directory in environment variable VCPKG_ROOT" + ) endif() else() # Get it from the file vcpkg.path.txt @@ -78,12 +83,13 @@ else() if(EXISTS "${_VCPKG_PATH_TXT}") file(READ "${_VCPKG_PATH_TXT}" VCPKG_ROOT) else() - message( - FATAL_ERROR - "vcpkg not found. Install vcpkg if not installed, " - "then run vcpkg integrate install or set environment variable VCPKG_ROOT.") + message(FATAL_ERROR "vcpkg not found. Install vcpkg if not installed, " + "then run vcpkg integrate install or set environment variable VCPKG_ROOT." + ) endif() - find_program(_VCPKG_BIN vcpkg PATHS "${VCPKG_ROOT}" NO_DEFAULT_PATH) + find_program(_VCPKG_BIN vcpkg + PATHS "${VCPKG_ROOT}" + NO_DEFAULT_PATH) if(NOT _VCPKG_BIN) message(FATAL_ERROR "vcpkg not found. Re-run vcpkg integrate install " "or set environment variable VCPKG_ROOT.") @@ -105,7 +111,9 @@ if(DEFINED ENV{VCPKG_DEFAULT_TRIPLET} AND NOT DEFINED VCPKG_TARGET_TRIPLET) endif() # Explicitly set manifest mode on if it is not set and vcpkg.json exists if(NOT DEFINED VCPKG_MANIFEST_MODE AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/vcpkg.json") - set(VCPKG_MANIFEST_MODE ON CACHE BOOL "Use vcpkg.json manifest") + set(VCPKG_MANIFEST_MODE + ON + CACHE BOOL "Use vcpkg.json manifest") message(STATUS "vcpkg.json manifest found. Using VCPKG_MANIFEST_MODE: ON") endif() # vcpkg can install packages in three different places @@ -113,13 +121,7 @@ set(_INST_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/vcpkg_installed") # try here fi set(_INST_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/vcpkg_installed") # try here second set(_INST_VCPKG_ROOT "${VCPKG_ROOT}/installed") # Iterate over the places -foreach(_INST_DIR - IN - LISTS - _INST_BUILD_DIR - _INST_SOURCE_DIR - _INST_VCPKG_ROOT - "notfound") +foreach(_INST_DIR IN LISTS _INST_BUILD_DIR _INST_SOURCE_DIR _INST_VCPKG_ROOT "notfound") if(_INST_DIR STREQUAL "notfound") message(FATAL_ERROR "vcpkg installed libraries directory not found. " "Install packages with vcpkg before executing cmake.") @@ -158,10 +160,8 @@ if(NOT DEFINED VCPKG_TARGET_TRIPLET) message(FATAL_ERROR "Could not infer VCPKG_TARGET_TRIPLET. " "Specify triplet with -DVCPKG_TARGET_TRIPLET.") elseif(NOT DEFINED _VCPKG_INSTALLED_DIR) - message( - FATAL_ERROR - "Could not find installed vcpkg packages for triplet ${VCPKG_TARGET_TRIPLET}. " - "Install packages with vcpkg before executing cmake.") + message(FATAL_ERROR "Could not find installed vcpkg packages for triplet ${VCPKG_TARGET_TRIPLET}. " + "Install packages with vcpkg before executing cmake.") endif() set(VCPKG_TARGET_TRIPLET @@ -194,24 +194,56 @@ set(ARROW_VCPKG_PREFIX "${_VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}" CACHE PATH "Path to target triplet subdirectory in vcpkg installed directory") -set(ARROW_VCPKG ON CACHE BOOL "Use vcpkg for dependencies") +set(ARROW_VCPKG + ON + CACHE BOOL "Use vcpkg for dependencies") set(ARROW_DEPENDENCY_SOURCE "SYSTEM" CACHE STRING "The specified value VCPKG is implemented internally as SYSTEM" FORCE) -set(BOOST_ROOT "${ARROW_VCPKG_PREFIX}" CACHE STRING "") -set(BOOST_INCLUDEDIR "${ARROW_VCPKG_PREFIX}/include/boost" CACHE STRING "") -set(BOOST_LIBRARYDIR "${ARROW_VCPKG_PREFIX}/lib" CACHE STRING "") -set(OPENSSL_INCLUDE_DIR "${ARROW_VCPKG_PREFIX}/include" CACHE STRING "") -set(OPENSSL_LIBRARIES "${ARROW_VCPKG_PREFIX}/lib" CACHE STRING "") -set(OPENSSL_ROOT_DIR "${ARROW_VCPKG_PREFIX}" CACHE STRING "") -set(Thrift_ROOT "${ARROW_VCPKG_PREFIX}/lib" CACHE STRING "") -set(ZSTD_INCLUDE_DIR "${ARROW_VCPKG_PREFIX}/include" CACHE STRING "") -set(ZSTD_ROOT "${ARROW_VCPKG_PREFIX}" CACHE STRING "") +set(BOOST_ROOT + "${ARROW_VCPKG_PREFIX}" + CACHE STRING "") +set(BOOST_INCLUDEDIR + "${ARROW_VCPKG_PREFIX}/include/boost" + CACHE STRING "") +set(BOOST_LIBRARYDIR + "${ARROW_VCPKG_PREFIX}/lib" + CACHE STRING "") +set(OPENSSL_INCLUDE_DIR + "${ARROW_VCPKG_PREFIX}/include" + CACHE STRING "") +set(OPENSSL_LIBRARIES + "${ARROW_VCPKG_PREFIX}/lib" + CACHE STRING "") +set(OPENSSL_ROOT_DIR + "${ARROW_VCPKG_PREFIX}" + CACHE STRING "") +set(Thrift_ROOT + "${ARROW_VCPKG_PREFIX}/lib" + CACHE STRING "") +set(ZSTD_INCLUDE_DIR + "${ARROW_VCPKG_PREFIX}/include" + CACHE STRING "") +set(ZSTD_ROOT + "${ARROW_VCPKG_PREFIX}" + CACHE STRING "") +set(BROTLI_ROOT + "${ARROW_VCPKG_PREFIX}" + CACHE STRING "") +set(LZ4_ROOT + "${ARROW_VCPKG_PREFIX}" + CACHE STRING "") if(CMAKE_HOST_WIN32) - set(LZ4_MSVC_LIB_PREFIX "" CACHE STRING "") - set(LZ4_MSVC_STATIC_LIB_SUFFIX "" CACHE STRING "") - set(ZSTD_MSVC_LIB_PREFIX "" CACHE STRING "") + set(LZ4_MSVC_LIB_PREFIX + "" + CACHE STRING "") + set(LZ4_MSVC_STATIC_LIB_SUFFIX + "" + CACHE STRING "") + set(ZSTD_MSVC_LIB_PREFIX + "" + CACHE STRING "") endif() diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake index 5eee6278009..bde9af23e57 100644 --- a/cpp/cmake_modules/san-config.cmake +++ b/cpp/cmake_modules/san-config.cmake @@ -20,10 +20,8 @@ endif() if(${ARROW_USE_ASAN}) if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" - OR (CMAKE_CXX_COMPILER_ID - STREQUAL - "GNU" - AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.8")) + OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION + VERSION_GREATER "4.8")) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -DADDRESS_SANITIZER") else() message(SEND_ERROR "Cannot use ASAN without clang or gcc >= 4.8") @@ -41,18 +39,16 @@ endif() # (https://bugs.llvm.org/show_bug.cgi?id=17000#c1) # Note: GCC does not support the 'function' flag. if(${ARROW_USE_UBSAN}) - if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" - OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set( - CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function,float-divide-by-zero -fno-sanitize-recover=all" - ) - elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" - AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "5.1") - set( - CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr -fno-sanitize-recover=all" - ) + if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL + "Clang") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr,function,float-divide-by-zero -fno-sanitize-recover=all" + ) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION + VERSION_GREATER_EQUAL "5.1") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr -fno-sanitize-recover=all" + ) else() message(SEND_ERROR "Cannot use UBSAN without clang or gcc >= 5.1") endif() @@ -61,14 +57,10 @@ endif() # Flag to enable thread sanitizer (clang or gcc 4.8) if(${ARROW_USE_TSAN}) if(NOT - (CMAKE_CXX_COMPILER_ID - STREQUAL - "AppleClang" + (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" - OR (CMAKE_CXX_COMPILER_ID - STREQUAL - "GNU" - AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.8"))) + OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION + VERSION_GREATER "4.8"))) message(SEND_ERROR "Cannot use TSAN without clang or gcc >= 4.8") endif() @@ -100,34 +92,31 @@ if(${ARROW_USE_TSAN}) endif() if(${ARROW_USE_COVERAGE}) - if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" - OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - add_definitions( - "-fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep" - ) + if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL + "Clang") + add_definitions("-fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep" + ) - set( - CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep" - ) + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -fsanitize-coverage=pc-table,inline-8bit-counters,edge,no-prune,trace-cmp,trace-div,trace-gep" + ) else() message(SEND_ERROR "You can only enable coverage with clang") endif() endif() -if("${ARROW_USE_UBSAN}" OR "${ARROW_USE_ASAN}" OR "${ARROW_USE_TSAN}") +if("${ARROW_USE_UBSAN}" + OR "${ARROW_USE_ASAN}" + OR "${ARROW_USE_TSAN}") # GCC 4.8 and 4.9 (latest as of this writing) don't allow you to specify # disallowed entries for the sanitizer. - if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" - OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set( - CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -fsanitize-blacklist=${BUILD_SUPPORT_DIR}/sanitizer-disallowed-entries.txt" - ) + if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL + "Clang") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -fsanitize-blacklist=${BUILD_SUPPORT_DIR}/sanitizer-disallowed-entries.txt" + ) else() - message( - WARNING - "GCC does not support specifying a sanitizer disallowed entries list. Known sanitizer check failures will not be suppressed." - ) + message(WARNING "GCC does not support specifying a sanitizer disallowed entries list. Known sanitizer check failures will not be suppressed." + ) endif() endif() diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index 1abbf52ac3e..0bcf5de0ad1 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -17,6 +17,10 @@ ADD_ARROW_EXAMPLE(row_wise_conversion_example) +if (ARROW_COMPUTE) + ADD_ARROW_EXAMPLE(compute_register_example) +endif() + if (ARROW_PARQUET AND ARROW_DATASET) if (ARROW_BUILD_SHARED) set(DATASET_EXAMPLES_LINK_LIBS arrow_dataset_shared) @@ -28,4 +32,9 @@ if (ARROW_PARQUET AND ARROW_DATASET) EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) add_dependencies(dataset_parquet_scan_example parquet) + + ADD_ARROW_EXAMPLE(dataset_documentation_example + EXTRA_LINK_LIBS + ${DATASET_EXAMPLES_LINK_LIBS}) + add_dependencies(dataset_documentation_example parquet) endif() diff --git a/cpp/examples/arrow/compute_register_example.cc b/cpp/examples/arrow/compute_register_example.cc new file mode 100644 index 00000000000..3c20a3d2a87 --- /dev/null +++ b/cpp/examples/arrow/compute_register_example.cc @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include +#include + +// Demonstrate registering an Arrow compute function outside of the Arrow source tree + +namespace cp = ::arrow::compute; + +#define ABORT_ON_FAILURE(expr) \ + do { \ + arrow::Status status_ = (expr); \ + if (!status_.ok()) { \ + std::cerr << status_.message() << std::endl; \ + abort(); \ + } \ + } while (0); + +class ExampleFunctionOptionsType : public cp::FunctionOptionsType { + const char* type_name() const override { return "ExampleFunctionOptionsType"; } + std::string Stringify(const cp::FunctionOptions&) const override { + return "ExampleFunctionOptionsType"; + } + bool Compare(const cp::FunctionOptions&, const cp::FunctionOptions&) const override { + return true; + } + // optional: support for serialization + // Result> Serialize(const FunctionOptions&) const override; + // Result> Deserialize(const Buffer&) const override; +}; + +cp::FunctionOptionsType* GetExampleFunctionOptionsType() { + static ExampleFunctionOptionsType options_type; + return &options_type; +} + +class ExampleFunctionOptions : public cp::FunctionOptions { + public: + ExampleFunctionOptions() : cp::FunctionOptions(GetExampleFunctionOptionsType()) {} +}; + +arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out) { + *out->mutable_array() = *batch[0].array(); + return arrow::Status::OK(); +} + +const cp::FunctionDoc func_doc{ + "Example function to demonstrate registering an out-of-tree function", + "", + {"x"}, + "ExampleFunctionOptions"}; + +int main(int argc, char** argv) { + const std::string name = "compute_register_example"; + auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc); + ABORT_ON_FAILURE(func->AddKernel({cp::InputType::Array(arrow::int64())}, arrow::int64(), + ExampleFunctionImpl)); + + auto registry = cp::GetFunctionRegistry(); + ABORT_ON_FAILURE(registry->AddFunction(std::move(func))); + + arrow::Int64Builder builder(arrow::default_memory_pool()); + std::shared_ptr arr; + ABORT_ON_FAILURE(builder.Append(42)); + ABORT_ON_FAILURE(builder.Finish(&arr)); + auto options = std::make_shared(); + auto maybe_result = cp::CallFunction(name, {arr}, options.get()); + ABORT_ON_FAILURE(maybe_result.status()); + + std::cout << maybe_result->make_array()->ToString() << std::endl; + + // Expression serialization will raise NotImplemented if an expression includes + // FunctionOptions for which serialization is not supported. + auto expr = cp::call(name, {}, options); + auto maybe_serialized = cp::Serialize(expr); + std::cerr << maybe_serialized.status().ToString() << std::endl; + + return EXIT_SUCCESS; +} diff --git a/cpp/examples/arrow/dataset_documentation_example.cc b/cpp/examples/arrow/dataset_documentation_example.cc new file mode 100644 index 00000000000..1aac66d4a6c --- /dev/null +++ b/cpp/examples/arrow/dataset_documentation_example.cc @@ -0,0 +1,374 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This example showcases various ways to work with Datasets. It's +// intended to be paired with the documentation. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace ds = arrow::dataset; +namespace fs = arrow::fs; +namespace cp = arrow::compute; + +#define ABORT_ON_FAILURE(expr) \ + do { \ + arrow::Status status_ = (expr); \ + if (!status_.ok()) { \ + std::cerr << status_.message() << std::endl; \ + abort(); \ + } \ + } while (0); + +// (Doc section: Reading Datasets) +// Generate some data for the rest of this example. +std::shared_ptr CreateTable() { + auto schema = + arrow::schema({arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64()), + arrow::field("c", arrow::int64())}); + std::shared_ptr array_a; + std::shared_ptr array_b; + std::shared_ptr array_c; + arrow::NumericBuilder builder; + ABORT_ON_FAILURE(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9})); + ABORT_ON_FAILURE(builder.Finish(&array_a)); + builder.Reset(); + ABORT_ON_FAILURE(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0})); + ABORT_ON_FAILURE(builder.Finish(&array_b)); + builder.Reset(); + ABORT_ON_FAILURE(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2})); + ABORT_ON_FAILURE(builder.Finish(&array_c)); + return arrow::Table::Make(schema, {array_a, array_b, array_c}); +} + +// Set up a dataset by writing two Parquet files. +std::string CreateExampleParquetDataset(const std::shared_ptr& filesystem, + const std::string& root_path) { + auto base_path = root_path + "/parquet_dataset"; + ABORT_ON_FAILURE(filesystem->CreateDir(base_path)); + // Create an Arrow Table + auto table = CreateTable(); + // Write it into two Parquet files + auto output = filesystem->OpenOutputStream(base_path + "/data1.parquet").ValueOrDie(); + ABORT_ON_FAILURE(parquet::arrow::WriteTable( + *table->Slice(0, 5), arrow::default_memory_pool(), output, /*chunk_size=*/2048)); + output = filesystem->OpenOutputStream(base_path + "/data2.parquet").ValueOrDie(); + ABORT_ON_FAILURE(parquet::arrow::WriteTable( + *table->Slice(5), arrow::default_memory_pool(), output, /*chunk_size=*/2048)); + return base_path; +} +// (Doc section: Reading Datasets) + +// (Doc section: Reading different file formats) +// Set up a dataset by writing two Feather files. +std::string CreateExampleFeatherDataset(const std::shared_ptr& filesystem, + const std::string& root_path) { + auto base_path = root_path + "/feather_dataset"; + ABORT_ON_FAILURE(filesystem->CreateDir(base_path)); + // Create an Arrow Table + auto table = CreateTable(); + // Write it into two Feather files + auto output = filesystem->OpenOutputStream(base_path + "/data1.feather").ValueOrDie(); + auto writer = arrow::ipc::MakeFileWriter(output.get(), table->schema()).ValueOrDie(); + ABORT_ON_FAILURE(writer->WriteTable(*table->Slice(0, 5))); + ABORT_ON_FAILURE(writer->Close()); + output = filesystem->OpenOutputStream(base_path + "/data2.feather").ValueOrDie(); + writer = arrow::ipc::MakeFileWriter(output.get(), table->schema()).ValueOrDie(); + ABORT_ON_FAILURE(writer->WriteTable(*table->Slice(5))); + ABORT_ON_FAILURE(writer->Close()); + return base_path; +} +// (Doc section: Reading different file formats) + +// (Doc section: Reading and writing partitioned data) +// Set up a dataset by writing files with partitioning +std::string CreateExampleParquetHivePartitionedDataset( + const std::shared_ptr& filesystem, const std::string& root_path) { + auto base_path = root_path + "/parquet_dataset"; + ABORT_ON_FAILURE(filesystem->CreateDir(base_path)); + // Create an Arrow Table + auto schema = arrow::schema( + {arrow::field("a", arrow::int64()), arrow::field("b", arrow::int64()), + arrow::field("c", arrow::int64()), arrow::field("part", arrow::utf8())}); + std::vector> arrays(4); + arrow::NumericBuilder builder; + ABORT_ON_FAILURE(builder.AppendValues({0, 1, 2, 3, 4, 5, 6, 7, 8, 9})); + ABORT_ON_FAILURE(builder.Finish(&arrays[0])); + builder.Reset(); + ABORT_ON_FAILURE(builder.AppendValues({9, 8, 7, 6, 5, 4, 3, 2, 1, 0})); + ABORT_ON_FAILURE(builder.Finish(&arrays[1])); + builder.Reset(); + ABORT_ON_FAILURE(builder.AppendValues({1, 2, 1, 2, 1, 2, 1, 2, 1, 2})); + ABORT_ON_FAILURE(builder.Finish(&arrays[2])); + arrow::StringBuilder string_builder; + ABORT_ON_FAILURE( + string_builder.AppendValues({"a", "a", "a", "a", "a", "b", "b", "b", "b", "b"})); + ABORT_ON_FAILURE(string_builder.Finish(&arrays[3])); + auto table = arrow::Table::Make(schema, arrays); + // Write it using Datasets + auto dataset = std::make_shared(table); + auto scanner_builder = dataset->NewScan().ValueOrDie(); + auto scanner = scanner_builder->Finish().ValueOrDie(); + + // The partition schema determines which fields are part of the partitioning. + auto partition_schema = arrow::schema({arrow::field("part", arrow::utf8())}); + // We'll use Hive-style partitioning, which creates directories with "key=value" pairs. + auto partitioning = std::make_shared(partition_schema); + // We'll write Parquet files. + auto format = std::make_shared(); + ds::FileSystemDatasetWriteOptions write_options; + write_options.file_write_options = format->DefaultWriteOptions(); + write_options.filesystem = filesystem; + write_options.base_dir = base_path; + write_options.partitioning = partitioning; + write_options.basename_template = "part{i}.parquet"; + ABORT_ON_FAILURE(ds::FileSystemDataset::Write(write_options, scanner)); + return base_path; +} +// (Doc section: Reading and writing partitioned data) + +// (Doc section: Dataset discovery) +// Read the whole dataset with the given format, without partitioning. +std::shared_ptr ScanWholeDataset( + const std::shared_ptr& filesystem, + const std::shared_ptr& format, const std::string& base_dir) { + // Create a dataset by scanning the filesystem for files + fs::FileSelector selector; + selector.base_dir = base_dir; + auto factory = ds::FileSystemDatasetFactory::Make(filesystem, selector, format, + ds::FileSystemFactoryOptions()) + .ValueOrDie(); + auto dataset = factory->Finish().ValueOrDie(); + // Print out the fragments + for (const auto& fragment : dataset->GetFragments().ValueOrDie()) { + std::cout << "Found fragment: " << (*fragment)->ToString() << std::endl; + } + // Read the entire dataset as a Table + auto scan_builder = dataset->NewScan().ValueOrDie(); + auto scanner = scan_builder->Finish().ValueOrDie(); + return scanner->ToTable().ValueOrDie(); +} +// (Doc section: Dataset discovery) + +// (Doc section: Filtering data) +// Read a dataset, but select only column "b" and only rows where b < 4. +// +// This is useful when you only want a few columns from a dataset. Where possible, +// Datasets will push down the column selection such that less work is done. +std::shared_ptr FilterAndSelectDataset( + const std::shared_ptr& filesystem, + const std::shared_ptr& format, const std::string& base_dir) { + fs::FileSelector selector; + selector.base_dir = base_dir; + auto factory = ds::FileSystemDatasetFactory::Make(filesystem, selector, format, + ds::FileSystemFactoryOptions()) + .ValueOrDie(); + auto dataset = factory->Finish().ValueOrDie(); + // Read specified columns with a row filter + auto scan_builder = dataset->NewScan().ValueOrDie(); + ABORT_ON_FAILURE(scan_builder->Project({"b"})); + ABORT_ON_FAILURE(scan_builder->Filter(cp::less(cp::field_ref("b"), cp::literal(4)))); + auto scanner = scan_builder->Finish().ValueOrDie(); + return scanner->ToTable().ValueOrDie(); +} +// (Doc section: Filtering data) + +// (Doc section: Projecting columns) +// Read a dataset, but with column projection. +// +// This is useful to derive new columns from existing data. For example, here we +// demonstrate casting a column to a different type, and turning a numeric column into a +// boolean column based on a predicate. You could also rename columns or perform +// computations involving multiple columns. +std::shared_ptr ProjectDataset( + const std::shared_ptr& filesystem, + const std::shared_ptr& format, const std::string& base_dir) { + fs::FileSelector selector; + selector.base_dir = base_dir; + auto factory = ds::FileSystemDatasetFactory::Make(filesystem, selector, format, + ds::FileSystemFactoryOptions()) + .ValueOrDie(); + auto dataset = factory->Finish().ValueOrDie(); + // Read specified columns with a row filter + auto scan_builder = dataset->NewScan().ValueOrDie(); + ABORT_ON_FAILURE(scan_builder->Project( + { + // Leave column "a" as-is. + cp::field_ref("a"), + // Cast column "b" to float32. + cp::call("cast", {cp::field_ref("b")}, + arrow::compute::CastOptions::Safe(arrow::float32())), + // Derive a boolean column from "c". + cp::equal(cp::field_ref("c"), cp::literal(1)), + }, + {"a_renamed", "b_as_float32", "c_1"})); + auto scanner = scan_builder->Finish().ValueOrDie(); + return scanner->ToTable().ValueOrDie(); +} +// (Doc section: Projecting columns) + +// (Doc section: Projecting columns #2) +// Read a dataset, but with column projection. +// +// This time, we read all original columns plus one derived column. This simply combines +// the previous two examples: selecting a subset of columns by name, and deriving new +// columns with an expression. +std::shared_ptr SelectAndProjectDataset( + const std::shared_ptr& filesystem, + const std::shared_ptr& format, const std::string& base_dir) { + fs::FileSelector selector; + selector.base_dir = base_dir; + auto factory = ds::FileSystemDatasetFactory::Make(filesystem, selector, format, + ds::FileSystemFactoryOptions()) + .ValueOrDie(); + auto dataset = factory->Finish().ValueOrDie(); + // Read specified columns with a row filter + auto scan_builder = dataset->NewScan().ValueOrDie(); + std::vector names; + std::vector exprs; + // Read all the original columns. + for (const auto& field : dataset->schema()->fields()) { + names.push_back(field->name()); + exprs.push_back(cp::field_ref(field->name())); + } + // Also derive a new column. + names.emplace_back("b_large"); + exprs.push_back(cp::greater(cp::field_ref("b"), cp::literal(1))); + ABORT_ON_FAILURE(scan_builder->Project(exprs, names)); + auto scanner = scan_builder->Finish().ValueOrDie(); + return scanner->ToTable().ValueOrDie(); +} +// (Doc section: Projecting columns #2) + +// (Doc section: Reading and writing partitioned data #2) +// Read an entire dataset, but with partitioning information. +std::shared_ptr ScanPartitionedDataset( + const std::shared_ptr& filesystem, + const std::shared_ptr& format, const std::string& base_dir) { + fs::FileSelector selector; + selector.base_dir = base_dir; + selector.recursive = true; // Make sure to search subdirectories + ds::FileSystemFactoryOptions options; + // We'll use Hive-style partitioning. We'll let Arrow Datasets infer the partition + // schema. + options.partitioning = ds::HivePartitioning::MakeFactory(); + auto factory = ds::FileSystemDatasetFactory::Make(filesystem, selector, format, options) + .ValueOrDie(); + auto dataset = factory->Finish().ValueOrDie(); + // Print out the fragments + for (const auto& fragment : dataset->GetFragments().ValueOrDie()) { + std::cout << "Found fragment: " << (*fragment)->ToString() << std::endl; + std::cout << "Partition expression: " + << (*fragment)->partition_expression().ToString() << std::endl; + } + auto scan_builder = dataset->NewScan().ValueOrDie(); + auto scanner = scan_builder->Finish().ValueOrDie(); + return scanner->ToTable().ValueOrDie(); +} +// (Doc section: Reading and writing partitioned data #2) + +// (Doc section: Reading and writing partitioned data #3) +// Read an entire dataset, but with partitioning information. Also, filter the dataset on +// the partition values. +std::shared_ptr FilterPartitionedDataset( + const std::shared_ptr& filesystem, + const std::shared_ptr& format, const std::string& base_dir) { + fs::FileSelector selector; + selector.base_dir = base_dir; + selector.recursive = true; + ds::FileSystemFactoryOptions options; + options.partitioning = ds::HivePartitioning::MakeFactory(); + auto factory = ds::FileSystemDatasetFactory::Make(filesystem, selector, format, options) + .ValueOrDie(); + auto dataset = factory->Finish().ValueOrDie(); + auto scan_builder = dataset->NewScan().ValueOrDie(); + // Filter based on the partition values. This will mean that we won't even read the + // files whose partition expressions don't match the filter. + ABORT_ON_FAILURE( + scan_builder->Filter(cp::equal(cp::field_ref("part"), cp::literal("b")))); + auto scanner = scan_builder->Finish().ValueOrDie(); + return scanner->ToTable().ValueOrDie(); +} +// (Doc section: Reading and writing partitioned data #3) + +int main(int argc, char** argv) { + if (argc < 3) { + // Fake success for CI purposes. + return EXIT_SUCCESS; + } + + std::string uri = argv[1]; + std::string format_name = argv[2]; + std::string mode = argc > 3 ? argv[3] : "no_filter"; + std::string root_path; + auto fs = fs::FileSystemFromUri(uri, &root_path).ValueOrDie(); + + std::string base_path; + std::shared_ptr format; + if (format_name == "feather") { + format = std::make_shared(); + base_path = CreateExampleFeatherDataset(fs, root_path); + } else if (format_name == "parquet") { + format = std::make_shared(); + base_path = CreateExampleParquetDataset(fs, root_path); + } else if (format_name == "parquet_hive") { + format = std::make_shared(); + base_path = CreateExampleParquetHivePartitionedDataset(fs, root_path); + } else { + std::cerr << "Unknown format: " << format_name << std::endl; + std::cerr << "Supported formats: feather, parquet, parquet_hive" << std::endl; + return EXIT_FAILURE; + } + + std::shared_ptr table; + if (mode == "no_filter") { + table = ScanWholeDataset(fs, format, base_path); + } else if (mode == "filter") { + table = FilterAndSelectDataset(fs, format, base_path); + } else if (mode == "project") { + table = ProjectDataset(fs, format, base_path); + } else if (mode == "select_project") { + table = SelectAndProjectDataset(fs, format, base_path); + } else if (mode == "partitioned") { + table = ScanPartitionedDataset(fs, format, base_path); + } else if (mode == "filter_partitioned") { + table = FilterPartitionedDataset(fs, format, base_path); + } else { + std::cerr << "Unknown mode: " << mode << std::endl; + std::cerr + << "Supported modes: no_filter, filter, project, select_project, partitioned" + << std::endl; + return EXIT_FAILURE; + } + std::cout << "Read " << table->num_rows() << " rows" << std::endl; + std::cout << table->ToString() << std::endl; + return EXIT_SUCCESS; +} diff --git a/cpp/examples/arrow/dataset_parquet_scan_example.cc b/cpp/examples/arrow/dataset_parquet_scan_example.cc index 197ca5aa4c6..cd9b89fe380 100644 --- a/cpp/examples/arrow/dataset_parquet_scan_example.cc +++ b/cpp/examples/arrow/dataset_parquet_scan_example.cc @@ -16,9 +16,9 @@ // under the License. #include +#include #include #include -#include #include #include #include @@ -37,6 +37,8 @@ namespace fs = arrow::fs; namespace ds = arrow::dataset; +namespace cp = arrow::compute; + #define ABORT_ON_FAILURE(expr) \ do { \ arrow::Status status_ = (expr); \ @@ -60,8 +62,8 @@ struct Configuration { // Indicates the filter by which rows will be filtered. This optimization can // make use of partition information and/or file metadata if possible. - ds::Expression filter = - ds::greater(ds::field_ref("total_amount"), ds::literal(1000.0f)); + cp::Expression filter = + cp::greater(cp::field_ref("total_amount"), cp::literal(1000.0f)); ds::InspectOptions inspect_options{}; ds::FinishOptions finish_options{}; @@ -146,7 +148,7 @@ std::shared_ptr GetDatasetFromPath( std::shared_ptr GetScannerFromDataset(std::shared_ptr dataset, std::vector columns, - ds::Expression filter, + cp::Expression filter, bool use_threads) { auto scanner_builder = dataset->NewScan().ValueOrDie(); diff --git a/cpp/examples/minimal_build/example.cc b/cpp/examples/minimal_build/example.cc index e1b5c123a85..2ca163155ee 100644 --- a/cpp/examples/minimal_build/example.cc +++ b/cpp/examples/minimal_build/example.cc @@ -38,8 +38,7 @@ Status RunMain(int argc, char** argv) { arrow::io::ReadableFile::Open(csv_filename)); ARROW_ASSIGN_OR_RAISE( auto csv_reader, - arrow::csv::TableReader::Make(arrow::default_memory_pool(), - arrow::io::default_io_context(), + arrow::csv::TableReader::Make(arrow::io::default_io_context(), input_file, arrow::csv::ReadOptions::Defaults(), arrow::csv::ParseOptions::Defaults(), diff --git a/cpp/examples/minimal_build/minimal.dockerfile b/cpp/examples/minimal_build/minimal.dockerfile index 95f73e9a549..9361fc5e81d 100644 --- a/cpp/examples/minimal_build/minimal.dockerfile +++ b/cpp/examples/minimal_build/minimal.dockerfile @@ -22,5 +22,6 @@ ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ build-essential \ - cmake && \ + cmake \ + pkg-config && \ apt-get clean && rm -rf /var/lib/apt/lists* diff --git a/cpp/examples/minimal_build/run_static.sh b/cpp/examples/minimal_build/run_static.sh index 05804a0366c..ff3bb894570 100755 --- a/cpp/examples/minimal_build/run_static.sh +++ b/cpp/examples/minimal_build/run_static.sh @@ -67,10 +67,12 @@ popd echo echo "==" +echo "== CMake:" echo "== Building example project using Arrow C++ library" echo "==" echo +rm -rf $EXAMPLE_BUILD_DIR mkdir -p $EXAMPLE_BUILD_DIR pushd $EXAMPLE_BUILD_DIR @@ -81,10 +83,39 @@ popd echo echo "==" +echo "== CMake:" echo "== Running example project" echo "==" echo pushd $EXAMPLE_DIR -${EXAMPLE_BUILD_DIR}/arrow_example +$EXAMPLE_BUILD_DIR/arrow_example + +echo +echo "==" +echo "== pkg-config" +echo "== Building example project using Arrow C++ library" +echo "==" +echo + +rm -rf $EXAMPLE_BUILD_DIR +mkdir -p $EXAMPLE_BUILD_DIR +${CXX:-c++} \ + -o $EXAMPLE_BUILD_DIR/arrow_example \ + $EXAMPLE_DIR/example.cc \ + $(PKG_CONFIG_PATH=$ARROW_BUILD_DIR/lib/pkgconfig \ + pkg-config --cflags --libs --static arrow) + +popd + +echo +echo "==" +echo "== pkg-config:" +echo "== Running example project" +echo "==" +echo + +pushd $EXAMPLE_DIR + +$EXAMPLE_BUILD_DIR/arrow_example diff --git a/cpp/examples/minimal_build/system_dependency.dockerfile b/cpp/examples/minimal_build/system_dependency.dockerfile index f0b29cef990..926fcaf6f4b 100644 --- a/cpp/examples/minimal_build/system_dependency.dockerfile +++ b/cpp/examples/minimal_build/system_dependency.dockerfile @@ -37,6 +37,7 @@ RUN apt-get update -y -q && \ libthrift-dev \ libutf8proc-dev \ libzstd-dev \ + pkg-config \ protobuf-compiler \ rapidjson-dev \ zlib1g-dev && \ diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index df72dcc5b6b..f13e5b1ef75 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -119,6 +119,22 @@ function(ADD_ARROW_BENCHMARK REL_TEST_NAME) ${ARG_UNPARSED_ARGUMENTS}) endfunction() +macro(append_avx2_src SRC) + if(ARROW_HAVE_RUNTIME_AVX2) + list(APPEND ARROW_SRCS ${SRC}) + set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) + set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS ${ARROW_AVX2_FLAG}) + endif() +endmacro() + +macro(append_avx512_src SRC) + if(ARROW_HAVE_RUNTIME_AVX512) + list(APPEND ARROW_SRCS ${SRC}) + set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) + set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS ${ARROW_AVX512_FLAG}) + endif() +endmacro() + set(ARROW_SRCS array/array_base.cc array/array_binary.cc @@ -172,6 +188,7 @@ set(ARROW_SRCS io/interfaces.cc io/memory.cc io/slow.cc + io/stdio.cc io/transform.cc util/basic_decimal.cc util/bit_block_counter.cc @@ -201,6 +218,7 @@ set(ARROW_SRCS util/thread_pool.cc util/time.cc util/trie.cc + util/unreachable.cc util/uri.cc util/utf8.cc util/value_parsing.cc @@ -215,18 +233,11 @@ set(ARROW_SRCS vendored/double-conversion/diy-fp.cc vendored/double-conversion/strtod.cc) -if(ARROW_HAVE_RUNTIME_AVX2) - list(APPEND ARROW_SRCS util/bpacking_avx2.cc) - set_source_files_properties(util/bpacking_avx2.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON) - set_source_files_properties(util/bpacking_avx2.cc PROPERTIES COMPILE_FLAGS - ${ARROW_AVX2_FLAG}) -endif() -if(ARROW_HAVE_RUNTIME_AVX512) - list(APPEND ARROW_SRCS util/bpacking_avx512.cc) - set_source_files_properties(util/bpacking_avx512.cc PROPERTIES SKIP_PRECOMPILE_HEADERS - ON) - set_source_files_properties(util/bpacking_avx512.cc PROPERTIES COMPILE_FLAGS - ${ARROW_AVX512_FLAG}) +append_avx2_src(util/bpacking_avx2.cc) +append_avx512_src(util/bpacking_avx512.cc) + +if(ARROW_HAVE_NEON) + list(APPEND ARROW_SRCS util/bpacking_neon.cc) endif() if(APPLE) @@ -252,11 +263,8 @@ set(ARROW_C_SRCS vendored/uriparser/UriShorten.c) set_source_files_properties(vendored/datetime/tz.cpp - PROPERTIES - SKIP_PRECOMPILE_HEADERS - ON - SKIP_UNITY_BUILD_INCLUSION - ON) + PROPERTIES SKIP_PRECOMPILE_HEADERS ON + SKIP_UNITY_BUILD_INCLUSION ON) # Disable DLL exports in vendored uriparser library add_definitions(-DURI_STATIC_BUILD) @@ -317,16 +325,12 @@ endif() if(_allocator_dependencies) if("${CMAKE_GENERATOR}" STREQUAL "Ninja") set_source_files_properties(memory_pool.cc PROPERTIES OBJECT_DEPENDS - "${_allocator_dependencies}") + "${_allocator_dependencies}") else() add_dependencies(arrow_dependencies ${_allocator_dependencies}) endif() - set_source_files_properties(memory_pool.cc - PROPERTIES - SKIP_PRECOMPILE_HEADERS - ON - SKIP_UNITY_BUILD_INCLUSION - ON) + set_source_files_properties(memory_pool.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON + SKIP_UNITY_BUILD_INCLUSION ON) endif() unset(_allocator_dependencies) @@ -342,14 +346,15 @@ endif() # if(ARROW_CSV) - list(APPEND ARROW_SRCS - csv/converter.cc - csv/chunker.cc - csv/column_builder.cc - csv/column_decoder.cc - csv/options.cc - csv/parser.cc - csv/reader.cc) + list(APPEND + ARROW_SRCS + csv/converter.cc + csv/chunker.cc + csv/column_builder.cc + csv/column_decoder.cc + csv/options.cc + csv/parser.cc + csv/reader.cc) if(ARROW_COMPUTE) list(APPEND ARROW_SRCS csv/writer.cc) endif() @@ -358,56 +363,65 @@ if(ARROW_CSV) endif() if(ARROW_COMPUTE) - list(APPEND ARROW_SRCS - compute/api_aggregate.cc - compute/api_scalar.cc - compute/api_vector.cc - compute/cast.cc - compute/exec.cc - compute/function.cc - compute/kernel.cc - compute/registry.cc - compute/kernels/aggregate_basic.cc - compute/kernels/aggregate_mode.cc - compute/kernels/aggregate_quantile.cc - compute/kernels/aggregate_tdigest.cc - compute/kernels/aggregate_var_std.cc - compute/kernels/codegen_internal.cc - compute/kernels/hash_aggregate.cc - compute/kernels/scalar_arithmetic.cc - compute/kernels/scalar_boolean.cc - compute/kernels/scalar_cast_boolean.cc - compute/kernels/scalar_cast_internal.cc - compute/kernels/scalar_cast_nested.cc - compute/kernels/scalar_cast_numeric.cc - compute/kernels/scalar_cast_string.cc - compute/kernels/scalar_cast_temporal.cc - compute/kernels/scalar_compare.cc - compute/kernels/scalar_nested.cc - compute/kernels/scalar_set_lookup.cc - compute/kernels/scalar_string.cc - compute/kernels/scalar_validity.cc - compute/kernels/scalar_fill_null.cc - compute/kernels/util_internal.cc - compute/kernels/vector_hash.cc - compute/kernels/vector_nested.cc - compute/kernels/vector_selection.cc - compute/kernels/vector_sort.cc) - - if(ARROW_HAVE_RUNTIME_AVX2) - list(APPEND ARROW_SRCS compute/kernels/aggregate_basic_avx2.cc) - set_source_files_properties(compute/kernels/aggregate_basic_avx2.cc PROPERTIES - SKIP_PRECOMPILE_HEADERS ON) - set_source_files_properties(compute/kernels/aggregate_basic_avx2.cc PROPERTIES - COMPILE_FLAGS ${ARROW_AVX2_FLAG}) - endif() - if(ARROW_HAVE_RUNTIME_AVX512) - list(APPEND ARROW_SRCS compute/kernels/aggregate_basic_avx512.cc) - set_source_files_properties(compute/kernels/aggregate_basic_avx512.cc PROPERTIES - SKIP_PRECOMPILE_HEADERS ON) - set_source_files_properties(compute/kernels/aggregate_basic_avx512.cc PROPERTIES - COMPILE_FLAGS ${ARROW_AVX512_FLAG}) - endif() + list(APPEND + ARROW_SRCS + compute/api_aggregate.cc + compute/api_scalar.cc + compute/api_vector.cc + compute/cast.cc + compute/exec.cc + compute/exec/exec_plan.cc + compute/exec/expression.cc + compute/function.cc + compute/function_internal.cc + compute/kernel.cc + compute/registry.cc + compute/kernels/aggregate_basic.cc + compute/kernels/aggregate_mode.cc + compute/kernels/aggregate_quantile.cc + compute/kernels/aggregate_tdigest.cc + compute/kernels/aggregate_var_std.cc + compute/kernels/codegen_internal.cc + compute/kernels/hash_aggregate.cc + compute/kernels/scalar_arithmetic.cc + compute/kernels/scalar_boolean.cc + compute/kernels/scalar_cast_boolean.cc + compute/kernels/scalar_cast_dictionary.cc + compute/kernels/scalar_cast_internal.cc + compute/kernels/scalar_cast_nested.cc + compute/kernels/scalar_cast_numeric.cc + compute/kernels/scalar_cast_string.cc + compute/kernels/scalar_cast_temporal.cc + compute/kernels/scalar_compare.cc + compute/kernels/scalar_nested.cc + compute/kernels/scalar_set_lookup.cc + compute/kernels/scalar_string.cc + compute/kernels/scalar_temporal.cc + compute/kernels/scalar_validity.cc + compute/kernels/scalar_fill_null.cc + compute/kernels/scalar_if_else.cc + compute/kernels/util_internal.cc + compute/kernels/vector_hash.cc + compute/kernels/vector_nested.cc + compute/kernels/vector_replace.cc + compute/kernels/vector_selection.cc + compute/kernels/vector_sort.cc + compute/exec/key_hash.cc + compute/exec/key_map.cc + compute/exec/key_compare.cc + compute/exec/key_encode.cc + compute/exec/util.cc) + + append_avx2_src(compute/kernels/aggregate_basic_avx2.cc) + append_avx512_src(compute/kernels/aggregate_basic_avx512.cc) + + append_avx2_src(compute/exec/key_hash_avx2.cc) + append_avx2_src(compute/exec/key_map_avx2.cc) + append_avx2_src(compute/exec/key_compare_avx2.cc) + append_avx2_src(compute/exec/key_encode_avx2.cc) + append_avx2_src(compute/exec/util_avx2.cc) + + list(APPEND ARROW_TESTING_SRCS compute/exec/test_util.cc) endif() if(ARROW_FILESYSTEM) @@ -415,12 +429,13 @@ if(ARROW_FILESYSTEM) add_definitions(-DARROW_HDFS) endif() - list(APPEND ARROW_SRCS - filesystem/filesystem.cc - filesystem/localfs.cc - filesystem/mockfs.cc - filesystem/path_util.cc - filesystem/util_internal.cc) + list(APPEND + ARROW_SRCS + filesystem/filesystem.cc + filesystem/localfs.cc + filesystem/mockfs.cc + filesystem/path_util.cc + filesystem/util_internal.cc) if(ARROW_HDFS) list(APPEND ARROW_SRCS filesystem/hdfs.cc) @@ -428,25 +443,23 @@ if(ARROW_FILESYSTEM) if(ARROW_S3) list(APPEND ARROW_SRCS filesystem/s3fs.cc) set_source_files_properties(filesystem/s3fs.cc - PROPERTIES - SKIP_PRECOMPILE_HEADERS - ON - SKIP_UNITY_BUILD_INCLUSION - ON) + PROPERTIES SKIP_PRECOMPILE_HEADERS ON + SKIP_UNITY_BUILD_INCLUSION ON) endif() list(APPEND ARROW_TESTING_SRCS filesystem/test_util.cc) endif() if(ARROW_IPC) - list(APPEND ARROW_SRCS - ipc/dictionary.cc - ipc/feather.cc - ipc/message.cc - ipc/metadata_internal.cc - ipc/options.cc - ipc/reader.cc - ipc/writer.cc) + list(APPEND + ARROW_SRCS + ipc/dictionary.cc + ipc/feather.cc + ipc/message.cc + ipc/metadata_internal.cc + ipc/options.cc + ipc/reader.cc + ipc/writer.cc) if(ARROW_JSON) list(APPEND ARROW_SRCS ipc/json_simple.cc) @@ -454,25 +467,23 @@ if(ARROW_IPC) endif() if(ARROW_JSON) - list(APPEND ARROW_SRCS - json/options.cc - json/chunked_builder.cc - json/chunker.cc - json/converter.cc - json/object_parser.cc - json/object_writer.cc - json/parser.cc - json/reader.cc) + list(APPEND + ARROW_SRCS + json/options.cc + json/chunked_builder.cc + json/chunker.cc + json/converter.cc + json/object_parser.cc + json/object_writer.cc + json/parser.cc + json/reader.cc) endif() if(ARROW_ORC) list(APPEND ARROW_SRCS adapters/orc/adapter.cc adapters/orc/adapter_util.cc) endif() -if(NOT APPLE AND NOT MSVC_TOOLCHAIN) - # Localize thirdparty symbols using a linker version script. This hides them - # from the client application. The OS X linker does not support the - # version-script option. +if(CXX_LINKER_SUPPORTS_VERSION_SCRIPT) set(ARROW_VERSION_SCRIPT_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map") set(ARROW_SHARED_LINK_FLAGS ${ARROW_VERSION_SCRIPT_FLAGS}) @@ -480,6 +491,21 @@ endif() set(ARROW_ALL_SRCS ${ARROW_SRCS} ${ARROW_C_SRCS}) +if(ARROW_BUILD_STATIC AND ARROW_BUNDLED_STATIC_LIBS) + set(ARROW_BUILD_BUNDLED_DEPENDENCIES TRUE) +else() + set(ARROW_BUILD_BUNDLED_DEPENDENCIES FALSE) +endif() + +if(ARROW_BUILD_BUNDLED_DEPENDENCIES) + string(APPEND ARROW_PC_LIBS_PRIVATE " -larrow_bundled_dependencies") +endif() +# Need -latomic on Raspbian. +# See also: https://issues.apache.org/jira/browse/ARROW-12860 +if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux" AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + string(APPEND ARROW_PC_LIBS_PRIVATE " -latomic") +endif() + add_arrow_lib(arrow CMAKE_PACKAGE_NAME Arrow @@ -512,18 +538,21 @@ if(ARROW_BUILD_STATIC AND WIN32) target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) endif() +foreach(LIB_TARGET ${ARROW_LIBRARIES}) + target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) +endforeach() + if(ARROW_WITH_BACKTRACE) find_package(Backtrace) foreach(LIB_TARGET ${ARROW_LIBRARIES}) - target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) if(Backtrace_FOUND AND ARROW_WITH_BACKTRACE) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_WITH_BACKTRACE) endif() endforeach() endif() -if(ARROW_BUILD_STATIC AND ARROW_BUNDLED_STATIC_LIBS) +if(ARROW_BUILD_BUNDLED_DEPENDENCIES) arrow_car(_FIRST_LIB ${ARROW_BUNDLED_STATIC_LIBS}) arrow_cdr(_OTHER_LIBS ${ARROW_BUNDLED_STATIC_LIBS}) create_merged_static_lib(arrow_bundled_dependencies @@ -610,12 +639,8 @@ add_arrow_test(misc_test add_arrow_test(public_api_test) -set_source_files_properties(public_api_test.cc - PROPERTIES - SKIP_PRECOMPILE_HEADERS - ON - SKIP_UNITY_BUILD_INCLUSION - ON) +set_source_files_properties(public_api_test.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON + SKIP_UNITY_BUILD_INCLUSION ON) add_arrow_test(scalar_test) add_arrow_test(type_test) diff --git a/cpp/src/arrow/adapters/orc/CMakeLists.txt b/cpp/src/arrow/adapters/orc/CMakeLists.txt index 516196c2eef..ca901b07dfd 100644 --- a/cpp/src/arrow/adapters/orc/CMakeLists.txt +++ b/cpp/src/arrow/adapters/orc/CMakeLists.txt @@ -53,9 +53,5 @@ add_arrow_test(adapter_test STATIC_LINK_LIBS ${ORC_STATIC_TEST_LINK_LIBS}) -set_source_files_properties(adapter_test.cc - PROPERTIES - SKIP_PRECOMPILE_HEADERS - ON - SKIP_UNITY_BUILD_INCLUSION - ON) +set_source_files_properties(adapter_test.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON + SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 2c61f8995de..2f74b40e40d 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -16,10 +16,10 @@ // under the License. #include "arrow/adapters/orc/adapter.h" -#include "arrow/adapters/orc/adapter_util.h" #include #include +#include #include #include #include @@ -27,6 +27,7 @@ #include #include +#include "arrow/adapters/orc/adapter_util.h" #include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/io/interfaces.h" @@ -44,20 +45,11 @@ #include "arrow/util/macros.h" #include "arrow/util/range.h" #include "arrow/util/visibility.h" - #include "orc/Exceptions.hh" -#include "orc/OrcFile.hh" // alias to not interfere with nested orc namespace namespace liborc = orc; -namespace arrow { - -using internal::checked_cast; - -namespace adapters { -namespace orc { - #define ORC_THROW_NOT_OK(s) \ do { \ Status _s = (s); \ @@ -77,6 +69,35 @@ namespace orc { ORC_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \ lhs, rexpr); +#define ORC_BEGIN_CATCH_NOT_OK try { +#define ORC_END_CATCH_NOT_OK \ + } \ + catch (const liborc::ParseError& e) { \ + return Status::IOError(e.what()); \ + } \ + catch (const liborc::InvalidArgument& e) { \ + return Status::Invalid(e.what()); \ + } \ + catch (const liborc::NotImplementedYet& e) { \ + return Status::NotImplemented(e.what()); \ + } + +#define ORC_CATCH_NOT_OK(_s) \ + ORC_BEGIN_CATCH_NOT_OK(_s); \ + ORC_END_CATCH_NOT_OK + +namespace arrow { +namespace adapters { +namespace orc { + +namespace { + +// The following are required by ORC to be uint64_t +constexpr uint64_t kOrcWriterBatchSize = 128 * 1024; +constexpr uint64_t kOrcNaturalWriteSize = 128 * 1024; + +using internal::checked_cast; + class ArrowInputFile : public liborc::InputStream { public: explicit ArrowInputFile(const std::shared_ptr& file) @@ -129,11 +150,7 @@ class OrcStripeReader : public RecordBatchReader { Status ReadNext(std::shared_ptr* out) override { std::unique_ptr batch; - try { - batch = row_reader_->createRowBatch(batch_size_); - } catch (const liborc::ParseError& e) { - return Status::Invalid(e.what()); - } + ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_)); const liborc::Type& type = row_reader_->getSelectedType(); if (!row_reader_->next(*batch)) { @@ -163,6 +180,8 @@ class OrcStripeReader : public RecordBatchReader { int64_t batch_size_; }; +} // namespace + class ORCFileReader::Impl { public: Impl() {} @@ -172,11 +191,7 @@ class ORCFileReader::Impl { std::unique_ptr io_wrapper(new ArrowInputFile(file)); liborc::ReaderOptions options; std::unique_ptr liborc_reader; - try { - liborc_reader = createReader(std::move(io_wrapper), options); - } catch (const liborc::ParseError& e) { - return Status::IOError(e.what()); - } + ORC_CATCH_NOT_OK(liborc_reader = createReader(std::move(io_wrapper), options)); pool_ = pool; reader_ = std::move(liborc_reader); current_row_ = 0; @@ -209,15 +224,20 @@ class ORCFileReader::Impl { Status ReadSchema(const liborc::RowReaderOptions& opts, std::shared_ptr* out) { std::unique_ptr row_reader; - try { - row_reader = reader_->createRowReader(opts); - } catch (const liborc::ParseError& e) { - return Status::Invalid(e.what()); - } + ORC_CATCH_NOT_OK(row_reader = reader_->createRowReader(opts)); const liborc::Type& type = row_reader->getSelectedType(); return GetArrowSchema(type, out); } + Result> ReadMetadata() { + const std::list keys = reader_->getMetadataKeys(); + auto metadata = std::make_shared(); + for (const auto& key : keys) { + metadata->Append(key, reader_->getMetadataValue(key)); + } + return std::const_pointer_cast(metadata); + } + Status GetArrowSchema(const liborc::Type& type, std::shared_ptr* out) { if (type.getKind() != liborc::STRUCT) { return Status::NotImplemented( @@ -232,16 +252,8 @@ class ORCFileReader::Impl { std::string name = type.getFieldName(child); fields.push_back(field(name, elemtype)); } - std::list keys = reader_->getMetadataKeys(); - std::shared_ptr metadata; - if (!keys.empty()) { - metadata = std::make_shared(); - for (auto it = keys.begin(); it != keys.end(); ++it) { - metadata->Append(*it, reader_->getMetadataValue(*it)); - } - } - - *out = std::make_shared(fields, metadata); + ARROW_ASSIGN_OR_RAISE(auto metadata, ReadMetadata()); + *out = std::make_shared(std::move(fields), std::move(metadata)); return Status::OK(); } @@ -342,12 +354,12 @@ class ORCFileReader::Impl { std::shared_ptr* out) { std::unique_ptr row_reader; std::unique_ptr batch; - try { - row_reader = reader_->createRowReader(opts); - batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch)); - } catch (const liborc::ParseError& e) { - return Status::Invalid(e.what()); - } + + ORC_BEGIN_CATCH_NOT_OK + row_reader = reader_->createRowReader(opts); + batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch)); + ORC_END_CATCH_NOT_OK + std::unique_ptr builder; RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder)); @@ -389,13 +401,12 @@ class ORCFileReader::Impl { std::shared_ptr schema; RETURN_NOT_OK(ReadSchema(opts, &schema)); std::unique_ptr row_reader; - try { - row_reader = reader_->createRowReader(opts); - row_reader->seekToRow(current_row_); - current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows; - } catch (const liborc::ParseError& e) { - return Status::Invalid(e.what()); - } + + ORC_BEGIN_CATCH_NOT_OK + row_reader = reader_->createRowReader(opts); + row_reader->seekToRow(current_row_); + current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows; + ORC_END_CATCH_NOT_OK *out = std::shared_ptr( new OrcStripeReader(std::move(row_reader), schema, batch_size, pool_)); @@ -425,6 +436,10 @@ Status ORCFileReader::Open(const std::shared_ptr& file, return Status::OK(); } +Result> ORCFileReader::ReadMetadata() { + return impl_->ReadMetadata(); +} + Status ORCFileReader::ReadSchema(std::shared_ptr* out) { return impl_->ReadSchema(out); } @@ -473,6 +488,108 @@ int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } +namespace { + +class ArrowOutputStream : public liborc::OutputStream { + public: + explicit ArrowOutputStream(arrow::io::OutputStream& output_stream) + : output_stream_(output_stream), length_(0) {} + + uint64_t getLength() const override { return length_; } + + uint64_t getNaturalWriteSize() const override { return kOrcNaturalWriteSize; } + + void write(const void* buf, size_t length) override { + ORC_THROW_NOT_OK(output_stream_.Write(buf, static_cast(length))); + length_ += static_cast(length); + } + + // Mandatory due to us implementing an ORC virtual class. + // Used by ORC for error messages, not used by Arrow + const std::string& getName() const override { + static const std::string filename("ArrowOutputFile"); + return filename; + } + + void close() override { + if (!output_stream_.closed()) { + ORC_THROW_NOT_OK(output_stream_.Close()); + } + } + + void set_length(int64_t length) { length_ = length; } + + private: + arrow::io::OutputStream& output_stream_; + int64_t length_; +}; + +} // namespace + +class ORCFileWriter::Impl { + public: + Status Open(arrow::io::OutputStream* output_stream) { + out_stream_ = std::unique_ptr( + checked_cast(new ArrowOutputStream(*output_stream))); + return Status::OK(); + } + + Status Write(const Table& table) { + std::unique_ptr orc_options = + std::unique_ptr(new liborc::WriterOptions()); + ARROW_ASSIGN_OR_RAISE(auto orc_schema, GetOrcType(*(table.schema()))); + ORC_CATCH_NOT_OK( + writer_ = liborc::createWriter(*orc_schema, out_stream_.get(), *orc_options)) + + int64_t num_rows = table.num_rows(); + const int num_cols_ = table.num_columns(); + std::vector arrow_index_offset(num_cols_, 0); + std::vector arrow_chunk_offset(num_cols_, 0); + std::unique_ptr batch = + writer_->createRowBatch(kOrcWriterBatchSize); + liborc::StructVectorBatch* root = + internal::checked_cast(batch.get()); + while (num_rows > 0) { + for (int i = 0; i < num_cols_; i++) { + RETURN_NOT_OK(adapters::orc::WriteBatch( + *(table.column(i)), kOrcWriterBatchSize, &(arrow_chunk_offset[i]), + &(arrow_index_offset[i]), (root->fields)[i])); + } + root->numElements = (root->fields)[0]->numElements; + writer_->add(*batch); + batch->clear(); + num_rows -= kOrcWriterBatchSize; + } + return Status::OK(); + } + + Status Close() { + writer_->close(); + return Status::OK(); + } + + private: + std::unique_ptr writer_; + std::unique_ptr out_stream_; +}; + +ORCFileWriter::~ORCFileWriter() {} + +ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); } + +Result> ORCFileWriter::Open( + io::OutputStream* output_stream) { + std::unique_ptr result = + std::unique_ptr(new ORCFileWriter()); + Status status = result->impl_->Open(output_stream); + RETURN_NOT_OK(status); + return std::move(result); +} + +Status ORCFileWriter::Write(const Table& table) { return impl_->Write(table); } + +Status ORCFileWriter::Close() { return impl_->Close(); } + } // namespace orc } // namespace adapters } // namespace arrow diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h index 9bf18674af4..012c1701980 100644 --- a/cpp/src/arrow/adapters/orc/adapter.h +++ b/cpp/src/arrow/adapters/orc/adapter.h @@ -26,12 +26,11 @@ #include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/util/visibility.h" namespace arrow { - namespace adapters { - namespace orc { /// \class ORCFileReader @@ -49,6 +48,11 @@ class ARROW_EXPORT ORCFileReader { static Status Open(const std::shared_ptr& file, MemoryPool* pool, std::unique_ptr* reader); + /// \brief Return the metadata read from the ORC file + /// + /// \return A KeyValueMetadata object containing the ORC metadata + Result> ReadMetadata(); + /// \brief Return the schema read from the ORC file /// /// \param[out] out the returned Schema object @@ -142,8 +146,36 @@ class ARROW_EXPORT ORCFileReader { ORCFileReader(); }; -} // namespace orc +/// \class ORCFileWriter +/// \brief Write an Arrow Table or RecordBatch to an ORC file. +class ARROW_EXPORT ORCFileWriter { + public: + ~ORCFileWriter(); + /// \brief Creates a new ORC writer. + /// + /// \param[in] output_stream a pointer to the io::OutputStream to write into + /// \return the returned writer object + static Result> Open(io::OutputStream* output_stream); -} // namespace adapters + /// \brief Write a table + /// + /// \param[in] table the Arrow table from which data is extracted + /// \return Status + Status Write(const Table& table); + + /// \brief Close an ORC writer (orc::Writer) + /// + /// \return Status + Status Close(); + + private: + class Impl; + std::unique_ptr impl_; + private: + ORCFileWriter(); +}; + +} // namespace orc +} // namespace adapters } // namespace arrow diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc index 09e47fb7626..9f7fb561362 100644 --- a/cpp/src/arrow/adapters/orc/adapter_test.cc +++ b/cpp/src/arrow/adapters/orc/adapter_test.cc @@ -15,20 +15,47 @@ // specific language governing permissions and limitations // under the License. -#include - #include "arrow/adapters/orc/adapter.h" -#include "arrow/array.h" -#include "arrow/io/api.h" #include + #include +#include + +#include "arrow/adapters/orc/adapter_util.h" +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/buffer_builder.h" +#include "arrow/chunked_array.h" +#include "arrow/compute/cast.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/util/decimal.h" +#include "arrow/util/key_value_metadata.h" namespace liborc = orc; namespace arrow { -constexpr int DEFAULT_MEM_STREAM_SIZE = 100 * 1024 * 1024; +using internal::checked_pointer_cast; + +constexpr int kDefaultSmallMemStreamSize = 16384 * 5; // 80KB +constexpr int kDefaultMemStreamSize = 10 * 1024 * 1024; +constexpr int64_t kNanoMax = std::numeric_limits::max(); +constexpr int64_t kNanoMin = std::numeric_limits::lowest(); +const int64_t kMicroMax = std::floor(kNanoMax / 1000); +const int64_t kMicroMin = std::ceil(kNanoMin / 1000); +const int64_t kMilliMax = std::floor(kMicroMax / 1000); +const int64_t kMilliMin = std::ceil(kMicroMin / 1000); +const int64_t kSecondMax = std::floor(kMilliMax / 1000); +const int64_t kSecondMin = std::ceil(kMilliMin / 1000); + +static constexpr random::SeedType kRandomSeed = 0x0ff1ce; class MemoryOutputStream : public liborc::OutputStream { public: @@ -58,6 +85,189 @@ class MemoryOutputStream : public liborc::OutputStream { uint64_t length_, natural_write_size_; }; +std::shared_ptr GenerateFixedDifferenceBuffer(int32_t fixed_length, + int64_t length) { + BufferBuilder builder; + int32_t offsets[length]; + ARROW_EXPECT_OK(builder.Resize(4 * length)); + for (int32_t i = 0; i < length; i++) { + offsets[i] = fixed_length * i; + } + ARROW_EXPECT_OK(builder.Append(offsets, 4 * length)); + std::shared_ptr buffer; + ARROW_EXPECT_OK(builder.Finish(&buffer)); + return buffer; +} + +std::shared_ptr CastFixedSizeBinaryArrayToBinaryArray( + std::shared_ptr array) { + auto fixed_size_binary_array = checked_pointer_cast(array); + std::shared_ptr value_offsets = GenerateFixedDifferenceBuffer( + fixed_size_binary_array->byte_width(), array->length() + 1); + return std::make_shared(array->length(), value_offsets, + array->data()->buffers[1], + array->data()->buffers[0]); +} + +template +std::shared_ptr CastInt64ArrayToTemporalArray( + const std::shared_ptr& type, std::shared_ptr array) { + std::shared_ptr new_array_data = + ArrayData::Make(type, array->length(), array->data()->buffers); + return std::make_shared(new_array_data); +} + +Result> GenerateRandomDate64Array(int64_t size, + double null_probability) { + arrow::random::RandomArrayGenerator rand(kRandomSeed); + return CastInt64ArrayToTemporalArray( + date64(), rand.Int64(size, kMilliMin, kMilliMax, null_probability)); +} + +Result> GenerateRandomTimestampArray(int64_t size, + arrow::TimeUnit::type type, + double null_probability) { + arrow::random::RandomArrayGenerator rand(kRandomSeed); + switch (type) { + case arrow::TimeUnit::type::SECOND: { + return CastInt64ArrayToTemporalArray( + timestamp(TimeUnit::SECOND), + rand.Int64(size, kSecondMin, kSecondMax, null_probability)); + } + case arrow::TimeUnit::type::MILLI: { + return CastInt64ArrayToTemporalArray( + timestamp(TimeUnit::MILLI), + rand.Int64(size, kMilliMin, kMilliMax, null_probability)); + } + case arrow::TimeUnit::type::MICRO: { + return CastInt64ArrayToTemporalArray( + timestamp(TimeUnit::MICRO), + rand.Int64(size, kMicroMin, kMicroMax, null_probability)); + } + case arrow::TimeUnit::type::NANO: { + return CastInt64ArrayToTemporalArray( + timestamp(TimeUnit::NANO), + rand.Int64(size, kNanoMin, kNanoMax, null_probability)); + } + default: { + return arrow::Status::TypeError("Unknown or unsupported Arrow TimeUnit: ", type); + } + } +} + +/// \brief Construct a random weak composition of a nonnegative integer +/// i.e. a way of writing it as the sum of a sequence of n non-negative +/// integers. +/// +/// \param[in] n the number of integers in the weak composition +/// \param[in] sum the integer of which a random weak composition is generated +/// \param[out] out The generated weak composition +template +void RandWeakComposition(int64_t n, T sum, std::vector* out) { + const int random_seed = 0; + std::default_random_engine gen(random_seed); + out->resize(n, static_cast(0)); + T remaining_sum = sum; + std::generate(out->begin(), out->end() - 1, [&gen, &remaining_sum] { + std::uniform_int_distribution d(static_cast(0), remaining_sum); + auto res = d(gen); + remaining_sum -= res; + return static_cast(res); + }); + (*out)[n - 1] += remaining_sum; + std::random_shuffle(out->begin(), out->end()); +} + +std::shared_ptr GenerateRandomChunkedArray( + const std::shared_ptr& data_type, int64_t size, int64_t min_num_chunks, + int64_t max_num_chunks, double null_probability) { + arrow::random::RandomArrayGenerator rand(kRandomSeed); + std::vector num_chunks(1, 0); + std::vector current_size_chunks; + arrow::randint(1, min_num_chunks, max_num_chunks, &num_chunks); + int64_t current_num_chunks = num_chunks[0]; + ArrayVector arrays(current_num_chunks, nullptr); + arrow::RandWeakComposition(current_num_chunks, size, ¤t_size_chunks); + for (int j = 0; j < current_num_chunks; j++) { + switch (data_type->id()) { + case arrow::Type::type::DATE64: { + EXPECT_OK_AND_ASSIGN(arrays[j], GenerateRandomDate64Array(current_size_chunks[j], + null_probability)); + break; + } + case arrow::Type::type::TIMESTAMP: { + EXPECT_OK_AND_ASSIGN( + arrays[j], + GenerateRandomTimestampArray( + current_size_chunks[j], + arrow::internal::checked_pointer_cast(data_type) + ->unit(), + null_probability)); + break; + } + default: + arrays[j] = rand.ArrayOf(data_type, current_size_chunks[j], null_probability); + } + } + return std::make_shared(arrays); +} + +std::shared_ptr GenerateRandomTable(const std::shared_ptr& schema, + int64_t size, int64_t min_num_chunks, + int64_t max_num_chunks, + double null_probability) { + int num_cols = schema->num_fields(); + ChunkedArrayVector cv; + for (int col = 0; col < num_cols; col++) { + cv.push_back(GenerateRandomChunkedArray(schema->field(col)->type(), size, + min_num_chunks, max_num_chunks, + null_probability)); + } + return Table::Make(schema, cv); +} + +void AssertTableWriteReadEqual(const std::shared_ptr
& input_table, + const std::shared_ptr
& expected_output_table, + const int64_t max_size = kDefaultSmallMemStreamSize) { + EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, + io::BufferOutputStream::Create(max_size)); + EXPECT_OK_AND_ASSIGN(auto writer, + adapters::orc::ORCFileWriter::Open(buffer_output_stream.get())); + ARROW_EXPECT_OK(writer->Write(*input_table)); + ARROW_EXPECT_OK(writer->Close()); + EXPECT_OK_AND_ASSIGN(auto buffer, buffer_output_stream->Finish()); + std::shared_ptr in_stream(new io::BufferReader(buffer)); + std::unique_ptr reader; + ARROW_EXPECT_OK( + adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(), &reader)); + std::shared_ptr
actual_output_table; + ARROW_EXPECT_OK(reader->Read(&actual_output_table)); + AssertTablesEqual(*expected_output_table, *actual_output_table, false, false); +} +void AssertArrayWriteReadEqual(const std::shared_ptr& input_array, + const std::shared_ptr& expected_output_array, + const int64_t max_size = kDefaultSmallMemStreamSize) { + std::shared_ptr input_schema = schema({field("col0", input_array->type())}), + output_schema = + schema({field("col0", expected_output_array->type())}); + auto input_chunked_array = std::make_shared(input_array), + expected_output_chunked_array = + std::make_shared(expected_output_array); + std::shared_ptr
input_table = Table::Make(input_schema, {input_chunked_array}), + expected_output_table = + Table::Make(output_schema, {expected_output_chunked_array}); + AssertTableWriteReadEqual(input_table, expected_output_table, max_size); +} + +void SchemaORCWriteReadTest(const std::shared_ptr& schema, int64_t size, + int64_t min_num_chunks, int64_t max_num_chunks, + double null_probability, + int64_t max_size = kDefaultSmallMemStreamSize) { + const std::shared_ptr
table = + GenerateRandomTable(schema, size, min_num_chunks, max_num_chunks, null_probability); + AssertTableWriteReadEqual(table, table, max_size); +} + std::unique_ptr CreateWriter(uint64_t stripe_size, const liborc::Type& type, liborc::OutputStream* stream) { @@ -69,32 +279,34 @@ std::unique_ptr CreateWriter(uint64_t stripe_size, return liborc::createWriter(type, stream, options); } -TEST(TestAdapter, readIntAndStringFileMultipleStripes) { - MemoryOutputStream mem_stream(DEFAULT_MEM_STREAM_SIZE); +TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) { + MemoryOutputStream mem_stream(kDefaultMemStreamSize); ORC_UNIQUE_PTR type( liborc::Type::buildTypeFromString("struct")); constexpr uint64_t stripe_size = 1024; // 1K constexpr uint64_t stripe_count = 10; - constexpr uint64_t stripe_row_count = 65535; + constexpr uint64_t stripe_row_count = 16384; constexpr uint64_t reader_batch_size = 1024; auto writer = CreateWriter(stripe_size, *type, &mem_stream); auto batch = writer->createRowBatch(stripe_row_count); - auto struct_batch = dynamic_cast(batch.get()); - auto long_batch = dynamic_cast(struct_batch->fields[0]); - auto str_batch = dynamic_cast(struct_batch->fields[1]); + auto struct_batch = internal::checked_cast(batch.get()); + auto long_batch = + internal::checked_cast(struct_batch->fields[0]); + auto str_batch = + internal::checked_cast(struct_batch->fields[1]); int64_t accumulated = 0; for (uint64_t j = 0; j < stripe_count; ++j) { - char data_buffer[327675]; + std::string data_buffer(stripe_row_count * 5, '\0'); uint64_t offset = 0; for (uint64_t i = 0; i < stripe_row_count; ++i) { std::string str_data = std::to_string(accumulated % stripe_row_count); long_batch->data[i] = static_cast(accumulated % stripe_row_count); - str_batch->data[i] = data_buffer + offset; + str_batch->data[i] = &data_buffer[offset]; str_batch->length[i] = static_cast(str_data.size()); - memcpy(data_buffer + offset, str_data.c_str(), str_data.size()); + memcpy(&data_buffer[offset], str_data.c_str(), str_data.size()); accumulated++; offset += str_data.size(); } @@ -115,6 +327,10 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) { ASSERT_TRUE( adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool(), &reader).ok()); + EXPECT_OK_AND_ASSIGN(auto metadata, reader->ReadMetadata()); + auto expected_metadata = std::const_pointer_cast( + key_value_metadata(std::vector(), std::vector())); + ASSERT_TRUE(metadata->Equals(*expected_metadata)); ASSERT_EQ(stripe_row_count * stripe_count, reader->NumberOfRows()); ASSERT_EQ(stripe_count, reader->NumberOfStripes()); accumulated = 0; @@ -124,8 +340,8 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) { std::shared_ptr record_batch; EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok()); while (record_batch) { - auto int32_array = std::dynamic_pointer_cast(record_batch->column(0)); - auto str_array = std::dynamic_pointer_cast(record_batch->column(1)); + auto int32_array = checked_pointer_cast(record_batch->column(0)); + auto str_array = checked_pointer_cast(record_batch->column(1)); for (int j = 0; j < record_batch->num_rows(); ++j) { EXPECT_EQ(accumulated % stripe_row_count, int32_array->Value(j)); EXPECT_EQ(std::to_string(accumulated % stripe_row_count), @@ -157,4 +373,317 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) { EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok()); } } + +// WriteORC tests +// Trivial + +class TestORCWriterTrivialNoConversion : public ::testing::Test { + public: + TestORCWriterTrivialNoConversion() { + table_schema = schema( + {field("bool", boolean()), field("int8", int8()), field("int16", int16()), + field("int32", int32()), field("int64", int64()), field("float", float32()), + field("double", float64()), field("decimal128nz", decimal128(25, 6)), + field("decimal128z", decimal128(32, 0)), field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), field("string", utf8()), + field("binary", binary()), + field("struct", struct_({field("a", utf8()), field("b", int64())})), + field("list", list(int32())), + field("lsl", list(struct_({field("lsl0", list(int32()))}))), + field("map", map(utf8(), utf8()))}); + } + + protected: + std::shared_ptr table_schema; +}; +TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunk) { + std::shared_ptr
table = TableFromJSON(table_schema, {R"([])"}); + AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16); +} +TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) { + std::shared_ptr
table = TableFromJSON(table_schema, {}); + AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16); +} +class TestORCWriterTrivialWithConversion : public ::testing::Test { + public: + TestORCWriterTrivialWithConversion() { + input_schema = schema( + {field("date64", date64()), field("ts0", timestamp(TimeUnit::SECOND)), + field("ts1", timestamp(TimeUnit::MILLI)), + field("ts2", timestamp(TimeUnit::MICRO)), field("large_string", large_utf8()), + field("large_binary", large_binary()), + field("fixed_size_binary0", fixed_size_binary(0)), + field("fixed_size_binary", fixed_size_binary(5)), + field("large_list", large_list(int32())), + field("fixed_size_list", fixed_size_list(int32(), 3))}), + output_schema = schema( + {field("date64", timestamp(TimeUnit::NANO)), + field("ts0", timestamp(TimeUnit::NANO)), field("ts1", timestamp(TimeUnit::NANO)), + field("ts2", timestamp(TimeUnit::NANO)), field("large_string", utf8()), + field("large_binary", binary()), field("fixed_size_binary0", binary()), + field("fixed_size_binary", binary()), field("large_list", list(int32())), + field("fixed_size_list", list(int32()))}); + } + + protected: + std::shared_ptr input_schema, output_schema; +}; +TEST_F(TestORCWriterTrivialWithConversion, writeTrivialChunk) { + std::shared_ptr
input_table = TableFromJSON(input_schema, {R"([])"}), + expected_output_table = TableFromJSON(output_schema, {R"([])"}); + AssertTableWriteReadEqual(input_table, expected_output_table, + kDefaultSmallMemStreamSize / 16); +} +TEST_F(TestORCWriterTrivialWithConversion, writeChunkless) { + std::shared_ptr
input_table = TableFromJSON(input_schema, {}), + expected_output_table = TableFromJSON(output_schema, {}); + AssertTableWriteReadEqual(input_table, expected_output_table, + kDefaultSmallMemStreamSize / 16); +} + +// General + +class TestORCWriterNoConversion : public ::testing::Test { + public: + TestORCWriterNoConversion() { + table_schema = schema( + {field("bool", boolean()), field("int8", int8()), field("int16", int16()), + field("int32", int32()), field("int64", int64()), field("float", float32()), + field("double", float64()), field("date32", date32()), + field("decimal64", decimal128(18, 4)), field("decimal64z", decimal128(18, 0)), + field("ts3", timestamp(TimeUnit::NANO)), field("string", utf8()), + field("binary", binary())}); + } + + protected: + std::shared_ptr table_schema; +}; +TEST_F(TestORCWriterNoConversion, writeNoNulls) { + SchemaORCWriteReadTest(table_schema, 11203, 5, 10, 0, kDefaultSmallMemStreamSize * 5); +} +TEST_F(TestORCWriterNoConversion, writeMixed) { + SchemaORCWriteReadTest(table_schema, 9405, 1, 20, 0.6, kDefaultSmallMemStreamSize * 5); +} +TEST_F(TestORCWriterNoConversion, writeAllNulls) { + SchemaORCWriteReadTest(table_schema, 4006, 1, 5, 1); +} + +// Converts +// Since Arrow has way more types than ORC type conversions are unavoidable +class TestORCWriterWithConversion : public ::testing::Test { + public: + TestORCWriterWithConversion() { + input_schema = schema( + {field("date64", date64()), field("ts0", timestamp(TimeUnit::SECOND)), + field("ts1", timestamp(TimeUnit::MILLI)), + field("ts2", timestamp(TimeUnit::MICRO)), field("large_string", large_utf8()), + field("large_binary", large_binary()), + field("fixed_size_binary0", fixed_size_binary(0)), + field("fixed_size_binary", fixed_size_binary(5))}), + output_schema = schema( + {field("date64", timestamp(TimeUnit::NANO)), + field("ts0", timestamp(TimeUnit::NANO)), field("ts1", timestamp(TimeUnit::NANO)), + field("ts2", timestamp(TimeUnit::NANO)), field("large_string", utf8()), + field("large_binary", binary()), field("fixed_size_binary0", binary()), + field("fixed_size_binary", binary())}); + } + void RunTest(int64_t num_rows, double null_possibility, + int64_t max_size = kDefaultSmallMemStreamSize) { + int64_t num_cols = (input_schema->fields()).size(); + std::shared_ptr
input_table = + GenerateRandomTable(input_schema, num_rows, 1, 1, null_possibility); + ArrayVector av(num_cols); + for (int i = 0; i < num_cols - 2; i++) { + EXPECT_OK_AND_ASSIGN(av[i], + arrow::compute::Cast(*(input_table->column(i)->chunk(0)), + output_schema->field(i)->type())); + } + for (int i = num_cols - 2; i < num_cols; i++) { + av[i] = CastFixedSizeBinaryArrayToBinaryArray(input_table->column(i)->chunk(0)); + } + std::shared_ptr
expected_output_table = Table::Make(output_schema, av); + AssertTableWriteReadEqual(input_table, expected_output_table, max_size); + } + + protected: + std::shared_ptr input_schema, output_schema; +}; +TEST_F(TestORCWriterWithConversion, writeAllNulls) { RunTest(12000, 1); } +TEST_F(TestORCWriterWithConversion, writeNoNulls) { RunTest(10009, 0); } +TEST_F(TestORCWriterWithConversion, writeMixed) { RunTest(8021, 0.5); } + +class TestORCWriterSingleArray : public ::testing::Test { + public: + TestORCWriterSingleArray() : rand(kRandomSeed) {} + + protected: + arrow::random::RandomArrayGenerator rand; +}; + +// Nested types +TEST_F(TestORCWriterSingleArray, WriteStruct) { + std::vector> subfields{field("int32", boolean())}; + const int64_t num_rows = 1234; + int num_subcols = subfields.size(); + ArrayVector av0(num_subcols); + for (int i = 0; i < num_subcols; i++) { + av0[i] = rand.ArrayOf(subfields[i]->type(), num_rows, 0.4); + } + std::shared_ptr bitmap = rand.NullBitmap(num_rows, 0.5); + std::shared_ptr array = + std::make_shared(struct_(subfields), num_rows, av0, bitmap); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 10); +} + +TEST_F(TestORCWriterSingleArray, WriteStructOfStruct) { + std::vector> subsubfields{ + field("bool", boolean()), + field("int8", int8()), + field("int16", int16()), + field("int32", int32()), + field("int64", int64()), + field("date32", date32()), + field("ts3", timestamp(TimeUnit::NANO)), + field("string", utf8()), + field("binary", binary())}; + const int64_t num_rows = 1234; + int num_subsubcols = subsubfields.size(); + ArrayVector av00(num_subsubcols), av0(1); + for (int i = 0; i < num_subsubcols; i++) { + av00[i] = rand.ArrayOf(subsubfields[i]->type(), num_rows, 0); + } + std::shared_ptr bitmap0 = rand.NullBitmap(num_rows, 0); + av0[0] = std::make_shared(struct_(subsubfields), num_rows, av00, bitmap0); + std::shared_ptr bitmap = rand.NullBitmap(num_rows, 0.2); + std::shared_ptr array = std::make_shared( + struct_({field("struct2", struct_(subsubfields))}), num_rows, av0, bitmap); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 10); +} + +TEST_F(TestORCWriterSingleArray, WriteList) { + const int64_t num_rows = 1234; + auto value_array = rand.ArrayOf(int32(), 125 * num_rows, 0); + std::shared_ptr array = rand.List(*value_array, num_rows, 1); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 100); +} + +TEST_F(TestORCWriterSingleArray, WriteLargeList) { + const int64_t num_rows = 1234; + auto value_array = rand.ArrayOf(int32(), 5 * num_rows, 0.5); + auto output_offsets = rand.Offsets(num_rows + 1, 0, 5 * num_rows, 0.6, false); + EXPECT_OK_AND_ASSIGN(auto input_offsets, + arrow::compute::Cast(*output_offsets, int64())); + EXPECT_OK_AND_ASSIGN(auto input_array, + arrow::LargeListArray::FromArrays(*input_offsets, *value_array)); + EXPECT_OK_AND_ASSIGN(auto output_array, + arrow::ListArray::FromArrays(*output_offsets, *value_array)); + AssertArrayWriteReadEqual(input_array, output_array, kDefaultSmallMemStreamSize * 10); +} + +TEST_F(TestORCWriterSingleArray, WriteFixedSizeList) { + const int64_t num_rows = 1234; + std::shared_ptr value_array = rand.ArrayOf(int32(), 3 * num_rows, 0.8); + std::shared_ptr bitmap = rand.NullBitmap(num_rows, 1); + std::shared_ptr buffer = GenerateFixedDifferenceBuffer(3, num_rows + 1); + std::shared_ptr input_array = std::make_shared( + fixed_size_list(int32(), 3), num_rows, value_array, bitmap), + output_array = std::make_shared( + list(int32()), num_rows, buffer, value_array, bitmap); + AssertArrayWriteReadEqual(input_array, output_array, kDefaultSmallMemStreamSize * 10); +} + +TEST_F(TestORCWriterSingleArray, WriteListOfList) { + const int64_t num_rows = 1234; + auto value_value_array = rand.ArrayOf(utf8(), 4 * num_rows, 0.5); + std::shared_ptr value_array = rand.List(*value_value_array, 2 * num_rows, 0.7); + std::shared_ptr array = rand.List(*value_array, num_rows, 0.4); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 10); +} + +TEST_F(TestORCWriterSingleArray, WriteListOfListOfList) { + const int64_t num_rows = 1234; + auto value3_array = rand.ArrayOf(int64(), 12 * num_rows, 0.1); + std::shared_ptr value2_array = rand.List(*value3_array, 5 * num_rows, 0); + std::shared_ptr value_array = rand.List(*value2_array, 2 * num_rows, 0.1); + std::shared_ptr array = rand.List(*value_array, num_rows, 0.1); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 35); +} + +TEST_F(TestORCWriterSingleArray, WriteListOfStruct) { + const int64_t num_rows = 1234, num_values = 3 * num_rows; + ArrayVector av00(1); + av00[0] = rand.ArrayOf(int32(), num_values, 0); + std::shared_ptr bitmap = rand.NullBitmap(num_values, 0.2); + std::shared_ptr value_array = std::make_shared( + struct_({field("a", int32())}), num_values, av00, bitmap); + std::shared_ptr array = rand.List(*value_array, num_rows, 0); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 30); +} + +TEST_F(TestORCWriterSingleArray, WriteStructOfList) { + const int64_t num_rows = 1234; + ArrayVector av0(1); + auto value_array = rand.ArrayOf(int32(), 5 * num_rows, 0.2); + av0[0] = rand.List(*value_array, num_rows, 0); + std::shared_ptr bitmap = rand.NullBitmap(num_rows, 0.2); + std::shared_ptr array = std::make_shared( + struct_({field("a", list(int32()))}), num_rows, av0, bitmap); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 20); +} + +TEST_F(TestORCWriterSingleArray, WriteMap) { + const int64_t num_rows = 1234; + auto key_array = rand.ArrayOf(int32(), 20 * num_rows, 0); + auto item_array = rand.ArrayOf(int32(), 20 * num_rows, 1); + std::shared_ptr array = rand.Map(key_array, item_array, num_rows, 0.1); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 50); +} + +TEST_F(TestORCWriterSingleArray, WriteStructOfMap) { + const int64_t num_rows = 1234, num_values = 5 * num_rows; + ArrayVector av0(1); + auto key_array = rand.ArrayOf(binary(), num_values, 0); + auto item_array = rand.ArrayOf(int32(), num_values, 0.5); + av0[0] = rand.Map(key_array, item_array, num_rows, 0.2); + std::shared_ptr array = std::make_shared( + struct_({field("a", map(binary(), int32()))}), num_rows, av0); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 20); +} + +TEST_F(TestORCWriterSingleArray, WriteMapOfStruct) { + const int64_t num_rows = 1234, num_values = 10 * num_rows; + std::shared_ptr key_array = rand.ArrayOf(utf8(), num_values, 0); + ArrayVector av00(1); + av00[0] = rand.ArrayOf(int32(), num_values, 0.1); + std::shared_ptr bitmap = rand.NullBitmap(num_values, 0.2); + std::shared_ptr item_array = std::make_shared( + struct_({field("a", int32())}), num_values, av00, bitmap); + std::shared_ptr array = rand.Map(key_array, item_array, num_rows, 0.1); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 10); +} + +TEST_F(TestORCWriterSingleArray, WriteMapOfMap) { + const int64_t num_rows = 1234; + auto key_key_array = rand.ArrayOf(utf8(), 4 * num_rows, 0); + auto key_item_array = rand.ArrayOf(int32(), 4 * num_rows, 0.5); + std::shared_ptr key_array = + rand.Map(key_key_array, key_item_array, 2 * num_rows, 0); + auto item_key_array = rand.ArrayOf(utf8(), 4 * num_rows, 0); + auto item_item_array = rand.ArrayOf(int32(), 4 * num_rows, 0.2); + std::shared_ptr item_array = + rand.Map(item_key_array, item_item_array, 2 * num_rows, 0.3); + std::shared_ptr array = rand.Map(key_array, item_array, num_rows, 0.4); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 10); +} + +TEST_F(TestORCWriterSingleArray, WriteListOfMap) { + const int64_t num_rows = 1234; + auto value_key_array = rand.ArrayOf(utf8(), 4 * num_rows, 0); + auto value_item_array = rand.ArrayOf(int32(), 4 * num_rows, 0.5); + std::shared_ptr value_array = + rand.Map(value_key_array, value_item_array, 2 * num_rows, 0.2); + std::shared_ptr array = rand.List(*value_array, num_rows, 0.4); + AssertArrayWriteReadEqual(array, array, kDefaultSmallMemStreamSize * 10); +} + } // namespace arrow diff --git a/cpp/src/arrow/adapters/orc/adapter_util.cc b/cpp/src/arrow/adapters/orc/adapter_util.cc index 5a36e2c0100..f956a6f6217 100644 --- a/cpp/src/arrow/adapters/orc/adapter_util.cc +++ b/cpp/src/arrow/adapters/orc/adapter_util.cc @@ -15,18 +15,25 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/adapters/orc/adapter_util.h" + +#include #include #include -#include "arrow/adapters/orc/adapter_util.h" #include "arrow/array/builder_base.h" #include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/scalar.h" #include "arrow/status.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/range.h" - +#include "arrow/util/string_view.h" +#include "arrow/visitor_inline.h" #include "orc/Exceptions.hh" +#include "orc/MemoryPool.hh" #include "orc/OrcFile.hh" // alias to not interfere with nested orc namespace @@ -34,19 +41,25 @@ namespace liborc = orc; namespace arrow { -namespace adapters { +using internal::checked_cast; +namespace adapters { namespace orc { -using internal::checked_cast; +namespace { -// The number of nanoseconds in a second +// The number of milliseconds, microseconds and nanoseconds in a second +constexpr int64_t kOneSecondMillis = 1000LL; +constexpr int64_t kOneMicroNanos = 1000LL; +constexpr int64_t kOneSecondMicros = 1000000LL; +constexpr int64_t kOneMilliNanos = 1000000LL; constexpr int64_t kOneSecondNanos = 1000000000LL; -Status AppendStructBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cbatch, - int64_t offset, int64_t length, ArrayBuilder* abuilder) { +Status AppendStructBatch(const liborc::Type* type, + liborc::ColumnVectorBatch* column_vector_batch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { auto builder = checked_cast(abuilder); - auto batch = checked_cast(cbatch); + auto batch = checked_cast(column_vector_batch); const uint8_t* valid_bytes = nullptr; if (batch->hasNulls) { @@ -61,10 +74,11 @@ Status AppendStructBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cb return Status::OK(); } -Status AppendListBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cbatch, - int64_t offset, int64_t length, ArrayBuilder* abuilder) { +Status AppendListBatch(const liborc::Type* type, + liborc::ColumnVectorBatch* column_vector_batch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { auto builder = checked_cast(abuilder); - auto batch = checked_cast(cbatch); + auto batch = checked_cast(column_vector_batch); liborc::ColumnVectorBatch* elements = batch->elements.get(); const liborc::Type* elemtype = type->getSubtype(0); @@ -83,37 +97,38 @@ Status AppendListBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cbat return Status::OK(); } -Status AppendMapBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cbatch, - int64_t offset, int64_t length, ArrayBuilder* abuilder) { - auto list_builder = checked_cast(abuilder); - auto struct_builder = checked_cast(list_builder->value_builder()); - auto batch = checked_cast(cbatch); +Status AppendMapBatch(const liborc::Type* type, + liborc::ColumnVectorBatch* column_vector_batch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { + auto builder = checked_cast(abuilder); + auto batch = checked_cast(column_vector_batch); liborc::ColumnVectorBatch* keys = batch->keys.get(); - liborc::ColumnVectorBatch* vals = batch->elements.get(); - const liborc::Type* keytype = type->getSubtype(0); - const liborc::Type* valtype = type->getSubtype(1); + liborc::ColumnVectorBatch* items = batch->elements.get(); + const liborc::Type* key_type = type->getSubtype(0); + const liborc::Type* item_type = type->getSubtype(1); const bool has_nulls = batch->hasNulls; for (int64_t i = offset; i < length + offset; i++) { - RETURN_NOT_OK(list_builder->Append()); - int64_t start = batch->offsets[i]; - int64_t list_length = batch->offsets[i + 1] - start; - if (list_length && (!has_nulls || batch->notNull[i])) { - RETURN_NOT_OK(struct_builder->AppendValues(list_length, nullptr)); - RETURN_NOT_OK(AppendBatch(keytype, keys, start, list_length, - struct_builder->field_builder(0))); - RETURN_NOT_OK(AppendBatch(valtype, vals, start, list_length, - struct_builder->field_builder(1))); + if (!has_nulls || batch->notNull[i]) { + int64_t start = batch->offsets[i]; + int64_t end = batch->offsets[i + 1]; + RETURN_NOT_OK(builder->Append()); + RETURN_NOT_OK( + AppendBatch(key_type, keys, start, end - start, builder->key_builder())); + RETURN_NOT_OK( + AppendBatch(item_type, items, start, end - start, builder->item_builder())); + } else { + RETURN_NOT_OK(builder->AppendNull()); } } return Status::OK(); } -template -Status AppendNumericBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, +template +Status AppendNumericBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset, int64_t length, ArrayBuilder* abuilder) { - auto builder = checked_cast(abuilder); - auto batch = checked_cast(cbatch); + auto builder = checked_cast(abuilder); + auto batch = checked_cast(column_vector_batch); if (length == 0) { return Status::OK(); @@ -122,16 +137,16 @@ Status AppendNumericBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, if (batch->hasNulls) { valid_bytes = reinterpret_cast(batch->notNull.data()) + offset; } - const elem_type* source = batch->data.data() + offset; + const ElemType* source = batch->data.data() + offset; RETURN_NOT_OK(builder->AppendValues(source, length, valid_bytes)); return Status::OK(); } -template -Status AppendNumericBatchCast(liborc::ColumnVectorBatch* cbatch, int64_t offset, - int64_t length, ArrayBuilder* abuilder) { - auto builder = checked_cast(abuilder); - auto batch = checked_cast(cbatch); +template +Status AppendNumericBatchCast(liborc::ColumnVectorBatch* column_vector_batch, + int64_t offset, int64_t length, ArrayBuilder* abuilder) { + auto builder = checked_cast(abuilder); + auto batch = checked_cast(column_vector_batch); if (length == 0) { return Status::OK(); @@ -141,9 +156,9 @@ Status AppendNumericBatchCast(liborc::ColumnVectorBatch* cbatch, int64_t offset, if (batch->hasNulls) { valid_bytes = reinterpret_cast(batch->notNull.data()) + offset; } - const source_type* source = batch->data.data() + offset; + const SourceType* source = batch->data.data() + offset; auto cast_iter = internal::MakeLazyRange( - [&source](int64_t index) { return static_cast(source[index]); }, + [&source](int64_t index) { return static_cast(source[index]); }, length); RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes)); @@ -151,10 +166,10 @@ Status AppendNumericBatchCast(liborc::ColumnVectorBatch* cbatch, int64_t offset, return Status::OK(); } -Status AppendBoolBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, int64_t length, - ArrayBuilder* abuilder) { +Status AppendBoolBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { auto builder = checked_cast(abuilder); - auto batch = checked_cast(cbatch); + auto batch = checked_cast(column_vector_batch); if (length == 0) { return Status::OK(); @@ -174,10 +189,10 @@ Status AppendBoolBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, int64_ return Status::OK(); } -Status AppendTimestampBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, - int64_t length, ArrayBuilder* abuilder) { +Status AppendTimestampBatch(liborc::ColumnVectorBatch* column_vector_batch, + int64_t offset, int64_t length, ArrayBuilder* abuilder) { auto builder = checked_cast(abuilder); - auto batch = checked_cast(cbatch); + auto batch = checked_cast(column_vector_batch); if (length == 0) { return Status::OK(); @@ -202,11 +217,11 @@ Status AppendTimestampBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, return Status::OK(); } -template -Status AppendBinaryBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, +template +Status AppendBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset, int64_t length, ArrayBuilder* abuilder) { - auto builder = checked_cast(abuilder); - auto batch = checked_cast(cbatch); + auto builder = checked_cast(abuilder); + auto batch = checked_cast(column_vector_batch); const bool has_nulls = batch->hasNulls; for (int64_t i = offset; i < length + offset; i++) { @@ -220,10 +235,10 @@ Status AppendBinaryBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, return Status::OK(); } -Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, - int64_t length, ArrayBuilder* abuilder) { +Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch, + int64_t offset, int64_t length, ArrayBuilder* abuilder) { auto builder = checked_cast(abuilder); - auto batch = checked_cast(cbatch); + auto batch = checked_cast(column_vector_batch); const bool has_nulls = batch->hasNulls; for (int64_t i = offset; i < length + offset; i++) { @@ -236,13 +251,14 @@ Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* cbatch, int64_t offset, return Status::OK(); } -Status AppendDecimalBatch(const liborc::Type* type, liborc::ColumnVectorBatch* cbatch, - int64_t offset, int64_t length, ArrayBuilder* abuilder) { +Status AppendDecimalBatch(const liborc::Type* type, + liborc::ColumnVectorBatch* column_vector_batch, int64_t offset, + int64_t length, ArrayBuilder* abuilder) { auto builder = checked_cast(abuilder); - const bool has_nulls = cbatch->hasNulls; + const bool has_nulls = column_vector_batch->hasNulls; if (type->getPrecision() == 0 || type->getPrecision() > 18) { - auto batch = checked_cast(cbatch); + auto batch = checked_cast(column_vector_batch); for (int64_t i = offset; i < length + offset; i++) { if (!has_nulls || batch->notNull[i]) { RETURN_NOT_OK(builder->Append( @@ -252,7 +268,7 @@ Status AppendDecimalBatch(const liborc::Type* type, liborc::ColumnVectorBatch* c } } } else { - auto batch = checked_cast(cbatch); + auto batch = checked_cast(column_vector_batch); for (int64_t i = offset; i < length + offset; i++) { if (!has_nulls || batch->notNull[i]) { RETURN_NOT_OK(builder->Append(Decimal128(batch->values[i]))); @@ -264,6 +280,8 @@ Status AppendDecimalBatch(const liborc::Type* type, liborc::ColumnVectorBatch* c return Status::OK(); } +} // namespace + Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch, int64_t offset, int64_t length, ArrayBuilder* builder) { if (type == nullptr) { @@ -316,6 +334,615 @@ Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch, } } +namespace { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +Status WriteBatch(const Array& parray, int64_t orc_offset, + liborc::ColumnVectorBatch* column_vector_batch); + +// Make sure children of StructArray have appropriate null. +Result> NormalizeArray(const std::shared_ptr& array) { + Type::type kind = array->type_id(); + switch (kind) { + case Type::type::STRUCT: { + if (array->null_count() == 0) { + return array; + } else { + auto struct_array = checked_pointer_cast(array); + const std::shared_ptr bitmap = struct_array->null_bitmap(); + std::shared_ptr struct_type = struct_array->type(); + std::size_t size = struct_type->fields().size(); + std::vector> new_children(size, nullptr); + for (std::size_t i = 0; i < size; i++) { + std::shared_ptr child = struct_array->field(i); + const std::shared_ptr child_bitmap = child->null_bitmap(); + std::shared_ptr final_child_bitmap; + if (child_bitmap == nullptr) { + final_child_bitmap = bitmap; + } else { + ARROW_ASSIGN_OR_RAISE( + final_child_bitmap, + internal::BitmapAnd(default_memory_pool(), bitmap->data(), 0, + child_bitmap->data(), 0, struct_array->length(), 0)); + } + std::shared_ptr child_array_data = child->data(); + std::vector> child_buffers = child_array_data->buffers; + child_buffers[0] = final_child_bitmap; + std::shared_ptr new_child_array_data = + ArrayData::Make(child->type(), child->length(), child_buffers, + child_array_data->child_data, child_array_data->dictionary); + ARROW_ASSIGN_OR_RAISE(new_children[i], + NormalizeArray(MakeArray(new_child_array_data))); + } + return std::make_shared(struct_type, struct_array->length(), + new_children, bitmap); + } + } + case Type::type::LIST: { + auto list_array = checked_pointer_cast(array); + ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values())); + return std::make_shared(list_array->type(), list_array->length(), + list_array->value_offsets(), value_array, + list_array->null_bitmap()); + } + case Type::type::LARGE_LIST: { + auto list_array = checked_pointer_cast(array); + ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values())); + return std::make_shared(list_array->type(), list_array->length(), + list_array->value_offsets(), value_array, + list_array->null_bitmap()); + } + case Type::type::FIXED_SIZE_LIST: { + auto list_array = checked_pointer_cast(array); + ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values())); + return std::make_shared(list_array->type(), + list_array->length(), value_array, + list_array->null_bitmap()); + } + case Type::type::MAP: { + auto map_array = checked_pointer_cast(array); + ARROW_ASSIGN_OR_RAISE(auto key_array, NormalizeArray(map_array->keys())); + ARROW_ASSIGN_OR_RAISE(auto item_array, NormalizeArray(map_array->items())); + return std::make_shared(map_array->type(), map_array->length(), + map_array->value_offsets(), key_array, item_array, + map_array->null_bitmap()); + } + default: { + return array; + } + } +} + +template +struct Appender {}; + +// Types for long/double-like Appender, that is, numeric, boolean or date32 +template +using is_generic_type = + std::integral_constant::value || + std::is_same::value || + is_boolean_type::value>; +template +using enable_if_generic = enable_if_t::value, R>; + +// Number-like +template +struct Appender> { + using ArrayType = typename TypeTraits::ArrayType; + using ValueType = typename TypeTraits::CType; + Status VisitNull() { + batch->notNull[running_orc_offset] = false; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + Status VisitValue(ValueType v) { + batch->data[running_orc_offset] = array.Value(running_arrow_offset); + batch->notNull[running_orc_offset] = true; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + const ArrayType& array; + BatchType* batch; + int64_t running_orc_offset, running_arrow_offset; +}; + +// Binary +template +struct Appender { + using ArrayType = typename TypeTraits::ArrayType; + using COffsetType = typename TypeTraits::OffsetType::c_type; + Status VisitNull() { + batch->notNull[running_orc_offset] = false; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + Status VisitValue(util::string_view v) { + batch->notNull[running_orc_offset] = true; + COffsetType data_length = 0; + batch->data[running_orc_offset] = reinterpret_cast( + const_cast(array.GetValue(running_arrow_offset, &data_length))); + batch->length[running_orc_offset] = data_length; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + const ArrayType& array; + liborc::StringVectorBatch* batch; + int64_t running_orc_offset, running_arrow_offset; +}; + +// Decimal +template <> +struct Appender { + Status VisitNull() { + batch->notNull[running_orc_offset] = false; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + Status VisitValue(util::string_view v) { + batch->notNull[running_orc_offset] = true; + const Decimal128 dec_value(array.GetValue(running_arrow_offset)); + batch->values[running_orc_offset] = static_cast(dec_value.low_bits()); + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + const Decimal128Array& array; + liborc::Decimal64VectorBatch* batch; + int64_t running_orc_offset, running_arrow_offset; +}; + +template <> +struct Appender { + Status VisitNull() { + batch->notNull[running_orc_offset] = false; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + Status VisitValue(util::string_view v) { + batch->notNull[running_orc_offset] = true; + const Decimal128 dec_value(array.GetValue(running_arrow_offset)); + batch->values[running_orc_offset] = + liborc::Int128(dec_value.high_bits(), dec_value.low_bits()); + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + const Decimal128Array& array; + liborc::Decimal128VectorBatch* batch; + int64_t running_orc_offset, running_arrow_offset; +}; + +// Date64 and Timestamp +template +struct TimestampAppender { + using ArrayType = typename TypeTraits::ArrayType; + Status VisitNull() { + batch->notNull[running_orc_offset] = false; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + Status VisitValue(int64_t v) { + int64_t data = array.Value(running_arrow_offset); + batch->notNull[running_orc_offset] = true; + batch->data[running_orc_offset] = + static_cast(std::floor(data / conversion_factor_from_second)); + batch->nanoseconds[running_orc_offset] = + (data - conversion_factor_from_second * batch->data[running_orc_offset]) * + conversion_factor_to_nano; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + const ArrayType& array; + liborc::TimestampVectorBatch* batch; + int64_t running_orc_offset, running_arrow_offset; + int64_t conversion_factor_from_second, conversion_factor_to_nano; +}; + +// FSB +struct FixedSizeBinaryAppender { + Status VisitNull() { + batch->notNull[running_orc_offset] = false; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + Status VisitValue(util::string_view v) { + batch->notNull[running_orc_offset] = true; + batch->data[running_orc_offset] = reinterpret_cast( + const_cast(array.GetValue(running_arrow_offset))); + batch->length[running_orc_offset] = data_length; + running_orc_offset++; + running_arrow_offset++; + return Status::OK(); + } + const FixedSizeBinaryArray& array; + liborc::StringVectorBatch* batch; + int64_t running_orc_offset, running_arrow_offset; + const int32_t data_length; +}; + +// static_cast from int64_t or double to itself shouldn't introduce overhead +// Pleae see +// https://stackoverflow.com/questions/19106826/ +// can-static-cast-to-same-type-introduce-runtime-overhead +template +Status WriteGenericBatch(const Array& array, int64_t orc_offset, + liborc::ColumnVectorBatch* column_vector_batch) { + using ArrayType = typename TypeTraits::ArrayType; + const ArrayType& array_(checked_cast(array)); + auto batch = checked_cast(column_vector_batch); + if (array.null_count()) { + batch->hasNulls = true; + } + Appender appender{array_, batch, orc_offset, 0}; + ArrayDataVisitor visitor; + RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender)); + return Status::OK(); +} + +template +Status WriteTimestampBatch(const Array& array, int64_t orc_offset, + liborc::ColumnVectorBatch* column_vector_batch, + const int64_t& conversion_factor_from_second, + const int64_t& conversion_factor_to_nano) { + using ArrayType = typename TypeTraits::ArrayType; + const ArrayType& array_(checked_cast(array)); + auto batch = checked_cast(column_vector_batch); + if (array.null_count()) { + batch->hasNulls = true; + } + TimestampAppender appender{array_, + batch, + orc_offset, + 0, + conversion_factor_from_second, + conversion_factor_to_nano}; + ArrayDataVisitor visitor; + RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender)); + return Status::OK(); +} + +Status WriteFixedSizeBinaryBatch(const Array& array, int64_t orc_offset, + liborc::ColumnVectorBatch* column_vector_batch) { + const FixedSizeBinaryArray& array_(checked_cast(array)); + auto batch = checked_cast(column_vector_batch); + if (array.null_count()) { + batch->hasNulls = true; + } + FixedSizeBinaryAppender appender{array_, batch, orc_offset, 0, array_.byte_width()}; + ArrayDataVisitor visitor; + RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender)); + return Status::OK(); +} + +Status WriteStructBatch(const Array& array, int64_t orc_offset, + liborc::ColumnVectorBatch* column_vector_batch) { + std::shared_ptr array_ = MakeArray(array.data()); + std::shared_ptr struct_array(checked_pointer_cast(array_)); + auto batch = checked_cast(column_vector_batch); + std::size_t size = array.type()->fields().size(); + int64_t arrow_length = array.length(); + int64_t running_arrow_offset = 0, running_orc_offset = orc_offset; + // First fill fields of ColumnVectorBatch + if (array.null_count()) { + batch->hasNulls = true; + } + for (; running_arrow_offset < arrow_length; + running_orc_offset++, running_arrow_offset++) { + if (array.IsNull(running_arrow_offset)) { + batch->notNull[running_orc_offset] = false; + } else { + batch->notNull[running_orc_offset] = true; + } + } + // Fill the fields + for (std::size_t i = 0; i < size; i++) { + batch->fields[i]->resize(orc_offset + arrow_length); + RETURN_NOT_OK(WriteBatch(*(struct_array->field(i)), orc_offset, batch->fields[i])); + } + return Status::OK(); +} + +template +Status WriteListBatch(const Array& array, int64_t orc_offset, + liborc::ColumnVectorBatch* column_vector_batch) { + const ArrayType& list_array(checked_cast(array)); + auto batch = checked_cast(column_vector_batch); + liborc::ColumnVectorBatch* element_batch = (batch->elements).get(); + int64_t arrow_length = array.length(); + int64_t running_arrow_offset = 0, running_orc_offset = orc_offset; + if (orc_offset == 0) { + batch->offsets[0] = 0; + } + if (array.null_count()) { + batch->hasNulls = true; + } + for (; running_arrow_offset < arrow_length; + running_orc_offset++, running_arrow_offset++) { + if (array.IsNull(running_arrow_offset)) { + batch->notNull[running_orc_offset] = false; + batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset]; + } else { + batch->notNull[running_orc_offset] = true; + batch->offsets[running_orc_offset + 1] = + batch->offsets[running_orc_offset] + + list_array.value_offset(running_arrow_offset + 1) - + list_array.value_offset(running_arrow_offset); + element_batch->resize(batch->offsets[running_orc_offset + 1]); + int64_t subarray_arrow_offset = list_array.value_offset(running_arrow_offset), + subarray_orc_offset = batch->offsets[running_orc_offset], + subarray_orc_length = + batch->offsets[running_orc_offset + 1] - subarray_orc_offset; + RETURN_NOT_OK(WriteBatch( + *(list_array.values()->Slice(subarray_arrow_offset, subarray_orc_length)), + subarray_orc_offset, element_batch)); + } + } + return Status::OK(); +} + +Status WriteMapBatch(const Array& array, int64_t orc_offset, + liborc::ColumnVectorBatch* column_vector_batch) { + const MapArray& map_array(checked_cast(array)); + auto batch = checked_cast(column_vector_batch); + liborc::ColumnVectorBatch* key_batch = (batch->keys).get(); + liborc::ColumnVectorBatch* element_batch = (batch->elements).get(); + std::shared_ptr key_array = map_array.keys(); + std::shared_ptr element_array = map_array.items(); + int64_t arrow_length = array.length(); + int64_t running_arrow_offset = 0, running_orc_offset = orc_offset; + if (orc_offset == 0) { + batch->offsets[0] = 0; + } + if (array.null_count()) { + batch->hasNulls = true; + } + for (; running_arrow_offset < arrow_length; + running_orc_offset++, running_arrow_offset++) { + if (array.IsNull(running_arrow_offset)) { + batch->notNull[running_orc_offset] = false; + batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset]; + } else { + batch->notNull[running_orc_offset] = true; + batch->offsets[running_orc_offset + 1] = + batch->offsets[running_orc_offset] + + map_array.value_offset(running_arrow_offset + 1) - + map_array.value_offset(running_arrow_offset); + int64_t subarray_arrow_offset = map_array.value_offset(running_arrow_offset), + subarray_orc_offset = batch->offsets[running_orc_offset], + new_subarray_orc_offset = batch->offsets[running_orc_offset + 1], + subarray_orc_length = new_subarray_orc_offset - subarray_orc_offset; + key_batch->resize(new_subarray_orc_offset); + element_batch->resize(new_subarray_orc_offset); + RETURN_NOT_OK( + WriteBatch(*(key_array->Slice(subarray_arrow_offset, subarray_orc_length)), + subarray_orc_offset, key_batch)); + RETURN_NOT_OK( + WriteBatch(*(element_array->Slice(subarray_arrow_offset, subarray_orc_length)), + subarray_orc_offset, element_batch)); + } + } + return Status::OK(); +} + +Status WriteBatch(const Array& array, int64_t orc_offset, + liborc::ColumnVectorBatch* column_vector_batch) { + Type::type kind = array.type_id(); + column_vector_batch->numElements = orc_offset; + switch (kind) { + case Type::type::BOOL: + return WriteGenericBatch(array, orc_offset, + column_vector_batch); + case Type::type::INT8: + return WriteGenericBatch(array, orc_offset, + column_vector_batch); + case Type::type::INT16: + return WriteGenericBatch(array, orc_offset, + column_vector_batch); + case Type::type::INT32: + return WriteGenericBatch(array, orc_offset, + column_vector_batch); + case Type::type::INT64: + return WriteGenericBatch(array, orc_offset, + column_vector_batch); + case Type::type::FLOAT: + return WriteGenericBatch(array, orc_offset, + column_vector_batch); + case Type::type::DOUBLE: + return WriteGenericBatch( + array, orc_offset, column_vector_batch); + case Type::type::BINARY: + return WriteGenericBatch( + array, orc_offset, column_vector_batch); + case Type::type::LARGE_BINARY: + return WriteGenericBatch( + array, orc_offset, column_vector_batch); + case Type::type::STRING: + return WriteGenericBatch( + array, orc_offset, column_vector_batch); + case Type::type::LARGE_STRING: + return WriteGenericBatch( + array, orc_offset, column_vector_batch); + case Type::type::FIXED_SIZE_BINARY: + return WriteFixedSizeBinaryBatch(array, orc_offset, column_vector_batch); + case Type::type::DATE32: + return WriteGenericBatch(array, orc_offset, + column_vector_batch); + case Type::type::DATE64: + return WriteTimestampBatch(array, orc_offset, column_vector_batch, + kOneSecondMillis, kOneMilliNanos); + case Type::type::TIMESTAMP: { + switch (internal::checked_pointer_cast(array.type())->unit()) { + case TimeUnit::type::SECOND: + return WriteTimestampBatch( + array, orc_offset, column_vector_batch, 1, kOneSecondNanos); + case TimeUnit::type::MILLI: + return WriteTimestampBatch( + array, orc_offset, column_vector_batch, kOneSecondMillis, kOneMilliNanos); + case TimeUnit::type::MICRO: + return WriteTimestampBatch( + array, orc_offset, column_vector_batch, kOneSecondMicros, kOneMicroNanos); + case TimeUnit::type::NANO: + return WriteTimestampBatch( + array, orc_offset, column_vector_batch, kOneSecondNanos, 1); + default: + return Status::TypeError("Unknown or unsupported Arrow type: ", + array.type()->ToString()); + } + } + case Type::type::DECIMAL128: { + int32_t precision = checked_pointer_cast(array.type())->precision(); + if (precision > 18) { + return WriteGenericBatch( + array, orc_offset, column_vector_batch); + } else { + return WriteGenericBatch( + array, orc_offset, column_vector_batch); + } + } + case Type::type::STRUCT: + return WriteStructBatch(array, orc_offset, column_vector_batch); + case Type::type::LIST: + return WriteListBatch(array, orc_offset, column_vector_batch); + case Type::type::LARGE_LIST: + return WriteListBatch(array, orc_offset, column_vector_batch); + case Type::type::FIXED_SIZE_LIST: + return WriteListBatch(array, orc_offset, column_vector_batch); + case Type::type::MAP: + return WriteMapBatch(array, orc_offset, column_vector_batch); + default: { + return Status::NotImplemented("Unknown or unsupported Arrow type: ", + array.type()->ToString()); + } + } + return Status::OK(); +} + +Result> GetOrcType(const DataType& type) { + Type::type kind = type.id(); + switch (kind) { + case Type::type::BOOL: + return liborc::createPrimitiveType(liborc::TypeKind::BOOLEAN); + case Type::type::INT8: + return liborc::createPrimitiveType(liborc::TypeKind::BYTE); + case Type::type::INT16: + return liborc::createPrimitiveType(liborc::TypeKind::SHORT); + case Type::type::INT32: + return liborc::createPrimitiveType(liborc::TypeKind::INT); + case Type::type::INT64: + return liborc::createPrimitiveType(liborc::TypeKind::LONG); + case Type::type::FLOAT: + return liborc::createPrimitiveType(liborc::TypeKind::FLOAT); + case Type::type::DOUBLE: + return liborc::createPrimitiveType(liborc::TypeKind::DOUBLE); + // Use STRING instead of VARCHAR for now, both use UTF-8 + case Type::type::STRING: + case Type::type::LARGE_STRING: + return liborc::createPrimitiveType(liborc::TypeKind::STRING); + case Type::type::BINARY: + case Type::type::LARGE_BINARY: + case Type::type::FIXED_SIZE_BINARY: + return liborc::createPrimitiveType(liborc::TypeKind::BINARY); + case Type::type::DATE32: + return liborc::createPrimitiveType(liborc::TypeKind::DATE); + case Type::type::DATE64: + case Type::type::TIMESTAMP: + return liborc::createPrimitiveType(liborc::TypeKind::TIMESTAMP); + case Type::type::DECIMAL128: { + const uint64_t precision = + static_cast(checked_cast(type).precision()); + const uint64_t scale = + static_cast(checked_cast(type).scale()); + return liborc::createDecimalType(precision, scale); + } + case Type::type::LIST: + case Type::type::FIXED_SIZE_LIST: + case Type::type::LARGE_LIST: { + std::shared_ptr arrow_child_type = + checked_cast(type).value_type(); + ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type)); + return liborc::createListType(std::move(orc_subtype)); + } + case Type::type::STRUCT: { + ORC_UNIQUE_PTR out_type = liborc::createStructType(); + std::vector> arrow_fields = + checked_cast(type).fields(); + for (std::vector>::iterator it = arrow_fields.begin(); + it != arrow_fields.end(); ++it) { + std::string field_name = (*it)->name(); + std::shared_ptr arrow_child_type = (*it)->type(); + ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type)); + out_type->addStructField(field_name, std::move(orc_subtype)); + } + return std::move(out_type); + } + case Type::type::MAP: { + std::shared_ptr key_arrow_type = + checked_cast(type).key_type(); + std::shared_ptr item_arrow_type = + checked_cast(type).item_type(); + ARROW_ASSIGN_OR_RAISE(auto key_orc_type, GetOrcType(*key_arrow_type)); + ARROW_ASSIGN_OR_RAISE(auto item_orc_type, GetOrcType(*item_arrow_type)); + return liborc::createMapType(std::move(key_orc_type), std::move(item_orc_type)); + } + case Type::type::DENSE_UNION: + case Type::type::SPARSE_UNION: { + ORC_UNIQUE_PTR out_type = liborc::createUnionType(); + std::vector> arrow_fields = + checked_cast(type).fields(); + for (std::vector>::iterator it = arrow_fields.begin(); + it != arrow_fields.end(); ++it) { + std::string field_name = (*it)->name(); + std::shared_ptr arrow_child_type = (*it)->type(); + ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type)); + out_type->addUnionChild(std::move(orc_subtype)); + } + return std::move(out_type); + } + default: { + return Status::NotImplemented("Unknown or unsupported Arrow type: ", + type.ToString()); + } + } +} + +} // namespace + +Status WriteBatch(const ChunkedArray& chunked_array, int64_t length, + int* arrow_chunk_offset, int64_t* arrow_index_offset, + liborc::ColumnVectorBatch* column_vector_batch) { + int num_batch = chunked_array.num_chunks(); + int64_t orc_offset = 0; + while (*arrow_chunk_offset < num_batch && orc_offset < length) { + ARROW_ASSIGN_OR_RAISE(auto array, + NormalizeArray(chunked_array.chunk(*arrow_chunk_offset))); + int64_t num_written_elements = + std::min(length - orc_offset, array->length() - *arrow_index_offset); + if (num_written_elements > 0) { + RETURN_NOT_OK(WriteBatch(*(array->Slice(*arrow_index_offset, num_written_elements)), + orc_offset, column_vector_batch)); + orc_offset += num_written_elements; + *arrow_index_offset += num_written_elements; + } + if (orc_offset < length) { // Another Arrow Array done + *arrow_index_offset = 0; + (*arrow_chunk_offset)++; + } + } + column_vector_batch->numElements = orc_offset; + return Status::OK(); +} + Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { // When subselecting fields on read, liborc will set some nodes to nullptr, // so we need to check for nullptr before progressing @@ -369,15 +996,15 @@ Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { const int scale = static_cast(type->getScale()); if (precision == 0) { // In HIVE 0.11/0.12 precision is set as 0, but means max precision - *out = decimal(38, 6); + *out = decimal128(38, 6); } else { - *out = decimal(precision, scale); + *out = decimal128(precision, scale); } break; } case liborc::LIST: { if (subtype_count != 1) { - return Status::Invalid("Invalid Orc List type"); + return Status::TypeError("Invalid Orc List type"); } std::shared_ptr elemtype; RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &elemtype)); @@ -386,22 +1013,21 @@ Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { } case liborc::MAP: { if (subtype_count != 2) { - return Status::Invalid("Invalid Orc Map type"); + return Status::TypeError("Invalid Orc Map type"); } - std::shared_ptr keytype; - std::shared_ptr valtype; - RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &keytype)); - RETURN_NOT_OK(GetArrowType(type->getSubtype(1), &valtype)); - *out = list(struct_({field("key", keytype), field("value", valtype)})); + std::shared_ptr key_type, item_type; + RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &key_type)); + RETURN_NOT_OK(GetArrowType(type->getSubtype(1), &item_type)); + *out = map(key_type, item_type); break; } case liborc::STRUCT: { std::vector> fields; for (int child = 0; child < subtype_count; ++child) { - std::shared_ptr elemtype; - RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elemtype)); + std::shared_ptr elem_type; + RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type)); std::string name = type->getFieldName(child); - fields.push_back(field(name, elemtype)); + fields.push_back(field(name, elem_type)); } *out = struct_(fields); break; @@ -410,21 +1036,34 @@ Status GetArrowType(const liborc::Type* type, std::shared_ptr* out) { std::vector> fields; std::vector type_codes; for (int child = 0; child < subtype_count; ++child) { - std::shared_ptr elemtype; - RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elemtype)); - fields.push_back(field("_union_" + std::to_string(child), elemtype)); + std::shared_ptr elem_type; + RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type)); + fields.push_back(field("_union_" + std::to_string(child), elem_type)); type_codes.push_back(static_cast(child)); } *out = sparse_union(fields, type_codes); break; } default: { - return Status::Invalid("Unknown Orc type kind: ", kind); + return Status::TypeError("Unknown Orc type kind: ", type->toString()); } } return Status::OK(); } +Result> GetOrcType(const Schema& schema) { + int numFields = schema.num_fields(); + ORC_UNIQUE_PTR out_type = liborc::createStructType(); + for (int i = 0; i < numFields; i++) { + std::shared_ptr field = schema.field(i); + std::string field_name = field->name(); + std::shared_ptr arrow_child_type = field->type(); + ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type)); + out_type->addStructField(field_name, std::move(orc_subtype)); + } + return std::move(out_type); +} + } // namespace orc } // namespace adapters } // namespace arrow diff --git a/cpp/src/arrow/adapters/orc/adapter_util.h b/cpp/src/arrow/adapters/orc/adapter_util.h index 13a62f2bbd3..3e6d0fcc660 100644 --- a/cpp/src/arrow/adapters/orc/adapter_util.h +++ b/cpp/src/arrow/adapters/orc/adapter_util.h @@ -34,8 +34,24 @@ namespace orc { Status GetArrowType(const liborc::Type* type, std::shared_ptr* out); +Result> GetOrcType(const Schema& schema); + Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch, - int64_t offset, int64_t length, ArrayBuilder* builder); + int64_t offset, int64_t length, arrow::ArrayBuilder* builder); + +/// \brief Write a chunked array to an orc::ColumnVectorBatch +/// +/// \param[in] chunked_array the chunked array +/// \param[in] length the orc::ColumnVectorBatch size limit +/// \param[in,out] arrow_chunk_offset The current chunk being processed +/// \param[in,out] arrow_index_offset The index of the arrow_chunk_offset array +/// before or after a process +/// \param[in,out] column_vector_batch the orc::ColumnVectorBatch to be filled +/// \return Status +Status WriteBatch(const ChunkedArray& chunked_array, int64_t length, + int* arrow_chunk_offset, int64_t* arrow_index_offset, + liborc::ColumnVectorBatch* column_vector_batch); + } // namespace orc } // namespace adapters } // namespace arrow diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index 67c5ca84e1f..dad689d3ca7 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -103,28 +103,30 @@ struct ScalarFromArraySlotImpl { } Status Visit(const SparseUnionArray& a) { + const auto type_code = a.type_code(index_); // child array which stores the actual value - auto arr = a.field(a.child_id(index_)); + const auto arr = a.field(a.child_id(index_)); // no need to adjust the index ARROW_ASSIGN_OR_RAISE(auto value, arr->GetScalar(index_)); if (value->is_valid) { - out_ = std::shared_ptr(new SparseUnionScalar(value, a.type())); + out_ = std::shared_ptr(new SparseUnionScalar(value, type_code, a.type())); } else { - out_ = MakeNullScalar(a.type()); + out_ = std::shared_ptr(new SparseUnionScalar(type_code, a.type())); } return Status::OK(); } Status Visit(const DenseUnionArray& a) { + const auto type_code = a.type_code(index_); // child array which stores the actual value auto arr = a.field(a.child_id(index_)); // need to look up the value based on offsets auto offset = a.value_offset(index_); ARROW_ASSIGN_OR_RAISE(auto value, arr->GetScalar(offset)); if (value->is_valid) { - out_ = std::shared_ptr(new DenseUnionScalar(value, a.type())); + out_ = std::shared_ptr(new DenseUnionScalar(value, type_code, a.type())); } else { - out_ = MakeNullScalar(a.type()); + out_ = std::shared_ptr(new DenseUnionScalar(type_code, a.type())); } return Status::OK(); } diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index e29db00cfcf..2add572e7a4 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -56,15 +56,17 @@ class ARROW_EXPORT Array { /// \brief Return true if value at index is null. Does not boundscheck bool IsNull(int64_t i) const { - return null_bitmap_data_ != NULLPTR && - !BitUtil::GetBit(null_bitmap_data_, i + data_->offset); + return null_bitmap_data_ != NULLPTR + ? !BitUtil::GetBit(null_bitmap_data_, i + data_->offset) + : data_->null_count == data_->length; } /// \brief Return true if value at index is valid (not null). Does not /// boundscheck bool IsValid(int64_t i) const { - return null_bitmap_data_ == NULLPTR || - BitUtil::GetBit(null_bitmap_data_, i + data_->offset); + return null_bitmap_data_ != NULLPTR + ? BitUtil::GetBit(null_bitmap_data_, i + data_->offset) + : data_->null_count != data_->length; } /// \brief Return a Scalar containing the value of this array at i diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index db3c640b9a4..f8e8c4f8a44 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -71,6 +71,13 @@ class BaseBinaryArray : public FlatArray { raw_value_offsets_[i + 1] - pos); } + /// \brief Get binary value as a string_view + /// Provided for consistency with other arrays. + /// + /// \param i the value index + /// \return the view over the selected value + util::string_view Value(int64_t i) const { return GetView(i); } + /// \brief Get binary value as a std::string /// /// \param i the value index diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 5c247a6dc66..e593cf7e6c4 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -473,6 +473,70 @@ class TestStringBuilder : public TestBuilder { CheckStringArray(*result_, strings, is_valid, reps); } + void TestExtendCurrent() { + std::vector strings = {"", "bbbb", "aaaaa", "", "ccc"}; + std::vector is_valid = {1, 1, 1, 0, 1}; + + int N = static_cast(strings.size()); + int reps = 10; + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (!is_valid[i]) { + ASSERT_OK(builder_->AppendNull()); + } else if (strings[i].length() > 3) { + ASSERT_OK(builder_->Append(strings[i].substr(0, 3))); + ASSERT_OK(builder_->ExtendCurrent(strings[i].substr(3))); + } else { + ASSERT_OK(builder_->Append(strings[i])); + } + } + } + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 12, result_->value_data()->size()); + + CheckStringArray(*result_, strings, is_valid, reps); + } + + void TestExtendCurrentUnsafe() { + std::vector strings = {"", "bbbb", "aaaaa", "", "ccc"}; + std::vector is_valid = {1, 1, 1, 0, 1}; + + int N = static_cast(strings.size()); + int reps = 13; + int64_t total_length = 0; + for (const auto& s : strings) { + total_length += static_cast(s.size()); + } + + ASSERT_OK(builder_->Reserve(N * reps)); + ASSERT_OK(builder_->ReserveData(total_length * reps)); + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (!is_valid[i]) { + builder_->UnsafeAppendNull(); + } else if (strings[i].length() > 3) { + builder_->UnsafeAppend(strings[i].substr(0, 3)); + builder_->UnsafeExtendCurrent(strings[i].substr(3)); + } else { + builder_->UnsafeAppend(strings[i]); + } + } + } + ASSERT_EQ(builder_->value_data_length(), total_length * reps); + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 12, result_->value_data()->size()); + + CheckStringArray(*result_, strings, is_valid, reps); + } + void TestVectorAppend() { std::vector strings = {"", "bb", "a", "", "ccc"}; std::vector valid_bytes = {1, 1, 1, 0, 1}; @@ -608,6 +672,12 @@ TYPED_TEST(TestStringBuilder, TestScalarAppend) { this->TestScalarAppend(); } TYPED_TEST(TestStringBuilder, TestScalarAppendUnsafe) { this->TestScalarAppendUnsafe(); } +TYPED_TEST(TestStringBuilder, TestExtendCurrent) { this->TestExtendCurrent(); } + +TYPED_TEST(TestStringBuilder, TestExtendCurrentUnsafe) { + this->TestExtendCurrentUnsafe(); +} + TYPED_TEST(TestStringBuilder, TestVectorAppend) { this->TestVectorAppend(); } TYPED_TEST(TestStringBuilder, TestAppendCStringsWithValidBytes) { diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index a50cbcc13cf..faeeaf56333 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -1036,7 +1036,7 @@ void ValidateBasicFixedSizeListArray(const FixedSizeListArray* result, ASSERT_EQ(is_valid[i] == 0, result->IsNull(i)); } - ASSERT_EQ(result->length() * result->value_length(), result->values()->length()); + ASSERT_LE(result->length() * result->value_length(), result->values()->length()); auto varr = std::dynamic_pointer_cast(result->values()); for (size_t i = 0; i < values.size(); ++i) { @@ -1084,7 +1084,7 @@ TEST_F(TestFixedSizeListArray, BulkAppend) { ValidateBasicFixedSizeListArray(result_.get(), values, is_valid); } -TEST_F(TestFixedSizeListArray, BulkAppendInvalid) { +TEST_F(TestFixedSizeListArray, BulkAppendExcess) { std::vector values = {0, 1, 2, 3, 4, 5}; std::vector is_valid = {1, 0, 1}; @@ -1099,7 +1099,8 @@ TEST_F(TestFixedSizeListArray, BulkAppendInvalid) { } Done(); - ASSERT_RAISES(Invalid, result_->ValidateFull()); + // We appended too many values to the child array, but that's OK + ValidateBasicFixedSizeListArray(result_.get(), values, is_valid); } TEST_F(TestFixedSizeListArray, TestZeroLength) { @@ -1131,4 +1132,16 @@ TEST_F(TestFixedSizeListArray, NegativeLength) { ASSERT_RAISES(Invalid, result_->ValidateFull()); } +TEST_F(TestFixedSizeListArray, NotEnoughValues) { + type_ = fixed_size_list(value_type_, 2); + auto values = ArrayFromJSON(value_type_, "[]"); + result_ = std::make_shared(type_, 1, values); + ASSERT_RAISES(Invalid, result_->ValidateFull()); + + // ARROW-13437: too many values is OK though + values = ArrayFromJSON(value_type_, "[1, 2, 3, 4]"); + result_ = std::make_shared(type_, 1, values); + ASSERT_OK(result_->ValidateFull()); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index f967127c5f1..102a82512e1 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -730,8 +730,6 @@ Result> SparseUnionArray::Make( return std::make_shared(std::move(internal_data)); } -std::shared_ptr UnionArray::child(int i) const { return field(i); } - std::shared_ptr UnionArray::field(int i) const { if (i < 0 || static_cast(i) >= boxed_fields_.size()) { diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index d39f33f4702..bd5abaa3a8f 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -378,6 +378,9 @@ class ARROW_EXPORT UnionArray : public Array { const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; } + /// The logical type code of the value at index. + type_code_t type_code(int64_t i) const { return raw_type_codes_[i + data_->offset]; } + /// The physical child id containing value at index. int child_id(int64_t i) const { return union_type_->child_ids()[raw_type_codes_[i + data_->offset]]; @@ -387,12 +390,6 @@ class ARROW_EXPORT UnionArray : public Array { UnionMode::type mode() const { return union_type_->mode(); } - // Return the given field as an individual array. - // For sparse unions, the returned array has its offset, length and null - // count adjusted. - ARROW_DEPRECATED("Deprecated in 1.0.0. Use field(pos)") - std::shared_ptr child(int pos) const; - /// \brief Return the given field as an individual array. /// /// For sparse unions, the returned array has its offset, length and null diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index a97bf134604..5cee0a2691f 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -111,6 +111,14 @@ TEST_F(TestArray, TestLength) { ASSERT_EQ(arr->length(), 100); } +TEST_F(TestArray, TestNullToString) { + // Invalid NULL buffer + auto data = std::make_shared(nullptr, 400); + + std::unique_ptr arr(new Int32Array(100, data)); + ASSERT_EQ(arr->ToString(), ""); +} + TEST_F(TestArray, TestSliceSafe) { std::vector original_data{1, 2, 3, 4, 5, 6, 7}; auto arr = std::make_shared(7, Buffer::Wrap(original_data)); @@ -322,8 +330,6 @@ TEST_F(TestArray, BuildLargeInMemoryArray) { ASSERT_EQ(length, result->length()); } -TEST_F(TestArray, TestCopy) {} - TEST_F(TestArray, TestMakeArrayOfNull) { std::shared_ptr types[] = { // clang-format off @@ -356,6 +362,10 @@ TEST_F(TestArray, TestMakeArrayOfNull) { ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); ASSERT_EQ(array->null_count(), length); + for (int64_t i = 0; i < length; ++i) { + ASSERT_TRUE(array->IsNull(i)); + ASSERT_FALSE(array->IsValid(i)); + } } } } @@ -397,38 +407,68 @@ TEST_F(TestArray, TestMakeArrayOfNullUnion) { } } -TEST_F(TestArray, TestMakeArrayFromScalar) { - ASSERT_OK_AND_ASSIGN(auto null_array, MakeArrayFromScalar(NullScalar(), 5)); - ASSERT_OK(null_array->ValidateFull()); - ASSERT_EQ(null_array->length(), 5); - ASSERT_EQ(null_array->null_count(), 5); +void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) { + std::unique_ptr builder; + auto null_scalar = MakeNullScalar(scalar->type); + ASSERT_OK(MakeBuilder(pool, scalar->type, &builder)); + ASSERT_OK(builder->AppendScalar(*scalar)); + ASSERT_OK(builder->AppendScalar(*scalar)); + ASSERT_OK(builder->AppendScalar(*null_scalar)); + ASSERT_OK(builder->AppendScalars({scalar, null_scalar})); + ASSERT_OK(builder->AppendScalar(*scalar, /*n_repeats=*/2)); + ASSERT_OK(builder->AppendScalar(*null_scalar, /*n_repeats=*/2)); + + std::shared_ptr out; + FinishAndCheckPadding(builder.get(), &out); + ASSERT_OK(out->ValidateFull()); + AssertTypeEqual(scalar->type, out->type()); + ASSERT_EQ(out->length(), 9); + + const bool can_check_nulls = internal::HasValidityBitmap(out->type()->id()); + + if (can_check_nulls) { + ASSERT_EQ(out->null_count(), 4); + } + for (const auto index : {0, 1, 3, 5, 6}) { + ASSERT_FALSE(out->IsNull(index)); + ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index)); + AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true); + } + for (const auto index : {2, 4, 7, 8}) { + ASSERT_EQ(out->IsNull(index), can_check_nulls); + ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index)); + AssertScalarsEqual(*null_scalar, *scalar_i, /*verbose=*/true); + } +} +static ScalarVector GetScalars() { auto hello = Buffer::FromString("hello"); DayTimeIntervalType::DayMilliseconds daytime{1, 100}; - ScalarVector scalars{ - std::make_shared(false), - std::make_shared(3), - std::make_shared(3), - std::make_shared(3), - std::make_shared(3), - std::make_shared(3.0), - std::make_shared(10), - std::make_shared(11), + FieldVector union_fields{field("string", utf8()), field("number", int32()), + field("other_number", int32())}; + std::vector union_type_codes{5, 6, 42}; + + const auto sparse_union_ty = ::arrow::sparse_union(union_fields, union_type_codes); + const auto dense_union_ty = ::arrow::dense_union(union_fields, union_type_codes); + + return { + std::make_shared(false), std::make_shared(3), + std::make_shared(3), std::make_shared(3), + std::make_shared(3), std::make_shared(3.0), + std::make_shared(10), std::make_shared(11), std::make_shared(1000, time32(TimeUnit::SECOND)), std::make_shared(1111, time64(TimeUnit::MICRO)), std::make_shared(1111, timestamp(TimeUnit::MILLI)), std::make_shared(1), std::make_shared(daytime), std::make_shared(60, duration(TimeUnit::SECOND)), - std::make_shared(hello), - std::make_shared(hello), + std::make_shared(hello), std::make_shared(hello), std::make_shared( hello, fixed_size_binary(static_cast(hello->size()))), std::make_shared(Decimal128(10), decimal(16, 4)), std::make_shared(Decimal256(10), decimal(76, 38)), - std::make_shared(hello), - std::make_shared(hello), + std::make_shared(hello), std::make_shared(hello), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3, 4]")), @@ -437,7 +477,25 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { std::make_shared(2), std::make_shared(6), }, - struct_({field("min", int32()), field("max", int32())}))}; + struct_({field("min", int32()), field("max", int32())})), + // Same values, different union type codes + std::make_shared(std::make_shared(100), 6, + sparse_union_ty), + std::make_shared(std::make_shared(100), 42, + sparse_union_ty), + std::make_shared(std::make_shared(101), 6, + dense_union_ty), + std::make_shared(std::make_shared(101), 42, + dense_union_ty)}; +} + +TEST_F(TestArray, TestMakeArrayFromScalar) { + ASSERT_OK_AND_ASSIGN(auto null_array, MakeArrayFromScalar(NullScalar(), 5)); + ASSERT_OK(null_array->ValidateFull()); + ASSERT_EQ(null_array->length(), 5); + ASSERT_EQ(null_array->null_count(), 5); + + auto scalars = GetScalars(); for (int64_t length : {16}) { for (auto scalar : scalars) { @@ -445,8 +503,32 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); ASSERT_EQ(array->null_count(), 0); + + // test case for ARROW-13321 + for (int64_t i : std::vector{0, length / 2, length - 1}) { + ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i)); + AssertScalarsEqual(*s, *scalar, /*verbose=*/true); + } } } + + for (auto scalar : scalars) { + AssertAppendScalar(pool_, scalar); + } +} + +TEST_F(TestArray, TestMakeArrayFromScalarSliced) { + // Regression test for ARROW-13437 + auto scalars = GetScalars(); + + for (auto scalar : scalars) { + SCOPED_TRACE(scalar->type->ToString()); + ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, 32)); + auto sliced = array->Slice(1, 4); + ASSERT_EQ(sliced->length(), 4); + ASSERT_EQ(sliced->null_count(), 0); + ARROW_EXPECT_OK(sliced->ValidateFull()); + } } TEST_F(TestArray, TestMakeArrayFromDictionaryScalar) { @@ -481,6 +563,8 @@ TEST_F(TestArray, TestMakeArrayFromMapScalar) { ASSERT_OK_AND_ASSIGN(auto item, array->GetScalar(i)); ASSERT_TRUE(item->Equals(scalar)); } + + AssertAppendScalar(pool_, std::make_shared(scalar)); } TEST_F(TestArray, ValidateBuffersPrimitive) { diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index 88d25e823bb..d3afe40df8d 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -#include - #include +#include + #include "arrow/array.h" #include "arrow/array/builder_nested.h" #include "arrow/array/builder_union.h" @@ -107,11 +107,11 @@ class TestUnionArrayFactories : public ::testing::Test { public: void SetUp() { pool_ = default_memory_pool(); - type_codes_ = {1, 2, 4, 8}; + type_codes_ = {1, 2, 4, 127}; ArrayFromVector({0, 1, 2, 0, 1, 3, 2, 0, 2, 1}, &type_ids_); - ArrayFromVector({1, 2, 4, 1, 2, 8, 4, 1, 4, 2}, &logical_type_ids_); - ArrayFromVector({1, 2, 4, 1, -2, 8, 4, 1, 4, 2}, &invalid_type_ids1_); - ArrayFromVector({1, 2, 4, 1, 3, 8, 4, 1, 4, 2}, &invalid_type_ids2_); + ArrayFromVector({1, 2, 4, 1, 2, 127, 4, 1, 4, 2}, &logical_type_ids_); + ArrayFromVector({1, 2, 4, 1, -2, 127, 4, 1, 4, 2}, &invalid_type_ids1_); + ArrayFromVector({1, 2, 4, 1, 3, 127, 4, 1, 4, 2}, &invalid_type_ids2_); } void CheckUnionArray(const UnionArray& array, UnionMode::type mode, diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index b92cc285894..2f4e63b546d 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -18,14 +18,18 @@ #include "arrow/array/builder_base.h" #include +#include #include #include "arrow/array/array_base.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" namespace arrow { @@ -92,6 +96,210 @@ Status ArrayBuilder::Advance(int64_t elements) { return null_bitmap_builder_.Advance(elements); } +namespace { + +struct AppendScalarImpl { + template + enable_if_t::value || is_decimal_type::value || + is_fixed_size_binary_type::value, + Status> + Visit(const T&) { + auto builder = internal::checked_cast::BuilderType*>(builder_); + RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); + + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; + raw++) { + auto scalar = + internal::checked_cast::ScalarType*>(raw->get()); + if (scalar->is_valid) { + builder->UnsafeAppend(scalar->value); + } else { + builder->UnsafeAppendNull(); + } + } + } + return Status::OK(); + } + + template + enable_if_base_binary Visit(const T&) { + int64_t data_size = 0; + for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; + raw++) { + auto scalar = + internal::checked_cast::ScalarType*>(raw->get()); + if (scalar->is_valid) { + data_size += scalar->value->size(); + } + } + + auto builder = internal::checked_cast::BuilderType*>(builder_); + RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); + RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size)); + + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_; + raw++) { + auto scalar = + internal::checked_cast::ScalarType*>(raw->get()); + if (scalar->is_valid) { + builder->UnsafeAppend(util::string_view{*scalar->value}); + } else { + builder->UnsafeAppendNull(); + } + } + } + return Status::OK(); + } + + template + enable_if_list_like Visit(const T&) { + auto builder = internal::checked_cast::BuilderType*>(builder_); + int64_t num_children = 0; + for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_; + scalar++) { + if (!(*scalar)->is_valid) continue; + num_children += + internal::checked_cast(**scalar).value->length(); + } + RETURN_NOT_OK(builder->value_builder()->Reserve(num_children * n_repeats_)); + + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_; + scalar++) { + if ((*scalar)->is_valid) { + RETURN_NOT_OK(builder->Append()); + const Array& list = + *internal::checked_cast(**scalar).value; + for (int64_t i = 0; i < list.length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); + RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); + } + } else { + RETURN_NOT_OK(builder_->AppendNull()); + } + } + } + return Status::OK(); + } + + Status Visit(const StructType& type) { + auto* builder = internal::checked_cast(builder_); + auto count = n_repeats_ * (scalars_end_ - scalars_begin_); + RETURN_NOT_OK(builder->Reserve(count)); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + RETURN_NOT_OK(builder->field_builder(field_index)->Reserve(count)); + } + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* s = scalars_begin_; s != scalars_end_; s++) { + const auto& scalar = internal::checked_cast(**s); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + if (!scalar.is_valid || !scalar.value[field_index]) { + RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull()); + } else { + RETURN_NOT_OK(builder->field_builder(field_index) + ->AppendScalar(*scalar.value[field_index])); + } + } + RETURN_NOT_OK(builder->Append(scalar.is_valid)); + } + } + return Status::OK(); + } + + Status Visit(const SparseUnionType& type) { return MakeUnionArray(type); } + + Status Visit(const DenseUnionType& type) { return MakeUnionArray(type); } + + template + Status MakeUnionArray(const T& type) { + using BuilderType = typename TypeTraits::BuilderType; + constexpr bool is_dense = std::is_same::value; + + auto* builder = internal::checked_cast(builder_); + const auto count = n_repeats_ * (scalars_end_ - scalars_begin_); + + RETURN_NOT_OK(builder->Reserve(count)); + + DCHECK_EQ(type.num_fields(), builder->num_children()); + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + RETURN_NOT_OK(builder->child_builder(field_index)->Reserve(count)); + } + + for (int64_t i = 0; i < n_repeats_; i++) { + for (const std::shared_ptr* s = scalars_begin_; s != scalars_end_; s++) { + // For each scalar, + // 1. append the type code, + // 2. append the value to the corresponding child, + // 3. if the union is sparse, append null to the other children. + const auto& scalar = internal::checked_cast(**s); + const auto scalar_field_index = type.child_ids()[scalar.type_code]; + RETURN_NOT_OK(builder->Append(scalar.type_code)); + + for (int field_index = 0; field_index < type.num_fields(); ++field_index) { + auto* child_builder = builder->child_builder(field_index).get(); + if (field_index == scalar_field_index) { + if (scalar.is_valid) { + RETURN_NOT_OK(child_builder->AppendScalar(*scalar.value)); + } else { + RETURN_NOT_OK(child_builder->AppendNull()); + } + } else if (!is_dense) { + RETURN_NOT_OK(child_builder->AppendNull()); + } + } + } + } + return Status::OK(); + } + + Status Visit(const DataType& type) { + return Status::NotImplemented("AppendScalar for type ", type); + } + + Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); } + + const std::shared_ptr* scalars_begin_; + const std::shared_ptr* scalars_end_; + int64_t n_repeats_; + ArrayBuilder* builder_; +}; + +} // namespace + +Status ArrayBuilder::AppendScalar(const Scalar& scalar) { + if (!scalar.type->Equals(type())) { + return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), + " to builder for type ", type()->ToString()); + } + std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}}; + return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert(); +} + +Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) { + if (!scalar.type->Equals(type())) { + return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), + " to builder for type ", type()->ToString()); + } + std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}}; + return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert(); +} + +Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { + if (scalars.empty()) return Status::OK(); + const auto ty = type(); + for (const auto& scalar : scalars) { + if (!scalar->type->Equals(ty)) { + return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(), + " to builder for type ", type()->ToString()); + } + } + return AppendScalarImpl{scalars.data(), scalars.data() + scalars.size(), + /*n_repeats=*/1, this} + .Convert(); +} + Status ArrayBuilder::Finish(std::shared_ptr* out) { std::shared_ptr internal_data; RETURN_NOT_OK(FinishInternal(&internal_data)); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 15c726241b5..c2aba4e959f 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -50,6 +50,8 @@ class ARROW_EXPORT ArrayBuilder { public: explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {} + ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder); + virtual ~ArrayBuilder() = default; /// For nested types. Since the objects are owned by this class instance, we @@ -116,6 +118,11 @@ class ARROW_EXPORT ArrayBuilder { /// This method is useful when appending null values to a parent nested type. virtual Status AppendEmptyValues(int64_t length) = 0; + /// \brief Append a value from a scalar + Status AppendScalar(const Scalar& scalar); + Status AppendScalar(const Scalar& scalar, int64_t n_repeats); + Status AppendScalars(const ScalarVector& scalars); + /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index bc49c7d6787..7653eeca5c4 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -77,6 +77,23 @@ class BaseBinaryBuilder : public ArrayBuilder { return Append(value.data(), static_cast(value.size())); } + /// Extend the last appended value by appending more data at the end + /// + /// Unlike Append, this does not create a new offset. + Status ExtendCurrent(const uint8_t* value, offset_type length) { + // Safety check for UBSAN. + if (ARROW_PREDICT_TRUE(length > 0)) { + ARROW_RETURN_NOT_OK(ValidateOverflow(length)); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + } + return Status::OK(); + } + + Status ExtendCurrent(util::string_view value) { + return ExtendCurrent(reinterpret_cast(value.data()), + static_cast(value.size())); + } + Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); ARROW_RETURN_NOT_OK(Reserve(length)); @@ -133,12 +150,28 @@ class BaseBinaryBuilder : public ArrayBuilder { UnsafeAppend(value.data(), static_cast(value.size())); } + /// Like ExtendCurrent, but do not check capacity + void UnsafeExtendCurrent(const uint8_t* value, offset_type length) { + value_data_builder_.UnsafeAppend(value, length); + } + + void UnsafeExtendCurrent(util::string_view value) { + UnsafeExtendCurrent(reinterpret_cast(value.data()), + static_cast(value.size())); + } + void UnsafeAppendNull() { const int64_t num_bytes = value_data_builder_.length(); offsets_builder_.UnsafeAppend(static_cast(num_bytes)); UnsafeAppendToBitmap(false); } + void UnsafeAppendEmptyValue() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(true); + } + /// \brief Append a sequence of strings in one shot. /// /// \param[in] values a vector of strings @@ -258,14 +291,7 @@ class BaseBinaryBuilder : public ArrayBuilder { } Status Resize(int64_t capacity) override { - // XXX Why is this check necessary? There is no reason to disallow, say, - // binary arrays with more than 2**31 empty or null values. - if (capacity > memory_limit()) { - return Status::CapacityError("BinaryBuilder cannot reserve space for more than ", - memory_limit(), " child elements, got ", capacity); - } ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); - // One more than requested for offsets ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); return ArrayBuilder::Resize(capacity); @@ -441,6 +467,14 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status Append(const Buffer& s) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(util::string_view(s)); + return Status::OK(); + } + + Status Append(const std::shared_ptr& s) { return Append(*s); } + template Status Append(const std::array& value) { ARROW_RETURN_NOT_OK(Reserve(1)); @@ -476,6 +510,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value.data())); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); } + + void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0); diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h index 8c75e7dd674..f48392ed001 100644 --- a/cpp/src/arrow/array/builder_decimal.h +++ b/cpp/src/arrow/array/builder_decimal.h @@ -32,6 +32,7 @@ namespace arrow { class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { public: using TypeClass = Decimal128Type; + using ValueType = Decimal128; explicit Decimal128Builder(const std::shared_ptr& type, MemoryPool* pool = default_memory_pool()); @@ -61,6 +62,7 @@ class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder { public: using TypeClass = Decimal256Type; + using ValueType = Decimal256; explicit Decimal256Builder(const std::shared_ptr& type, MemoryPool* pool = default_memory_pool()); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 40d6ce1ba9a..455cb3df7b1 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -29,6 +29,7 @@ #include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/array/data.h" #include "arrow/array/util.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc index 037a1ecbf91..e403c42411d 100644 --- a/cpp/src/arrow/array/builder_primitive.cc +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -65,9 +65,8 @@ Status BooleanBuilder::Resize(int64_t capacity) { } Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr null_bitmap, data; - RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - RETURN_NOT_OK(data_builder_.Finish(&data)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); *out = ArrayData::Make(boolean(), length_, {null_bitmap, data}, null_count_); diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index e10f11fdd6c..e0f39f97967 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -23,6 +23,7 @@ #include "arrow/array/builder_base.h" #include "arrow/array/data.h" +#include "arrow/result.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -185,9 +186,9 @@ class NumericBuilder : public ArrayBuilder { } Status FinishInternal(std::shared_ptr* out) override { - std::shared_ptr data, null_bitmap; - ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - ARROW_RETURN_NOT_OK(data_builder_.Finish(&data)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, + null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_); capacity_ = length_ = null_count_ = 0; return Status::OK(); diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 90d4f42084a..8617cb73fce 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -65,8 +65,8 @@ BasicUnionBuilder::BasicUnionBuilder( children_ = children; type_id_to_children_.resize(union_type.max_type_code() + 1, nullptr); - DCHECK_LT( - type_id_to_children_.size(), + DCHECK_LE( + type_id_to_children_.size() - 1, static_cast(UnionType::kMaxTypeCode)); for (size_t i = 0; i < children.size(); ++i) { diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 32478783394..e2a5898c209 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -482,9 +482,4 @@ Result> Concatenate(const ArrayVector& arrays, MemoryPool return MakeArray(std::move(out_data)); } -Status Concatenate(const ArrayVector& arrays, MemoryPool* pool, - std::shared_ptr* out) { - return Concatenate(arrays, pool).Value(out); -} - } // namespace arrow diff --git a/cpp/src/arrow/array/concatenate.h b/cpp/src/arrow/array/concatenate.h index a6c1c3cf3c1..e7597aad812 100644 --- a/cpp/src/arrow/array/concatenate.h +++ b/cpp/src/arrow/array/concatenate.h @@ -34,9 +34,4 @@ ARROW_EXPORT Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool = default_memory_pool()); -ARROW_DEPRECATED("Use Result-returning version") -ARROW_EXPORT -Status Concatenate(const ArrayVector& arrays, MemoryPool* pool, - std::shared_ptr* out); - } // namespace arrow diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index e397a752cd8..5a214473972 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -56,41 +56,39 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length, } } -std::shared_ptr ArrayData::Make(const std::shared_ptr& type, - int64_t length, +std::shared_ptr ArrayData::Make(std::shared_ptr type, int64_t length, std::vector> buffers, int64_t null_count, int64_t offset) { AdjustNonNullable(type->id(), length, &buffers, &null_count); - return std::make_shared(type, length, std::move(buffers), null_count, - offset); + return std::make_shared(std::move(type), length, std::move(buffers), + null_count, offset); } std::shared_ptr ArrayData::Make( - const std::shared_ptr& type, int64_t length, + std::shared_ptr type, int64_t length, std::vector> buffers, std::vector> child_data, int64_t null_count, int64_t offset) { AdjustNonNullable(type->id(), length, &buffers, &null_count); - return std::make_shared(type, length, std::move(buffers), + return std::make_shared(std::move(type), length, std::move(buffers), std::move(child_data), null_count, offset); } std::shared_ptr ArrayData::Make( - const std::shared_ptr& type, int64_t length, + std::shared_ptr type, int64_t length, std::vector> buffers, std::vector> child_data, std::shared_ptr dictionary, int64_t null_count, int64_t offset) { AdjustNonNullable(type->id(), length, &buffers, &null_count); - auto data = std::make_shared(type, length, std::move(buffers), + auto data = std::make_shared(std::move(type), length, std::move(buffers), std::move(child_data), null_count, offset); data->dictionary = std::move(dictionary); return data; } -std::shared_ptr ArrayData::Make(const std::shared_ptr& type, - int64_t length, int64_t null_count, - int64_t offset) { - return std::make_shared(type, length, null_count, offset); +std::shared_ptr ArrayData::Make(std::shared_ptr type, int64_t length, + int64_t null_count, int64_t offset) { + return std::make_shared(std::move(type), length, null_count, offset); } std::shared_ptr ArrayData::Slice(int64_t off, int64_t len) const { diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 02a49949e1f..418d09def6b 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -71,49 +71,47 @@ constexpr int64_t kUnknownNullCount = -1; /// input array and replace them with newly-allocated data, changing the output /// data type as well. struct ARROW_EXPORT ArrayData { - ArrayData() : length(0), null_count(0), offset(0) {} + ArrayData() = default; - ArrayData(const std::shared_ptr& type, int64_t length, + ArrayData(std::shared_ptr type, int64_t length, int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : type(type), length(length), null_count(null_count), offset(offset) {} + : type(std::move(type)), length(length), null_count(null_count), offset(offset) {} - ArrayData(const std::shared_ptr& type, int64_t length, + ArrayData(std::shared_ptr type, int64_t length, std::vector> buffers, int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : ArrayData(type, length, null_count, offset) { + : ArrayData(std::move(type), length, null_count, offset) { this->buffers = std::move(buffers); } - ArrayData(const std::shared_ptr& type, int64_t length, + ArrayData(std::shared_ptr type, int64_t length, std::vector> buffers, std::vector> child_data, int64_t null_count = kUnknownNullCount, int64_t offset = 0) - : ArrayData(type, length, null_count, offset) { + : ArrayData(std::move(type), length, null_count, offset) { this->buffers = std::move(buffers); this->child_data = std::move(child_data); } - static std::shared_ptr Make(const std::shared_ptr& type, - int64_t length, + static std::shared_ptr Make(std::shared_ptr type, int64_t length, std::vector> buffers, int64_t null_count = kUnknownNullCount, int64_t offset = 0); static std::shared_ptr Make( - const std::shared_ptr& type, int64_t length, + std::shared_ptr type, int64_t length, std::vector> buffers, std::vector> child_data, int64_t null_count = kUnknownNullCount, int64_t offset = 0); static std::shared_ptr Make( - const std::shared_ptr& type, int64_t length, + std::shared_ptr type, int64_t length, std::vector> buffers, std::vector> child_data, std::shared_ptr dictionary, int64_t null_count = kUnknownNullCount, int64_t offset = 0); - static std::shared_ptr Make(const std::shared_ptr& type, - int64_t length, + static std::shared_ptr Make(std::shared_ptr type, int64_t length, int64_t null_count = kUnknownNullCount, int64_t offset = 0); @@ -232,11 +230,11 @@ struct ARROW_EXPORT ArrayData { } std::shared_ptr type; - int64_t length; - mutable std::atomic null_count; + int64_t length = 0; + mutable std::atomic null_count{0}; // The logical start point into the physical buffers (in values, not bytes). // Note that, for child data, this must be *added* to the child data's own offset. - int64_t offset; + int64_t offset = 0; std::vector> buffers; std::vector> child_data; diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 297745a2b17..fae379e51f4 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -34,6 +34,7 @@ #include "arrow/buffer.h" #include "arrow/buffer_builder.h" #include "arrow/extension_type.h" +#include "arrow/result.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" @@ -286,7 +287,7 @@ std::shared_ptr MakeArray(const std::shared_ptr& data) { // ---------------------------------------------------------------------- // Misc APIs -namespace internal { +namespace { // get the maximum buffer length required, then allocate a single zeroed buffer // to use anywhere a buffer is required @@ -510,16 +511,26 @@ class RepeatedArrayFactory { } template - enable_if_t::value || is_fixed_size_binary_type::value || - is_temporal_type::value, - Status> - Visit(const T&) { + enable_if_t::value || is_temporal_type::value, Status> Visit( + const T&) { auto value = checked_cast::ScalarType&>(scalar_).value; return FinishFixedWidth(&value, sizeof(value)); } - Status Visit(const Decimal128Type&) { - auto value = checked_cast(scalar_).value.ToBytes(); + Status Visit(const FixedSizeBinaryType& type) { + auto value = checked_cast(scalar_).value; + return FinishFixedWidth(value->data(), type.byte_width()); + } + + template + enable_if_decimal Visit(const T&) { + using ScalarType = typename TypeTraits::ScalarType; + auto value = checked_cast(scalar_).value.ToBytes(); + return FinishFixedWidth(value.data(), value.size()); + } + + Status Visit(const Decimal256Type&) { + auto value = checked_cast(scalar_).value.ToBytes(); return FinishFixedWidth(value.data(), value.size()); } @@ -603,18 +614,85 @@ class RepeatedArrayFactory { return Status::OK(); } - Status Visit(const ExtensionType& type) { - return Status::NotImplemented("construction from scalar of type ", *scalar_.type); + Status Visit(const SparseUnionType& type) { + const auto& union_scalar = checked_cast(scalar_); + const auto& union_type = checked_cast(*scalar_.type); + const auto scalar_type_code = union_scalar.type_code; + const auto scalar_child_id = union_type.child_ids()[scalar_type_code]; + + // Create child arrays: most of them are all-null, except for the child array + // for the given type code (if the scalar is valid). + ArrayVector fields; + for (int i = 0; i < type.num_fields(); ++i) { + fields.emplace_back(); + if (i == scalar_child_id && scalar_.is_valid) { + ARROW_ASSIGN_OR_RAISE(fields.back(), + MakeArrayFromScalar(*union_scalar.value, length_, pool_)); + } else { + ARROW_ASSIGN_OR_RAISE( + fields.back(), MakeArrayOfNull(union_type.field(i)->type(), length_, pool_)); + } + } + + ARROW_ASSIGN_OR_RAISE(auto type_codes_buffer, CreateUnionTypeCodes(scalar_type_code)); + + out_ = std::make_shared(scalar_.type, length_, std::move(fields), + std::move(type_codes_buffer)); + return Status::OK(); } Status Visit(const DenseUnionType& type) { - return Status::NotImplemented("construction from scalar of type ", *scalar_.type); + const auto& union_scalar = checked_cast(scalar_); + const auto& union_type = checked_cast(*scalar_.type); + const auto scalar_type_code = union_scalar.type_code; + const auto scalar_child_id = union_type.child_ids()[scalar_type_code]; + + // Create child arrays: all of them are empty, except for the child array + // for the given type code (if length > 0). + ArrayVector fields; + for (int i = 0; i < type.num_fields(); ++i) { + fields.emplace_back(); + if (i == scalar_child_id && length_ > 0) { + if (scalar_.is_valid) { + // One valid element (will be referenced by multiple offsets) + ARROW_ASSIGN_OR_RAISE(fields.back(), + MakeArrayFromScalar(*union_scalar.value, 1, pool_)); + } else { + // One null element (will be referenced by multiple offsets) + ARROW_ASSIGN_OR_RAISE(fields.back(), + MakeArrayOfNull(union_type.field(i)->type(), 1, pool_)); + } + } else { + // Zero element (will not be referenced by any offset) + ARROW_ASSIGN_OR_RAISE(fields.back(), + MakeArrayOfNull(union_type.field(i)->type(), 0, pool_)); + } + } + + // Create an offsets buffer with all offsets equal to 0 + ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, + AllocateBuffer(length_ * sizeof(int32_t), pool_)); + memset(offsets_buffer->mutable_data(), 0, offsets_buffer->size()); + + ARROW_ASSIGN_OR_RAISE(auto type_codes_buffer, CreateUnionTypeCodes(scalar_type_code)); + + out_ = std::make_shared(scalar_.type, length_, std::move(fields), + std::move(type_codes_buffer), + std::move(offsets_buffer)); + return Status::OK(); } - Status Visit(const SparseUnionType& type) { + Status Visit(const ExtensionType& type) { return Status::NotImplemented("construction from scalar of type ", *scalar_.type); } + Result> CreateUnionTypeCodes(int8_t type_code) { + TypedBufferBuilder builder(pool_); + RETURN_NOT_OK(builder.Resize(length_)); + builder.UnsafeAppend(length_, type_code); + return builder.Finish(); + } + template Status CreateOffsetsBuffer(OffsetType value_length, std::shared_ptr* out) { TypedBufferBuilder builder(pool_); @@ -650,12 +728,11 @@ class RepeatedArrayFactory { std::shared_ptr out_; }; -} // namespace internal +} // namespace Result> MakeArrayOfNull(const std::shared_ptr& type, int64_t length, MemoryPool* pool) { - ARROW_ASSIGN_OR_RAISE(auto data, - internal::NullArrayFactory(pool, type, length).Create()); + ARROW_ASSIGN_OR_RAISE(auto data, NullArrayFactory(pool, type, length).Create()); return MakeArray(data); } @@ -664,7 +741,7 @@ Result> MakeArrayFromScalar(const Scalar& scalar, int64_t if (!scalar.is_valid) { return MakeArrayOfNull(scalar.type, length, pool); } - return internal::RepeatedArrayFactory(pool, scalar, length).Create(); + return RepeatedArrayFactory(pool, scalar, length).Create(); } namespace internal { diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 6ac885f8443..0ffba4a5071 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -85,9 +85,9 @@ struct ValidateArrayImpl { int64_t expected_values_length = -1; if (MultiplyWithOverflow(data.length, list_size, &expected_values_length) || - values.length != expected_values_length) { + values.length < expected_values_length) { return Status::Invalid("Values length (", values.length, - ") is not equal to the length (", data.length, + ") is less than the length (", data.length, ") multiplied by the value size (", list_size, ")"); } @@ -555,7 +555,7 @@ struct ValidateArrayFullImpl { const ArrayData& field = *data.child_data[i]; const Status field_valid = ValidateArrayFull(field); if (!field_valid.ok()) { - return Status::Invalid("Struct child array #", i, + return Status::Invalid("Union child array #", i, " invalid: ", field_valid.ToString()); } } diff --git a/cpp/src/arrow/arrow.pc.in b/cpp/src/arrow/arrow.pc.in index 947d534fdbf..ef995fdc3db 100644 --- a/cpp/src/arrow/arrow.pc.in +++ b/cpp/src/arrow/arrow.pc.in @@ -25,5 +25,7 @@ full_so_version=@ARROW_FULL_SO_VERSION@ Name: Apache Arrow Description: Arrow is a set of technologies that enable big-data systems to process and move data fast. Version: @ARROW_VERSION@ +Requires.private:@ARROW_PC_REQUIRES_PRIVATE@ Libs: -L${libdir} -larrow +Libs.private:@ARROW_PC_LIBS_PRIVATE@ Cflags: -I${includedir} diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 9215d9ab544..b1b2945d0f5 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -21,7 +21,6 @@ #include #include -#include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/bit_util.h" @@ -171,111 +170,6 @@ MutableBuffer::MutableBuffer(const std::shared_ptr& parent, const int64_ parent_ = parent; } -// ----------------------------------------------------------------------- -// Pool buffer and allocation - -/// A Buffer whose lifetime is tied to a particular MemoryPool -class PoolBuffer : public ResizableBuffer { - public: - explicit PoolBuffer(std::shared_ptr mm, MemoryPool* pool) - : ResizableBuffer(nullptr, 0, std::move(mm)), pool_(pool) {} - - ~PoolBuffer() override { - if (mutable_data_ != nullptr) { - pool_->Free(mutable_data_, capacity_); - } - } - - Status Reserve(const int64_t capacity) override { - if (capacity < 0) { - return Status::Invalid("Negative buffer capacity: ", capacity); - } - if (!mutable_data_ || capacity > capacity_) { - uint8_t* new_data; - int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(capacity); - if (mutable_data_) { - RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &mutable_data_)); - } else { - RETURN_NOT_OK(pool_->Allocate(new_capacity, &new_data)); - mutable_data_ = new_data; - } - data_ = mutable_data_; - capacity_ = new_capacity; - } - return Status::OK(); - } - - Status Resize(const int64_t new_size, bool shrink_to_fit = true) override { - if (ARROW_PREDICT_FALSE(new_size < 0)) { - return Status::Invalid("Negative buffer resize: ", new_size); - } - if (mutable_data_ && shrink_to_fit && new_size <= size_) { - // Buffer is non-null and is not growing, so shrink to the requested size without - // excess space. - int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(new_size); - if (capacity_ != new_capacity) { - // Buffer hasn't got yet the requested size. - RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &mutable_data_)); - data_ = mutable_data_; - capacity_ = new_capacity; - } - } else { - RETURN_NOT_OK(Reserve(new_size)); - } - size_ = new_size; - - return Status::OK(); - } - - static std::shared_ptr MakeShared(MemoryPool* pool) { - std::shared_ptr mm; - if (pool == nullptr) { - pool = default_memory_pool(); - mm = default_cpu_memory_manager(); - } else { - mm = CPUDevice::memory_manager(pool); - } - return std::make_shared(std::move(mm), pool); - } - - static std::unique_ptr MakeUnique(MemoryPool* pool) { - std::shared_ptr mm; - if (pool == nullptr) { - pool = default_memory_pool(); - mm = default_cpu_memory_manager(); - } else { - mm = CPUDevice::memory_manager(pool); - } - return std::unique_ptr(new PoolBuffer(std::move(mm), pool)); - } - - private: - MemoryPool* pool_; -}; - -namespace { -// A utility that does most of the work of the `AllocateBuffer` and -// `AllocateResizableBuffer` methods. The argument `buffer` should be a smart pointer to -// a PoolBuffer. -template -inline Result ResizePoolBuffer(PoolBufferPtr&& buffer, const int64_t size) { - RETURN_NOT_OK(buffer->Resize(size)); - buffer->ZeroPadding(); - return std::move(buffer); -} - -} // namespace - -Result> AllocateBuffer(const int64_t size, MemoryPool* pool) { - return ResizePoolBuffer>(PoolBuffer::MakeUnique(pool), size); -} - -Result> AllocateResizableBuffer(const int64_t size, - MemoryPool* pool) { - return ResizePoolBuffer>(PoolBuffer::MakeUnique(pool), - size); -} - Result> AllocateBitmap(int64_t length, MemoryPool* pool) { ARROW_ASSIGN_OR_RAISE(auto buf, AllocateBuffer(BitUtil::BytesForBits(length), pool)); // Zero out any trailing bits diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 1a3bb29e439..cfd525ab2d6 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -56,23 +56,13 @@ class ARROW_EXPORT Buffer { /// /// \note The passed memory must be kept alive through some other means Buffer(const uint8_t* data, int64_t size) - : is_mutable_(false), - is_cpu_(true), - data_(data), - mutable_data_(NULLPTR), - size_(size), - capacity_(size) { + : is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) { SetMemoryManager(default_cpu_memory_manager()); } Buffer(const uint8_t* data, int64_t size, std::shared_ptr mm, std::shared_ptr parent = NULLPTR) - : is_mutable_(false), - data_(data), - mutable_data_(NULLPTR), - size_(size), - capacity_(size), - parent_(parent) { + : is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(parent) { SetMemoryManager(std::move(mm)); } @@ -131,7 +121,7 @@ class ARROW_EXPORT Buffer { #endif // A zero-capacity buffer can have a null data pointer if (capacity_ != 0) { - memset(mutable_data_ + size_, 0, static_cast(capacity_ - size_)); + memset(mutable_data() + size_, 0, static_cast(capacity_ - size_)); } } @@ -205,7 +195,8 @@ class ARROW_EXPORT Buffer { CheckCPU(); CheckMutable(); #endif - return ARROW_PREDICT_TRUE(is_cpu_) ? mutable_data_ : NULLPTR; + return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast(data_) + : NULLPTR; } /// \brief Return the device address of the buffer's data @@ -219,7 +210,7 @@ class ARROW_EXPORT Buffer { #ifndef NDEBUG CheckMutable(); #endif - return reinterpret_cast(mutable_data_); + return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast(data_) : 0; } /// \brief Return the buffer's size in bytes @@ -289,7 +280,6 @@ class ARROW_EXPORT Buffer { bool is_mutable_; bool is_cpu_; const uint8_t* data_; - uint8_t* mutable_data_; int64_t size_; int64_t capacity_; @@ -389,13 +379,11 @@ Result> SliceMutableBufferSafe( class ARROW_EXPORT MutableBuffer : public Buffer { public: MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) { - mutable_data_ = data; is_mutable_ = true; } MutableBuffer(uint8_t* data, const int64_t size, std::shared_ptr mm) : Buffer(data, size, std::move(mm)) { - mutable_data_ = data; is_mutable_ = true; } @@ -428,7 +416,10 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { /// /// @param new_size The new size for the buffer. /// @param shrink_to_fit Whether to shrink the capacity if new size < current size - virtual Status Resize(const int64_t new_size, bool shrink_to_fit = true) = 0; + virtual Status Resize(const int64_t new_size, bool shrink_to_fit) = 0; + Status Resize(const int64_t new_size) { + return Resize(new_size, /*shrink_to_fit=*/true); + } /// Ensure that buffer has enough memory allocated to fit the indicated /// capacity (and meets the 64 byte padding requirement in Layout.md). diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index f525ec23c58..eb3f68affc0 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -45,8 +45,7 @@ class ARROW_EXPORT BufferBuilder { explicit BufferBuilder(MemoryPool* pool = default_memory_pool()) : pool_(pool), data_(/*ensure never null to make ubsan happy and avoid check penalties below*/ - &util::internal::non_null_filler), - + util::MakeNonNull()), capacity_(0), size_(0) {} @@ -64,15 +63,12 @@ class ARROW_EXPORT BufferBuilder { /// \brief Resize the buffer to the nearest multiple of 64 bytes /// /// \param new_capacity the new capacity of the of the builder. Will be - /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if - /// new capacity is smaller than the existing size, reallocate internal - /// buffer. Set to false to avoid reallocations when shrinking the builder. + /// rounded up to a multiple of 64 bytes for padding + /// \param shrink_to_fit if new capacity is smaller than the existing, + /// reallocate internal buffer. Set to false to avoid reallocations when + /// shrinking the builder. /// \return Status Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { - // Resize(0) is a no-op - if (new_capacity == 0) { - return Status::OK(); - } if (buffer_ == NULLPTR) { ARROW_ASSIGN_OR_RAISE(buffer_, AllocateResizableBuffer(new_capacity, pool_)); } else { @@ -168,6 +164,17 @@ class ARROW_EXPORT BufferBuilder { return out; } + /// \brief Like Finish, but override the final buffer size + /// + /// This is useful after writing data directly into the builder memory + /// without calling the Append methods (basically, when using BufferBuilder + /// mostly for memory allocation). + Result> FinishWithLength(int64_t final_length, + bool shrink_to_fit = true) { + size_ = final_length; + return Finish(shrink_to_fit); + } + void Reset() { buffer_ = NULLPTR; capacity_ = size_ = 0; @@ -273,6 +280,16 @@ class TypedBufferBuilder< return out; } + /// \brief Like Finish, but override the final buffer size + /// + /// This is useful after writing data directly into the builder memory + /// without calling the Append methods (basically, when using TypedBufferBuilder + /// only for memory allocation). + Result> FinishWithLength(int64_t final_length, + bool shrink_to_fit = true) { + return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit); + } + void Reset() { bytes_builder_.Reset(); } int64_t length() const { return bytes_builder_.length() / sizeof(T); } @@ -399,6 +416,19 @@ class TypedBufferBuilder { return out; } + /// \brief Like Finish, but override the final buffer size + /// + /// This is useful after writing data directly into the builder memory + /// without calling the Append methods (basically, when using TypedBufferBuilder + /// only for memory allocation). + Result> FinishWithLength(int64_t final_length, + bool shrink_to_fit = true) { + const auto final_byte_length = BitUtil::BytesForBits(final_length); + bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length()); + bit_length_ = false_count_ = 0; + return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit); + } + void Reset() { bytes_builder_.Reset(); bit_length_ = false_count_ = 0; diff --git a/cpp/src/arrow/buffer_test.cc b/cpp/src/arrow/buffer_test.cc index 02b96c3b493..4295d4ca692 100644 --- a/cpp/src/arrow/buffer_test.cc +++ b/cpp/src/arrow/buffer_test.cc @@ -653,18 +653,77 @@ TEST(TestBufferBuilder, ResizeReserve) { ASSERT_OK(builder.Resize(128)); ASSERT_EQ(128, builder.capacity()); + ASSERT_EQ(9, builder.length()); // Do not shrink to fit ASSERT_OK(builder.Resize(64, false)); ASSERT_EQ(128, builder.capacity()); + ASSERT_EQ(9, builder.length()); // Shrink to fit ASSERT_OK(builder.Resize(64)); ASSERT_EQ(64, builder.capacity()); + ASSERT_EQ(9, builder.length()); // Reserve elements ASSERT_OK(builder.Reserve(60)); ASSERT_EQ(128, builder.capacity()); + ASSERT_EQ(9, builder.length()); +} + +TEST(TestBufferBuilder, Finish) { + const std::string data = "some data"; + auto data_ptr = data.c_str(); + + for (const bool shrink_to_fit : {true, false}) { + ARROW_SCOPED_TRACE("shrink_to_fit = ", shrink_to_fit); + BufferBuilder builder; + ASSERT_OK(builder.Append(data_ptr, 9)); + ASSERT_OK(builder.Append(data_ptr, 9)); + ASSERT_EQ(18, builder.length()); + ASSERT_EQ(64, builder.capacity()); + + ASSERT_OK_AND_ASSIGN(auto buf, builder.Finish(shrink_to_fit)); + ASSERT_EQ(buf->size(), 18); + ASSERT_EQ(buf->capacity(), 64); + } + for (const bool shrink_to_fit : {true, false}) { + ARROW_SCOPED_TRACE("shrink_to_fit = ", shrink_to_fit); + BufferBuilder builder; + ASSERT_OK(builder.Reserve(1024)); + builder.UnsafeAppend(data_ptr, 9); + builder.UnsafeAppend(data_ptr, 9); + ASSERT_EQ(18, builder.length()); + ASSERT_EQ(builder.capacity(), 1024); + + ASSERT_OK_AND_ASSIGN(auto buf, builder.Finish(shrink_to_fit)); + ASSERT_EQ(buf->size(), 18); + ASSERT_EQ(buf->capacity(), shrink_to_fit ? 64 : 1024); + } +} + +TEST(TestBufferBuilder, FinishEmpty) { + for (const bool shrink_to_fit : {true, false}) { + ARROW_SCOPED_TRACE("shrink_to_fit = ", shrink_to_fit); + BufferBuilder builder; + ASSERT_EQ(0, builder.length()); + ASSERT_EQ(0, builder.capacity()); + + ASSERT_OK_AND_ASSIGN(auto buf, builder.Finish(shrink_to_fit)); + ASSERT_EQ(buf->size(), 0); + ASSERT_EQ(buf->capacity(), 0); + } + for (const bool shrink_to_fit : {true, false}) { + ARROW_SCOPED_TRACE("shrink_to_fit = ", shrink_to_fit); + BufferBuilder builder; + ASSERT_OK(builder.Reserve(1024)); + ASSERT_EQ(0, builder.length()); + ASSERT_EQ(1024, builder.capacity()); + + ASSERT_OK_AND_ASSIGN(auto buf, builder.Finish(shrink_to_fit)); + ASSERT_EQ(buf->size(), 0); + ASSERT_EQ(buf->capacity(), shrink_to_fit ? 0 : 1024); + } } template @@ -717,7 +776,7 @@ TYPED_TEST(TypedTestBufferBuilder, AppendCopies) { } } -TEST(TestBufferBuilder, BasicBoolBufferBuilderUsage) { +TEST(TestBoolBufferBuilder, Basics) { TypedBufferBuilder builder; ASSERT_OK(builder.Append(false)); @@ -746,7 +805,7 @@ TEST(TestBufferBuilder, BasicBoolBufferBuilderUsage) { ASSERT_EQ(built->size(), BitUtil::BytesForBits(nvalues + 1)); } -TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) { +TEST(TestBoolBufferBuilder, AppendCopies) { TypedBufferBuilder builder; ASSERT_OK(builder.Append(13, true)); @@ -766,6 +825,21 @@ TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) { ASSERT_EQ(built->size(), BitUtil::BytesForBits(13 + 17)); } +TEST(TestBoolBufferBuilder, Reserve) { + TypedBufferBuilder builder; + + ASSERT_OK(builder.Reserve(13 + 17)); + builder.UnsafeAppend(13, true); + builder.UnsafeAppend(17, false); + ASSERT_EQ(builder.length(), 13 + 17); + ASSERT_EQ(builder.capacity(), 64 * 8); + ASSERT_EQ(builder.false_count(), 17); + + ASSERT_OK_AND_ASSIGN(auto built, builder.Finish()); + AssertIsCPUBuffer(*built); + ASSERT_EQ(built->size(), BitUtil::BytesForBits(13 + 17)); +} + template class TypedTestBuffer : public ::testing::Test {}; diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 5cb3e577235..a43bf8104f2 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -984,11 +984,11 @@ struct SchemaImporter { if (prec_scale.size() != 2 && prec_scale.size() != 3) { return f_parser_.Invalid(); } - if (prec_scale[0] <= 0 || prec_scale[1] <= 0) { + if (prec_scale[0] <= 0) { return f_parser_.Invalid(); } if (prec_scale.size() == 2 || prec_scale[2] == 128) { - type_ = decimal(prec_scale[0], prec_scale[1]); + type_ = decimal128(prec_scale[0], prec_scale[1]); } else if (prec_scale[2] == 256) { type_ = decimal256(prec_scale[0], prec_scale[1]); } else { diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 317fd01f17c..54ce0efcf9d 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -283,6 +283,12 @@ TEST_F(TestSchemaExport, Primitive) { TestPrimitive(decimal(16, 4), "d:16,4"); TestPrimitive(decimal256(16, 4), "d:16,4,256"); + + TestPrimitive(decimal(15, 0), "d:15,0"); + TestPrimitive(decimal256(15, 0), "d:15,0,256"); + + TestPrimitive(decimal(15, -4), "d:15,-4"); + TestPrimitive(decimal256(15, -4), "d:15,-4,256"); } TEST_F(TestSchemaExport, Temporal) { @@ -1196,6 +1202,20 @@ TEST_F(TestSchemaImport, Primitive) { CheckImport(field("", decimal128(16, 4))); FillPrimitive("d:16,4,256"); CheckImport(field("", decimal256(16, 4))); + + FillPrimitive("d:16,0"); + CheckImport(field("", decimal128(16, 0))); + FillPrimitive("d:16,0,128"); + CheckImport(field("", decimal128(16, 0))); + FillPrimitive("d:16,0,256"); + CheckImport(field("", decimal256(16, 0))); + + FillPrimitive("d:16,-4"); + CheckImport(field("", decimal128(16, -4))); + FillPrimitive("d:16,-4,128"); + CheckImport(field("", decimal128(16, -4))); + FillPrimitive("d:16,-4,256"); + CheckImport(field("", decimal256(16, -4))); } TEST_F(TestSchemaImport, Temporal) { @@ -1395,6 +1415,8 @@ TEST_F(TestSchemaImport, FormatStringError) { CheckImportError(); FillPrimitive("d:15.4"); CheckImportError(); + FillPrimitive("d:15,z"); + CheckImportError(); FillPrimitive("t"); CheckImportError(); FillPrimitive("td"); @@ -2382,9 +2404,12 @@ TEST_F(TestSchemaRoundtrip, Primitive) { TestWithTypeFactory(boolean); TestWithTypeFactory(float16); - TestWithTypeFactory(std::bind(decimal, 19, 4)); TestWithTypeFactory(std::bind(decimal128, 19, 4)); TestWithTypeFactory(std::bind(decimal256, 19, 4)); + TestWithTypeFactory(std::bind(decimal128, 19, 0)); + TestWithTypeFactory(std::bind(decimal256, 19, 0)); + TestWithTypeFactory(std::bind(decimal128, 19, -5)); + TestWithTypeFactory(std::bind(decimal256, 19, -5)); TestWithTypeFactory(std::bind(fixed_size_binary, 3)); TestWithTypeFactory(binary); TestWithTypeFactory(large_utf8); diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index b259b05d7cf..142bd0d8c89 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -118,6 +118,33 @@ bool ChunkedArray::Equals(const std::shared_ptr& other) const { return Equals(*other.get()); } +bool ChunkedArray::ApproxEquals(const ChunkedArray& other, + const EqualOptions& equal_options) const { + if (length_ != other.length()) { + return false; + } + if (null_count_ != other.null_count()) { + return false; + } + // We cannot toggle check_metadata here yet, so we don't check it + if (!type_->Equals(*other.type_, /*check_metadata=*/false)) { + return false; + } + + // Check contents of the underlying arrays. This checks for equality of + // the underlying data independently of the chunk size. + return internal::ApplyBinaryChunked( + *this, other, + [&](const Array& left_piece, const Array& right_piece, + int64_t ARROW_ARG_UNUSED(position)) { + if (!left_piece.ApproxEquals(right_piece, equal_options)) { + return Status::Invalid("Unequal piece"); + } + return Status::OK(); + }) + .ok(); +} + std::shared_ptr ChunkedArray::Slice(int64_t offset, int64_t length) const { ARROW_CHECK_LE(offset, length_) << "Slice offset greater than array length"; bool offset_equals_length = offset == length_; diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 5c0dda91850..2ace045c2bf 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -23,6 +23,7 @@ #include #include +#include "arrow/compare.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" @@ -136,6 +137,9 @@ class ARROW_EXPORT ChunkedArray { bool Equals(const ChunkedArray& other) const; /// \brief Determine if two chunked arrays are equal. bool Equals(const std::shared_ptr& other) const; + /// \brief Determine if two chunked arrays approximately equal + bool ApproxEquals(const ChunkedArray& other, + const EqualOptions& = EqualOptions::Defaults()) const; /// \return PrettyPrint representation suitable for debugging std::string ToString() const; diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index e781dff90e2..897dc32f357 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -68,3 +68,5 @@ add_arrow_compute_test(internals_test add_arrow_benchmark(function_benchmark PREFIX "arrow-compute") add_subdirectory(kernels) + +add_subdirectory(exec) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 5afa1048960..1b00c366bfd 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -18,35 +18,151 @@ #include "arrow/compute/api_aggregate.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function_internal.h" +#include "arrow/compute/registry.h" +#include "arrow/compute/util_internal.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" namespace arrow { + +namespace internal { +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "QuantileOptions::Interpolation"; } + static std::string value_name(compute::QuantileOptions::Interpolation value) { + switch (value) { + case compute::QuantileOptions::LINEAR: + return "LINEAR"; + case compute::QuantileOptions::LOWER: + return "LOWER"; + case compute::QuantileOptions::HIGHER: + return "HIGHER"; + case compute::QuantileOptions::NEAREST: + return "NEAREST"; + case compute::QuantileOptions::MIDPOINT: + return "MIDPOINT"; + } + return ""; + } +}; +} // namespace internal + namespace compute { +// ---------------------------------------------------------------------- +// Function options + +using ::arrow::internal::checked_cast; + +namespace internal { +namespace { +using ::arrow::internal::DataMember; +static auto kScalarAggregateOptionsType = GetFunctionOptionsType( + DataMember("skip_nulls", &ScalarAggregateOptions::skip_nulls), + DataMember("min_count", &ScalarAggregateOptions::min_count)); +static auto kModeOptionsType = + GetFunctionOptionsType(DataMember("n", &ModeOptions::n)); +static auto kVarianceOptionsType = + GetFunctionOptionsType(DataMember("ddof", &VarianceOptions::ddof)); +static auto kQuantileOptionsType = GetFunctionOptionsType( + DataMember("q", &QuantileOptions::q), + DataMember("interpolation", &QuantileOptions::interpolation)); +static auto kTDigestOptionsType = GetFunctionOptionsType( + DataMember("q", &TDigestOptions::q), DataMember("delta", &TDigestOptions::delta), + DataMember("buffer_size", &TDigestOptions::buffer_size)); +static auto kIndexOptionsType = + GetFunctionOptionsType(DataMember("value", &IndexOptions::value)); +} // namespace +} // namespace internal + +ScalarAggregateOptions::ScalarAggregateOptions(bool skip_nulls, uint32_t min_count) + : FunctionOptions(internal::kScalarAggregateOptionsType), + skip_nulls(skip_nulls), + min_count(min_count) {} +constexpr char ScalarAggregateOptions::kTypeName[]; + +ModeOptions::ModeOptions(int64_t n) : FunctionOptions(internal::kModeOptionsType), n(n) {} +constexpr char ModeOptions::kTypeName[]; + +VarianceOptions::VarianceOptions(int ddof) + : FunctionOptions(internal::kVarianceOptionsType), ddof(ddof) {} +constexpr char VarianceOptions::kTypeName[]; + +QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation) + : FunctionOptions(internal::kQuantileOptionsType), + q{q}, + interpolation{interpolation} {} +QuantileOptions::QuantileOptions(std::vector q, enum Interpolation interpolation) + : FunctionOptions(internal::kQuantileOptionsType), + q{std::move(q)}, + interpolation{interpolation} {} +constexpr char QuantileOptions::kTypeName[]; + +TDigestOptions::TDigestOptions(double q, uint32_t delta, uint32_t buffer_size) + : FunctionOptions(internal::kTDigestOptionsType), + q{q}, + delta{delta}, + buffer_size{buffer_size} {} +TDigestOptions::TDigestOptions(std::vector q, uint32_t delta, + uint32_t buffer_size) + : FunctionOptions(internal::kTDigestOptionsType), + q{std::move(q)}, + delta{delta}, + buffer_size{buffer_size} {} +constexpr char TDigestOptions::kTypeName[]; + +IndexOptions::IndexOptions(std::shared_ptr value) + : FunctionOptions(internal::kIndexOptionsType), value{std::move(value)} {} +IndexOptions::IndexOptions() : IndexOptions(std::make_shared()) {} +constexpr char IndexOptions::kTypeName[]; + +namespace internal { +void RegisterAggregateOptions(FunctionRegistry* registry) { + DCHECK_OK(registry->AddFunctionOptionsType(kScalarAggregateOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kModeOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType)); +} +} // namespace internal + // ---------------------------------------------------------------------- // Scalar aggregates -Result Count(const Datum& value, CountOptions options, ExecContext* ctx) { +Result Count(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { return CallFunction("count", {value}, &options, ctx); } -Result Mean(const Datum& value, ExecContext* ctx) { - return CallFunction("mean", {value}, ctx); +Result Mean(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { + return CallFunction("mean", {value}, &options, ctx); } -Result Sum(const Datum& value, ExecContext* ctx) { - return CallFunction("sum", {value}, ctx); +Result Sum(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { + return CallFunction("sum", {value}, &options, ctx); } -Result MinMax(const Datum& value, const MinMaxOptions& options, ExecContext* ctx) { +Result MinMax(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { return CallFunction("min_max", {value}, &options, ctx); } -Result Any(const Datum& value, ExecContext* ctx) { - return CallFunction("any", {value}, ctx); +Result Any(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { + return CallFunction("any", {value}, &options, ctx); } -Result All(const Datum& value, ExecContext* ctx) { - return CallFunction("all", {value}, ctx); +Result All(const Datum& value, const ScalarAggregateOptions& options, + ExecContext* ctx) { + return CallFunction("all", {value}, &options, ctx); } Result Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) { @@ -73,5 +189,9 @@ Result TDigest(const Datum& value, const TDigestOptions& options, return CallFunction("tdigest", {value}, &options, ctx); } +Result Index(const Datum& value, const IndexOptions& options, ExecContext* ctx) { + return CallFunction("index", {value}, &options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index ca118ec5678..d66d4f1517c 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -40,49 +40,27 @@ class ExecContext; /// \addtogroup compute-concrete-options /// @{ -/// \brief Control Count kernel behavior -/// -/// By default, all non-null values are counted. -struct ARROW_EXPORT CountOptions : public FunctionOptions { - enum Mode { - /// Count all non-null values. - COUNT_NON_NULL = 0, - /// Count all null values. - COUNT_NULL, - }; - - explicit CountOptions(enum Mode count_mode = COUNT_NON_NULL) : count_mode(count_mode) {} - - static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); } - - enum Mode count_mode; -}; - -/// \brief Control MinMax kernel behavior +/// \brief Control general scalar aggregate kernel behavior /// /// By default, null values are ignored -struct ARROW_EXPORT MinMaxOptions : public FunctionOptions { - enum Mode { - /// Skip null values - SKIP = 0, - /// Any nulls will result in null output - EMIT_NULL - }; - - explicit MinMaxOptions(enum Mode null_handling = SKIP) : null_handling(null_handling) {} - - static MinMaxOptions Defaults() { return MinMaxOptions{}; } +class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions { + public: + explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1); + constexpr static char const kTypeName[] = "ScalarAggregateOptions"; + static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; } - enum Mode null_handling; + bool skip_nulls; + uint32_t min_count; }; /// \brief Control Mode kernel behavior /// /// Returns top-n common values and counts. /// By default, returns the most common value and count. -struct ARROW_EXPORT ModeOptions : public FunctionOptions { - explicit ModeOptions(int64_t n = 1) : n(n) {} - +class ARROW_EXPORT ModeOptions : public FunctionOptions { + public: + explicit ModeOptions(int64_t n = 1); + constexpr static char const kTypeName[] = "ModeOptions"; static ModeOptions Defaults() { return ModeOptions{}; } int64_t n = 1; @@ -92,9 +70,10 @@ struct ARROW_EXPORT ModeOptions : public FunctionOptions { /// /// The divisor used in calculations is N - ddof, where N is the number of elements. /// By default, ddof is zero, and population variance or stddev is returned. -struct ARROW_EXPORT VarianceOptions : public FunctionOptions { - explicit VarianceOptions(int ddof = 0) : ddof(ddof) {} - +class ARROW_EXPORT VarianceOptions : public FunctionOptions { + public: + explicit VarianceOptions(int ddof = 0); + constexpr static char const kTypeName[] = "VarianceOptions"; static VarianceOptions Defaults() { return VarianceOptions{}; } int ddof = 0; @@ -103,7 +82,8 @@ struct ARROW_EXPORT VarianceOptions : public FunctionOptions { /// \brief Control Quantile kernel behavior /// /// By default, returns the median value. -struct ARROW_EXPORT QuantileOptions : public FunctionOptions { +class ARROW_EXPORT QuantileOptions : public FunctionOptions { + public: /// Interpolation method to use when quantile lies between two data points enum Interpolation { LINEAR = 0, @@ -113,13 +93,12 @@ struct ARROW_EXPORT QuantileOptions : public FunctionOptions { MIDPOINT, }; - explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR) - : q{q}, interpolation{interpolation} {} + explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR); explicit QuantileOptions(std::vector q, - enum Interpolation interpolation = LINEAR) - : q{std::move(q)}, interpolation{interpolation} {} + enum Interpolation interpolation = LINEAR); + constexpr static char const kTypeName[] = "QuantileOptions"; static QuantileOptions Defaults() { return QuantileOptions{}; } /// quantile must be between 0 and 1 inclusive @@ -130,15 +109,13 @@ struct ARROW_EXPORT QuantileOptions : public FunctionOptions { /// \brief Control TDigest approximate quantile kernel behavior /// /// By default, returns the median value. -struct ARROW_EXPORT TDigestOptions : public FunctionOptions { +class ARROW_EXPORT TDigestOptions : public FunctionOptions { + public: explicit TDigestOptions(double q = 0.5, uint32_t delta = 100, - uint32_t buffer_size = 500) - : q{q}, delta{delta}, buffer_size{buffer_size} {} - + uint32_t buffer_size = 500); explicit TDigestOptions(std::vector q, uint32_t delta = 100, - uint32_t buffer_size = 500) - : q{std::move(q)}, delta{delta}, buffer_size{buffer_size} {} - + uint32_t buffer_size = 500); + constexpr static char const kTypeName[] = "TDigestOptions"; static TDigestOptions Defaults() { return TDigestOptions{}; } /// quantile must be between 0 and 1 inclusive @@ -149,11 +126,22 @@ struct ARROW_EXPORT TDigestOptions : public FunctionOptions { uint32_t buffer_size; }; +/// \brief Control Index kernel behavior +class ARROW_EXPORT IndexOptions : public FunctionOptions { + public: + explicit IndexOptions(std::shared_ptr value); + // Default constructor for serialization + IndexOptions(); + constexpr static char const kTypeName[] = "IndexOptions"; + + std::shared_ptr value; +}; + /// @} /// \brief Count non-null (or null) values in an array. /// -/// \param[in] options counting options, see CountOptions for more information +/// \param[in] options counting options, see ScalarAggregateOptions for more information /// \param[in] datum to count /// \param[in] ctx the function execution context, optional /// \return out resulting datum @@ -161,30 +149,40 @@ struct ARROW_EXPORT TDigestOptions : public FunctionOptions { /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Count(const Datum& datum, CountOptions options = CountOptions::Defaults(), - ExecContext* ctx = NULLPTR); +Result Count( + const Datum& datum, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Compute the mean of a numeric array. /// /// \param[in] value datum to compute the mean, expecting Array +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return datum of the computed mean as a DoubleScalar /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Mean(const Datum& value, ExecContext* ctx = NULLPTR); +Result Mean( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Sum values of a numeric array. /// /// \param[in] value datum to sum, expecting Array or ChunkedArray +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return datum of the computed sum as a Scalar /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Sum(const Datum& value, ExecContext* ctx = NULLPTR); +Result Sum( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Calculate the min / max of a numeric array /// @@ -192,44 +190,59 @@ Result Sum(const Datum& value, ExecContext* ctx = NULLPTR); /// struct, where T is the input type /// /// \param[in] value input datum, expecting Array or ChunkedArray -/// \param[in] options see MinMaxOptions for more information +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return resulting datum as a struct scalar /// /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result MinMax(const Datum& value, - const MinMaxOptions& options = MinMaxOptions::Defaults(), - ExecContext* ctx = NULLPTR); +Result MinMax( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Test whether any element in a boolean array evaluates to true. /// /// This function returns true if any of the elements in the array evaluates -/// to true and false otherwise. Null values are skipped. +/// to true and false otherwise. Null values are ignored by default. +/// If null values are taken into account by setting ScalarAggregateOptions +/// parameter skip_nulls = false then Kleene logic is used. +/// See KleeneOr for more details on Kleene logic. /// /// \param[in] value input datum, expecting a boolean array +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return resulting datum as a BooleanScalar /// /// \since 3.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Any(const Datum& value, ExecContext* ctx = NULLPTR); +Result Any( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Test whether all elements in a boolean array evaluate to true. /// /// This function returns true if all of the elements in the array evaluate -/// to true and false otherwise. Null values are skipped. +/// to true and false otherwise. Null values are ignored by default. +/// If null values are taken into account by setting ScalarAggregateOptions +/// parameter skip_nulls = false then Kleene logic is used. +/// See KleeneAnd for more details on Kleene logic. /// /// \param[in] value input datum, expecting a boolean array +/// \param[in] options see ScalarAggregateOptions for more information /// \param[in] ctx the function execution context, optional /// \return resulting datum as a BooleanScalar /// \since 3.0.0 /// \note API not yet finalized ARROW_EXPORT -Result All(const Datum& value, ExecContext* ctx = NULLPTR); +Result All( + const Datum& value, + const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); /// \brief Calculate the modal (most common) value of a numeric array /// @@ -306,6 +319,19 @@ Result TDigest(const Datum& value, const TDigestOptions& options = TDigestOptions::Defaults(), ExecContext* ctx = NULLPTR); +/// \brief Find the first index of a value in an array. +/// +/// \param[in] value The array to search. +/// \param[in] options The array to search for. See IndexOoptions. +/// \param[in] ctx the function execution context, optional +/// \return out a Scalar containing the index (or -1 if not found). +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Index(const Datum& value, const IndexOptions& options, + ExecContext* ctx = NULLPTR); + namespace internal { /// Internal use only: streaming group identifier. @@ -399,7 +425,7 @@ struct ARROW_EXPORT Aggregate { /// This will be replaced by streaming execution operators. ARROW_EXPORT Result GroupBy(const std::vector& arguments, const std::vector& keys, - const std::vector& aggregates, + const std::vector& aggregates, bool use_threads = false, ExecContext* ctx = default_exec_context()); } // namespace internal diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index f4696fbe02a..1feb4e7eee0 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -21,13 +21,287 @@ #include #include +#include "arrow/array/array_base.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function_internal.h" +#include "arrow/compute/registry.h" +#include "arrow/compute/util_internal.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" namespace arrow { + +namespace internal { +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "JoinOptions::NullHandlingBehavior"; } + static std::string value_name(compute::JoinOptions::NullHandlingBehavior value) { + switch (value) { + case compute::JoinOptions::NullHandlingBehavior::EMIT_NULL: + return "EMIT_NULL"; + case compute::JoinOptions::NullHandlingBehavior::SKIP: + return "SKIP"; + case compute::JoinOptions::NullHandlingBehavior::REPLACE: + return "REPLACE"; + } + return ""; + } +}; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "TimeUnit::type"; } + static std::string value_name(TimeUnit::type value) { + switch (value) { + case TimeUnit::type::SECOND: + return "SECOND"; + case TimeUnit::type::MILLI: + return "MILLI"; + case TimeUnit::type::MICRO: + return "MICRO"; + case TimeUnit::type::NANO: + return "NANO"; + } + return ""; + } +}; +template <> +struct EnumTraits + : BasicEnumTraits< + compute::CompareOperator, compute::CompareOperator::EQUAL, + compute::CompareOperator::NOT_EQUAL, compute::CompareOperator::GREATER, + compute::CompareOperator::GREATER_EQUAL, compute::CompareOperator::LESS, + compute::CompareOperator::LESS_EQUAL> { + static std::string name() { return "compute::CompareOperator"; } + static std::string value_name(compute::CompareOperator value) { + switch (value) { + case compute::CompareOperator::EQUAL: + return "EQUAL"; + case compute::CompareOperator::NOT_EQUAL: + return "NOT_EQUAL"; + case compute::CompareOperator::GREATER: + return "GREATER"; + case compute::CompareOperator::GREATER_EQUAL: + return "GREATER_EQUAL"; + case compute::CompareOperator::LESS: + return "LESS"; + case compute::CompareOperator::LESS_EQUAL: + return "LESS_EQUAL"; + } + return ""; + } +}; +} // namespace internal + namespace compute { +// ---------------------------------------------------------------------- +// Function options + +using ::arrow::internal::checked_cast; + +namespace internal { +namespace { +using ::arrow::internal::DataMember; +static auto kArithmeticOptionsType = GetFunctionOptionsType( + DataMember("check_overflow", &ArithmeticOptions::check_overflow)); +static auto kElementWiseAggregateOptionsType = + GetFunctionOptionsType( + DataMember("skip_nulls", &ElementWiseAggregateOptions::skip_nulls)); +static auto kJoinOptionsType = GetFunctionOptionsType( + DataMember("null_handling", &JoinOptions::null_handling), + DataMember("null_replacement", &JoinOptions::null_replacement)); +static auto kMatchSubstringOptionsType = GetFunctionOptionsType( + DataMember("pattern", &MatchSubstringOptions::pattern), + DataMember("ignore_case", &MatchSubstringOptions::ignore_case)); +static auto kSplitOptionsType = GetFunctionOptionsType( + DataMember("max_splits", &SplitOptions::max_splits), + DataMember("reverse", &SplitOptions::reverse)); +static auto kSplitPatternOptionsType = GetFunctionOptionsType( + DataMember("pattern", &SplitPatternOptions::pattern), + DataMember("max_splits", &SplitPatternOptions::max_splits), + DataMember("reverse", &SplitPatternOptions::reverse)); +static auto kReplaceSliceOptionsType = GetFunctionOptionsType( + DataMember("start", &ReplaceSliceOptions::start), + DataMember("stop", &ReplaceSliceOptions::stop), + DataMember("replacement", &ReplaceSliceOptions::replacement)); +static auto kReplaceSubstringOptionsType = + GetFunctionOptionsType( + DataMember("pattern", &ReplaceSubstringOptions::pattern), + DataMember("replacement", &ReplaceSubstringOptions::replacement), + DataMember("max_replacements", &ReplaceSubstringOptions::max_replacements)); +static auto kExtractRegexOptionsType = GetFunctionOptionsType( + DataMember("pattern", &ExtractRegexOptions::pattern)); +static auto kSetLookupOptionsType = GetFunctionOptionsType( + DataMember("value_set", &SetLookupOptions::value_set), + DataMember("skip_nulls", &SetLookupOptions::skip_nulls)); +static auto kStrptimeOptionsType = GetFunctionOptionsType( + DataMember("format", &StrptimeOptions::format), + DataMember("unit", &StrptimeOptions::unit)); +static auto kPadOptionsType = GetFunctionOptionsType( + DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding)); +static auto kTrimOptionsType = GetFunctionOptionsType( + DataMember("characters", &TrimOptions::characters)); +static auto kSliceOptionsType = GetFunctionOptionsType( + DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop), + DataMember("step", &SliceOptions::step)); +static auto kMakeStructOptionsType = GetFunctionOptionsType( + DataMember("field_names", &MakeStructOptions::field_names), + DataMember("field_nullability", &MakeStructOptions::field_nullability), + DataMember("field_metadata", &MakeStructOptions::field_metadata)); +static auto kDayOfWeekOptionsType = GetFunctionOptionsType( + DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering), + DataMember("week_start", &DayOfWeekOptions::week_start)); +} // namespace +} // namespace internal + +ArithmeticOptions::ArithmeticOptions(bool check_overflow) + : FunctionOptions(internal::kArithmeticOptionsType), check_overflow(check_overflow) {} +constexpr char ArithmeticOptions::kTypeName[]; + +ElementWiseAggregateOptions::ElementWiseAggregateOptions(bool skip_nulls) + : FunctionOptions(internal::kElementWiseAggregateOptionsType), + skip_nulls(skip_nulls) {} +constexpr char ElementWiseAggregateOptions::kTypeName[]; + +JoinOptions::JoinOptions(NullHandlingBehavior null_handling, std::string null_replacement) + : FunctionOptions(internal::kJoinOptionsType), + null_handling(null_handling), + null_replacement(std::move(null_replacement)) {} +constexpr char JoinOptions::kTypeName[]; + +MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case) + : FunctionOptions(internal::kMatchSubstringOptionsType), + pattern(std::move(pattern)), + ignore_case(ignore_case) {} +MatchSubstringOptions::MatchSubstringOptions() : MatchSubstringOptions("", false) {} +constexpr char MatchSubstringOptions::kTypeName[]; + +SplitOptions::SplitOptions(int64_t max_splits, bool reverse) + : FunctionOptions(internal::kSplitOptionsType), + max_splits(max_splits), + reverse(reverse) {} +constexpr char SplitOptions::kTypeName[]; + +SplitPatternOptions::SplitPatternOptions(std::string pattern, int64_t max_splits, + bool reverse) + : FunctionOptions(internal::kSplitPatternOptionsType), + pattern(std::move(pattern)), + max_splits(max_splits), + reverse(reverse) {} +SplitPatternOptions::SplitPatternOptions() : SplitPatternOptions("", -1, false) {} +constexpr char SplitPatternOptions::kTypeName[]; + +ReplaceSliceOptions::ReplaceSliceOptions(int64_t start, int64_t stop, + std::string replacement) + : FunctionOptions(internal::kReplaceSliceOptionsType), + start(start), + stop(stop), + replacement(std::move(replacement)) {} +ReplaceSliceOptions::ReplaceSliceOptions() : ReplaceSliceOptions(0, 0, "") {} +constexpr char ReplaceSliceOptions::kTypeName[]; + +ReplaceSubstringOptions::ReplaceSubstringOptions(std::string pattern, + std::string replacement, + int64_t max_replacements) + : FunctionOptions(internal::kReplaceSubstringOptionsType), + pattern(std::move(pattern)), + replacement(std::move(replacement)), + max_replacements(max_replacements) {} +ReplaceSubstringOptions::ReplaceSubstringOptions() + : ReplaceSubstringOptions("", "", -1) {} +constexpr char ReplaceSubstringOptions::kTypeName[]; + +ExtractRegexOptions::ExtractRegexOptions(std::string pattern) + : FunctionOptions(internal::kExtractRegexOptionsType), pattern(std::move(pattern)) {} +ExtractRegexOptions::ExtractRegexOptions() : ExtractRegexOptions("") {} +constexpr char ExtractRegexOptions::kTypeName[]; + +SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls) + : FunctionOptions(internal::kSetLookupOptionsType), + value_set(std::move(value_set)), + skip_nulls(skip_nulls) {} +SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {} +constexpr char SetLookupOptions::kTypeName[]; + +StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit) + : FunctionOptions(internal::kStrptimeOptionsType), + format(std::move(format)), + unit(unit) {} +StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {} +constexpr char StrptimeOptions::kTypeName[]; + +PadOptions::PadOptions(int64_t width, std::string padding) + : FunctionOptions(internal::kPadOptionsType), + width(width), + padding(std::move(padding)) {} +PadOptions::PadOptions() : PadOptions(0, " ") {} +constexpr char PadOptions::kTypeName[]; + +TrimOptions::TrimOptions(std::string characters) + : FunctionOptions(internal::kTrimOptionsType), characters(std::move(characters)) {} +TrimOptions::TrimOptions() : TrimOptions("") {} +constexpr char TrimOptions::kTypeName[]; + +SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step) + : FunctionOptions(internal::kSliceOptionsType), + start(start), + stop(stop), + step(step) {} +SliceOptions::SliceOptions() : SliceOptions(0, 0, 1) {} +constexpr char SliceOptions::kTypeName[]; + +MakeStructOptions::MakeStructOptions( + std::vector n, std::vector r, + std::vector> m) + : FunctionOptions(internal::kMakeStructOptionsType), + field_names(std::move(n)), + field_nullability(std::move(r)), + field_metadata(std::move(m)) {} + +MakeStructOptions::MakeStructOptions(std::vector n) + : FunctionOptions(internal::kMakeStructOptionsType), + field_names(std::move(n)), + field_nullability(field_names.size(), true), + field_metadata(field_names.size(), NULLPTR) {} + +MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector()) {} +constexpr char MakeStructOptions::kTypeName[]; + +DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start) + : FunctionOptions(internal::kDayOfWeekOptionsType), + one_based_numbering(one_based_numbering), + week_start(week_start) {} +constexpr char DayOfWeekOptions::kTypeName[]; + +namespace internal { +void RegisterScalarOptions(FunctionRegistry* registry) { + DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kElementWiseAggregateOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kSplitOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kSplitPatternOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSliceOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType)); +} +} // namespace internal + #define SCALAR_EAGER_UNARY(NAME, REGISTRY_NAME) \ Result NAME(const Datum& value, ExecContext* ctx) { \ return CallFunction(REGISTRY_NAME, {value}, ctx); \ @@ -41,6 +315,26 @@ namespace compute { // ---------------------------------------------------------------------- // Arithmetic +#define SCALAR_ARITHMETIC_UNARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \ + Result NAME(const Datum& arg, ArithmeticOptions options, ExecContext* ctx) { \ + auto func_name = (options.check_overflow) ? REGISTRY_CHECKED_NAME : REGISTRY_NAME; \ + return CallFunction(func_name, {arg}, ctx); \ + } + +SCALAR_ARITHMETIC_UNARY(AbsoluteValue, "abs", "abs_checked") +SCALAR_ARITHMETIC_UNARY(Negate, "negate", "negate_checked") +SCALAR_EAGER_UNARY(Sign, "sign") +SCALAR_ARITHMETIC_UNARY(Sin, "sin", "sin_checked") +SCALAR_ARITHMETIC_UNARY(Cos, "cos", "cos_checked") +SCALAR_ARITHMETIC_UNARY(Asin, "asin", "asin_checked") +SCALAR_ARITHMETIC_UNARY(Acos, "acos", "acos_checked") +SCALAR_ARITHMETIC_UNARY(Tan, "tan", "tan_checked") +SCALAR_EAGER_UNARY(Atan, "atan") +SCALAR_ARITHMETIC_UNARY(Ln, "ln", "ln_checked") +SCALAR_ARITHMETIC_UNARY(Log10, "log10", "log10_checked") +SCALAR_ARITHMETIC_UNARY(Log2, "log2", "log2_checked") +SCALAR_ARITHMETIC_UNARY(Log1p, "log1p", "log1p_checked") + #define SCALAR_ARITHMETIC_BINARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \ Result NAME(const Datum& left, const Datum& right, ArithmeticOptions options, \ ExecContext* ctx) { \ @@ -52,6 +346,23 @@ SCALAR_ARITHMETIC_BINARY(Add, "add", "add_checked") SCALAR_ARITHMETIC_BINARY(Subtract, "subtract", "subtract_checked") SCALAR_ARITHMETIC_BINARY(Multiply, "multiply", "multiply_checked") SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked") +SCALAR_ARITHMETIC_BINARY(Power, "power", "power_checked") +SCALAR_ARITHMETIC_BINARY(ShiftLeft, "shift_left", "shift_left_checked") +SCALAR_ARITHMETIC_BINARY(ShiftRight, "shift_right", "shift_right_checked") +SCALAR_EAGER_BINARY(Atan2, "atan2") +SCALAR_EAGER_UNARY(Floor, "floor") +SCALAR_EAGER_UNARY(Ceil, "ceil") +SCALAR_EAGER_UNARY(Trunc, "trunc") + +Result MaxElementWise(const std::vector& args, + ElementWiseAggregateOptions options, ExecContext* ctx) { + return CallFunction("max_element_wise", args, &options, ctx); +} + +Result MinElementWise(const std::vector& args, + ElementWiseAggregateOptions options, ExecContext* ctx) { + return CallFunction("min_element_wise", args, &options, ctx); +} // ---------------------------------------------------------------------- // Set-related operations @@ -133,7 +444,7 @@ Result Compare(const Datum& left, const Datum& right, CompareOptions opti func_name = "less_equal"; break; } - return CallFunction(func_name, {left, right}, &options, ctx); + return CallFunction(func_name, {left, right}, nullptr, ctx); } // ---------------------------------------------------------------------- @@ -147,5 +458,41 @@ Result FillNull(const Datum& values, const Datum& fill_value, ExecContext return CallFunction("fill_null", {values, fill_value}, ctx); } +Result IfElse(const Datum& cond, const Datum& if_true, const Datum& if_false, + ExecContext* ctx) { + return CallFunction("if_else", {cond, if_true, if_false}, ctx); +} + +Result CaseWhen(const Datum& cond, const std::vector& cases, + ExecContext* ctx) { + std::vector args = {cond}; + args.reserve(cases.size() + 1); + args.insert(args.end(), cases.begin(), cases.end()); + return CallFunction("case_when", args, ctx); +} + +// ---------------------------------------------------------------------- +// Temporal functions + +SCALAR_EAGER_UNARY(Year, "year") +SCALAR_EAGER_UNARY(Month, "month") +SCALAR_EAGER_UNARY(Day, "day") +SCALAR_EAGER_UNARY(DayOfYear, "day_of_year") +SCALAR_EAGER_UNARY(ISOYear, "iso_year") +SCALAR_EAGER_UNARY(ISOWeek, "iso_week") +SCALAR_EAGER_UNARY(ISOCalendar, "iso_calendar") +SCALAR_EAGER_UNARY(Quarter, "quarter") +SCALAR_EAGER_UNARY(Hour, "hour") +SCALAR_EAGER_UNARY(Minute, "minute") +SCALAR_EAGER_UNARY(Second, "second") +SCALAR_EAGER_UNARY(Millisecond, "millisecond") +SCALAR_EAGER_UNARY(Microsecond, "microsecond") +SCALAR_EAGER_UNARY(Nanosecond, "nanosecond") +SCALAR_EAGER_UNARY(Subsecond, "subsecond") + +Result DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* ctx) { + return CallFunction("day_of_week", {arg}, &options, ctx); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index f59426d8f1b..e07e41569a1 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -37,21 +37,58 @@ namespace compute { /// /// @{ -struct ArithmeticOptions : public FunctionOptions { - ArithmeticOptions() : check_overflow(false) {} +class ARROW_EXPORT ArithmeticOptions : public FunctionOptions { + public: + explicit ArithmeticOptions(bool check_overflow = false); + constexpr static char const kTypeName[] = "ArithmeticOptions"; bool check_overflow; }; -struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { - explicit MatchSubstringOptions(std::string pattern) : pattern(std::move(pattern)) {} +class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions { + public: + explicit ElementWiseAggregateOptions(bool skip_nulls = true); + constexpr static char const kTypeName[] = "ElementWiseAggregateOptions"; + static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; } + + bool skip_nulls; +}; + +/// Options for var_args_join. +class ARROW_EXPORT JoinOptions : public FunctionOptions { + public: + /// How to handle null values. (A null separator always results in a null output.) + enum NullHandlingBehavior { + /// A null in any input results in a null in the output. + EMIT_NULL, + /// Nulls in inputs are skipped. + SKIP, + /// Nulls in inputs are replaced with the replacement string. + REPLACE, + }; + explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL, + std::string null_replacement = ""); + constexpr static char const kTypeName[] = "JoinOptions"; + static JoinOptions Defaults() { return JoinOptions(); } + NullHandlingBehavior null_handling; + std::string null_replacement; +}; + +class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { + public: + explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false); + MatchSubstringOptions(); + constexpr static char const kTypeName[] = "MatchSubstringOptions"; /// The exact substring (or regex, depending on kernel) to look for inside input values. std::string pattern; + /// Whether to perform a case-insensitive match. + bool ignore_case = false; }; -struct ARROW_EXPORT SplitOptions : public FunctionOptions { - explicit SplitOptions(int64_t max_splits = -1, bool reverse = false) - : max_splits(max_splits), reverse(reverse) {} +class ARROW_EXPORT SplitOptions : public FunctionOptions { + public: + explicit SplitOptions(int64_t max_splits = -1, bool reverse = false); + constexpr static char const kTypeName[] = "SplitOptions"; /// Maximum number of splits allowed, or unlimited when -1 int64_t max_splits; @@ -59,19 +96,41 @@ struct ARROW_EXPORT SplitOptions : public FunctionOptions { bool reverse; }; -struct ARROW_EXPORT SplitPatternOptions : public SplitOptions { +class ARROW_EXPORT SplitPatternOptions : public FunctionOptions { + public: explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1, - bool reverse = false) - : SplitOptions(max_splits, reverse), pattern(std::move(pattern)) {} + bool reverse = false); + SplitPatternOptions(); + constexpr static char const kTypeName[] = "SplitPatternOptions"; - /// The exact substring to look for inside input values. + /// The exact substring to split on. std::string pattern; + /// Maximum number of splits allowed, or unlimited when -1 + int64_t max_splits; + /// Start splitting from the end of the string (only relevant when max_splits != -1) + bool reverse; +}; + +class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions { + public: + explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement); + ReplaceSliceOptions(); + constexpr static char const kTypeName[] = "ReplaceSliceOptions"; + + /// Index to start slicing at + int64_t start; + /// Index to stop slicing at + int64_t stop; + /// String to replace the slice with + std::string replacement; }; -struct ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions { +class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions { + public: explicit ReplaceSubstringOptions(std::string pattern, std::string replacement, - int64_t max_replacements = -1) - : pattern(pattern), replacement(replacement), max_replacements(max_replacements) {} + int64_t max_replacements = -1); + ReplaceSubstringOptions(); + constexpr static char const kTypeName[] = "ReplaceSubstringOptions"; /// Pattern to match, literal, or regular expression depending on which kernel is used std::string pattern; @@ -81,10 +140,22 @@ struct ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions { int64_t max_replacements; }; +class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions { + public: + explicit ExtractRegexOptions(std::string pattern); + ExtractRegexOptions(); + constexpr static char const kTypeName[] = "ExtractRegexOptions"; + + /// Regular expression with named capture fields + std::string pattern; +}; + /// Options for IsIn and IndexIn functions -struct ARROW_EXPORT SetLookupOptions : public FunctionOptions { - explicit SetLookupOptions(Datum value_set, bool skip_nulls = false) - : value_set(std::move(value_set)), skip_nulls(skip_nulls) {} +class ARROW_EXPORT SetLookupOptions : public FunctionOptions { + public: + explicit SetLookupOptions(Datum value_set, bool skip_nulls = false); + SetLookupOptions(); + constexpr static char const kTypeName[] = "SetLookupOptions"; /// The set of values to look up input values into. Datum value_set; @@ -97,21 +168,47 @@ struct ARROW_EXPORT SetLookupOptions : public FunctionOptions { bool skip_nulls; }; -struct ARROW_EXPORT StrptimeOptions : public FunctionOptions { - explicit StrptimeOptions(std::string format, TimeUnit::type unit) - : format(std::move(format)), unit(unit) {} +class ARROW_EXPORT StrptimeOptions : public FunctionOptions { + public: + explicit StrptimeOptions(std::string format, TimeUnit::type unit); + StrptimeOptions(); + constexpr static char const kTypeName[] = "StrptimeOptions"; std::string format; TimeUnit::type unit; }; -struct ARROW_EXPORT TrimOptions : public FunctionOptions { - explicit TrimOptions(std::string characters) : characters(std::move(characters)) {} +class ARROW_EXPORT PadOptions : public FunctionOptions { + public: + explicit PadOptions(int64_t width, std::string padding = " "); + PadOptions(); + constexpr static char const kTypeName[] = "PadOptions"; + + /// The desired string length. + int64_t width; + /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII). + std::string padding; +}; + +class ARROW_EXPORT TrimOptions : public FunctionOptions { + public: + explicit TrimOptions(std::string characters); + TrimOptions(); + constexpr static char const kTypeName[] = "TrimOptions"; /// The individual characters that can be trimmed from the string. std::string characters; }; +class ARROW_EXPORT SliceOptions : public FunctionOptions { + public: + explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits::max(), + int64_t step = 1); + SliceOptions(); + constexpr static char const kTypeName[] = "SliceOptions"; + int64_t start, stop, step; +}; + enum CompareOperator : int8_t { EQUAL, NOT_EQUAL, @@ -121,23 +218,19 @@ enum CompareOperator : int8_t { LESS_EQUAL, }; -struct CompareOptions : public FunctionOptions { +struct ARROW_EXPORT CompareOptions { explicit CompareOptions(CompareOperator op) : op(op) {} - + CompareOptions() : CompareOptions(CompareOperator::EQUAL) {} enum CompareOperator op; }; -struct ARROW_EXPORT ProjectOptions : public FunctionOptions { - ProjectOptions(std::vector n, std::vector r, - std::vector> m) - : field_names(std::move(n)), - field_nullability(std::move(r)), - field_metadata(std::move(m)) {} - - explicit ProjectOptions(std::vector n) - : field_names(std::move(n)), - field_nullability(field_names.size(), true), - field_metadata(field_names.size(), NULLPTR) {} +class ARROW_EXPORT MakeStructOptions : public FunctionOptions { + public: + MakeStructOptions(std::vector n, std::vector r, + std::vector> m); + explicit MakeStructOptions(std::vector n); + MakeStructOptions(); + constexpr static char const kTypeName[] = "MakeStructOptions"; /// Names for wrapped columns std::vector field_names; @@ -149,8 +242,33 @@ struct ARROW_EXPORT ProjectOptions : public FunctionOptions { std::vector> field_metadata; }; +struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions { + public: + explicit DayOfWeekOptions(bool one_based_numbering = false, uint32_t week_start = 1); + constexpr static char const kTypeName[] = "DayOfWeekOptions"; + static DayOfWeekOptions Defaults() { return DayOfWeekOptions{}; } + + /// Number days from 1 if true and from 0 if false + bool one_based_numbering; + /// What day does the week start with (Monday=1, Sunday=7) + uint32_t week_start; +}; + /// @} +/// \brief Get the absolute value of a value. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the value transformed +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise absolute value +ARROW_EXPORT +Result AbsoluteValue(const Datum& arg, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + /// \brief Add two values together. Array values must be the same length. If /// either addend is null the result will be null. /// @@ -204,6 +322,233 @@ Result Divide(const Datum& left, const Datum& right, ArithmeticOptions options = ArithmeticOptions(), ExecContext* ctx = NULLPTR); +/// \brief Negate values. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg the value negated +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise negation +ARROW_EXPORT +Result Negate(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Raise the values of base array to the power of the exponent array values. +/// Array values must be the same length. If either base or exponent is null the result +/// will be null. +/// +/// \param[in] left the base +/// \param[in] right the exponent +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise base value raised to the power of exponent +ARROW_EXPORT +Result Power(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Left shift the left array by the right array. Array values must be the +/// same length. If either operand is null, the result will be null. +/// +/// \param[in] left the value to shift +/// \param[in] right the value to shift by +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise left value shifted left by the right value +ARROW_EXPORT +Result ShiftLeft(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Right shift the left array by the right array. Array values must be the +/// same length. If either operand is null, the result will be null. Performs a +/// logical shift for unsigned values, and an arithmetic shift for signed values. +/// +/// \param[in] left the value to shift +/// \param[in] right the value to shift by +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise left value shifted right by the right value +ARROW_EXPORT +Result ShiftRight(const Datum& left, const Datum& right, + ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the sine of the array values. +/// \param[in] arg The values to compute the sine for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise sine of the values +ARROW_EXPORT +Result Sin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the cosine of the array values. +/// \param[in] arg The values to compute the cosine for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise cosine of the values +ARROW_EXPORT +Result Cos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the inverse sine (arcsine) of the array values. +/// \param[in] arg The values to compute the inverse sine for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise inverse sine of the values +ARROW_EXPORT +Result Asin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the inverse cosine (arccosine) of the array values. +/// \param[in] arg The values to compute the inverse cosine for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise inverse cosine of the values +ARROW_EXPORT +Result Acos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the tangent of the array values. +/// \param[in] arg The values to compute the tangent for. +/// \param[in] options arithmetic options (enable/disable overflow checking), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise tangent of the values +ARROW_EXPORT +Result Tan(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the inverse tangent (arctangent) of the array values. +/// \param[in] arg The values to compute the inverse tangent for. +/// \param[in] ctx the function execution context, optional +/// \return the elementwise inverse tangent of the values +ARROW_EXPORT +Result Atan(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Compute the inverse tangent (arctangent) of y/x, using the +/// argument signs to determine the correct quadrant. +/// \param[in] y The y-values to compute the inverse tangent for. +/// \param[in] x The x-values to compute the inverse tangent for. +/// \param[in] ctx the function execution context, optional +/// \return the elementwise inverse tangent of the values +ARROW_EXPORT +Result Atan2(const Datum& y, const Datum& x, ExecContext* ctx = NULLPTR); + +/// \brief Get the natural log of a value. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg The values to compute the logarithm for. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise natural log +ARROW_EXPORT +Result Ln(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the log base 10 of a value. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg The values to compute the logarithm for. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise log base 10 +ARROW_EXPORT +Result Log10(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the log base 2 of a value. +/// +/// If argument is null the result will be null. +/// +/// \param[in] arg The values to compute the logarithm for. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise log base 2 +ARROW_EXPORT +Result Log2(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the natural log of (1 + value). +/// +/// If argument is null the result will be null. +/// This function may be more accurate than Log(1 + value) for values close to zero. +/// +/// \param[in] arg The values to compute the logarithm for. +/// \param[in] options arithmetic options (overflow handling), optional +/// \param[in] ctx the function execution context, optional +/// \return the elementwise natural log +ARROW_EXPORT +Result Log1p(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief Round to the nearest integer less than or equal in magnitude to the +/// argument. Array values can be of arbitrary length. If argument is null the +/// result will be null. +/// +/// \param[in] arg the value to round +/// \param[in] ctx the function execution context, optional +/// \return the rounded value +ARROW_EXPORT +Result Floor(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Round to the nearest integer greater than or equal in magnitude to the +/// argument. Array values can be of arbitrary length. If argument is null the +/// result will be null. +/// +/// \param[in] arg the value to round +/// \param[in] ctx the function execution context, optional +/// \return the rounded value +ARROW_EXPORT +Result Ceil(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Get the integral part without fractional digits. Array values can be +/// of arbitrary length. If argument is null the result will be null. +/// +/// \param[in] arg the value to truncate +/// \param[in] ctx the function execution context, optional +/// \return the truncated value +ARROW_EXPORT +Result Trunc(const Datum& arg, ExecContext* ctx = NULLPTR); + +/// \brief Find the element-wise maximum of any number of arrays or scalars. +/// Array values must be the same length. +/// +/// \param[in] args arrays or scalars to operate on. +/// \param[in] options options for handling nulls, optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise maximum +ARROW_EXPORT +Result MaxElementWise( + const std::vector& args, + ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Find the element-wise minimum of any number of arrays or scalars. +/// Array values must be the same length. +/// +/// \param[in] args arrays or scalars to operate on. +/// \param[in] options options for handling nulls, optional +/// \param[in] ctx the function execution context, optional +/// \return the element-wise minimum +ARROW_EXPORT +Result MinElementWise( + const std::vector& args, + ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Get the sign of a value. Array values can be of arbitrary length. If argument +/// is null the result will be null. +/// +/// \param[in] arg the value to extract sign from +/// \param[in] ctx the function execution context, optional +/// \return the elementwise sign function +ARROW_EXPORT +Result Sign(const Datum& arg, ExecContext* ctx = NULLPTR); + /// \brief Compare a numeric array with a scalar. /// /// \param[in] left datum to compare, must be an Array @@ -217,9 +562,10 @@ Result Divide(const Datum& left, const Datum& right, /// /// \since 1.0.0 /// \note API not yet finalized +ARROW_DEPRECATED("Deprecated in 5.0.0. Use each compare function directly") ARROW_EXPORT -Result Compare(const Datum& left, const Datum& right, - struct CompareOptions options, ExecContext* ctx = NULLPTR); +Result Compare(const Datum& left, const Datum& right, CompareOptions options, + ExecContext* ctx = NULLPTR); /// \brief Invert the values of a boolean datum /// \param[in] value datum to invert @@ -416,5 +762,228 @@ ARROW_EXPORT Result FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx = NULLPTR); +/// \brief IfElse returns elements chosen from `left` or `right` +/// depending on `cond`. `null` values in `cond` will be promoted to the result +/// +/// \param[in] cond `Boolean` condition Scalar/ Array +/// \param[in] left Scalar/ Array +/// \param[in] right Scalar/ Array +/// \param[in] ctx the function execution context, optional +/// +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result IfElse(const Datum& cond, const Datum& left, const Datum& right, + ExecContext* ctx = NULLPTR); + +/// \brief CaseWhen behaves like a switch/case or if-else if-else statement: for +/// each row, select the first value for which the corresponding condition is +/// true, or (if given) select the 'else' value, else emit null. Note that a +/// null condition is the same as false. +/// +/// \param[in] cond Conditions (Boolean) +/// \param[in] cases Values (any type), along with an optional 'else' value. +/// \param[in] ctx the function execution context, optional +/// +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result CaseWhen(const Datum& cond, const std::vector& cases, + ExecContext* ctx = NULLPTR); + +/// \brief Year returns year for each element of `values` +/// +/// \param[in] values input to extract year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Year(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Month returns month for each element of `values`. +/// Month is encoded as January=1, December=12 +/// +/// \param[in] values input to extract month from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Month(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Day returns day number for each element of `values` +/// +/// \param[in] values input to extract day from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Day(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief DayOfWeek returns number of the day of the week value for each element of +/// `values`. +/// +/// By default week starts on Monday denoted by 0 and ends on Sunday denoted +/// by 6. Start day of the week (Monday=1, Sunday=7) and numbering base (0 or 1) can be +/// set using DayOfWeekOptions +/// +/// \param[in] values input to extract number of the day of the week from +/// \param[in] options for setting start of the week and day numbering +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result DayOfWeek(const Datum& values, + DayOfWeekOptions options = DayOfWeekOptions(), + ExecContext* ctx = NULLPTR); + +/// \brief DayOfYear returns number of day of the year for each element of `values`. +/// January 1st maps to day number 1, February 1st to 32, etc. +/// +/// \param[in] values input to extract number of day of the year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOYear returns ISO year number for each element of `values`. +/// First week of an ISO year has the majority (4 or more) of its days in January. +/// +/// \param[in] values input to extract ISO year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result ISOYear(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOWeek returns ISO week of year number for each element of `values`. +/// First ISO week has the majority (4 or more) of its days in January. +/// Week of the year starts with 1 and can run up to 53. +/// +/// \param[in] values input to extract ISO week of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for +/// each element of `values`. +/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7. +/// +/// \param[in] values input to ISO calendar struct from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Quarter returns the quarter of year number for each element of `values` +/// First quarter maps to 1 and fourth quarter maps to 4. +/// +/// \param[in] values input to extract quarter of year from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Quarter(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Hour returns hour value for each element of `values` +/// +/// \param[in] values input to extract hour from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Hour(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Minute returns minutes value for each element of `values` +/// +/// \param[in] values input to extract minutes from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Minute(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Second returns seconds value for each element of `values` +/// +/// \param[in] values input to extract seconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Second(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Millisecond returns number of milliseconds since the last full second +/// for each element of `values` +/// +/// \param[in] values input to extract milliseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Millisecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Microsecond returns number of microseconds since the last full millisecond +/// for each element of `values` +/// +/// \param[in] values input to extract microseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Microsecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Nanosecond returns number of nanoseconds since the last full millisecond +/// for each element of `values` +/// +/// \param[in] values input to extract nanoseconds from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR); + +/// \brief Subsecond returns the fraction of second elapsed since last full second +/// as a float for each element of `values` +/// +/// \param[in] values input to extract subsecond from +/// \param[in] ctx the function execution context, optional +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT Result Subsecond(const Datum& values, ExecContext* ctx = NULLPTR); + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 0082d48112d..9f3b3fa71b3 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -18,23 +18,139 @@ #include "arrow/compute/api_vector.h" #include +#include #include #include #include "arrow/array/array_nested.h" #include "arrow/array/builder_primitive.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function_internal.h" +#include "arrow/compute/registry.h" #include "arrow/datum.h" #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" namespace arrow { +using internal::checked_cast; using internal::checked_pointer_cast; +namespace internal { +using compute::DictionaryEncodeOptions; +using compute::FilterOptions; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "FilterOptions::NullSelectionBehavior"; } + static std::string value_name(FilterOptions::NullSelectionBehavior value) { + switch (value) { + case FilterOptions::DROP: + return "DROP"; + case FilterOptions::EMIT_NULL: + return "EMIT_NULL"; + } + return ""; + } +}; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "DictionaryEncodeOptions::NullEncodingBehavior"; } + static std::string value_name(DictionaryEncodeOptions::NullEncodingBehavior value) { + switch (value) { + case DictionaryEncodeOptions::ENCODE: + return "ENCODE"; + case DictionaryEncodeOptions::MASK: + return "MASK"; + } + return ""; + } +}; +} // namespace internal + namespace compute { +// ---------------------------------------------------------------------- +// Function options + +bool SortKey::Equals(const SortKey& other) const { + return name == other.name && order == other.order; +} +std::string SortKey::ToString() const { + std::stringstream ss; + ss << name << ' '; + switch (order) { + case SortOrder::Ascending: + ss << "ASC"; + break; + case SortOrder::Descending: + ss << "DESC"; + break; + } + return ss.str(); +} + +namespace internal { +namespace { +using ::arrow::internal::DataMember; +static auto kFilterOptionsType = GetFunctionOptionsType( + DataMember("null_selection_behavior", &FilterOptions::null_selection_behavior)); +static auto kTakeOptionsType = GetFunctionOptionsType( + DataMember("boundscheck", &TakeOptions::boundscheck)); +static auto kDictionaryEncodeOptionsType = + GetFunctionOptionsType(DataMember( + "null_encoding_behavior", &DictionaryEncodeOptions::null_encoding_behavior)); +static auto kArraySortOptionsType = GetFunctionOptionsType( + DataMember("order", &ArraySortOptions::order)); +static auto kSortOptionsType = + GetFunctionOptionsType(DataMember("sort_keys", &SortOptions::sort_keys)); +static auto kPartitionNthOptionsType = GetFunctionOptionsType( + DataMember("pivot", &PartitionNthOptions::pivot)); +} // namespace +} // namespace internal + +FilterOptions::FilterOptions(NullSelectionBehavior null_selection) + : FunctionOptions(internal::kFilterOptionsType), + null_selection_behavior(null_selection) {} +constexpr char FilterOptions::kTypeName[]; + +TakeOptions::TakeOptions(bool boundscheck) + : FunctionOptions(internal::kTakeOptionsType), boundscheck(boundscheck) {} +constexpr char TakeOptions::kTypeName[]; + +DictionaryEncodeOptions::DictionaryEncodeOptions(NullEncodingBehavior null_encoding) + : FunctionOptions(internal::kDictionaryEncodeOptionsType), + null_encoding_behavior(null_encoding) {} +constexpr char DictionaryEncodeOptions::kTypeName[]; + +ArraySortOptions::ArraySortOptions(SortOrder order) + : FunctionOptions(internal::kArraySortOptionsType), order(order) {} +constexpr char ArraySortOptions::kTypeName[]; + +SortOptions::SortOptions(std::vector sort_keys) + : FunctionOptions(internal::kSortOptionsType), sort_keys(std::move(sort_keys)) {} +constexpr char SortOptions::kTypeName[]; + +PartitionNthOptions::PartitionNthOptions(int64_t pivot) + : FunctionOptions(internal::kPartitionNthOptionsType), pivot(pivot) {} +constexpr char PartitionNthOptions::kTypeName[]; + +namespace internal { +void RegisterVectorOptions(FunctionRegistry* registry) { + DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kTakeOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kDictionaryEncodeOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kArraySortOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kSortOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kPartitionNthOptionsType)); +} +} // namespace internal + // ---------------------------------------------------------------------- // Direct exec interface to kernels @@ -46,6 +162,11 @@ Result> NthToIndices(const Array& values, int64_t n, return result.make_array(); } +Result ReplaceWithMask(const Datum& values, const Datum& mask, + const Datum& replacements, ExecContext* ctx) { + return CallFunction("replace_with_mask", {values, mask, replacements}, ctx); +} + Result> SortIndices(const Array& values, SortOrder order, ExecContext* ctx) { ArraySortOptions options(order); @@ -115,45 +236,6 @@ Result> Take(const Array& values, const Array& indices, // ---------------------------------------------------------------------- // Deprecated functions -Result> Take(const ChunkedArray& values, - const Array& indices, - const TakeOptions& options, ExecContext* ctx) { - ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(values), Datum(indices), options, ctx)); - return result.chunked_array(); -} - -Result> Take(const ChunkedArray& values, - const ChunkedArray& indices, - const TakeOptions& options, ExecContext* ctx) { - ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(values), Datum(indices), options, ctx)); - return result.chunked_array(); -} - -Result> Take(const Array& values, - const ChunkedArray& indices, - const TakeOptions& options, ExecContext* ctx) { - ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(values), Datum(indices), options, ctx)); - return result.chunked_array(); -} - -Result> Take(const RecordBatch& batch, const Array& indices, - const TakeOptions& options, ExecContext* ctx) { - ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(batch), Datum(indices), options, ctx)); - return result.record_batch(); -} - -Result> Take(const Table& table, const Array& indices, - const TakeOptions& options, ExecContext* ctx) { - ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(table), Datum(indices), options, ctx)); - return result.table(); -} - -Result> Take(const Table& table, const ChunkedArray& indices, - const TakeOptions& options, ExecContext* ctx) { - ARROW_ASSIGN_OR_RAISE(Datum result, Take(Datum(table), Datum(indices), options, ctx)); - return result.table(); -} - Result> SortToIndices(const Array& values, ExecContext* ctx) { return SortIndices(values, SortOrder::Ascending, ctx); } diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index d67568e1567..2d9522b0732 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include "arrow/compute/function.h" #include "arrow/datum.h" @@ -32,7 +33,8 @@ class ExecContext; /// \addtogroup compute-concrete-options /// @{ -struct FilterOptions : public FunctionOptions { +class ARROW_EXPORT FilterOptions : public FunctionOptions { + public: /// Configure the action taken when a slot of the selection mask is null enum NullSelectionBehavior { /// the corresponding filtered value will be removed in the output @@ -41,30 +43,27 @@ struct FilterOptions : public FunctionOptions { EMIT_NULL, }; - explicit FilterOptions(NullSelectionBehavior null_selection = DROP) - : null_selection_behavior(null_selection) {} - + explicit FilterOptions(NullSelectionBehavior null_selection = DROP); + constexpr static char const kTypeName[] = "FilterOptions"; static FilterOptions Defaults() { return FilterOptions(); } NullSelectionBehavior null_selection_behavior = DROP; }; -struct ARROW_EXPORT TakeOptions : public FunctionOptions { - explicit TakeOptions(bool boundscheck = true) : boundscheck(boundscheck) {} - - bool boundscheck = true; +class ARROW_EXPORT TakeOptions : public FunctionOptions { + public: + explicit TakeOptions(bool boundscheck = true); + constexpr static char const kTypeName[] = "TakeOptions"; static TakeOptions BoundsCheck() { return TakeOptions(true); } static TakeOptions NoBoundsCheck() { return TakeOptions(false); } static TakeOptions Defaults() { return BoundsCheck(); } -}; -enum class SortOrder { - Ascending, - Descending, + bool boundscheck = true; }; /// \brief Options for the dictionary encode function -struct DictionaryEncodeOptions : public FunctionOptions { +class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions { + public: /// Configure how null values will be encoded enum NullEncodingBehavior { /// the null value will be added to the dictionary with a proper index @@ -73,18 +72,29 @@ struct DictionaryEncodeOptions : public FunctionOptions { MASK }; - explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK) - : null_encoding_behavior(null_encoding) {} - + explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK); + constexpr static char const kTypeName[] = "DictionaryEncodeOptions"; static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); } NullEncodingBehavior null_encoding_behavior = MASK; }; +enum class SortOrder { + Ascending, + Descending, +}; + /// \brief One sort key for PartitionNthIndices (TODO) and SortIndices -struct ARROW_EXPORT SortKey { +class ARROW_EXPORT SortKey : public util::EqualityComparable { + public: explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending) - : name(name), order(order) {} + : name(std::move(name)), order(order) {} + + using util::EqualityComparable::Equals; + using util::EqualityComparable::operator==; + using util::EqualityComparable::operator!=; + bool Equals(const SortKey& other) const; + std::string ToString() const; /// The name of the sort column. std::string name; @@ -92,25 +102,30 @@ struct ARROW_EXPORT SortKey { SortOrder order; }; -struct ARROW_EXPORT ArraySortOptions : public FunctionOptions { - explicit ArraySortOptions(SortOrder order = SortOrder::Ascending) : order(order) {} - +class ARROW_EXPORT ArraySortOptions : public FunctionOptions { + public: + explicit ArraySortOptions(SortOrder order = SortOrder::Ascending); + constexpr static char const kTypeName[] = "ArraySortOptions"; static ArraySortOptions Defaults() { return ArraySortOptions{}; } SortOrder order; }; -struct ARROW_EXPORT SortOptions : public FunctionOptions { - explicit SortOptions(std::vector sort_keys = {}) : sort_keys(sort_keys) {} - +class ARROW_EXPORT SortOptions : public FunctionOptions { + public: + explicit SortOptions(std::vector sort_keys = {}); + constexpr static char const kTypeName[] = "SortOptions"; static SortOptions Defaults() { return SortOptions{}; } std::vector sort_keys; }; /// \brief Partitioning options for NthToIndices -struct ARROW_EXPORT PartitionNthOptions : public FunctionOptions { - explicit PartitionNthOptions(int64_t pivot) : pivot(pivot) {} +class ARROW_EXPORT PartitionNthOptions : public FunctionOptions { + public: + explicit PartitionNthOptions(int64_t pivot); + PartitionNthOptions() : PartitionNthOptions(0) {} + constexpr static char const kTypeName[] = "PartitionNthOptions"; /// The index into the equivalent sorted array of the partition pivot element. int64_t pivot; @@ -157,6 +172,23 @@ Result> GetTakeIndices( } // namespace internal +/// \brief ReplaceWithMask replaces each value in the array corresponding +/// to a true value in the mask with the next element from `replacements`. +/// +/// \param[in] values Array input to replace +/// \param[in] mask Array or Scalar of Boolean mask values +/// \param[in] replacements The replacement values to draw from. There must +/// be as many replacement values as true values in the mask. +/// \param[in] ctx the function execution context, optional +/// +/// \return the resulting datum +/// +/// \since 5.0.0 +/// \note API not yet finalized +ARROW_EXPORT +Result ReplaceWithMask(const Datum& values, const Datum& mask, + const Datum& replacements, ExecContext* ctx = NULLPTR); + /// \brief Take from an array of values at indices in another array /// /// The output array will be of the same type as the input values @@ -334,42 +366,6 @@ Result DictionaryEncode( // ---------------------------------------------------------------------- // Deprecated functions -ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version") -ARROW_EXPORT -Result> Take( - const ChunkedArray& values, const Array& indices, - const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR); - -ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version") -ARROW_EXPORT -Result> Take( - const ChunkedArray& values, const ChunkedArray& indices, - const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR); - -ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version") -ARROW_EXPORT -Result> Take( - const Array& values, const ChunkedArray& indices, - const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR); - -ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version") -ARROW_EXPORT -Result> Take( - const RecordBatch& batch, const Array& indices, - const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR); - -ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version") -ARROW_EXPORT -Result> Take(const Table& table, const Array& indices, - const TakeOptions& options = TakeOptions::Defaults(), - ExecContext* context = NULLPTR); - -ARROW_DEPRECATED("Deprecated in 1.0.0. Use Datum-based version") -ARROW_EXPORT -Result> Take(const Table& table, const ChunkedArray& indices, - const TakeOptions& options = TakeOptions::Defaults(), - ExecContext* context = NULLPTR); - ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()") ARROW_EXPORT Result> SortToIndices(const Array& values, diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index 8a091f2355d..4de68ba8d90 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -18,6 +18,7 @@ #include "arrow/compute/cast.h" #include +#include #include #include #include @@ -26,10 +27,12 @@ #include "arrow/compute/cast_internal.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function_internal.h" #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/registry.h" #include "arrow/util/logging.h" +#include "arrow/util/reflection_internal.h" namespace arrow { @@ -38,6 +41,9 @@ using internal::ToTypeName; namespace compute { namespace internal { +// ---------------------------------------------------------------------- +// Function options + namespace { std::unordered_map> g_cast_table; @@ -55,6 +61,7 @@ void InitCastTable() { AddCastFunctions(GetNestedCasts()); AddCastFunctions(GetNumericCasts()); AddCastFunctions(GetTemporalCasts()); + AddCastFunctions(GetDictionaryCasts()); } void EnsureInitCastTable() { std::call_once(cast_table_initialized, InitCastTable); } @@ -116,14 +123,35 @@ class CastMetaFunction : public MetaFunction { } }; +static auto kCastOptionsType = GetFunctionOptionsType( + arrow::internal::DataMember("to_type", &CastOptions::to_type), + arrow::internal::DataMember("allow_int_overflow", &CastOptions::allow_int_overflow), + arrow::internal::DataMember("allow_time_truncate", &CastOptions::allow_time_truncate), + arrow::internal::DataMember("allow_time_overflow", &CastOptions::allow_time_overflow), + arrow::internal::DataMember("allow_decimal_truncate", + &CastOptions::allow_decimal_truncate), + arrow::internal::DataMember("allow_float_truncate", + &CastOptions::allow_float_truncate), + arrow::internal::DataMember("allow_invalid_utf8", &CastOptions::allow_invalid_utf8)); } // namespace void RegisterScalarCast(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::make_shared())); + DCHECK_OK(registry->AddFunctionOptionsType(kCastOptionsType)); } - } // namespace internal +CastOptions::CastOptions(bool safe) + : FunctionOptions(internal::kCastOptionsType), + allow_int_overflow(!safe), + allow_time_truncate(!safe), + allow_time_overflow(!safe), + allow_decimal_truncate(!safe), + allow_float_truncate(!safe), + allow_invalid_utf8(!safe) {} + +constexpr char CastOptions::kTypeName[]; + CastFunction::CastFunction(std::string name, Type::type out_type_id) : ScalarFunction(std::move(name), Arity::Unary(), /*doc=*/nullptr), out_type_id_(out_type_id) {} diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h index 818f2ef9182..131f57f892f 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/cast.h @@ -41,15 +41,11 @@ class ExecContext; /// \addtogroup compute-concrete-options /// @{ -struct ARROW_EXPORT CastOptions : public FunctionOptions { - explicit CastOptions(bool safe = true) - : allow_int_overflow(!safe), - allow_time_truncate(!safe), - allow_time_overflow(!safe), - allow_decimal_truncate(!safe), - allow_float_truncate(!safe), - allow_invalid_utf8(!safe) {} +class ARROW_EXPORT CastOptions : public FunctionOptions { + public: + explicit CastOptions(bool safe = true); + constexpr static char const kTypeName[] = "CastOptions"; static CastOptions Safe(std::shared_ptr to_type = NULLPTR) { CastOptions safe(true); safe.to_type = std::move(to_type); diff --git a/cpp/src/arrow/compute/cast_internal.h b/cpp/src/arrow/compute/cast_internal.h index c152d10bd86..0105d08a573 100644 --- a/cpp/src/arrow/compute/cast_internal.h +++ b/cpp/src/arrow/compute/cast_internal.h @@ -36,6 +36,7 @@ std::vector> GetNumericCasts(); std::vector> GetTemporalCasts(); std::vector> GetBinaryLikeCasts(); std::vector> GetNestedCasts(); +std::vector> GetDictionaryCasts(); } // namespace internal } // namespace compute diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index c3187a3995a..7d6db9f58db 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -36,6 +37,7 @@ #include "arrow/compute/registry.h" #include "arrow/compute/util_internal.h" #include "arrow/datum.h" +#include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/scalar.h" #include "arrow/status.h" @@ -69,6 +71,54 @@ ExecBatch::ExecBatch(const RecordBatch& batch) std::move(columns.begin(), columns.end(), values.begin()); } +bool ExecBatch::Equals(const ExecBatch& other) const { + return guarantee == other.guarantee && values == other.values; +} + +void PrintTo(const ExecBatch& batch, std::ostream* os) { + *os << "ExecBatch\n"; + + static const std::string indent = " "; + + *os << indent << "# Rows: " << batch.length << "\n"; + if (batch.guarantee != literal(true)) { + *os << indent << "Guarantee: " << batch.guarantee.ToString() << "\n"; + } + + int i = 0; + for (const Datum& value : batch.values) { + *os << indent << "" << i++ << ": "; + + if (value.is_scalar()) { + *os << "Scalar[" << value.scalar()->ToString() << "]\n"; + continue; + } + + auto array = value.make_array(); + PrettyPrintOptions options; + options.skip_new_lines = true; + *os << "Array"; + ARROW_CHECK_OK(PrettyPrint(*array, options, os)); + *os << "\n"; + } +} + +std::string ExecBatch::ToString() const { + std::stringstream ss; + PrintTo(*this, &ss); + return ss.str(); +} + +ExecBatch ExecBatch::Slice(int64_t offset, int64_t length) const { + ExecBatch out = *this; + for (auto& value : out.values) { + if (value.is_scalar()) continue; + value = value.array()->Slice(offset, length); + } + out.length = std::min(length, this->length - offset); + return out; +} + Result ExecBatch::Make(std::vector values) { if (values.empty()) { return Status::Invalid("Cannot infer ExecBatch length without at least one value"); @@ -77,9 +127,6 @@ Result ExecBatch::Make(std::vector values) { int64_t length = -1; for (const auto& value : values) { if (value.is_scalar()) { - if (length == -1) { - length = 1; - } continue; } @@ -94,8 +141,29 @@ Result ExecBatch::Make(std::vector values) { } } + if (length == -1) { + length = 1; + } + return ExecBatch(std::move(values), length); } + +Result> ExecBatch::ToRecordBatch( + std::shared_ptr schema, MemoryPool* pool) const { + ArrayVector columns(schema->num_fields()); + + for (size_t i = 0; i < columns.size(); ++i) { + const Datum& value = values[i]; + if (value.is_array()) { + columns[i] = value.make_array(); + continue; + } + ARROW_ASSIGN_OR_RAISE(columns[i], MakeArrayFromScalar(*value.scalar(), length, pool)); + } + + return RecordBatch::Make(std::move(schema), length, std::move(columns)); +} + namespace { Result> AllocateDataBuffer(KernelContext* ctx, int64_t length, @@ -106,7 +174,6 @@ Result> AllocateDataBuffer(KernelContext* ctx, int64_t l int64_t buffer_size = BitUtil::BytesForBits(length * bit_width); return ctx->Allocate(buffer_size); } - return Status::OK(); } struct BufferPreallocation { @@ -269,7 +336,7 @@ struct NullGeneralization { // Do not count the bits if they haven't been counted already const int64_t known_null_count = arr.null_count.load(); - if (known_null_count == 0) { + if ((known_null_count == 0) || (arr.buffers[0] == NULLPTR)) { return ALL_VALID; } @@ -616,8 +683,7 @@ class ScalarExecutor : public KernelExecutorImpl { } } - kernel_->exec(kernel_ctx_, batch, &out); - ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); + RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out)); if (!preallocate_contiguous_) { // If we are producing chunked output rather than one big array, then // emit each chunk as soon as it's available @@ -704,6 +770,7 @@ class ScalarExecutor : public KernelExecutorImpl { preallocate_contiguous_ = (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices && validity_preallocated_ && !is_nested(output_descr_.type->id()) && + !is_dictionary(output_descr_.type->id()) && data_preallocated_.size() == static_cast(output_num_buffers_ - 1) && std::all_of(data_preallocated_.begin(), data_preallocated_.end(), [](const BufferPreallocation& prealloc) { @@ -793,8 +860,7 @@ class VectorExecutor : public KernelExecutorImpl { output_descr_.shape == ValueDescr::ARRAY) { RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array())); } - kernel_->exec(kernel_ctx_, batch, &out); - ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); + RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out)); if (!kernel_->finalize) { // If there is no result finalizer (e.g. for hash-based functions, we can // emit the processed batch right away rather than waiting @@ -809,8 +875,7 @@ class VectorExecutor : public KernelExecutorImpl { if (kernel_->finalize) { // Intermediate results require post-processing after the execution is // completed (possibly involving some accumulated state) - kernel_->finalize(kernel_ctx_, &results_); - ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); + RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &results_)); for (const auto& result : results_) { RETURN_NOT_OK(listener->OnResult(result)); } @@ -863,8 +928,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { } Datum out; - kernel_->finalize(kernel_ctx_, &out); - ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); + RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &out)); RETURN_NOT_OK(listener->OnResult(std::move(out))); return Status::OK(); } @@ -878,24 +942,19 @@ class ScalarAggExecutor : public KernelExecutorImpl { private: Status Consume(const ExecBatch& batch) { // FIXME(ARROW-11840) don't merge *any* aggegates for every batch - auto batch_state = kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}); - ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); + ARROW_ASSIGN_OR_RAISE( + auto batch_state, + kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_})); if (batch_state == nullptr) { - kernel_ctx_->SetStatus( - Status::Invalid("ScalarAggregation requires non-null kernel state")); - return kernel_ctx_->status(); + return Status::Invalid("ScalarAggregation requires non-null kernel state"); } KernelContext batch_ctx(exec_context()); batch_ctx.SetState(batch_state.get()); - kernel_->consume(&batch_ctx, batch); - ARROW_CTX_RETURN_IF_ERROR(&batch_ctx); - - kernel_->merge(kernel_ctx_, std::move(*batch_state), state()); - ARROW_CTX_RETURN_IF_ERROR(kernel_ctx_); - + RETURN_NOT_OK(kernel_->consume(&batch_ctx, batch)); + RETURN_NOT_OK(kernel_->merge(kernel_ctx_, std::move(*batch_state), state())); return Status::OK(); } @@ -951,8 +1010,9 @@ std::unique_ptr KernelExecutor::MakeScalarAggregate() { } // namespace detail -ExecContext::ExecContext(MemoryPool* pool, FunctionRegistry* func_registry) - : pool_(pool) { +ExecContext::ExecContext(MemoryPool* pool, ::arrow::internal::Executor* executor, + FunctionRegistry* func_registry) + : pool_(pool), executor_(executor) { this->func_registry_ = func_registry == nullptr ? GetFunctionRegistry() : func_registry; } diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h index 7659442d8bf..1b70ee244cb 100644 --- a/cpp/src/arrow/compute/exec.h +++ b/cpp/src/arrow/compute/exec.h @@ -28,11 +28,13 @@ #include #include "arrow/array/data.h" +#include "arrow/compute/exec/expression.h" #include "arrow/datum.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/type_fwd.h" #include "arrow/util/macros.h" +#include "arrow/util/type_fwd.h" #include "arrow/util/visibility.h" namespace arrow { @@ -44,7 +46,7 @@ class CpuInfo; namespace compute { -struct FunctionOptions; +class FunctionOptions; class FunctionRegistry; // It seems like 64K might be a good default chunksize to use for execution @@ -59,6 +61,7 @@ class ARROW_EXPORT ExecContext { public: // If no function registry passed, the default is used. explicit ExecContext(MemoryPool* pool = default_memory_pool(), + ::arrow::internal::Executor* executor = NULLPTR, FunctionRegistry* func_registry = NULLPTR); /// \brief The MemoryPool used for allocations, default is @@ -67,6 +70,9 @@ class ARROW_EXPORT ExecContext { ::arrow::internal::CpuInfo* cpu_info() const; + /// \brief An Executor which may be used to parallelize execution. + ::arrow::internal::Executor* executor() const { return executor_; } + /// \brief The FunctionRegistry for looking up functions by name and /// selecting kernels for execution. Defaults to the library-global function /// registry provided by GetFunctionRegistry. @@ -113,6 +119,7 @@ class ARROW_EXPORT ExecContext { private: MemoryPool* pool_; + ::arrow::internal::Executor* executor_; FunctionRegistry* func_registry_; int64_t exec_chunksize_ = std::numeric_limits::max(); bool preallocate_contiguous_ = true; @@ -175,6 +182,9 @@ struct ARROW_EXPORT ExecBatch { static Result Make(std::vector values); + Result> ToRecordBatch( + std::shared_ptr schema, MemoryPool* pool = default_memory_pool()) const; + /// The values representing positional arguments to be passed to a kernel's /// exec function for processing. std::vector values; @@ -186,6 +196,9 @@ struct ARROW_EXPORT ExecBatch { /// ExecBatch::length is equal to the length of this array. std::shared_ptr selection_vector; + /// A predicate Expression guaranteed to evaluate to true for all rows in this batch. + Expression guarantee = literal(true); + /// The semantic length of the ExecBatch. When the values are all scalars, /// the length should be set to 1, otherwise the length is taken from the /// array values, except when there is a selection vector. When there is a @@ -203,9 +216,13 @@ struct ARROW_EXPORT ExecBatch { return values[i]; } + bool Equals(const ExecBatch& other) const; + /// \brief A convenience for the number of values / arguments. int num_values() const { return static_cast(values.size()); } + ExecBatch Slice(int64_t offset, int64_t length) const; + /// \brief A convenience for returning the ValueDescr objects (types and /// shapes) from the batch. std::vector GetDescriptors() const { @@ -215,8 +232,15 @@ struct ARROW_EXPORT ExecBatch { } return result; } + + std::string ToString() const; + + ARROW_EXPORT friend void PrintTo(const ExecBatch&, std::ostream*); }; +inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); } +inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); } + /// \defgroup compute-call-function One-shot calls to compute functions /// /// @{ diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt new file mode 100644 index 00000000000..2ed8b1c9480 --- /dev/null +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +arrow_install_all_headers("arrow/compute/exec") + +add_arrow_compute_test(expression_test + PREFIX + "arrow-compute" + SOURCES + expression_test.cc + subtree_test.cc) + +add_arrow_compute_test(plan_test PREFIX "arrow-compute") + +add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/exec/doc/exec_node.md b/cpp/src/arrow/compute/exec/doc/exec_node.md new file mode 100644 index 00000000000..797cc87d90a --- /dev/null +++ b/cpp/src/arrow/compute/exec/doc/exec_node.md @@ -0,0 +1,147 @@ + + +# ExecNodes and logical operators + +`ExecNode`s are intended to implement individual logical operators +in a streaming execution graph. Each node receives batches from +upstream nodes (inputs), processes them in some way, then pushes +results to downstream nodes (outputs). `ExecNode`s are owned and +(to an extent) coordinated by an `ExecPlan`. + +> Terminology: "operator" and "node" are mostly interchangable, like +> "Interface" and "Abstract Base Class" in c++ space. The latter is +> a formal and specific bit of code which implements the abstract +> concept. + +## Types of logical operators + +Each of these will have at least one corresponding concrete +`ExecNode`. Where possible, compatible implementations of a +logical operator will *not* be exposed as independent subclasses +of `ExecNode`. Instead we prefer that they be +be encapsulated internally by a single subclass of `ExecNode` +to permit switching between them during a query. + +- Scan: materializes in-memory batches from storage (e.g. Parquet + files, flight stream, ...) +- Filter: evaluates an `Expression` on each input batch and outputs + a copy with any rows excluded for which the filter did not return + `true`. +- Project: evaluates `Expression`s on each input batch to produce + the columns of an output batch. +- Grouped Aggregate: identify groups based on one or more key columns + in each input batch, then update aggregates corresponding to those + groups. Node that this is a pipeline breaker; it will wait for its + inputs to complete before outputting any batches. +- Union: merge two or more streams of batches into a single stream + of batches. +- Write: write each batch to storage +- ToTable: Collect batches into a `Table` with stable row ordering where + possible. + +#### Not in scope for Arrow 5.0: + +- Join: perform an inner, left, outer, semi, or anti join given some + join predicates. +- Sort: accumulate all input batches into a single table, reorder its + rows by some sorting condition, then stream the sorted table out as + batches +- Top-K: retrieve a limited subset of rows from a table as though it + were in sorted order. + +For example: a dataset scan with only a filter and a +projection will correspond to a fairly trivial graph: + +``` +ScanNode -> FilterNode -> ProjectNode -> ToTableNode +``` + +A scan node loads batches from disk and pushes to a filter node. +The filter node excludes some rows based on an `Expression` then +pushes filtered batches to a project node. The project node +materializes new columns based on `Expression`s then pushes those +batches to a table collection node. The table collection node +assembles these batches into a `Table` which is handed off as the +result of the `ExecPlan`. + +## Parallelism, pipelines + +The execution graph is orthogonal to parallelism; any +node may push to any other node from any thread. A scan node causes +each batch to arrive on a thread after which it will pass through +each node in the example graph above, never leaving that thread +(memory/other resource pressure permitting). + +The example graph above happens to be simple enough that processing +of any batch by any node is independent of other nodes and other +batches; it is a pipeline. Note that there is no explicit `Pipeline` +class- pipelined execution is an emergent property of some sub +graphs. + +Nodes which do not share this property (pipeline breakers) are +responsible for deciding when they have received sufficient input, +when they can start emitting output, etc. For example a `GroupByNode` +will wait for its input to be exhausted before it begins pushing +batches to its own outputs. + +Parallelism is "seeded" by `ScanNode` (or other source nodes)- it +owns a reference to the thread pool on which the graph is executing +and fans out pushing to its outputs across that pool. A subsequent +`ProjectNode` will process the batch immediately after it is handed +off by the `ScanNode`- no explicit scheduling required. +Eventually, individual nodes may internally +parallelize processing of individual batches (for example, if a +`FilterNode`'s filter expression is slow). This decision is also left +up to each `ExecNode` implementation. + +# ExecNode interface and usage + +`ExecNode`s are constructed using one of the available factory +functions, such as `arrow::compute::MakeFilterNode` +or `arrow::dataset::MakeScanNode`. Any inputs to an `ExecNode` +must be provided when the node is constructed, so the first +nodes to be constructed are source nodes with no inputs +such as `ScanNode`. + +The batches yielded by an `ExecNode` always conform precisely +to its output schema. NB: no by-name field lookups or type +checks are performed during execution. The output schema +is usually derived from the output schemas of inputs. For +example a `FilterNode`'s output schema is always identical to +that of its input since batches are only modified by exclusion +of some rows. + +An `ExecNode` will begin producing batches when +`node->StartProducing()` is invoked and will proceed until stopped +with `node->StopProducing()`. Started nodes may not be destroyed +until stopped. `ExecNode`s are not currently restartable. +An `ExecNode` pushes batches to its outputs by passing each batch +to `output->InputReceived()`. It signals exhaustion by invoking +`output->InputFinished()`. + +Error recovery is permitted within a node. For example, if evaluation +of an `Expression` runs out of memory the governing node may +try that evaluation again after some memory has been freed up. +If a node experiences an error from which it cannot recover (for +example an IO error while parsing a CSV file) then it reports this +with `output->ErrorReceived()`. An error which escapes the scope of +a single node should not be considered recoverable (no `FilterNode` +should `try/catch` the IO error above). + diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_1.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_1.jpg new file mode 100644 index 00000000000..814ad8a69f6 Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_1.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_10.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_10.jpg new file mode 100644 index 00000000000..7a75c96dfc5 Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_10.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_11.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_11.jpg new file mode 100644 index 00000000000..59bcc167ed2 Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_11.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_2.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_2.jpg new file mode 100644 index 00000000000..4484c57a81d Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_2.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_3.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_3.jpg new file mode 100644 index 00000000000..afd33aba2e0 Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_3.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_4.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_4.jpg new file mode 100644 index 00000000000..f026aebe9a2 Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_4.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_5.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_5.jpg new file mode 100644 index 00000000000..8e1981b6571 Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_5.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_6.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_6.jpg new file mode 100644 index 00000000000..e976a461459 Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_6.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_7.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_7.jpg new file mode 100644 index 00000000000..7552d5af6af Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_7.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_8.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_8.jpg new file mode 100644 index 00000000000..242f1305328 Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_8.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/img/key_map_9.jpg b/cpp/src/arrow/compute/exec/doc/img/key_map_9.jpg new file mode 100644 index 00000000000..4c064595c9a Binary files /dev/null and b/cpp/src/arrow/compute/exec/doc/img/key_map_9.jpg differ diff --git a/cpp/src/arrow/compute/exec/doc/key_map.md b/cpp/src/arrow/compute/exec/doc/key_map.md new file mode 100644 index 00000000000..fdedc88c4d4 --- /dev/null +++ b/cpp/src/arrow/compute/exec/doc/key_map.md @@ -0,0 +1,223 @@ + + +# Swiss Table + +A specialized hash table implementation used to dynamically map combinations of key field values to a dense set of integer ids. Ids can later be used in place of keys to identify groups of rows with equal keys. + +## Introduction + +Hash group-by in Arrow uses a variant of a hash table based on a data structure called Swiss table. Swiss table uses linear probing. There is an array of slots and the information related to inserted keys is stored in these slots. A hash function determines the slot where the search for a matching key will start during hash table lookup. Then the slots are visited sequentially, wrapping around the end of an array, until either a match or an empty slot is found, the latter case meaning that there is no match. Swiss table organizes the slots in blocks of 8 and has a design that enables data level parallelism at the block level. More precisely, it allows for visiting all slots within a block at once during lookups, by simply using 64-bit arithmetic. SIMD instructions can further enhance this data level parallelism allowing to process multiple blocks related to multiple input keys together using SIMD vectors of 64-bit elements. Occupied slots within a block are always clustered together. The name Swiss table comes from likening resulting sequences of empty slots to holes in a one dimensional cheese. + +## Interface + +Hash table used in query processing for implementing join and group-by operators does not need to provide all of the operations that a general purpose hash table would. Simplified requirements can help achieve a simpler and more efficient design. For instance we do not need to be able to remove previously inserted keys. It’s an append-only data structure: new keys can be added but old keys are never erased. Also, only a single copy of each key can be inserted - it is like `std::map` in that sense and not `std::multimap`. + +Our Swiss table is fully vectorized. That means that all methods work on vectors of input keys processing them in batches. Specialized SIMD implementations of processing functions are almost always provided for performance critical operations. All callback interfaces used from the core hash table code are also designed to work on batches of inputs instead of individual keys. The batch size can be almost arbitrary and is selected by the client of the hash table. Batch size should be the smallest number of input items, big enough so that the benefits of vectorization and SIMD can be fully experienced. Keeping it small means less memory used for temporary arrays storing intermediate results of computation (vector equivalent of some temporary variables kept on the stack). That in turn means smaller space in CPU caches, which also means less impact on other memory access intensive operations. We pick 1024 as the default size of the batch. We will call it a **mini-batch** to distinguish it from potentially other forms of batches used at higher levels in the code, e.g. when scheduling work for worker threads or relational operators inside an analytic query. + +The main functionality provided by Swiss table is mapping of arbitrarily complex keys to unique integer ids. Let us call it **lookup-or-insert**. Given a sequence of key values, return a corresponding sequence of integer ids, such that all keys that are equal receive the same id and for K distinct keys the integer ids will be assigned from the set of numbers 0 to (K-1). If we find a matching key in a hash table for a given input, we return the **key id** assigned when the key was first inserted into a hash table. If we fail to find an already inserted match, we assign the first unused integer as a key id and add a new entry to a hash table. Due to vectorized processing, which may result in out-of-order processing of individual inputs, it is not guaranteed that if there are two new key values in the same input batch and one of them appears earlier in the input sequence, then it will receive a smaller key id. Additional mapping functionality can be built on top of basic mapping to integer key id, for instance if we want to assign and perhaps keep updating some values to all unique keys, we can keep these values in a resizable vector indexed by obtained key id. + +The implementation of Swiss table does not need to have any information related to the domain of the keys. It does not use their logical data type or information about their physical representation and does not even use pointers to keys. All access to keys is delegated to a separate class or classes that provide callback functions for three operations: +- computing hashes of keys; +- checking equality for given pairs of keys; +- appending a given sequence of keys to a stack maintained outside of Swiss table object, so that they can be referenced later on by key ids (key ids will be equal to their positions in the stack). + + +When passing arguments to callback functions the keys are referenced using integer ids. For the left side - that is the keys present in the input mini-batch - ordinal positions within that mini-batch are used. For the right side - that is the keys inserted into the hash table - these are identified by key ids assigned to them and stored inside Swiss table when they were first encountered and processed. + +Diagram with logical view of information passing in callbacks: + +![alt text](img/key_map_1.jpg) + +Hash table values for inserted keys are also stored inside Swiss table. Because of that, hash table logic does not need to ever re-evaluate the hash, and there is actually no need for a hash function callback. It is enough that the caller provides hash values for all entries in the batch when calling lookup-or-insert. + +## Basic architecture and organization of data +The hash table is an array of **slots**. Slots are grouped in groups of 8 called **blocks**. The number of blocks is a power of 2. The empty hash table starts with a single block, with all slots empty. Then, as the keys are getting inserted and the amount of empty slots is shrinking, at some point resizing of the hash table is triggered. The data stored in slots is moved to a new hash table that has the double of the number of blocks. + +The diagram below shows the basic organization of data in our implementation of Swiss table: + +![alt text](img/key_map_2.jpg) + +N is the log of the number of blocks, 2n+3 is the number of slots and also the maximum number of inserted keys and hence (N + 3) is the number of bits required to store a key id. We will refer to N as the **size of the hash table**. + +Index of a block within an array will be called **block id**, and similarly index of a slot will be **slot id**. Sometimes we will focus on a single block and refer to slots that belong to it by using a **local slot id**, which is an index from 0 to 7. + +Every slot can either be **empty** or store data related to a single inserted key. There are three pieces of information stored inside a slot: +- status byte, +- key id, +- key hash. + +Status byte, as the name suggests, stores 8 bits. The highest bit indicates if the slot is empty (the highest bit is set) or corresponds to one of inserted keys (the highest bit is zero). The remaining 7 bits contain 7 bits of key hash that we call a **stamp**. The stamp is used to eliminate some false positives when searching for a matching key for a given input. Slot also stores **key id**, which is a non-negative integer smaller than the number of inserted keys, that is used as a reference to the actual inserted key. The last piece of information related to an inserted key is its **hash** value. We store hashes for all keys, so that they never need to be re-computed. That greatly simplifies some operations, like resizing of a hash table, that may not even need to look at the keys at all. For an empty slot, the status byte is 0x80, key id is zero and the hash is not used and can be set to any number. + +A single block contains 8 slots and can be viewed as a micro-stack of up to 8 inserted keys. When the first key is inserted into an empty block, it will occupy a slot with local id 0. The second inserted key will go into slot number 1 and so on. We use N highest bits of hash to get an index of a **start block**, when searching for a match or an empty slot to insert a previously not seen key when that is the case. If the start block contains any empty slots, then the search for either a match or place to insert a key will end at that block. We will call such a block an **open block**. A block that is not open is a full block. In the case of full block, the input key related search may continue in the next block module the number of blocks. If the key is not inserted into its start block, we will refer to it as an **overflow** entry, other entries being **non-overflow**. Overflow entries are slower to process, since they require visiting more than one block, so we want to keep their percentage low. This is done by choosing the right **load factor** (percentage of occupied slots in the hash table) at which the hash table gets resized and the number of blocks gets doubled. By tuning this value we can control the probability of encountering an overflow entry. + +The most interesting part of each block is the set of status bytes of its slots, which is simply a single 64-bit word. The implementation of efficient searches across these bytes during lookups require using either leading zero count or trailing zero count intrinsic. Since there are cases when only the first one is available, in order to take advantage of it, we order the bytes in the 64-bit status word so that the first slot within a block uses the highest byte and the last one uses the lowest byte (slots are in reversed bytes order). The diagram below shows how the information about slots is stored within a 64-bit status word: + +![alt text](img/key_map_3.jpg) + +Each status byte has a 7-bit fragment of hash value - a **stamp** - and an empty slot bit. Empty slots have status byte equal to 0x80 - the highest bit is set to 1 to indicate an empty slot and the lowest bits, which are used by a stamp, are set to zero. + +The diagram below shows which bits of hash value are used by hash table: + +![alt text](img/key_map_4.jpg) + +If a hash table has 2N blocks, then we use N highest bits of a hash to select a start block when searching for a match. The next 7 bits are used as a stamp. Using the highest bits to pick a start block means that a range of hash values can be easily mapped to a range of block ids of start blocks for hashes in that range. This is useful when resizing a hash table or merging two hash tables together. + +### Interleaving status bytes and key ids + +Status bytes and key ids for all slots are stored in a single array of bytes. They are first grouped by 8 into blocks, then each block of status bytes is interleaved with a corresponding block of key ids. Finally key ids are represented using the smallest possible number of bits and bit-packed (bits representing each next key id start right after the last bit of the previous key id). Note that regardless of the chosen number of bits, a block of bit-packed key ids (that is 8 of them) will start and end on the byte boundary. + +The diagram below shows the organization of bytes and bits of a single block in interleaved array: +![alt text](img/key_map_5.jpg) + +From the size of the hash table we can derive the number K of bits needed in the worst case to encode any key id. K is equal to the number of bits needed to represent slot id (number of keys is not greater than the number of slots and any key id is strictly less than the number of keys), which for a hash table of size N (N blocks) equals (N+3). To simplify bit packing and unpacking and avoid handling of special cases, we will round up K to full bytes for K > 24 bits. + +Status bytes are stored in a single 64-bit word in reverse byte order (the last byte corresponds to the slot with local id 0). On the other hand key ids are stored in the normal order (the order of slot ids). + +Since both status byte and key id for a given slot are stored in the same array close to each other, we can expect that most of the lookups will read only one CPU cache-line from memory inside Swiss table code (then at least another one outside Swiss table to access the bytes of the key for the purpose of comparison). Even if we hit an overflow entry, it is still likely to reside on the same cache-line as the start block data. Hash values, which are stored separately from status byte and key id, are only used when resizing and do not impact the lookups outside these events. + +> Improvement to consider: +> In addition to the Swiss table data, we need to store an array of inserted keys, one for each key id. If keys are of fixed length, then the address of the bytes of the key can be calculated by multiplying key id by the common length of the key. If keys are of varying length, then there will be an additional array with an offset of each key within the array of concatenated bytes of keys. That means that any key comparison during lookup will involve 3 arrays: one to get key id, one to get key offset and final one with bytes of the key. This could be reduced to 2 array lookups if we stored key offset instead of key id interleaved with slot status bytes. Offset indexed by key id and stored in its own array becomes offset indexed by slot id and stored interleaved with slot status bytes. At the same time key id indexed by slot id and interleaved with slot status bytes before becomes key id referenced using offset and stored with key bytes. There may be a slight increase in the total size of memory needed by the hash table, equal to the difference in the number of bits used to store offset and those used to store key id, multiplied by the number of slots, but that should be a small fraction of the total size. + +### 32-bit hash vs 64-bit hash + +Currently we use 32-bit hash values in Swiss table code and 32-bit integers as key ids. For the robust implementation, sooner or later we will need to support 64-bit hash and 64-bit key ids. When we use 32-bit hash, it means that we run out of hash bits when hash table size N is greater than 25 (25 bits of hash needed to select a block and 7 bits needed to generate a stamp byte reach 32 total bits). When the number of inserted keys exceeds the maximal number of keys stored in a hash table of size 25 (which is at least 224), the chance of false positives during lookups will start quickly growing. 32-bit hash should not be used with more than about 16 million inserted keys. + +### Low memory footprint and low chance of hash collisions + +Swiss table is a good choice of a hash table for modern hardware, because it combines lookups that can take advantage of special CPU instructions with space efficiency and low chance of hash collisions. + +Space efficiency is important for performance, because the cost of random array accesses, often dominating the lookup cost for larger hash tables, increases with the size of the arrays. This happens due to limited space of CPU caches. Let us look at what is the amortized additional storage cost for a key in a hash table apart from the essential cost of storing data of all those keys. Furthermore, we can skip the storage of hash values, since these are only used during infrequent hash table resize operations (should not have a big impact on CPU cache usage in normal cases). + +Half full hash table of size N will use 2 status bytes per inserted key (because for every filled slot there is one empty slot) and 2\*(N+3) bits for key id (again, one for the occupied slot and one for the empty). For N = 16 for instance this is slightly under 7 bytes per inserted key. + +Swiss table also has a low probability of false positives leading to wasted key comparisons. Here is some rationale behind why this should be the case. Hash table of size N can contain up to 2N+3 keys. Search for a match involves (N + 7) hash bits: N to select a start block and 7 to use as a stamp. There are always at least 16 times more combinations of used hash bits than there are keys in the hash table (32 times more if the hash table is half full). These numbers mean that the probability of false positives resulting from a search for a matching slot should be low. That corresponds to an expected number of comparisons per lookup being close to 1 for keys already present and 0 for new keys. + +## Lookup + +Lookup-or-insert operation, given a hash of a key, finds a list of candidate slots with corresponding keys that are likely to be equal to the input key. The list may be empty, which means that the key does not exist yet in the hash table. If it is not empty, then the callback function for key comparison is called for each next candidate to verify that there is indeed a match. False positives get rejected and we end up either finding an actual match or an empty slot, which means that the key is new to the hash table. New keys get assigned next available integers as key ids, and are appended to the set of keys stored in the hash table. As a result of inserting new keys to the hash table, the density of occupied slots may reach an upper limit, at which point the hash table will be resized and will afterwards have twice as many slots. That is in summary lookup-or-insert functionality, but the actual implementation is a bit more involved, because of vectorization of the processing and various optimizations for common cases. + +### Search within a single block + +There are three possible cases that can occur when searching for a match for a given key (that is, for a given stamp of a key) within a single block, illustrated below. + + 1. There is a matching stamp in the block of status bytes: + +![alt text](img/key_map_6.jpg) + + 2. There is no matching stamp in the block, but there is an empty slot in the block: + +![alt text](img/key_map_7.jpg) + + 3. There is no matching stamp in the block and the block is full (there are no empty slots left): + +![alt text](img/key_map_8.jpg) + +64-bit arithmetic can be used to search for a matching slot within the entire single block at once, without iterating over all slots in it. Following is an example of a sequence of steps to find the first status byte for a given stamp, returning the first empty slot on miss if the block is not full or 8 (one past maximum local slot id) otherwise. + +Following is a sketch of the possible steps to execute when searching for the matching stamp in a single block. + +*Example will use input stamp 0x5E and a 64-bit status bytes word with one empty slot: +0x 4B17 5E3A 5E2B 1180*. + +1. [1 instruction] Replicate stamp to all bytes by multiplying it by 0x 0101 0101 0101 0101. + + *We obtain: 0x 5E5E 5E5E 5E5E 5E5E.* + +2. [1 instruction] XOR replicated stamp with status bytes word. Bytes corresponding to a matching stamp will be 0, bytes corresponding to empty slots will have a value between 128 and 255, bytes corresponding to non-matching non-empty slots will have a value between 1 and 127. + + *We obtain: 0x 1549 0064 0075 4FDE.* + +3. [2 instructions] In the next step we want to have information about a match in the highest bit of each byte. We can ignore here empty slot bytes, because they will be taken care of at a later step. Set the highest bit in each byte (OR with 0x 8080 8080 8080 8080) and then subtract 1 from each byte (subtract 0x 0101 0101 0101 0101 from 64-bit word). Now if a byte corresponds to a non-empty slot then the highest bit 0 indicates a match and 1 indicates a miss. + + *We obtain: 0x 95C9 80E4 80F5 CFDE, + then 0x 94C8 7FE3 7FF4 CEDD.* + +4. [3 instructions] In the next step we want to obtain in each byte one of two values: 0x80 if it is either an empty slot or a match, 0x00 otherwise. We do it in three steps: NOT the result of the previous step to change the meaning of the highest bit; OR with the original status word to set highest bit in a byte to 1 for empty slots; mask out everything other than the highest bits in all bytes (AND with 0x 8080 8080 8080 8080). + + *We obtain: 6B37 801C 800B 3122, + then 6B37 DE3E DE2B 31A2, + finally 0x0000 8000 8000 0080.* + +5. [2 instructions] Finally, use leading zero bits count and divide it by 8 to find an index of the last byte that corresponds either to a match or an empty slot. If the leading zero count intrinsic returns 64 for a 64-bit input zero, then after dividing by 8 we will also get the desired answer in case of a full block without any matches. + + *We obtain: 16, + then 2 (index of the first slot within the block that matches the stamp).* + +If SIMD instructions with 64-bit lanes are available, multiple single block searches for different keys can be executed together. For instance AVX2 instruction set allows to process quadruplets of 64-bit values in a single instruction, four searches at once. + +### Complete search potentially across multiple blocks + +Full implementation of a search for a matching key may involve visiting multiple blocks beginning with the start block selected based on the hash of the key. We move to the next block modulo the number of blocks, whenever we do not find a match in the current block and the current block is full. The search may also involve visiting one or more slots in each block. Visiting in this case means calling a comparison callback to verify the match whenever a slot with a matching stamp is encountered. Eventually the search stops when either: +- the matching key is found in one of the slots matching the stamp, or + +- an empty slot is reached. This is illustrated in the diagram below: +![alt text](img/key_map_9.jpg) + + +### Optimistic processing with two passes + +Hash table lookups may have high cost in the pessimistic case, when we encounter cases of hash collisions and full blocks that lead to visiting further blocks. In the majority of cases we can expect an optimistic situation - the start block is not full, so we will only visit this one block, and all stamps in the block are different, so we will need at most one comparison to find a match. We can expect about 90% of the key lookups for an existing key to go through the optimistic path of processing. For that reason it pays off to optimize especially for this 90% of inputs. + +Lookups in Swiss table are split into two passes over an input batch of keys. The **first pass: fast-path lookup** , is a highly optimized, vectorized, SIMD-friendly, branch-free code that fully handles optimistic cases. The **second pass: slow-path lookup** , is normally executed only for the selection of inputs that have not been finished in the first pass, although it can also be called directly on all of the inputs, skipping fast-path lookup. It handles all special cases and inserts but in order to be robust it is not as efficient as fast-path. Slow-path lookup does not need to repeat the work done in fast-path lookup - it can use the state reached at the end of fast-path lookup as a starting point. + +Fast-path lookup implements search only for the first stamp match and only within the start block. It only makes sense when we already have at least one key inserted into the hash table, since it does not handle inserts. It takes a vector of key hashes as an input and based on it outputs three pieces of information for each key: + +- Key id corresponding to the slot in which a matching stamp was found. Any valid key id if a matching stamp was not found. +- A flag indicating if a match was found or not. +- Slot id of a slot from which slow-path should pick up the search if the first match was either not found or it turns out to be false positive after evaluating key comparison. + +> Improvement to consider: +> precomputing 1st pass lookup results. +> +> If the hash table is small, the number of inserted keys is small, we could further simplify and speed-up the first pass by storing in a lookup table pre-computed results for all combinations of hash bits. Let us consider the case of Swiss table of size 5 that has 256 slots and up to 128 inserted keys. Only 12 bits of hash are used by lookup in that case: 5 to select a block, 7 to create a stamp. For all 212 combinations of those bits we could keep the result of first pass lookup in an array. Key id and a match indicating flag can use one byte: 7 bits for key id and 1 bit for the flag. Note that slot id is only needed if we go into 2nd pass lookup, so it can be stored separately and likely only accessed by a small subset of keys. Fast-path lookup becomes almost a single fetch of result from a 4KB array. Lookup arrays used to implement this need to be kept in sync with the main copy of data about slots, which requires extra care during inserts. Since the number of entries in lookup arrays is much higher than the number of slots, this technique only makes sense for small hash tables. + +### Dense comparisons + +If there is at least one key inserted into a hash table, then every slot contains a key id value that corresponds to some actual key that can be used in comparison. That is because empty slots are initialized with 0 as their key id. After the fast-path lookup we get a match-found flag for each input. If it is set, then we need to run a comparison of the input key with the key in the hash table identified by key id returned by fast-path code. The comparison will verify that there is a true match between the keys. We only need to do this for a subset of inputs that have a match candidate, but since we have key id values corresponding to some real key for all inputs, we may as well execute comparisons on all inputs unconditionally. If the majority (e.g. more than 80%) of the keys have a match candidate, the cost of evaluating comparison for the remaining fraction of keys but without filtering may actually be cheaper than the cost of running evaluation only for required keys while referencing filter information. This can be seen as a variant of general preconditioning techniques used to avoid diverging conditional branches in the code. It may be used, based on some heuristic, to verify matches reported by fast-path lookups and is referred to as **dense comparisons**. + +## Resizing + +New hash table is initialized as empty and has only a single block with a space for only a few key entries. Doubling of the hash table size becomes necessary as more keys get inserted. It is invoked during the 2nd pass of the lookups, which also handles inserts. It happens immediately after the number of inserted keys reaches a specific upper limit decided based on a current size of the hash table. There may still be unprocessed entries from the input mini-batch after resizing, so the 2nd pass of the lookup is restarted right after, with the bigger hash table and the remaining subset of unprocessed entries. + +Current policy, that should work reasonably well, is to resize a small hash table (up to 8KB) when it is 50% full. Larger hash tables are resized when 75% full. We want to keep size in memory as small as possible, while maintaining a low probability of blocks becoming full. + +When discussing resizing we will be talking about **resize source** and **resize target** tables. The diagram below shows how the same hash bits are interpreted differently by the source and the target. + +![alt text](img/key_map_10.jpg) + +For a given hash, if a start block id was L in the source table, it will be either (2\*L+0) or (2\*L+1) in the target table. Based on that we can expect data access locality when migrating the data between the tables. + +Resizing is cheap also thanks to the fact that hash values for keys in the hash table are kept together with other slot data and do not need to be recomputed. That means that resizing procedure does not ever need to access the actual bytes of the key. + +### 1st pass + +Based on the hash value for a given slot we can tell whether this slot contains an overflow or non-overflow entry. In the first pass we go over all source slots in sequence, filter out overflow entries and move to the target table all other entries. Non-overflow entries from a block L will be distributed between blocks (2\*L+0) and (2\*L+1) of the target table. None of these target blocks can overflow, since they will be accommodating at most 8 input entries during this pass. + +For every non-overflow entry, the highest bit of a stamp in the source slot decides whether it will go to the left or to the right target block. It is further possible to avoid any conditional branches in this partitioning code, so that the result is friendly to the CPU execution pipeline. + +![alt text](img/key_map_11.jpg) + + +### 2nd pass + +In the second pass of resizing, we scan all source slots again, this time focusing only on the overflow entries that were all skipped in the 1st pass. We simply reinsert them in the target table using generic insertion code with one exception. Since we know that all the source keys are different, there is no need to search for a matching stamp or run key comparisons (or look at the key values). We just need to find the first open block beginning with the start block in the target table and use its first empty slot as the insert destination. + +We expect overflow entries to be rare and therefore the relative cost of that pass should stay low. + diff --git a/cpp/src/arrow/compute/exec/exec_plan.cc b/cpp/src/arrow/compute/exec/exec_plan.cc new file mode 100644 index 00000000000..4a4758c8471 --- /dev/null +++ b/cpp/src/arrow/compute/exec/exec_plan.cc @@ -0,0 +1,1312 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/exec_plan.h" + +#include +#include +#include +#include + +#include "arrow/array/concatenate.h" +#include "arrow/array/util.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/exec.h" +#include "arrow/compute/exec/expression.h" +#include "arrow/compute/exec_internal.h" +#include "arrow/compute/registry.h" +#include "arrow/datum.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/util/optional.h" +#include "arrow/util/task_group.h" +#include "arrow/util/thread_pool.h" +#include "arrow/util/unreachable.h" +#include "arrow/util/vector.h" + +namespace arrow { + +using BitUtil::CountLeadingZeros; +using internal::checked_cast; +using internal::checked_pointer_cast; + +namespace compute { + +namespace { + +struct ExecPlanImpl : public ExecPlan { + explicit ExecPlanImpl(ExecContext* exec_context) : ExecPlan(exec_context) {} + + ~ExecPlanImpl() override { + if (started_ && !finished_.is_finished()) { + ARROW_LOG(WARNING) << "Plan was destroyed before finishing"; + StopProducing(); + finished().Wait(); + } + } + + ExecNode* AddNode(std::unique_ptr node) { + if (node->num_inputs() == 0) { + sources_.push_back(node.get()); + } + if (node->num_outputs() == 0) { + sinks_.push_back(node.get()); + } + nodes_.push_back(std::move(node)); + return nodes_.back().get(); + } + + Status Validate() const { + if (nodes_.empty()) { + return Status::Invalid("ExecPlan has no node"); + } + for (const auto& node : nodes_) { + RETURN_NOT_OK(node->Validate()); + } + return Status::OK(); + } + + Status StartProducing() { + if (started_) { + return Status::Invalid("restarted ExecPlan"); + } + started_ = true; + + // producers precede consumers + sorted_nodes_ = TopoSort(); + + std::vector> futures; + + Status st = Status::OK(); + + using rev_it = std::reverse_iterator; + for (rev_it it(sorted_nodes_.end()), end(sorted_nodes_.begin()); it != end; ++it) { + auto node = *it; + + st = node->StartProducing(); + if (!st.ok()) { + // Stop nodes that successfully started, in reverse order + stopped_ = true; + StopProducingImpl(it.base(), sorted_nodes_.end()); + break; + } + + futures.push_back(node->finished()); + } + + finished_ = AllComplete(std::move(futures)); + return st; + } + + void StopProducing() { + DCHECK(started_) << "stopped an ExecPlan which never started"; + stopped_ = true; + + StopProducingImpl(sorted_nodes_.begin(), sorted_nodes_.end()); + } + + template + void StopProducingImpl(It begin, It end) { + for (auto it = begin; it != end; ++it) { + auto node = *it; + node->StopProducing(); + } + } + + NodeVector TopoSort() { + struct Impl { + const std::vector>& nodes; + std::unordered_set visited; + NodeVector sorted; + + explicit Impl(const std::vector>& nodes) : nodes(nodes) { + visited.reserve(nodes.size()); + sorted.resize(nodes.size()); + + for (const auto& node : nodes) { + Visit(node.get()); + } + + DCHECK_EQ(visited.size(), nodes.size()); + } + + void Visit(ExecNode* node) { + if (visited.count(node) != 0) return; + + for (auto input : node->inputs()) { + // Ensure that producers are inserted before this consumer + Visit(input); + } + + sorted[visited.size()] = node; + visited.insert(node); + } + }; + + return std::move(Impl{nodes_}.sorted); + } + + Future<> finished_ = Future<>::MakeFinished(); + bool started_ = false, stopped_ = false; + std::vector> nodes_; + NodeVector sources_, sinks_; + NodeVector sorted_nodes_; +}; + +ExecPlanImpl* ToDerived(ExecPlan* ptr) { return checked_cast(ptr); } + +const ExecPlanImpl* ToDerived(const ExecPlan* ptr) { + return checked_cast(ptr); +} + +util::optional GetNodeIndex(const std::vector& nodes, + const ExecNode* node) { + for (int i = 0; i < static_cast(nodes.size()); ++i) { + if (nodes[i] == node) return i; + } + return util::nullopt; +} + +} // namespace + +Result> ExecPlan::Make(ExecContext* ctx) { + return std::shared_ptr(new ExecPlanImpl{ctx}); +} + +ExecNode* ExecPlan::AddNode(std::unique_ptr node) { + return ToDerived(this)->AddNode(std::move(node)); +} + +const ExecPlan::NodeVector& ExecPlan::sources() const { + return ToDerived(this)->sources_; +} + +const ExecPlan::NodeVector& ExecPlan::sinks() const { return ToDerived(this)->sinks_; } + +Status ExecPlan::Validate() { return ToDerived(this)->Validate(); } + +Status ExecPlan::StartProducing() { return ToDerived(this)->StartProducing(); } + +void ExecPlan::StopProducing() { ToDerived(this)->StopProducing(); } + +Future<> ExecPlan::finished() { return ToDerived(this)->finished_; } + +ExecNode::ExecNode(ExecPlan* plan, std::string label, NodeVector inputs, + std::vector input_labels, + std::shared_ptr output_schema, int num_outputs) + : plan_(plan), + label_(std::move(label)), + inputs_(std::move(inputs)), + input_labels_(std::move(input_labels)), + output_schema_(std::move(output_schema)), + num_outputs_(num_outputs) { + for (auto input : inputs_) { + input->outputs_.push_back(this); + } +} + +Status ExecNode::Validate() const { + if (inputs_.size() != input_labels_.size()) { + return Status::Invalid("Invalid number of inputs for '", label(), "' (expected ", + num_inputs(), ", actual ", input_labels_.size(), ")"); + } + + if (static_cast(outputs_.size()) != num_outputs_) { + return Status::Invalid("Invalid number of outputs for '", label(), "' (expected ", + num_outputs(), ", actual ", outputs_.size(), ")"); + } + + for (auto out : outputs_) { + auto input_index = GetNodeIndex(out->inputs(), this); + if (!input_index) { + return Status::Invalid("Node '", label(), "' outputs to node '", out->label(), + "' but is not listed as an input."); + } + } + + return Status::OK(); +} + +bool ExecNode::ErrorIfNotOk(Status status) { + if (status.ok()) return false; + + for (auto out : outputs_) { + out->ErrorReceived(this, out == outputs_.back() ? std::move(status) : status); + } + return true; +} + +struct SourceNode : ExecNode { + SourceNode(ExecPlan* plan, std::string label, std::shared_ptr output_schema, + AsyncGenerator> generator) + : ExecNode(plan, std::move(label), {}, {}, std::move(output_schema), + /*num_outputs=*/1), + generator_(std::move(generator)) {} + + const char* kind_name() override { return "SourceNode"; } + + [[noreturn]] static void NoInputs() { + Unreachable("no inputs; this should never be called"); + } + [[noreturn]] void InputReceived(ExecNode*, int, ExecBatch) override { NoInputs(); } + [[noreturn]] void ErrorReceived(ExecNode*, Status) override { NoInputs(); } + [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); } + + Status StartProducing() override { + DCHECK(!stop_requested_) << "Restarted SourceNode"; + + CallbackOptions options; + if (auto executor = plan()->exec_context()->executor()) { + // These options will transfer execution to the desired Executor if necessary. + // This can happen for in-memory scans where batches didn't require + // any CPU work to decode. Otherwise, parsing etc should have already + // been placed us on the desired Executor and no queues will be pushed to. + options.executor = executor; + options.should_schedule = ShouldSchedule::IfDifferentExecutor; + } + + finished_ = Loop([this, options] { + std::unique_lock lock(mutex_); + int seq = batch_count_++; + if (stop_requested_) { + return Future>::MakeFinished(Break(seq)); + } + lock.unlock(); + + return generator_().Then( + [=](const util::optional& batch) -> ControlFlow { + std::unique_lock lock(mutex_); + if (IsIterationEnd(batch) || stop_requested_) { + stop_requested_ = true; + return Break(seq); + } + lock.unlock(); + + outputs_[0]->InputReceived(this, seq, *batch); + return Continue(); + }, + [=](const Status& error) -> ControlFlow { + // NB: ErrorReceived is independent of InputFinished, but + // ErrorReceived will usually prompt StopProducing which will + // prompt InputFinished. ErrorReceived may still be called from a + // node which was requested to stop (indeed, the request to stop + // may prompt an error). + std::unique_lock lock(mutex_); + stop_requested_ = true; + lock.unlock(); + outputs_[0]->ErrorReceived(this, error); + return Break(seq); + }, + options); + }).Then([&](int seq) { outputs_[0]->InputFinished(this, seq); }); + + return Status::OK(); + } + + void PauseProducing(ExecNode* output) override {} + + void ResumeProducing(ExecNode* output) override {} + + void StopProducing(ExecNode* output) override { + DCHECK_EQ(output, outputs_[0]); + StopProducing(); + } + + void StopProducing() override { + std::unique_lock lock(mutex_); + stop_requested_ = true; + } + + Future<> finished() override { return finished_; } + + private: + std::mutex mutex_; + bool stop_requested_{false}; + int batch_count_{0}; + Future<> finished_ = Future<>::MakeFinished(); + AsyncGenerator> generator_; +}; + +ExecNode* MakeSourceNode(ExecPlan* plan, std::string label, + std::shared_ptr output_schema, + AsyncGenerator> generator) { + return plan->EmplaceNode(plan, std::move(label), std::move(output_schema), + std::move(generator)); +} + +struct FilterNode : ExecNode { + FilterNode(ExecNode* input, std::string label, Expression filter) + : ExecNode(input->plan(), std::move(label), {input}, {"target"}, + /*output_schema=*/input->output_schema(), + /*num_outputs=*/1), + filter_(std::move(filter)) {} + + const char* kind_name() override { return "FilterNode"; } + + Result DoFilter(const ExecBatch& target) { + ARROW_ASSIGN_OR_RAISE(Expression simplified_filter, + SimplifyWithGuarantee(filter_, target.guarantee)); + + ARROW_ASSIGN_OR_RAISE(Datum mask, ExecuteScalarExpression(simplified_filter, target, + plan()->exec_context())); + + if (mask.is_scalar()) { + const auto& mask_scalar = mask.scalar_as(); + if (mask_scalar.is_valid && mask_scalar.value) { + return target; + } + + return target.Slice(0, 0); + } + + // if the values are all scalar then the mask must also be + DCHECK(!std::all_of(target.values.begin(), target.values.end(), + [](const Datum& value) { return value.is_scalar(); })); + + auto values = target.values; + for (auto& value : values) { + if (value.is_scalar()) continue; + ARROW_ASSIGN_OR_RAISE(value, Filter(value, mask, FilterOptions::Defaults())); + } + return ExecBatch::Make(std::move(values)); + } + + void InputReceived(ExecNode* input, int seq, ExecBatch batch) override { + DCHECK_EQ(input, inputs_[0]); + + auto maybe_filtered = DoFilter(std::move(batch)); + if (ErrorIfNotOk(maybe_filtered.status())) return; + + maybe_filtered->guarantee = batch.guarantee; + outputs_[0]->InputReceived(this, seq, maybe_filtered.MoveValueUnsafe()); + } + + void ErrorReceived(ExecNode* input, Status error) override { + DCHECK_EQ(input, inputs_[0]); + outputs_[0]->ErrorReceived(this, std::move(error)); + } + + void InputFinished(ExecNode* input, int seq) override { + DCHECK_EQ(input, inputs_[0]); + outputs_[0]->InputFinished(this, seq); + } + + Status StartProducing() override { return Status::OK(); } + + void PauseProducing(ExecNode* output) override {} + + void ResumeProducing(ExecNode* output) override {} + + void StopProducing(ExecNode* output) override { + DCHECK_EQ(output, outputs_[0]); + StopProducing(); + } + + void StopProducing() override { inputs_[0]->StopProducing(this); } + + Future<> finished() override { return inputs_[0]->finished(); } + + private: + Expression filter_; +}; + +Result MakeFilterNode(ExecNode* input, std::string label, Expression filter) { + if (!filter.IsBound()) { + ARROW_ASSIGN_OR_RAISE(filter, filter.Bind(*input->output_schema())); + } + + if (filter.type()->id() != Type::BOOL) { + return Status::TypeError("Filter expression must evaluate to bool, but ", + filter.ToString(), " evaluates to ", + filter.type()->ToString()); + } + + return input->plan()->EmplaceNode(input, std::move(label), + std::move(filter)); +} + +struct ProjectNode : ExecNode { + ProjectNode(ExecNode* input, std::string label, std::shared_ptr output_schema, + std::vector exprs) + : ExecNode(input->plan(), std::move(label), {input}, {"target"}, + /*output_schema=*/std::move(output_schema), + /*num_outputs=*/1), + exprs_(std::move(exprs)) {} + + const char* kind_name() override { return "ProjectNode"; } + + Result DoProject(const ExecBatch& target) { + std::vector values{exprs_.size()}; + for (size_t i = 0; i < exprs_.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(Expression simplified_expr, + SimplifyWithGuarantee(exprs_[i], target.guarantee)); + + ARROW_ASSIGN_OR_RAISE(values[i], ExecuteScalarExpression(simplified_expr, target, + plan()->exec_context())); + } + return ExecBatch{std::move(values), target.length}; + } + + void InputReceived(ExecNode* input, int seq, ExecBatch batch) override { + DCHECK_EQ(input, inputs_[0]); + + auto maybe_projected = DoProject(std::move(batch)); + if (ErrorIfNotOk(maybe_projected.status())) return; + + maybe_projected->guarantee = batch.guarantee; + outputs_[0]->InputReceived(this, seq, maybe_projected.MoveValueUnsafe()); + } + + void ErrorReceived(ExecNode* input, Status error) override { + DCHECK_EQ(input, inputs_[0]); + outputs_[0]->ErrorReceived(this, std::move(error)); + } + + void InputFinished(ExecNode* input, int seq) override { + DCHECK_EQ(input, inputs_[0]); + outputs_[0]->InputFinished(this, seq); + } + + Status StartProducing() override { return Status::OK(); } + + void PauseProducing(ExecNode* output) override {} + + void ResumeProducing(ExecNode* output) override {} + + void StopProducing(ExecNode* output) override { + DCHECK_EQ(output, outputs_[0]); + StopProducing(); + } + + void StopProducing() override { inputs_[0]->StopProducing(this); } + + Future<> finished() override { return inputs_[0]->finished(); } + + private: + std::vector exprs_; +}; + +Result MakeProjectNode(ExecNode* input, std::string label, + std::vector exprs, + std::vector names) { + FieldVector fields(exprs.size()); + + if (names.size() == 0) { + names.resize(exprs.size()); + for (size_t i = 0; i < exprs.size(); ++i) { + names[i] = exprs[i].ToString(); + } + } + + int i = 0; + for (auto& expr : exprs) { + if (!expr.IsBound()) { + ARROW_ASSIGN_OR_RAISE(expr, expr.Bind(*input->output_schema())); + } + fields[i] = field(std::move(names[i]), expr.type()); + ++i; + } + + return input->plan()->EmplaceNode( + input, std::move(label), schema(std::move(fields)), std::move(exprs)); +} + +class AtomicCounter { + public: + AtomicCounter() = default; + + int count() const { return count_.load(); } + + util::optional total() const { + int total = total_.load(); + if (total == -1) return {}; + return total; + } + + // return true if the counter is complete + bool Increment() { + DCHECK_NE(count_.load(), total_.load()); + int count = count_.fetch_add(1) + 1; + if (count != total_.load()) return false; + return DoneOnce(); + } + + // return true if the counter is complete + bool SetTotal(int total) { + total_.store(total); + if (count_.load() != total) return false; + return DoneOnce(); + } + + // return true if the counter has not already been completed + bool Cancel() { return DoneOnce(); } + + private: + // ensure there is only one true return from Increment(), SetTotal(), or Cancel() + bool DoneOnce() { + bool expected = false; + return complete_.compare_exchange_strong(expected, true); + } + + std::atomic count_{0}, total_{-1}; + std::atomic complete_{false}; +}; + +struct SinkNode : ExecNode { + SinkNode(ExecNode* input, std::string label, + AsyncGenerator>* generator) + : ExecNode(input->plan(), std::move(label), {input}, {"collected"}, {}, + /*num_outputs=*/0), + producer_(MakeProducer(generator)) {} + + static PushGenerator>::Producer MakeProducer( + AsyncGenerator>* out_gen) { + PushGenerator> gen; + auto out = gen.producer(); + *out_gen = std::move(gen); + return out; + } + + const char* kind_name() override { return "SinkNode"; } + + Status StartProducing() override { + finished_ = Future<>::Make(); + return Status::OK(); + } + + // sink nodes have no outputs from which to feel backpressure + [[noreturn]] static void NoOutputs() { + Unreachable("no outputs; this should never be called"); + } + [[noreturn]] void ResumeProducing(ExecNode* output) override { NoOutputs(); } + [[noreturn]] void PauseProducing(ExecNode* output) override { NoOutputs(); } + [[noreturn]] void StopProducing(ExecNode* output) override { NoOutputs(); } + + void StopProducing() override { + Finish(); + inputs_[0]->StopProducing(this); + } + + Future<> finished() override { return finished_; } + + void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) override { + DCHECK_EQ(input, inputs_[0]); + + bool did_push = producer_.Push(std::move(batch)); + if (!did_push) return; // producer_ was Closed already + + if (auto total = input_counter_.total()) { + DCHECK_LE(seq_num, *total); + } + + if (input_counter_.Increment()) { + Finish(); + } + } + + void ErrorReceived(ExecNode* input, Status error) override { + DCHECK_EQ(input, inputs_[0]); + + producer_.Push(std::move(error)); + + if (input_counter_.Cancel()) { + Finish(); + } + inputs_[0]->StopProducing(this); + } + + void InputFinished(ExecNode* input, int seq_stop) override { + if (input_counter_.SetTotal(seq_stop)) { + Finish(); + } + } + + private: + void Finish() { + if (producer_.Close()) { + finished_.MarkFinished(); + } + } + + AtomicCounter input_counter_; + Future<> finished_ = Future<>::MakeFinished(); + + PushGenerator>::Producer producer_; +}; + +AsyncGenerator> MakeSinkNode(ExecNode* input, + std::string label) { + AsyncGenerator> out; + (void)input->plan()->EmplaceNode(input, std::move(label), &out); + return out; +} + +std::shared_ptr MakeGeneratorReader( + std::shared_ptr schema, + std::function>()> gen, MemoryPool* pool) { + struct Impl : RecordBatchReader { + std::shared_ptr schema() const override { return schema_; } + + Status ReadNext(std::shared_ptr* record_batch) override { + ARROW_ASSIGN_OR_RAISE(auto batch, iterator_.Next()); + if (batch) { + ARROW_ASSIGN_OR_RAISE(*record_batch, batch->ToRecordBatch(schema_, pool_)); + } else { + *record_batch = IterationEnd>(); + } + return Status::OK(); + } + + MemoryPool* pool_; + std::shared_ptr schema_; + Iterator> iterator_; + }; + + auto out = std::make_shared(); + out->pool_ = pool; + out->schema_ = std::move(schema); + out->iterator_ = MakeGeneratorIterator(std::move(gen)); + return out; +} + +class ThreadIndexer { + public: + size_t operator()() { + auto id = std::this_thread::get_id(); + + std::unique_lock lock(mutex_); + const auto& id_index = *id_to_index_.emplace(id, id_to_index_.size()).first; + + return Check(id_index.second); + } + + static size_t Capacity() { + static size_t max_size = arrow::internal::ThreadPool::DefaultCapacity(); + return max_size; + } + + private: + size_t Check(size_t thread_index) { + DCHECK_LT(thread_index, Capacity()) << "thread index " << thread_index + << " is out of range [0, " << Capacity() << ")"; + + return thread_index; + } + + std::mutex mutex_; + std::unordered_map id_to_index_; +}; + +struct ScalarAggregateNode : ExecNode { + ScalarAggregateNode(ExecNode* input, std::string label, + std::shared_ptr output_schema, + std::vector kernels, + std::vector argument_indices, + std::vector>> states) + : ExecNode(input->plan(), std::move(label), {input}, {"target"}, + /*output_schema=*/std::move(output_schema), + /*num_outputs=*/1), + kernels_(std::move(kernels)), + argument_indices_(std::move(argument_indices)), + states_(std::move(states)) {} + + const char* kind_name() override { return "ScalarAggregateNode"; } + + Status DoConsume(const ExecBatch& batch, size_t thread_index) { + for (size_t i = 0; i < kernels_.size(); ++i) { + KernelContext batch_ctx{plan()->exec_context()}; + batch_ctx.SetState(states_[i][thread_index].get()); + + ExecBatch single_column_batch{{batch[argument_indices_[i]]}, batch.length}; + RETURN_NOT_OK(kernels_[i]->consume(&batch_ctx, single_column_batch)); + } + return Status::OK(); + } + + void InputReceived(ExecNode* input, int seq, ExecBatch batch) override { + DCHECK_EQ(input, inputs_[0]); + + auto thread_index = get_thread_index_(); + + if (ErrorIfNotOk(DoConsume(std::move(batch), thread_index))) return; + + if (input_counter_.Increment()) { + ErrorIfNotOk(Finish()); + } + } + + void ErrorReceived(ExecNode* input, Status error) override { + DCHECK_EQ(input, inputs_[0]); + outputs_[0]->ErrorReceived(this, std::move(error)); + } + + void InputFinished(ExecNode* input, int num_total) override { + DCHECK_EQ(input, inputs_[0]); + + if (input_counter_.SetTotal(num_total)) { + ErrorIfNotOk(Finish()); + } + } + + Status StartProducing() override { + finished_ = Future<>::Make(); + // Scalar aggregates will only output a single batch + outputs_[0]->InputFinished(this, 1); + return Status::OK(); + } + + void PauseProducing(ExecNode* output) override {} + + void ResumeProducing(ExecNode* output) override {} + + void StopProducing(ExecNode* output) override { + DCHECK_EQ(output, outputs_[0]); + StopProducing(); + } + + void StopProducing() override { + if (input_counter_.Cancel()) { + finished_.MarkFinished(); + } + inputs_[0]->StopProducing(this); + } + + Future<> finished() override { return finished_; } + + private: + Status Finish() { + ExecBatch batch{{}, 1}; + batch.values.resize(kernels_.size()); + + for (size_t i = 0; i < kernels_.size(); ++i) { + KernelContext ctx{plan()->exec_context()}; + ARROW_ASSIGN_OR_RAISE(auto merged, ScalarAggregateKernel::MergeAll( + kernels_[i], &ctx, std::move(states_[i]))); + RETURN_NOT_OK(kernels_[i]->finalize(&ctx, &batch.values[i])); + } + + outputs_[0]->InputReceived(this, 0, std::move(batch)); + finished_.MarkFinished(); + return Status::OK(); + } + + Future<> finished_ = Future<>::MakeFinished(); + const std::vector kernels_; + const std::vector argument_indices_; + + std::vector>> states_; + + ThreadIndexer get_thread_index_; + AtomicCounter input_counter_; +}; + +Result MakeScalarAggregateNode(ExecNode* input, std::string label, + std::vector aggregates, + std::vector arguments, + std::vector out_field_names) { + if (aggregates.size() != arguments.size()) { + return Status::Invalid("Provided ", aggregates.size(), " aggregates but ", + arguments.size(), " arguments."); + } + + if (aggregates.size() != out_field_names.size()) { + return Status::Invalid("Provided ", aggregates.size(), " aggregates but ", + out_field_names.size(), " field names for the output."); + } + + auto exec_ctx = input->plan()->exec_context(); + + std::vector kernels(aggregates.size()); + std::vector>> states(kernels.size()); + FieldVector fields(kernels.size()); + std::vector argument_indices(kernels.size()); + + for (size_t i = 0; i < kernels.size(); ++i) { + if (!arguments[i].IsName()) { + return Status::NotImplemented("Non name field refs"); + } + ARROW_ASSIGN_OR_RAISE(auto match, + arguments[i].FindOneOrNone(*input->output_schema())); + argument_indices[i] = match[0]; + + ARROW_ASSIGN_OR_RAISE(auto function, + exec_ctx->func_registry()->GetFunction(aggregates[i].function)); + + if (function->kind() != Function::SCALAR_AGGREGATE) { + return Status::Invalid("Provided non ScalarAggregateFunction ", + aggregates[i].function); + } + + auto in_type = ValueDescr::Array(input->output_schema()->fields()[i]->type()); + + ARROW_ASSIGN_OR_RAISE(const Kernel* kernel, function->DispatchExact({in_type})); + kernels[i] = static_cast(kernel); + + if (aggregates[i].options == nullptr) { + aggregates[i].options = function->default_options(); + } + + KernelContext kernel_ctx{exec_ctx}; + states[i].resize(ThreadIndexer::Capacity()); + RETURN_NOT_OK(Kernel::InitAll(&kernel_ctx, + KernelInitArgs{kernels[i], + { + in_type, + }, + aggregates[i].options}, + &states[i])); + + // pick one to resolve the kernel signature + kernel_ctx.SetState(states[i][0].get()); + ARROW_ASSIGN_OR_RAISE( + auto descr, kernels[i]->signature->out_type().Resolve(&kernel_ctx, {in_type})); + + fields[i] = field(std::move(out_field_names[i]), std::move(descr.type)); + } + + return input->plan()->EmplaceNode( + input, std::move(label), schema(std::move(fields)), std::move(kernels), + std::move(argument_indices), std::move(states)); +} + +namespace internal { + +Result> GetKernels( + ExecContext* ctx, const std::vector& aggregates, + const std::vector& in_descrs); + +Result>> InitKernels( + const std::vector& kernels, ExecContext* ctx, + const std::vector& aggregates, + const std::vector& in_descrs); + +Result ResolveKernels( + const std::vector& aggregates, + const std::vector& kernels, + const std::vector>& states, ExecContext* ctx, + const std::vector& descrs); + +} // namespace internal + +struct GroupByNode : ExecNode { + GroupByNode(ExecNode* input, std::string label, std::shared_ptr output_schema, + ExecContext* ctx, const std::vector&& key_field_ids, + const std::vector&& agg_src_field_ids, + const std::vector&& aggs, + const std::vector&& agg_kernels) + : ExecNode(input->plan(), std::move(label), {input}, {"groupby"}, + std::move(output_schema), /*num_outputs=*/1), + ctx_(ctx), + key_field_ids_(std::move(key_field_ids)), + agg_src_field_ids_(std::move(agg_src_field_ids)), + aggs_(std::move(aggs)), + agg_kernels_(std::move(agg_kernels)) {} + + const char* kind_name() override { return "GroupByNode"; } + + Status Consume(ExecBatch batch) { + size_t thread_index = get_thread_index_(); + if (thread_index >= local_states_.size()) { + return Status::IndexError("thread index ", thread_index, " is out of range [0, ", + local_states_.size(), ")"); + } + + auto state = &local_states_[thread_index]; + RETURN_NOT_OK(InitLocalStateIfNeeded(state)); + + // Create a batch with key columns + std::vector keys(key_field_ids_.size()); + for (size_t i = 0; i < key_field_ids_.size(); ++i) { + keys[i] = batch.values[key_field_ids_[i]]; + } + ARROW_ASSIGN_OR_RAISE(ExecBatch key_batch, ExecBatch::Make(keys)); + + // Create a batch with group ids + ARROW_ASSIGN_OR_RAISE(Datum id_batch, state->grouper->Consume(key_batch)); + + // Execute aggregate kernels + for (size_t i = 0; i < agg_kernels_.size(); ++i) { + KernelContext kernel_ctx{ctx_}; + kernel_ctx.SetState(state->agg_states[i].get()); + + ARROW_ASSIGN_OR_RAISE( + auto agg_batch, + ExecBatch::Make({batch.values[agg_src_field_ids_[i]], id_batch})); + + RETURN_NOT_OK(agg_kernels_[i]->resize(&kernel_ctx, state->grouper->num_groups())); + RETURN_NOT_OK(agg_kernels_[i]->consume(&kernel_ctx, agg_batch)); + } + + return Status::OK(); + } + + Status Merge() { + ThreadLocalState* state0 = &local_states_[0]; + for (size_t i = 1; i < local_states_.size(); ++i) { + ThreadLocalState* state = &local_states_[i]; + if (!state->grouper) { + continue; + } + + ARROW_ASSIGN_OR_RAISE(ExecBatch other_keys, state->grouper->GetUniques()); + ARROW_ASSIGN_OR_RAISE(Datum transposition, state0->grouper->Consume(other_keys)); + state->grouper.reset(); + + for (size_t i = 0; i < agg_kernels_.size(); ++i) { + KernelContext batch_ctx{ctx_}; + DCHECK(state0->agg_states[i]); + batch_ctx.SetState(state0->agg_states[i].get()); + + RETURN_NOT_OK(agg_kernels_[i]->resize(&batch_ctx, state0->grouper->num_groups())); + RETURN_NOT_OK(agg_kernels_[i]->merge(&batch_ctx, std::move(*state->agg_states[i]), + *transposition.array())); + state->agg_states[i].reset(); + } + } + return Status::OK(); + } + + Result Finalize() { + ThreadLocalState* state = &local_states_[0]; + + ExecBatch out_data{{}, state->grouper->num_groups()}; + out_data.values.resize(agg_kernels_.size() + key_field_ids_.size()); + + // Aggregate fields come before key fields to match the behavior of GroupBy function + for (size_t i = 0; i < agg_kernels_.size(); ++i) { + KernelContext batch_ctx{ctx_}; + batch_ctx.SetState(state->agg_states[i].get()); + RETURN_NOT_OK(agg_kernels_[i]->finalize(&batch_ctx, &out_data.values[i])); + state->agg_states[i].reset(); + } + + ARROW_ASSIGN_OR_RAISE(ExecBatch out_keys, state->grouper->GetUniques()); + std::move(out_keys.values.begin(), out_keys.values.end(), + out_data.values.begin() + agg_kernels_.size()); + state->grouper.reset(); + + if (output_counter_.SetTotal( + static_cast(BitUtil::CeilDiv(out_data.length, output_batch_size())))) { + // this will be hit if out_data.length == 0 + finished_.MarkFinished(); + } + return out_data; + } + + void OutputNthBatch(int n) { + // bail if StopProducing was called + if (finished_.is_finished()) return; + + int64_t batch_size = output_batch_size(); + outputs_[0]->InputReceived(this, n, out_data_.Slice(batch_size * n, batch_size)); + + if (output_counter_.Increment()) { + finished_.MarkFinished(); + } + } + + Status OutputResult() { + RETURN_NOT_OK(Merge()); + ARROW_ASSIGN_OR_RAISE(out_data_, Finalize()); + + int num_output_batches = *output_counter_.total(); + outputs_[0]->InputFinished(this, num_output_batches); + + auto executor = ctx_->executor(); + for (int i = 0; i < num_output_batches; ++i) { + if (executor) { + // bail if StopProducing was called + if (finished_.is_finished()) break; + + RETURN_NOT_OK(executor->Spawn([this, i] { OutputNthBatch(i); })); + } else { + OutputNthBatch(i); + } + } + + return Status::OK(); + } + + void InputReceived(ExecNode* input, int seq, ExecBatch batch) override { + // bail if StopProducing was called + if (finished_.is_finished()) return; + + DCHECK_EQ(input, inputs_[0]); + + if (ErrorIfNotOk(Consume(std::move(batch)))) return; + + if (input_counter_.Increment()) { + ErrorIfNotOk(OutputResult()); + } + } + + void ErrorReceived(ExecNode* input, Status error) override { + DCHECK_EQ(input, inputs_[0]); + + outputs_[0]->ErrorReceived(this, std::move(error)); + } + + void InputFinished(ExecNode* input, int num_total) override { + // bail if StopProducing was called + if (finished_.is_finished()) return; + + DCHECK_EQ(input, inputs_[0]); + + if (input_counter_.SetTotal(num_total)) { + ErrorIfNotOk(OutputResult()); + } + } + + Status StartProducing() override { + finished_ = Future<>::Make(); + + local_states_.resize(ThreadIndexer::Capacity()); + return Status::OK(); + } + + void PauseProducing(ExecNode* output) override {} + + void ResumeProducing(ExecNode* output) override {} + + void StopProducing(ExecNode* output) override { + DCHECK_EQ(output, outputs_[0]); + + if (input_counter_.Cancel()) { + finished_.MarkFinished(); + } else if (output_counter_.Cancel()) { + finished_.MarkFinished(); + } + inputs_[0]->StopProducing(this); + } + + void StopProducing() override { StopProducing(outputs_[0]); } + + Future<> finished() override { return finished_; } + + private: + struct ThreadLocalState { + std::unique_ptr grouper; + std::vector> agg_states; + }; + + ThreadLocalState* GetLocalState() { + size_t thread_index = get_thread_index_(); + return &local_states_[thread_index]; + } + + Status InitLocalStateIfNeeded(ThreadLocalState* state) { + // Get input schema + auto input_schema = inputs_[0]->output_schema(); + + if (state->grouper != nullptr) return Status::OK(); + + // Build vector of key field data types + std::vector key_descrs(key_field_ids_.size()); + for (size_t i = 0; i < key_field_ids_.size(); ++i) { + auto key_field_id = key_field_ids_[i]; + key_descrs[i] = ValueDescr(input_schema->field(key_field_id)->type()); + } + + // Construct grouper + ARROW_ASSIGN_OR_RAISE(state->grouper, internal::Grouper::Make(key_descrs, ctx_)); + + // Build vector of aggregate source field data types + std::vector agg_src_descrs(agg_kernels_.size()); + for (size_t i = 0; i < agg_kernels_.size(); ++i) { + auto agg_src_field_id = agg_src_field_ids_[i]; + agg_src_descrs[i] = + ValueDescr(input_schema->field(agg_src_field_id)->type(), ValueDescr::ARRAY); + } + + ARROW_ASSIGN_OR_RAISE( + state->agg_states, + internal::InitKernels(agg_kernels_, ctx_, aggs_, agg_src_descrs)); + + return Status::OK(); + } + + int output_batch_size() const { + int result = static_cast(ctx_->exec_chunksize()); + if (result < 0) { + result = 32 * 1024; + } + return result; + } + + ExecContext* ctx_; + Future<> finished_ = Future<>::MakeFinished(); + + const std::vector key_field_ids_; + const std::vector agg_src_field_ids_; + const std::vector aggs_; + const std::vector agg_kernels_; + + ThreadIndexer get_thread_index_; + AtomicCounter input_counter_, output_counter_; + + std::vector local_states_; + ExecBatch out_data_; +}; + +Result MakeGroupByNode(ExecNode* input, std::string label, + std::vector keys, + std::vector agg_srcs, + std::vector aggs) { + // Get input schema + auto input_schema = input->output_schema(); + + // Find input field indices for key fields + std::vector key_field_ids(keys.size()); + for (size_t i = 0; i < keys.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(auto match, FieldRef(keys[i]).FindOne(*input_schema)); + key_field_ids[i] = match[0]; + } + + // Find input field indices for aggregates + std::vector agg_src_field_ids(aggs.size()); + for (size_t i = 0; i < aggs.size(); ++i) { + ARROW_ASSIGN_OR_RAISE(auto match, FieldRef(agg_srcs[i]).FindOne(*input_schema)); + agg_src_field_ids[i] = match[0]; + } + + // Build vector of aggregate source field data types + DCHECK_EQ(agg_srcs.size(), aggs.size()); + std::vector agg_src_descrs(aggs.size()); + for (size_t i = 0; i < aggs.size(); ++i) { + auto agg_src_field_id = agg_src_field_ids[i]; + agg_src_descrs[i] = + ValueDescr(input_schema->field(agg_src_field_id)->type(), ValueDescr::ARRAY); + } + + auto ctx = input->plan()->exec_context(); + + // Construct aggregates + ARROW_ASSIGN_OR_RAISE(auto agg_kernels, + internal::GetKernels(ctx, aggs, agg_src_descrs)); + + ARROW_ASSIGN_OR_RAISE(auto agg_states, + internal::InitKernels(agg_kernels, ctx, aggs, agg_src_descrs)); + + ARROW_ASSIGN_OR_RAISE( + FieldVector agg_result_fields, + internal::ResolveKernels(aggs, agg_kernels, agg_states, ctx, agg_src_descrs)); + + // Build field vector for output schema + FieldVector output_fields{keys.size() + aggs.size()}; + + // Aggregate fields come before key fields to match the behavior of GroupBy function + for (size_t i = 0; i < aggs.size(); ++i) { + output_fields[i] = agg_result_fields[i]; + } + size_t base = aggs.size(); + for (size_t i = 0; i < keys.size(); ++i) { + int key_field_id = key_field_ids[i]; + output_fields[base + i] = input_schema->field(key_field_id); + } + + auto aggs_copy = aggs; + + return input->plan()->EmplaceNode( + input, std::move(label), schema(std::move(output_fields)), ctx, + std::move(key_field_ids), std::move(agg_src_field_ids), std::move(aggs), + std::move(agg_kernels)); +} + +Result GroupByUsingExecPlan(const std::vector& arguments, + const std::vector& keys, + const std::vector& aggregates, + bool use_threads, ExecContext* ctx) { + using arrow::compute::detail::ExecBatchIterator; + + FieldVector scan_fields(arguments.size() + keys.size()); + std::vector keys_str(keys.size()); + std::vector arguments_str(arguments.size()); + for (size_t i = 0; i < arguments.size(); ++i) { + arguments_str[i] = std::string("agg_") + std::to_string(i); + scan_fields[i] = field(arguments_str[i], arguments[i].type()); + } + for (size_t i = 0; i < keys.size(); ++i) { + keys_str[i] = std::string("key_") + std::to_string(i); + scan_fields[arguments.size() + i] = field(keys_str[i], keys[i].type()); + } + + std::vector scan_batches; + std::vector inputs; + for (const auto& argument : arguments) { + inputs.push_back(argument); + } + for (const auto& key : keys) { + inputs.push_back(key); + } + ARROW_ASSIGN_OR_RAISE(auto batch_iterator, + ExecBatchIterator::Make(inputs, ctx->exec_chunksize())); + ExecBatch batch; + while (batch_iterator->Next(&batch)) { + if (batch.length == 0) continue; + scan_batches.push_back(batch); + } + + ARROW_ASSIGN_OR_RAISE(auto plan, ExecPlan::Make(ctx)); + auto source = MakeSourceNode( + plan.get(), "source", schema(std::move(scan_fields)), + MakeVectorGenerator(arrow::internal::MapVector( + [](ExecBatch batch) { return util::make_optional(std::move(batch)); }, + std::move(scan_batches)))); + + ARROW_ASSIGN_OR_RAISE( + auto gby, MakeGroupByNode(source, "gby", keys_str, arguments_str, aggregates)); + auto sink_gen = MakeSinkNode(gby, "sink"); + + RETURN_NOT_OK(plan->Validate()); + RETURN_NOT_OK(plan->StartProducing()); + + auto collected_fut = CollectAsyncGenerator(sink_gen); + + auto start_and_collect = + AllComplete({plan->finished(), Future<>(collected_fut)}) + .Then([collected_fut]() -> Result> { + ARROW_ASSIGN_OR_RAISE(auto collected, collected_fut.result()); + return ::arrow::internal::MapVector( + [](util::optional batch) { return std::move(*batch); }, + std::move(collected)); + }); + + std::vector output_batches = + start_and_collect.MoveResult().MoveValueUnsafe(); + + ArrayDataVector out_data(arguments.size() + keys.size()); + for (size_t i = 0; i < arguments.size() + keys.size(); ++i) { + std::vector> arrays(output_batches.size()); + for (size_t j = 0; j < output_batches.size(); ++j) { + arrays[j] = output_batches[j].values[i].make_array(); + } + ARROW_ASSIGN_OR_RAISE(auto concatenated_array, Concatenate(arrays)); + out_data[i] = concatenated_array->data(); + } + + int64_t length = out_data[0]->length; + return ArrayData::Make(struct_(gby->output_schema()->fields()), length, + {/*null_bitmap=*/nullptr}, std::move(out_data), + /*null_count=*/0); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/exec_plan.h b/cpp/src/arrow/compute/exec/exec_plan.h new file mode 100644 index 00000000000..fc3af92af4a --- /dev/null +++ b/cpp/src/arrow/compute/exec/exec_plan.h @@ -0,0 +1,307 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/exec.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/type_fwd.h" +#include "arrow/util/macros.h" +#include "arrow/util/optional.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this { + public: + using NodeVector = std::vector; + + virtual ~ExecPlan() = default; + + ExecContext* exec_context() const { return exec_context_; } + + /// Make an empty exec plan + static Result> Make(ExecContext* = default_exec_context()); + + ExecNode* AddNode(std::unique_ptr node); + + template + Node* EmplaceNode(Args&&... args) { + std::unique_ptr node{new Node{std::forward(args)...}}; + auto out = node.get(); + AddNode(std::move(node)); + return out; + } + + /// The initial inputs + const NodeVector& sources() const; + + /// The final outputs + const NodeVector& sinks() const; + + Status Validate(); + + /// \brief Start producing on all nodes + /// + /// Nodes are started in reverse topological order, such that any node + /// is started before all of its inputs. + Status StartProducing(); + + /// \brief Stop producing on all nodes + /// + /// Nodes are stopped in topological order, such that any node + /// is stopped before all of its outputs. + void StopProducing(); + + /// \brief A future which will be marked finished when all nodes have stopped producing. + Future<> finished(); + + protected: + ExecContext* exec_context_; + explicit ExecPlan(ExecContext* exec_context) : exec_context_(exec_context) {} +}; + +class ARROW_EXPORT ExecNode { + public: + using NodeVector = std::vector; + + virtual ~ExecNode() = default; + + virtual const char* kind_name() = 0; + + // The number of inputs/outputs expected by this node + int num_inputs() const { return static_cast(inputs_.size()); } + int num_outputs() const { return num_outputs_; } + + /// This node's predecessors in the exec plan + const NodeVector& inputs() const { return inputs_; } + + /// \brief Labels identifying the function of each input. + const std::vector& input_labels() const { return input_labels_; } + + /// This node's successors in the exec plan + const NodeVector& outputs() const { return outputs_; } + + /// The datatypes for batches produced by this node + const std::shared_ptr& output_schema() const { return output_schema_; } + + /// This node's exec plan + ExecPlan* plan() { return plan_; } + + /// \brief An optional label, for display and debugging + /// + /// There is no guarantee that this value is non-empty or unique. + const std::string& label() const { return label_; } + + Status Validate() const; + + /// Upstream API: + /// These functions are called by input nodes that want to inform this node + /// about an updated condition (a new input batch, an error, an impeding + /// end of stream). + /// + /// Implementation rules: + /// - these may be called anytime after StartProducing() has succeeded + /// (and even during or after StopProducing()) + /// - these may be called concurrently + /// - these are allowed to call back into PauseProducing(), ResumeProducing() + /// and StopProducing() + + /// Transfer input batch to ExecNode + virtual void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) = 0; + + /// Signal error to ExecNode + virtual void ErrorReceived(ExecNode* input, Status error) = 0; + + /// Mark the inputs finished after the given number of batches. + /// + /// This may be called before all inputs are received. This simply fixes + /// the total number of incoming batches for an input, so that the ExecNode + /// knows when it has received all input, regardless of order. + virtual void InputFinished(ExecNode* input, int seq_stop) = 0; + + /// Lifecycle API: + /// - start / stop to initiate and terminate production + /// - pause / resume to apply backpressure + /// + /// Implementation rules: + /// - StartProducing() should not recurse into the inputs, as it is + /// handled by ExecPlan::StartProducing() + /// - PauseProducing(), ResumeProducing(), StopProducing() may be called + /// concurrently (but only after StartProducing() has returned successfully) + /// - PauseProducing(), ResumeProducing(), StopProducing() may be called + /// by the downstream nodes' InputReceived(), ErrorReceived(), InputFinished() + /// methods + /// - StopProducing() should recurse into the inputs + /// - StopProducing() must be idempotent + + // XXX What happens if StartProducing() calls an output's InputReceived() + // synchronously, and InputReceived() decides to call back into StopProducing() + // (or PauseProducing()) because it received enough data? + // + // Right now, since synchronous calls happen in both directions (input to + // output and then output to input), a node must be careful to be reentrant + // against synchronous calls from its output, *and* also concurrent calls from + // other threads. The most reliable solution is to update the internal state + // first, and notify outputs only at the end. + // + // Alternate rules: + // - StartProducing(), ResumeProducing() can call synchronously into + // its ouputs' consuming methods (InputReceived() etc.) + // - InputReceived(), ErrorReceived(), InputFinished() can call asynchronously + // into its inputs' PauseProducing(), StopProducing() + // + // Alternate API: + // - InputReceived(), ErrorReceived(), InputFinished() return a ProductionHint + // enum: either None (default), PauseProducing, ResumeProducing, StopProducing + // - A method allows passing a ProductionHint asynchronously from an output node + // (replacing PauseProducing(), ResumeProducing(), StopProducing()) + + /// \brief Start producing + /// + /// This must only be called once. If this fails, then other lifecycle + /// methods must not be called. + /// + /// This is typically called automatically by ExecPlan::StartProducing(). + virtual Status StartProducing() = 0; + + /// \brief Pause producing temporarily + /// + /// This call is a hint that an output node is currently not willing + /// to receive data. + /// + /// This may be called any number of times after StartProducing() succeeds. + /// However, the node is still free to produce data (which may be difficult + /// to prevent anyway if data is produced using multiple threads). + virtual void PauseProducing(ExecNode* output) = 0; + + /// \brief Resume producing after a temporary pause + /// + /// This call is a hint that an output node is willing to receive data again. + /// + /// This may be called any number of times after StartProducing() succeeds. + /// This may also be called concurrently with PauseProducing(), which suggests + /// the implementation may use an atomic counter. + virtual void ResumeProducing(ExecNode* output) = 0; + + /// \brief Stop producing definitively to a single output + /// + /// This call is a hint that an output node has completed and is not willing + /// to receive any further data. + virtual void StopProducing(ExecNode* output) = 0; + + /// \brief Stop producing definitively to all outputs + virtual void StopProducing() = 0; + + /// \brief A future which will be marked finished when this node has stopped producing. + virtual Future<> finished() = 0; + + protected: + ExecNode(ExecPlan* plan, std::string label, NodeVector inputs, + std::vector input_labels, std::shared_ptr output_schema, + int num_outputs); + + // A helper method to send an error status to all outputs. + // Returns true if the status was an error. + bool ErrorIfNotOk(Status status); + + ExecPlan* plan_; + std::string label_; + + NodeVector inputs_; + std::vector input_labels_; + + std::shared_ptr output_schema_; + int num_outputs_; + NodeVector outputs_; +}; + +/// \brief Adapt an AsyncGenerator as a source node +/// +/// plan->exec_context()->executor() is used to parallelize pushing to +/// outputs, if provided. +ARROW_EXPORT +ExecNode* MakeSourceNode(ExecPlan* plan, std::string label, + std::shared_ptr output_schema, + std::function>()>); + +/// \brief Add a sink node which forwards to an AsyncGenerator +/// +/// Emitted batches will not be ordered. +ARROW_EXPORT +std::function>()> MakeSinkNode(ExecNode* input, + std::string label); + +/// \brief Wrap an ExecBatch generator in a RecordBatchReader. +/// +/// The RecordBatchReader does not impose any ordering on emitted batches. +ARROW_EXPORT +std::shared_ptr MakeGeneratorReader( + std::shared_ptr, std::function>()>, + MemoryPool*); + +/// \brief Make a node which excludes some rows from batches passed through it +/// +/// The filter Expression will be evaluated against each batch which is pushed to +/// this node. Any rows for which the filter does not evaluate to `true` will be excluded +/// in the batch emitted by this node. +/// +/// If the filter is not already bound, it will be bound against the input's schema. +ARROW_EXPORT +Result MakeFilterNode(ExecNode* input, std::string label, Expression filter); + +/// \brief Make a node which executes expressions on input batches, producing new batches. +/// +/// Each expression will be evaluated against each batch which is pushed to +/// this node to produce a corresponding output column. +/// +/// If exprs are not already bound, they will be bound against the input's schema. +/// If names are not provided, the string representations of exprs will be used. +ARROW_EXPORT +Result MakeProjectNode(ExecNode* input, std::string label, + std::vector exprs, + std::vector names = {}); + +ARROW_EXPORT +Result MakeScalarAggregateNode(ExecNode* input, std::string label, + std::vector aggregates, + std::vector arguments, + std::vector out_field_names); + +/// \brief Make a node which groups input rows based on key fields and computes +/// aggregates for each group +ARROW_EXPORT +Result MakeGroupByNode(ExecNode* input, std::string label, + std::vector keys, + std::vector agg_srcs, + std::vector aggs); + +ARROW_EXPORT +Result GroupByUsingExecPlan(const std::vector& arguments, + const std::vector& keys, + const std::vector& aggregates, + bool use_threads, ExecContext* ctx); + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/dataset/expression.cc b/cpp/src/arrow/compute/exec/expression.cc similarity index 77% rename from cpp/src/arrow/dataset/expression.cc rename to cpp/src/arrow/compute/exec/expression.cc index 627477b3038..4aab64a46a4 100644 --- a/cpp/src/arrow/dataset/expression.cc +++ b/cpp/src/arrow/compute/exec/expression.cc @@ -15,19 +15,20 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/dataset/expression.h" +#include "arrow/compute/exec/expression.h" #include #include #include "arrow/chunked_array.h" #include "arrow/compute/api_vector.h" +#include "arrow/compute/exec/expression_internal.h" #include "arrow/compute/exec_internal.h" -#include "arrow/dataset/expression_internal.h" +#include "arrow/compute/function_internal.h" #include "arrow/io/memory.h" #include "arrow/ipc/reader.h" #include "arrow/ipc/writer.h" -#include "arrow/util/atomic_shared_ptr.h" +#include "arrow/util/hash_util.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/optional.h" @@ -39,9 +40,19 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -namespace dataset { +namespace compute { -Expression::Expression(Call call) : impl_(std::make_shared(std::move(call))) {} +void Expression::Call::ComputeHash() { + hash = std::hash{}(function_name); + for (const auto& arg : arguments) { + arrow::internal::hash_combine(hash, arg.hash()); + } +} + +Expression::Expression(Call call) { + call.ComputeHash(); + impl_ = std::make_shared(std::move(call)); +} Expression::Expression(Datum literal) : impl_(std::make_shared(std::move(literal))) {} @@ -52,7 +63,7 @@ Expression::Expression(Parameter parameter) Expression literal(Datum lit) { return Expression(std::move(lit)); } Expression field_ref(FieldRef ref) { - return Expression(Expression::Parameter{std::move(ref), {}}); + return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, -1}); } Expression call(std::string function, std::vector arguments, @@ -66,8 +77,12 @@ Expression call(std::string function, std::vector arguments, const Datum* Expression::literal() const { return util::get_if(impl_.get()); } +const Expression::Parameter* Expression::parameter() const { + return util::get_if(impl_.get()); +} + const FieldRef* Expression::field_ref() const { - if (auto parameter = util::get_if(impl_.get())) { + if (auto parameter = this->parameter()) { return ¶meter->ref; } return nullptr; @@ -84,7 +99,7 @@ ValueDescr Expression::descr() const { return lit->descr(); } - if (auto parameter = util::get_if(impl_.get())) { + if (auto parameter = this->parameter()) { return parameter->descr; } @@ -151,7 +166,7 @@ std::string Expression::ToString() const { return binary(std::move(op)); } - if (auto options = GetProjectOptions(*call)) { + if (auto options = GetMakeStructOptions(*call)) { std::string out = "{"; auto argument = call->arguments.begin(); for (const auto& field_name : options->field_names) { @@ -167,41 +182,14 @@ std::string Expression::ToString() const { out += arg.ToString() + ", "; } - if (call->options == nullptr) { + if (call->options) { + out += call->options->ToString(); + out.resize(out.size() + 1); + } else { out.resize(out.size() - 1); - out.back() = ')'; - return out; } - - if (auto options = GetSetLookupOptions(*call)) { - DCHECK_EQ(options->value_set.kind(), Datum::ARRAY); - out += "value_set=" + options->value_set.make_array()->ToString(); - if (options->skip_nulls) { - out += ", skip_nulls"; - } - return out + ")"; - } - - if (auto options = GetCastOptions(*call)) { - if (options->to_type == nullptr) { - return out + "to_type=)"; - } - out += "to_type=" + options->to_type->ToString(); - if (options->allow_int_overflow) out += ", allow_int_overflow"; - if (options->allow_time_truncate) out += ", allow_time_truncate"; - if (options->allow_time_overflow) out += ", allow_time_overflow"; - if (options->allow_decimal_truncate) out += ", allow_decimal_truncate"; - if (options->allow_float_truncate) out += ", allow_float_truncate"; - if (options->allow_invalid_utf8) out += ", allow_invalid_utf8"; - return out + ")"; - } - - if (auto options = GetStrptimeOptions(*call)) { - return out + "format=" + options->format + - ", unit=" + internal::ToString(options->unit) + ")"; - } - - return out + "{NON-REPRESENTABLE OPTIONS})"; + out.back() = ')'; + return out; } void PrintTo(const Expression& expr, std::ostream* os) { @@ -241,41 +229,9 @@ bool Expression::Equals(const Expression& other) const { } if (call->options == other_call->options) return true; - - if (auto options = GetSetLookupOptions(*call)) { - auto other_options = GetSetLookupOptions(*other_call); - return options->value_set == other_options->value_set && - options->skip_nulls == other_options->skip_nulls; - } - - if (auto options = GetCastOptions(*call)) { - auto other_options = GetCastOptions(*other_call); - for (auto safety_opt : { - &compute::CastOptions::allow_int_overflow, - &compute::CastOptions::allow_time_truncate, - &compute::CastOptions::allow_time_overflow, - &compute::CastOptions::allow_decimal_truncate, - &compute::CastOptions::allow_float_truncate, - &compute::CastOptions::allow_invalid_utf8, - }) { - if (options->*safety_opt != other_options->*safety_opt) return false; - } - return options->to_type->Equals(other_options->to_type); + if (call->options && other_call->options) { + return call->options->Equals(other_call->options); } - - if (auto options = GetProjectOptions(*call)) { - auto other_options = GetProjectOptions(*other_call); - return options->field_names == other_options->field_names; - } - - if (auto options = GetStrptimeOptions(*call)) { - auto other_options = GetStrptimeOptions(*other_call); - return options->format == other_options->format && - options->unit == other_options->unit; - } - - ARROW_LOG(WARNING) << "comparing unknown FunctionOptions for function " - << call->function_name; return false; } @@ -293,20 +249,7 @@ size_t Expression::hash() const { return ref->hash(); } - auto call = CallNotNull(*this); - if (call->hash != nullptr) { - return call->hash->load(); - } - - size_t out = std::hash{}(call->function_name); - for (const auto& arg : call->arguments) { - out ^= arg.hash(); - } - - std::shared_ptr> expected = nullptr; - internal::atomic_compare_exchange_strong(&const_cast(call)->hash, &expected, - std::make_shared>(out)); - return out; + return CallNotNull(*this)->hash; } bool Expression::IsBound() const { @@ -427,10 +370,10 @@ Result BindNonRecursive(Expression::Call call, bool insert_implicit_ compute::KernelContext kernel_context(exec_context); if (call.kernel->init) { - call.kernel_state = - call.kernel->init(&kernel_context, {call.kernel, descrs, call.options.get()}); + ARROW_ASSIGN_OR_RAISE( + call.kernel_state, + call.kernel->init(&kernel_context, {call.kernel, descrs, call.options.get()})); - RETURN_NOT_OK(kernel_context.status()); kernel_context.SetState(call.kernel_state.get()); } @@ -440,76 +383,113 @@ Result BindNonRecursive(Expression::Call call, bool insert_implicit_ return Expression(std::move(call)); } -struct FieldPathGetDatumImpl { - template ()))> - Result operator()(const std::shared_ptr& ptr) { - return path_.Get(*ptr).template As(); - } - - template - Result operator()(const T&) { - return Status::NotImplemented("FieldPath::Get() into Datum ", datum_.ToString()); +template +Result BindImpl(Expression expr, const TypeOrSchema& in, + ValueDescr::Shape shape, compute::ExecContext* exec_context) { + if (exec_context == nullptr) { + compute::ExecContext exec_context; + return BindImpl(std::move(expr), in, shape, &exec_context); } - const Datum& datum_; - const FieldPath& path_; -}; + if (expr.literal()) return expr; -inline Result GetDatumField(const FieldRef& ref, const Datum& input) { - Datum field; + if (auto ref = expr.field_ref()) { + if (ref->IsNested()) { + return Status::NotImplemented("nested field references"); + } - FieldPath match; - if (auto type = input.type()) { - ARROW_ASSIGN_OR_RAISE(match, ref.FindOneOrNone(*type)); - } else if (auto schema = input.schema()) { - ARROW_ASSIGN_OR_RAISE(match, ref.FindOneOrNone(*schema)); - } else { - return Status::NotImplemented("retrieving fields from datum ", input.ToString()); - } + ARROW_ASSIGN_OR_RAISE(auto path, ref->FindOne(in)); - if (!match.empty()) { - ARROW_ASSIGN_OR_RAISE(field, - util::visit(FieldPathGetDatumImpl{input, match}, input.value)); + auto bound = *expr.parameter(); + bound.index = path[0]; + ARROW_ASSIGN_OR_RAISE(auto field, path.Get(in)); + bound.descr.type = field->type(); + bound.descr.shape = shape; + return Expression{std::move(bound)}; } - if (field == Datum{}) { - return Datum(std::make_shared()); + auto call = *CallNotNull(expr); + for (auto& argument : call.arguments) { + ARROW_ASSIGN_OR_RAISE(argument, + BindImpl(std::move(argument), in, shape, exec_context)); } - - return field; + return BindNonRecursive(std::move(call), + /*insert_implicit_casts=*/true, exec_context); } } // namespace -Result Expression::Bind(ValueDescr in, +Result Expression::Bind(const ValueDescr& in, compute::ExecContext* exec_context) const { - if (exec_context == nullptr) { - compute::ExecContext exec_context; - return Bind(std::move(in), &exec_context); - } + return BindImpl(*this, *in.type, in.shape, exec_context); +} - if (literal()) return *this; +Result Expression::Bind(const Schema& in_schema, + compute::ExecContext* exec_context) const { + return BindImpl(*this, in_schema, ValueDescr::ARRAY, exec_context); +} - if (auto ref = field_ref()) { - ARROW_ASSIGN_OR_RAISE(auto field, ref->GetOneOrNone(*in.type)); - auto descr = field ? ValueDescr{field->type(), in.shape} : ValueDescr::Scalar(null()); - return Expression{Parameter{*ref, std::move(descr)}}; +Result MakeExecBatch(const Schema& full_schema, const Datum& partial) { + ExecBatch out; + + if (partial.kind() == Datum::RECORD_BATCH) { + const auto& partial_batch = *partial.record_batch(); + out.length = partial_batch.num_rows(); + + for (const auto& field : full_schema.fields()) { + ARROW_ASSIGN_OR_RAISE(auto column, + FieldRef(field->name()).GetOneOrNone(partial_batch)); + + if (column) { + if (!column->type()->Equals(field->type())) { + // Referenced field was present but didn't have the expected type. + // This *should* be handled by readers, and will just be an error in the future. + ARROW_ASSIGN_OR_RAISE( + auto converted, + compute::Cast(column, field->type(), compute::CastOptions::Safe())); + column = converted.make_array(); + } + out.values.emplace_back(std::move(column)); + } else { + out.values.emplace_back(MakeNullScalar(field->type())); + } + } + return out; } - auto call = *CallNotNull(*this); - for (auto& argument : call.arguments) { - ARROW_ASSIGN_OR_RAISE(argument, argument.Bind(in, exec_context)); + // wasteful but useful for testing: + if (partial.type()->id() == Type::STRUCT) { + if (partial.is_array()) { + ARROW_ASSIGN_OR_RAISE(auto partial_batch, + RecordBatch::FromStructArray(partial.make_array())); + + return MakeExecBatch(full_schema, partial_batch); + } + + if (partial.is_scalar()) { + ARROW_ASSIGN_OR_RAISE(auto partial_array, + MakeArrayFromScalar(*partial.scalar(), 1)); + ARROW_ASSIGN_OR_RAISE(auto out, MakeExecBatch(full_schema, partial_array)); + + for (Datum& value : out.values) { + if (value.is_scalar()) continue; + ARROW_ASSIGN_OR_RAISE(value, value.make_array()->GetScalar(0)); + } + return out; + } } - return BindNonRecursive(std::move(call), - /*insert_implicit_casts=*/true, exec_context); + + return Status::NotImplemented("MakeExecBatch from ", PrintDatum(partial)); } -Result Expression::Bind(const Schema& in_schema, - compute::ExecContext* exec_context) const { - return Bind(ValueDescr::Array(struct_(in_schema.fields())), exec_context); +Result ExecuteScalarExpression(const Expression& expr, const Schema& full_schema, + const Datum& partial_input, + compute::ExecContext* exec_context) { + ARROW_ASSIGN_OR_RAISE(auto input, MakeExecBatch(full_schema, partial_input)); + return ExecuteScalarExpression(expr, input, exec_context); } -Result ExecuteScalarExpression(const Expression& expr, const Datum& input, +Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& input, compute::ExecContext* exec_context) { if (exec_context == nullptr) { compute::ExecContext exec_context; @@ -527,15 +507,16 @@ Result ExecuteScalarExpression(const Expression& expr, const Datum& input if (auto lit = expr.literal()) return *lit; - if (auto ref = expr.field_ref()) { - ARROW_ASSIGN_OR_RAISE(Datum field, GetDatumField(*ref, input)); + if (auto param = expr.parameter()) { + if (param->descr.type->id() == Type::NA) { + return MakeNullScalar(null()); + } - if (field.descr() != expr.descr()) { - // Refernced field was present but didn't have the expected type. - // Should we just error here? For now, pay dispatch cost and just cast. - ARROW_ASSIGN_OR_RAISE( - field, - compute::Cast(field, expr.type(), compute::CastOptions::Safe(), exec_context)); + const Datum& field = input[param->index]; + if (!field.type()->Equals(param->descr.type)) { + return Status::Invalid("Referenced field ", expr.ToString(), " was ", + field.type()->ToString(), " but should have been ", + param->descr.type->ToString()); } return field; @@ -612,6 +593,17 @@ std::vector FieldsInExpression(const Expression& expr) { return fields; } +bool ExpressionHasFieldRefs(const Expression& expr) { + if (expr.literal()) return false; + + if (expr.field_ref()) return true; + + for (const Expression& arg : CallNotNull(expr)->arguments) { + if (ExpressionHasFieldRefs(arg)) return true; + } + return false; +} + Result FoldConstants(Expression expr) { return Modify( std::move(expr), [](Expression expr) { return expr; }, @@ -620,7 +612,7 @@ Result FoldConstants(Expression expr) { if (std::all_of(call->arguments.begin(), call->arguments.end(), [](const Expression& argument) { return argument.literal(); })) { // all arguments are literal; we can evaluate this subexpression *now* - static const Datum ignored_input = Datum{}; + static const ExecBatch ignored_input = ExecBatch{}; ARROW_ASSIGN_OR_RAISE(Datum constant, ExecuteScalarExpression(expr, ignored_input)); @@ -729,17 +721,16 @@ Status ExtractKnownFieldValuesImpl( } // namespace -Result> ExtractKnownFieldValues( +Result ExtractKnownFieldValues( const Expression& guaranteed_true_predicate) { auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate); - std::unordered_map known_values; - RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values)); + KnownFieldValues known_values; + RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map)); return known_values; } -Result ReplaceFieldsWithKnownValues( - const std::unordered_map& known_values, - Expression expr) { +Result ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values, + Expression expr) { if (!expr.IsBound()) { return Status::Invalid( "ReplaceFieldsWithKnownValues called on an unbound Expression"); @@ -749,8 +740,8 @@ Result ReplaceFieldsWithKnownValues( std::move(expr), [&known_values](Expression expr) -> Result { if (auto ref = expr.field_ref()) { - auto it = known_values.find(*ref); - if (it != known_values.end()) { + auto it = known_values.map.find(*ref); + if (it != known_values.map.end()) { Datum lit = it->second; if (lit.descr() == expr.descr()) return literal(std::move(lit)); // type mismatch, try casting the known value to the correct type @@ -952,8 +943,8 @@ Result SimplifyWithGuarantee(Expression expr, const Expression& guaranteed_true_predicate) { auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate); - std::unordered_map known_values; - RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values)); + KnownFieldValues known_values; + RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map)); ARROW_ASSIGN_OR_RAISE(expr, ReplaceFieldsWithKnownValues(known_values, std::move(expr))); @@ -980,92 +971,6 @@ Result SimplifyWithGuarantee(Expression expr, return expr; } -namespace { - -Result> FunctionOptionsToStructScalar( - const Expression::Call& call) { - if (call.options == nullptr) { - return nullptr; - } - - if (auto options = GetSetLookupOptions(call)) { - if (!options->value_set.is_array()) { - return Status::NotImplemented("chunked value_set"); - } - return StructScalar::Make( - { - std::make_shared(options->value_set.make_array()), - MakeScalar(options->skip_nulls), - }, - {"value_set", "skip_nulls"}); - } - - if (auto options = GetCastOptions(call)) { - return StructScalar::Make( - { - MakeNullScalar(options->to_type), - MakeScalar(options->allow_int_overflow), - MakeScalar(options->allow_time_truncate), - MakeScalar(options->allow_time_overflow), - MakeScalar(options->allow_decimal_truncate), - MakeScalar(options->allow_float_truncate), - MakeScalar(options->allow_invalid_utf8), - }, - { - "to_type_holder", - "allow_int_overflow", - "allow_time_truncate", - "allow_time_overflow", - "allow_decimal_truncate", - "allow_float_truncate", - "allow_invalid_utf8", - }); - } - - return Status::NotImplemented("conversion of options for ", call.function_name); -} - -Status FunctionOptionsFromStructScalar(const StructScalar* repr, Expression::Call* call) { - if (repr == nullptr) { - call->options = nullptr; - return Status::OK(); - } - - if (IsSetLookup(call->function_name)) { - ARROW_ASSIGN_OR_RAISE(auto value_set, repr->field("value_set")); - ARROW_ASSIGN_OR_RAISE(auto skip_nulls, repr->field("skip_nulls")); - call->options = std::make_shared( - checked_cast(*value_set).value, - checked_cast(*skip_nulls).value); - return Status::OK(); - } - - if (call->function_name == "cast") { - auto options = std::make_shared(); - ARROW_ASSIGN_OR_RAISE(auto to_type_holder, repr->field("to_type_holder")); - options->to_type = to_type_holder->type; - - int i = 1; - for (bool* opt : { - &options->allow_int_overflow, - &options->allow_time_truncate, - &options->allow_time_overflow, - &options->allow_decimal_truncate, - &options->allow_float_truncate, - &options->allow_invalid_utf8, - }) { - *opt = checked_cast(*repr->value[i++]).value; - } - - call->options = std::move(options); - return Status::OK(); - } - - return Status::NotImplemented("conversion of options for ", call->function_name); -} - -} // namespace - // Serialization is accomplished by converting expressions to KeyValueMetadata and storing // this in the schema of a RecordBatch. Embedded arrays and scalars are stored in its // columns. Finally, the RecordBatch is written to an IPC file. @@ -1107,7 +1012,8 @@ Result> Serialize(const Expression& expr) { } if (call->options) { - ARROW_ASSIGN_OR_RAISE(auto options_scalar, FunctionOptionsToStructScalar(*call)); + ARROW_ASSIGN_OR_RAISE(auto options_scalar, + internal::FunctionOptionsToStructScalar(*call->options)); ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*options_scalar)); metadata_->Append("options", std::move(value)); } @@ -1156,7 +1062,8 @@ Result Deserialize(std::shared_ptr buffer) { Result> GetScalar(const std::string& i) { int32_t column_index; - if (!internal::ParseValue(i.data(), i.length(), &column_index)) { + if (!::arrow::internal::ParseValue(i.data(), i.length(), + &column_index)) { return Status::Invalid("Couldn't parse column_index"); } if (column_index >= batch_.num_columns()) { @@ -1191,10 +1098,13 @@ Result Deserialize(std::shared_ptr buffer) { while (metadata().key(index_) != "end") { if (metadata().key(index_) == "options") { ARROW_ASSIGN_OR_RAISE(auto options_scalar, GetScalar(metadata().value(index_))); - auto expr = call(value, std::move(arguments)); - RETURN_NOT_OK(FunctionOptionsFromStructScalar( - checked_cast(options_scalar.get()), - const_cast(expr.call()))); + std::shared_ptr options; + if (options_scalar) { + ARROW_ASSIGN_OR_RAISE( + options, internal::FunctionOptionsFromStructScalar( + checked_cast(*options_scalar))); + } + auto expr = call(value, std::move(arguments), std::move(options)); index_ += 2; return expr; } @@ -1212,7 +1122,8 @@ Result Deserialize(std::shared_ptr buffer) { } Expression project(std::vector values, std::vector names) { - return call("project", std::move(values), compute::ProjectOptions{std::move(names)}); + return call("make_struct", std::move(values), + compute::MakeStructOptions{std::move(names)}); } Expression equal(Expression lhs, Expression rhs) { @@ -1271,13 +1182,5 @@ Expression or_(const std::vector& operands) { Expression not_(Expression operand) { return call("invert", {std::move(operand)}); } -Expression operator&&(Expression lhs, Expression rhs) { - return and_(std::move(lhs), std::move(rhs)); -} - -Expression operator||(Expression lhs, Expression rhs) { - return or_(std::move(lhs), std::move(rhs)); -} - -} // namespace dataset +} // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/expression.h b/cpp/src/arrow/compute/exec/expression.h new file mode 100644 index 00000000000..3810accf70a --- /dev/null +++ b/cpp/src/arrow/compute/exec/expression.h @@ -0,0 +1,269 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This API is EXPERIMENTAL. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/compute/type_fwd.h" +#include "arrow/datum.h" +#include "arrow/type_fwd.h" +#include "arrow/util/variant.h" + +namespace arrow { +namespace compute { + +/// An unbound expression which maps a single Datum to another Datum. +/// An expression is one of +/// - A literal Datum. +/// - A reference to a single (potentially nested) field of the input Datum. +/// - A call to a compute function, with arguments specified by other Expressions. +class ARROW_EXPORT Expression { + public: + struct Call { + std::string function_name; + std::vector arguments; + std::shared_ptr options; + // Cached hash value + size_t hash; + + // post-Bind properties: + std::shared_ptr function; + const Kernel* kernel = NULLPTR; + std::shared_ptr kernel_state; + ValueDescr descr; + + void ComputeHash(); + }; + + std::string ToString() const; + bool Equals(const Expression& other) const; + size_t hash() const; + struct Hash { + size_t operator()(const Expression& expr) const { return expr.hash(); } + }; + + /// Bind this expression to the given input type, looking up Kernels and field types. + /// Some expression simplification may be performed and implicit casts will be inserted. + /// Any state necessary for execution will be initialized and returned. + Result Bind(const ValueDescr& in, ExecContext* = NULLPTR) const; + Result Bind(const Schema& in_schema, ExecContext* = NULLPTR) const; + + // XXX someday + // Clone all KernelState in this bound expression. If any function referenced by this + // expression has mutable KernelState, it is not safe to execute or apply simplification + // passes to it (or copies of it!) from multiple threads. Cloning state produces new + // KernelStates where necessary to ensure that Expressions may be manipulated safely + // on multiple threads. + // Result CloneState() const; + // Status SetState(ExpressionState); + + /// Return true if all an expression's field references have explicit ValueDescr and all + /// of its functions' kernels are looked up. + bool IsBound() const; + + /// Return true if this expression is composed only of Scalar literals, field + /// references, and calls to ScalarFunctions. + bool IsScalarExpression() const; + + /// Return true if this expression is literal and entirely null. + bool IsNullLiteral() const; + + /// Return true if this expression could evaluate to true. + bool IsSatisfiable() const; + + // XXX someday + // Result GetPipelines(); + + /// Access a Call or return nullptr if this expression is not a call + const Call* call() const; + /// Access a Datum or return nullptr if this expression is not a literal + const Datum* literal() const; + /// Access a FieldRef or return nullptr if this expression is not a field_ref + const FieldRef* field_ref() const; + + /// The type and shape to which this expression will evaluate + ValueDescr descr() const; + std::shared_ptr type() const { return descr().type; } + // XXX someday + // NullGeneralization::type nullable() const; + + struct Parameter { + FieldRef ref; + + // post-bind properties + ValueDescr descr; + int index; + }; + const Parameter* parameter() const; + + Expression() = default; + explicit Expression(Call call); + explicit Expression(Datum literal); + explicit Expression(Parameter parameter); + + private: + using Impl = util::Variant; + std::shared_ptr impl_; + + ARROW_EXPORT friend bool Identical(const Expression& l, const Expression& r); + + ARROW_EXPORT friend void PrintTo(const Expression&, std::ostream*); +}; + +inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); } +inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); } + +// Factories + +ARROW_EXPORT +Expression literal(Datum lit); + +template +Expression literal(Arg&& arg) { + return literal(Datum(std::forward(arg))); +} + +ARROW_EXPORT +Expression field_ref(FieldRef ref); + +ARROW_EXPORT +Expression call(std::string function, std::vector arguments, + std::shared_ptr options = NULLPTR); + +template ::value>::type> +Expression call(std::string function, std::vector arguments, + Options options) { + return call(std::move(function), std::move(arguments), + std::make_shared(std::move(options))); +} + +/// Assemble a list of all fields referenced by an Expression at any depth. +ARROW_EXPORT +std::vector FieldsInExpression(const Expression&); + +/// Check if the expression references any fields. +ARROW_EXPORT +bool ExpressionHasFieldRefs(const Expression&); + +/// Assemble a mapping from field references to known values. +struct ARROW_EXPORT KnownFieldValues; +ARROW_EXPORT +Result ExtractKnownFieldValues( + const Expression& guaranteed_true_predicate); + +/// \defgroup expression-passes Functions for modification of Expressions +/// +/// @{ +/// +/// These transform bound expressions. Some transforms utilize a guarantee, which is +/// provided as an Expression which is guaranteed to evaluate to true. The +/// guaranteed_true_predicate need not be bound, but canonicalization is currently +/// deferred to producers of guarantees. For example in order to be recognized as a +/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS +/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or +/// other semantically identical Expressions will not be recognized. + +/// Weak canonicalization which establishes guarantees for subsequent passes. Even +/// equivalent Expressions may result in different canonicalized expressions. +/// TODO this could be a strong canonicalization +ARROW_EXPORT +Result Canonicalize(Expression, ExecContext* = NULLPTR); + +/// Simplify Expressions based on literal arguments (for example, add(null, x) will always +/// be null so replace the call with a null literal). Includes early evaluation of all +/// calls whose arguments are entirely literal. +ARROW_EXPORT +Result FoldConstants(Expression); + +/// Simplify Expressions by replacing with known values of the fields which it references. +ARROW_EXPORT +Result ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values, + Expression); + +/// Simplify an expression by replacing subexpressions based on a guarantee: +/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is +/// used to remove redundant function calls from a filter expression or to replace a +/// reference to a constant-value field with a literal. +ARROW_EXPORT +Result SimplifyWithGuarantee(Expression, + const Expression& guaranteed_true_predicate); + +/// @} + +// Execution + +/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a +/// RecordBatch which may have missing or incorrectly ordered columns. +/// Missing fields will be replaced with null scalars. +ARROW_EXPORT Result MakeExecBatch(const Schema& full_schema, + const Datum& partial); + +/// Execute a scalar expression against the provided state and input ExecBatch. This +/// expression must be bound. +ARROW_EXPORT +Result ExecuteScalarExpression(const Expression&, const ExecBatch& input, + ExecContext* = NULLPTR); + +/// Convenience function for invoking against a RecordBatch +ARROW_EXPORT +Result ExecuteScalarExpression(const Expression&, const Schema& full_schema, + const Datum& partial_input, ExecContext* = NULLPTR); + +// Serialization + +ARROW_EXPORT +Result> Serialize(const Expression&); + +ARROW_EXPORT +Result Deserialize(std::shared_ptr); + +// Convenience aliases for factories + +ARROW_EXPORT Expression project(std::vector values, + std::vector names); + +ARROW_EXPORT Expression equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression less(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression greater(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs); + +ARROW_EXPORT Expression is_null(Expression lhs); + +ARROW_EXPORT Expression is_valid(Expression lhs); + +ARROW_EXPORT Expression and_(Expression lhs, Expression rhs); +ARROW_EXPORT Expression and_(const std::vector&); +ARROW_EXPORT Expression or_(Expression lhs, Expression rhs); +ARROW_EXPORT Expression or_(const std::vector&); +ARROW_EXPORT Expression not_(Expression operand); + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/dataset/expression_benchmark.cc b/cpp/src/arrow/compute/exec/expression_benchmark.cc similarity index 81% rename from cpp/src/arrow/dataset/expression_benchmark.cc rename to cpp/src/arrow/compute/exec/expression_benchmark.cc index 24870f38c14..1899b7caab6 100644 --- a/cpp/src/arrow/dataset/expression_benchmark.cc +++ b/cpp/src/arrow/compute/exec/expression_benchmark.cc @@ -18,23 +18,16 @@ #include "benchmark/benchmark.h" #include "arrow/compute/cast.h" -#include "arrow/dataset/expression.h" +#include "arrow/compute/exec/expression.h" #include "arrow/dataset/partition.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" namespace arrow { -namespace dataset { +namespace compute { -static Expression GetPartitionExpression(const std::string& path, bool infer_dictionary) { - auto options = HivePartitioningFactoryOptions(); - options.infer_dictionary = infer_dictionary; - auto factory = HivePartitioning::MakeFactory(options); - ASSIGN_OR_ABORT(auto schema, factory->Inspect({path})); - ASSIGN_OR_ABORT(auto partitioning, factory->Finish(schema)); - ASSIGN_OR_ABORT(auto expr, partitioning->Parse(path)); - return expr; -} +std::shared_ptr ninety_nine_dict = + DictionaryScalar::Make(MakeScalar(0), ArrayFromJSON(int64(), "[99]")); // A benchmark of SimplifyWithGuarantee using expressions arising from partitioning. static void SimplifyFilterWithGuarantee(benchmark::State& state, Expression filter, @@ -61,11 +54,15 @@ auto filter_cast_negative = auto filter_cast_positive = and_(equal(call("cast", {field_ref("a")}, to_int64), literal(99)), equal(call("cast", {field_ref("b")}, to_int64), literal(99))); -// A fully simplified partition expression. -auto guarantee = GetPartitionExpression("a=99/b=99", /*infer_dictionary=*/false); -// A partition expression that uses dictionaries, which are inferred by default. -auto guarantee_dictionary = - GetPartitionExpression("a=99/b=99", /*infer_dictionary=*/true); + +// An unencoded partition expression for "a=99/b=99". +auto guarantee = and_(equal(field_ref("a"), literal(int64_t(99))), + equal(field_ref("b"), literal(int64_t(99)))); + +// A partition expression for "a=99/b=99" that uses dictionaries (inferred by default). +auto guarantee_dictionary = and_(equal(field_ref("a"), literal(ninety_nine_dict)), + equal(field_ref("b"), literal(ninety_nine_dict))); + // Negative queries (partition expressions that fail the filter) BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee, negative_filter_simple_guarantee_simple, filter_simple_negative, guarantee); @@ -87,5 +84,5 @@ BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee, BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee, positive_filter_cast_guarantee_dictionary, filter_cast_positive, guarantee_dictionary); -} // namespace dataset +} // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/dataset/expression_internal.h b/cpp/src/arrow/compute/exec/expression_internal.h similarity index 93% rename from cpp/src/arrow/dataset/expression_internal.h rename to cpp/src/arrow/compute/exec/expression_internal.h index 24e60377f5a..dc38924d932 100644 --- a/cpp/src/arrow/dataset/expression_internal.h +++ b/cpp/src/arrow/compute/exec/expression_internal.h @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/dataset/expression.h" +#include "arrow/compute/exec/expression.h" #include #include @@ -32,7 +32,11 @@ namespace arrow { using internal::checked_cast; -namespace dataset { +namespace compute { + +struct KnownFieldValues { + std::unordered_map map; +}; inline const Expression::Call* CallNotNull(const Expression& expr) { auto call = expr.call(); @@ -216,20 +220,10 @@ inline bool IsSetLookup(const std::string& function) { return function == "is_in" || function == "index_in"; } -inline const compute::SetLookupOptions* GetSetLookupOptions( +inline const compute::MakeStructOptions* GetMakeStructOptions( const Expression::Call& call) { - if (!IsSetLookup(call.function_name)) return nullptr; - return checked_cast(call.options.get()); -} - -inline const compute::ProjectOptions* GetProjectOptions(const Expression::Call& call) { - if (call.function_name != "project") return nullptr; - return checked_cast(call.options.get()); -} - -inline const compute::StrptimeOptions* GetStrptimeOptions(const Expression::Call& call) { - if (call.function_name != "strptime") return nullptr; - return checked_cast(call.options.get()); + if (call.function_name != "make_struct") return nullptr; + return checked_cast(call.options.get()); } /// A helper for unboxing an Expression composed of associative function calls. @@ -338,5 +332,5 @@ Result Modify(Expression expr, const PreVisit& pre, return post_call(std::move(expr), nullptr); } -} // namespace dataset +} // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/dataset/expression_test.cc b/cpp/src/arrow/compute/exec/expression_test.cc similarity index 84% rename from cpp/src/arrow/dataset/expression_test.cc rename to cpp/src/arrow/compute/exec/expression_test.cc index 2ab796b052f..b59f8762818 100644 --- a/cpp/src/arrow/dataset/expression_test.cc +++ b/cpp/src/arrow/compute/exec/expression_test.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/dataset/expression.h" +#include "arrow/compute/exec/expression.h" #include #include @@ -26,9 +26,9 @@ #include #include +#include "arrow/compute/exec/expression_internal.h" +#include "arrow/compute/function_internal.h" #include "arrow/compute/registry.h" -#include "arrow/dataset/expression_internal.h" -#include "arrow/dataset/test_util.h" #include "arrow/testing/gtest_util.h" using testing::HasSubstr; @@ -39,7 +39,24 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -namespace dataset { +namespace compute { + +const std::shared_ptr kBoringSchema = schema({ + field("bool", boolean()), + field("i8", int8()), + field("i32", int32()), + field("i32_req", int32(), /*nullable=*/false), + field("u32", uint32()), + field("i64", int64()), + field("f32", float32()), + field("f32_req", float32(), /*nullable=*/false), + field("f64", float64()), + field("date64", date64()), + field("str", utf8()), + field("dict_str", dictionary(int32(), utf8())), + field("dict_i32", dictionary(int32(), int32())), + field("ts_ns", timestamp(TimeUnit::NANO)), +}); #define EXPECT_OK ARROW_EXPECT_OK @@ -149,6 +166,56 @@ TEST(ExpressionUtils, StripOrderPreservingCasts) { Expect(cast(field_ref("i32"), uint64()), no_change); } +TEST(ExpressionUtils, MakeExecBatch) { + auto Expect = [](std::shared_ptr partial_batch) { + SCOPED_TRACE(partial_batch->ToString()); + ASSERT_OK_AND_ASSIGN(auto batch, MakeExecBatch(*kBoringSchema, partial_batch)); + + ASSERT_EQ(batch.num_values(), kBoringSchema->num_fields()); + for (int i = 0; i < kBoringSchema->num_fields(); ++i) { + const auto& field = *kBoringSchema->field(i); + + SCOPED_TRACE("Field#" + std::to_string(i) + " " + field.ToString()); + + EXPECT_TRUE(batch[i].type()->Equals(field.type())) + << "Incorrect type " << batch[i].type()->ToString(); + + ASSERT_OK_AND_ASSIGN(auto col, FieldRef(field.name()).GetOneOrNone(*partial_batch)); + + if (batch[i].is_scalar()) { + EXPECT_FALSE(batch[i].scalar()->is_valid) + << "Non-null placeholder scalar was injected"; + + EXPECT_EQ(col, nullptr) + << "Placeholder scalar overwrote column " << col->ToString(); + } else { + AssertDatumsEqual(col, batch[i]); + } + } + }; + + auto GetField = [](std::string name) { return kBoringSchema->GetFieldByName(name); }; + + constexpr int64_t kNumRows = 3; + auto i32 = ArrayFromJSON(int32(), "[1, 2, 3]"); + auto f32 = ArrayFromJSON(float32(), "[1.5, 2.25, 3.125]"); + + // empty + Expect(RecordBatchFromJSON(kBoringSchema, "[]")); + + // subset + Expect(RecordBatch::Make(schema({GetField("i32"), GetField("f32")}), kNumRows, + {i32, f32})); + + // flipped subset + Expect(RecordBatch::Make(schema({GetField("f32"), GetField("i32")}), kNumRows, + {f32, i32})); + + auto duplicated_names = + RecordBatch::Make(schema({GetField("i32"), GetField("i32")}), kNumRows, {i32, i32}); + ASSERT_RAISES(Invalid, MakeExecBatch(*kBoringSchema, duplicated_names)); +} + TEST(Expression, ToString) { EXPECT_EQ(field_ref("alpha").ToString(), "alpha"); @@ -156,6 +223,7 @@ TEST(Expression, ToString) { EXPECT_EQ(literal("a").ToString(), "\"a\""); EXPECT_EQ(literal("a\nb").ToString(), "\"a\\nb\""); EXPECT_EQ(literal(std::make_shared()).ToString(), "null"); + EXPECT_EQ(literal(std::make_shared()).ToString(), "null"); EXPECT_EQ(literal(std::make_shared(Buffer::FromString("az"))).ToString(), "\"617A\""); @@ -167,17 +235,43 @@ TEST(Expression, ToString) { auto in_12 = call("index_in", {field_ref("beta")}, compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")}); - EXPECT_EQ(in_12.ToString(), "index_in(beta, value_set=[\n 1,\n 2\n])"); + EXPECT_EQ(in_12.ToString(), + "index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=false})"); EXPECT_EQ(and_(field_ref("a"), field_ref("b")).ToString(), "(a and b)"); EXPECT_EQ(or_(field_ref("a"), field_ref("b")).ToString(), "(a or b)"); EXPECT_EQ(not_(field_ref("a")).ToString(), "invert(a)"); - EXPECT_EQ(cast(field_ref("a"), int32()).ToString(), "cast(a, to_type=int32)"); - EXPECT_EQ(cast(field_ref("a"), nullptr).ToString(), - "cast(a, to_type=)"); - - struct WidgetifyOptions : compute::FunctionOptions { + EXPECT_EQ( + cast(field_ref("a"), int32()).ToString(), + "cast(a, {to_type=int32, allow_int_overflow=false, allow_time_truncate=false, " + "allow_time_overflow=false, allow_decimal_truncate=false, " + "allow_float_truncate=false, allow_invalid_utf8=false})"); + EXPECT_EQ( + cast(field_ref("a"), nullptr).ToString(), + "cast(a, {to_type=, allow_int_overflow=false, allow_time_truncate=false, " + "allow_time_overflow=false, allow_decimal_truncate=false, " + "allow_float_truncate=false, allow_invalid_utf8=false})"); + + class WidgetifyOptionsType : public FunctionOptionsType { + public: + static const FunctionOptionsType* GetInstance() { + static std::unique_ptr instance(new WidgetifyOptionsType()); + return instance.get(); + } + const char* type_name() const override { return "widgetify"; } + std::string Stringify(const FunctionOptions& options) const override { + return type_name(); + } + bool Compare(const FunctionOptions& options, + const FunctionOptions& other) const override { + return true; + } + }; + class WidgetifyOptions : public compute::FunctionOptions { + public: + explicit WidgetifyOptions(bool really = true) + : FunctionOptions(WidgetifyOptionsType::GetInstance()), really(really) {} bool really; }; @@ -185,7 +279,7 @@ TEST(Expression, ToString) { EXPECT_EQ(call("widgetify", {}).ToString(), "widgetif)"); EXPECT_EQ( call("widgetify", {literal(1)}, std::make_shared()).ToString(), - "widgetify(1, {NON-REPRESENTABLE OPTIONS})"); + "widgetify(1, widgetify)"); EXPECT_EQ(equal(field_ref("a"), literal(1)).ToString(), "(a == 1)"); EXPECT_EQ(less(field_ref("a"), literal(2)).ToString(), "(a < 2)"); @@ -340,6 +434,28 @@ TEST(Expression, FieldsInExpression) { {"a", "b", "c"}); } +TEST(Expression, ExpressionHasFieldRefs) { + EXPECT_FALSE(ExpressionHasFieldRefs(literal(true))); + + EXPECT_FALSE(ExpressionHasFieldRefs(call("add", {literal(1), literal(3)}))); + + EXPECT_TRUE(ExpressionHasFieldRefs(field_ref("a"))); + + EXPECT_TRUE(ExpressionHasFieldRefs(equal(field_ref("a"), literal(1)))); + + EXPECT_TRUE(ExpressionHasFieldRefs(equal(field_ref("a"), field_ref("b")))); + + EXPECT_TRUE(ExpressionHasFieldRefs( + or_(equal(field_ref("a"), literal(1)), equal(field_ref("a"), literal(2))))); + + EXPECT_TRUE(ExpressionHasFieldRefs( + or_(equal(field_ref("a"), literal(1)), equal(field_ref("b"), literal(2))))); + + EXPECT_TRUE(ExpressionHasFieldRefs(or_( + and_(not_(equal(field_ref("a"), literal(1))), equal(field_ref("b"), literal(2))), + not_(less(field_ref("c"), literal(3)))))); +} + TEST(Expression, BindLiteral) { for (Datum dat : { Datum(3), @@ -379,21 +495,18 @@ TEST(Expression, BindFieldRef) { ExpectBindsTo(field_ref("i32"), no_change, &expr); EXPECT_EQ(expr.descr(), ValueDescr::Array(int32())); - // if the field is not found, a null scalar will be emitted - ExpectBindsTo(field_ref("no such field"), no_change, &expr); - EXPECT_EQ(expr.descr(), ValueDescr::Scalar(null())); + // if the field is not found, an error will be raised + ASSERT_RAISES(Invalid, field_ref("no such field").Bind(*kBoringSchema)); // referencing a field by name is not supported if that name is not unique // in the input schema ASSERT_RAISES(Invalid, field_ref("alpha").Bind(Schema( {field("alpha", int32()), field("alpha", float32())}))); - // referencing nested fields is supported - ASSERT_OK_AND_ASSIGN(expr, - field_ref(FieldRef("a", "b")) - .Bind(Schema({field("a", struct_({field("b", int32())}))}))); - EXPECT_TRUE(expr.IsBound()); - EXPECT_EQ(expr.descr(), ValueDescr::Array(int32())); + // referencing nested fields is not supported + ASSERT_RAISES(NotImplemented, + field_ref(FieldRef("a", "b")) + .Bind(Schema({field("a", struct_({field("b", int32())}))}))); } TEST(Expression, BindCall) { @@ -459,7 +572,8 @@ TEST(Expression, ExecuteFieldRef) { auto expr = field_ref(ref); ASSERT_OK_AND_ASSIGN(expr, expr.Bind(in.descr())); - ASSERT_OK_AND_ASSIGN(Datum actual, ExecuteScalarExpression(expr, in)); + ASSERT_OK_AND_ASSIGN(Datum actual, + ExecuteScalarExpression(expr, Schema(in.type()->fields()), in)); AssertDatumsEqual(actual, expected, /*verbose=*/true); }; @@ -471,39 +585,45 @@ TEST(Expression, ExecuteFieldRef) { ])"), ArrayFromJSON(float64(), R"([6.125, 0.0, -1])")); - // more nested: - ExpectRefIs(FieldRef{"a", "a"}, - ArrayFromJSON(struct_({field("a", struct_({field("a", float64())}))}), R"([ - {"a": {"a": 6.125}}, - {"a": {"a": 0.0}}, - {"a": {"a": -1}} + ExpectRefIs("a", + ArrayFromJSON(struct_({ + field("a", float64()), + field("b", float64()), + }), + R"([ + {"a": 6.125, "b": 7.5}, + {"a": 0.0, "b": 2.125}, + {"a": -1, "b": 4.0} ])"), ArrayFromJSON(float64(), R"([6.125, 0.0, -1])")); - // absent fields are resolved as a null scalar: - ExpectRefIs(FieldRef{"b"}, ArrayFromJSON(struct_({field("a", float64())}), R"([ - {"a": 6.125}, - {"a": 0.0}, - {"a": -1} + ExpectRefIs("b", + ArrayFromJSON(struct_({ + field("a", float64()), + field("b", float64()), + }), + R"([ + {"a": 6.125, "b": 7.5}, + {"a": 0.0, "b": 2.125}, + {"a": -1, "b": 4.0} ])"), - MakeNullScalar(null())); - - // XXX this *should* fail in Bind but for now it will just error in - // ExecuteScalarExpression - ASSERT_OK_AND_ASSIGN(auto list_item, field_ref("item").Bind(list(int32()))); - EXPECT_RAISES_WITH_MESSAGE_THAT( - NotImplemented, HasSubstr("non-struct array"), - ExecuteScalarExpression(list_item, - ArrayFromJSON(list(int32()), "[[1,2], [], null, [5]]"))); + ArrayFromJSON(float64(), R"([7.5, 2.125, 4.0])")); } Result NaiveExecuteScalarExpression(const Expression& expr, const Datum& input) { - auto call = expr.call(); - if (call == nullptr) { - // already tested execution of field_ref, execution of literal is trivial - return ExecuteScalarExpression(expr, input); + if (auto lit = expr.literal()) { + return *lit; } + if (auto ref = expr.field_ref()) { + if (input.type()) { + return ref->GetOneOrNone(*input.make_array()); + } + return ref->GetOneOrNone(*input.record_batch()); + } + + auto call = CallNotNull(expr); + std::vector arguments(call->arguments.size()); for (size_t i = 0; i < arguments.size(); ++i) { ARROW_ASSIGN_OR_RAISE(arguments[i], @@ -521,13 +641,16 @@ Result NaiveExecuteScalarExpression(const Expression& expr, const Datum& } void ExpectExecute(Expression expr, Datum in, Datum* actual_out = NULLPTR) { + std::shared_ptr schm; if (in.is_value()) { ASSERT_OK_AND_ASSIGN(expr, expr.Bind(in.descr())); + schm = schema(in.type()->fields()); } else { - ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*in.record_batch()->schema())); + ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*in.schema())); + schm = in.schema(); } - ASSERT_OK_AND_ASSIGN(Datum actual, ExecuteScalarExpression(expr, in)); + ASSERT_OK_AND_ASSIGN(Datum actual, ExecuteScalarExpression(expr, *schm, in)); ASSERT_OK_AND_ASSIGN(Datum expected, NaiveExecuteScalarExpression(expr, in)); @@ -587,9 +710,9 @@ TEST(Expression, ExecuteDictionaryTransparent) { ASSERT_OK_AND_ASSIGN( expr, SimplifyWithGuarantee(expr, equal(field_ref("dict_str"), literal("eh")))); - ASSERT_OK_AND_ASSIGN( - auto res, - ExecuteScalarExpression(expr, ArrayFromJSON(struct_({field("i32", int32())}), R"([ + ASSERT_OK_AND_ASSIGN(auto res, ExecuteScalarExpression( + expr, *kBoringSchema, + ArrayFromJSON(struct_({field("i32", int32())}), R"([ {"i32": 0}, {"i32": 1}, {"i32": 2} @@ -707,7 +830,7 @@ TEST(Expression, ExtractKnownFieldValues) { void operator()(Expression guarantee, std::unordered_map expected) { ASSERT_OK_AND_ASSIGN(auto actual, ExtractKnownFieldValues(guarantee)); - EXPECT_THAT(actual, UnorderedElementsAreArray(expected)) + EXPECT_THAT(actual.map, UnorderedElementsAreArray(expected)) << " guarantee: " << guarantee.ToString(); } } ExpectKnown; @@ -759,8 +882,8 @@ TEST(Expression, ReplaceFieldsWithKnownValues) { Expression unbound_expected) { ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*kBoringSchema)); ASSERT_OK_AND_ASSIGN(auto expected, unbound_expected.Bind(*kBoringSchema)); - ASSERT_OK_AND_ASSIGN(auto replaced, - ReplaceFieldsWithKnownValues(known_values, expr)); + ASSERT_OK_AND_ASSIGN(auto replaced, ReplaceFieldsWithKnownValues( + KnownFieldValues{known_values}, expr)); EXPECT_EQ(replaced, expected); ExpectIdenticalIfUnchanged(replaced, expr); @@ -775,7 +898,7 @@ TEST(Expression, ReplaceFieldsWithKnownValues) { // NB: known_values will be cast ExpectReplacesTo(field_ref("i32"), {{"i32", Datum("3")}}, literal(3)); - ExpectReplacesTo(field_ref("b"), i32_is_3, field_ref("b")); + ExpectReplacesTo(field_ref("f32"), i32_is_3, field_ref("f32")); ExpectReplacesTo(equal(field_ref("i32"), literal(1)), i32_is_3, equal(literal(3), literal(1))); @@ -816,17 +939,16 @@ TEST(Expression, ReplaceFieldsWithKnownValues) { ExpectReplacesTo(is_valid(field_ref("str")), i32_valid_str_null, is_valid(null_literal(utf8()))); - ASSERT_OK_AND_ASSIGN(auto expr, field_ref("dict_str").Bind(*kBoringSchema)); Datum dict_i32{ DictionaryScalar::Make(MakeScalar(0), ArrayFromJSON(int32(), R"([3])"))}; - // Unsupported cast dictionary(int32(), int32()) -> dictionary(int32(), utf8()) - ASSERT_RAISES(NotImplemented, - ReplaceFieldsWithKnownValues({{"dict_str", dict_i32}}, expr)); - // Unsupported cast dictionary(int8(), utf8()) -> dictionary(int32(), utf8()) - dict_str = Datum{ - DictionaryScalar::Make(MakeScalar(0), ArrayFromJSON(utf8(), R"(["a"])"))}; - ASSERT_RAISES(NotImplemented, - ReplaceFieldsWithKnownValues({{"dict_str", dict_str}}, expr)); + // cast dictionary(int32(), int32()) -> dictionary(int32(), utf8()) + ExpectReplacesTo(field_ref("dict_str"), {{"dict_str", dict_i32}}, literal(dict_str)); + + // cast dictionary(int8(), utf8()) -> dictionary(int32(), utf8()) + auto dict_int8_str = Datum{ + DictionaryScalar::Make(MakeScalar(0), ArrayFromJSON(utf8(), R"(["3"])"))}; + ExpectReplacesTo(field_ref("dict_str"), {{"dict_str", dict_int8_str}}, + literal(dict_str)); } struct { @@ -1016,7 +1138,8 @@ TEST(Expression, SingleComparisonGuarantees) { {"i32"})); ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*kBoringSchema)); - ASSERT_OK_AND_ASSIGN(Datum evaluated, ExecuteScalarExpression(filter, input)); + ASSERT_OK_AND_ASSIGN(Datum evaluated, + ExecuteScalarExpression(filter, *kBoringSchema, input)); // ensure that the simplified filter is as simplified as it could be // (this is always possible for single comparisons) @@ -1127,7 +1250,8 @@ TEST(Expression, Filter) { auto expected_mask = batch->column(0); ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*kBoringSchema)); - ASSERT_OK_AND_ASSIGN(Datum mask, ExecuteScalarExpression(filter, batch)); + ASSERT_OK_AND_ASSIGN(Datum mask, + ExecuteScalarExpression(filter, *kBoringSchema, batch)); AssertDatumsEqual(expected_mask, mask); }; @@ -1220,7 +1344,8 @@ TEST(Projection, AugmentWithNull) { auto ExpectProject = [&](Expression proj, Datum expected) { ASSERT_OK_AND_ASSIGN(proj, proj.Bind(*kBoringSchema)); - ASSERT_OK_AND_ASSIGN(auto actual, ExecuteScalarExpression(proj, input)); + ASSERT_OK_AND_ASSIGN(auto actual, + ExecuteScalarExpression(proj, *kBoringSchema, input)); AssertDatumsEqual(Datum(expected), actual); }; @@ -1250,7 +1375,8 @@ TEST(Projection, AugmentWithKnownValues) { Expression guarantee) { ASSERT_OK_AND_ASSIGN(proj, proj.Bind(*kBoringSchema)); ASSERT_OK_AND_ASSIGN(proj, SimplifyWithGuarantee(proj, guarantee)); - ASSERT_OK_AND_ASSIGN(auto actual, ExecuteScalarExpression(proj, input)); + ASSERT_OK_AND_ASSIGN(auto actual, + ExecuteScalarExpression(proj, *kBoringSchema, input)); AssertDatumsEqual(Datum(expected), actual); }; @@ -1278,5 +1404,5 @@ TEST(Projection, AugmentWithKnownValues) { })); } -} // namespace dataset +} // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/dataset/forest_internal.h b/cpp/src/arrow/compute/exec/forest_internal.h similarity index 96% rename from cpp/src/arrow/dataset/forest_internal.h rename to cpp/src/arrow/compute/exec/forest_internal.h index 1a7b874065e..7b55a0aabf3 100644 --- a/cpp/src/arrow/dataset/forest_internal.h +++ b/cpp/src/arrow/compute/exec/forest_internal.h @@ -21,15 +21,16 @@ #include #include -#include "arrow/dataset/visibility.h" +#include "arrow/result.h" +#include "arrow/status.h" namespace arrow { -namespace dataset { +namespace compute { /// A Forest is a view of a sorted range which carries an ancestry relation in addition /// to an ordering relation: each element's descendants appear directly after it. /// This can be used to efficiently skip subtrees when iterating through the range. -class ARROW_DS_EXPORT Forest { +class Forest { public: Forest() = default; @@ -69,7 +70,7 @@ class ARROW_DS_EXPORT Forest { std::equal(it, it + size_, other.descendant_counts_->begin()); } - struct ARROW_DS_EXPORT Ref { + struct Ref { int num_descendants() const { return forest->descendant_counts_->at(i); } bool IsAncestorOf(const Ref& ref) const { @@ -120,5 +121,5 @@ class ARROW_DS_EXPORT Forest { std::shared_ptr> descendant_counts_; }; -} // namespace dataset +} // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_compare.cc b/cpp/src/arrow/compute/exec/key_compare.cc new file mode 100644 index 00000000000..7a5b0be9990 --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_compare.cc @@ -0,0 +1,268 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/key_compare.h" + +#include +#include + +#include "arrow/compute/exec/util.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace compute { + +void KeyCompare::CompareRows(uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows, + uint16_t* out_sel_left_maybe_same, + const KeyEncoder::KeyRowArray& rows_left, + const KeyEncoder::KeyRowArray& rows_right) { + ARROW_DCHECK(rows_left.metadata().is_compatible(rows_right.metadata())); + + if (num_rows_to_compare == 0) { + *out_num_rows = 0; + return; + } + + // Allocate temporary byte and bit vectors + auto bytevector_holder = + util::TempVectorHolder(ctx->stack, num_rows_to_compare); + auto bitvector_holder = + util::TempVectorHolder(ctx->stack, num_rows_to_compare); + + uint8_t* match_bytevector = bytevector_holder.mutable_data(); + uint8_t* match_bitvector = bitvector_holder.mutable_data(); + + // All comparison functions called here will update match byte vector + // (AND it with comparison result) instead of overwriting it. + memset(match_bytevector, 0xff, num_rows_to_compare); + + if (rows_left.metadata().is_fixed_length) { + CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + match_bytevector, ctx, rows_left.metadata().fixed_length, + rows_left.data(1), rows_right.data(1)); + } else { + CompareVaryingLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + match_bytevector, ctx, rows_left.data(2), rows_right.data(2), + rows_left.offsets(), rows_right.offsets()); + } + + // CompareFixedLength can be used to compare nulls as well + bool nulls_present = rows_left.has_any_nulls(ctx) || rows_right.has_any_nulls(ctx); + if (nulls_present) { + CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map, + match_bytevector, ctx, + rows_left.metadata().null_masks_bytes_per_row, + rows_left.null_masks(), rows_right.null_masks()); + } + + util::BitUtil::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, match_bytevector, + match_bitvector); + if (sel_left_maybe_null) { + int out_num_rows_int; + util::BitUtil::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare, + match_bitvector, sel_left_maybe_null, + &out_num_rows_int, out_sel_left_maybe_same); + *out_num_rows = out_num_rows_int; + } else { + int out_num_rows_int; + util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare, + match_bitvector, &out_num_rows_int, + out_sel_left_maybe_same); + *out_num_rows = out_num_rows_int; + } +} + +void KeyCompare::CompareFixedLength(uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + uint8_t* match_bytevector, + KeyEncoder::KeyEncoderContext* ctx, + uint32_t fixed_length, const uint8_t* rows_left, + const uint8_t* rows_right) { + bool use_selection = (sel_left_maybe_null != nullptr); + + uint32_t num_rows_already_processed = 0; + +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2() && !use_selection) { + // Choose between up-to-8B length, up-to-16B length and any size versions + if (fixed_length <= 8) { + num_rows_already_processed = CompareFixedLength_UpTo8B_avx2( + num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length, + rows_left, rows_right); + } else if (fixed_length <= 16) { + num_rows_already_processed = CompareFixedLength_UpTo16B_avx2( + num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length, + rows_left, rows_right); + } else { + num_rows_already_processed = + CompareFixedLength_avx2(num_rows_to_compare, left_to_right_map, + match_bytevector, fixed_length, rows_left, rows_right); + } + } +#endif + + typedef void (*CompareFixedLengthImp_t)(uint32_t, uint32_t, const uint16_t*, + const uint32_t*, uint8_t*, uint32_t, + const uint8_t*, const uint8_t*); + static const CompareFixedLengthImp_t CompareFixedLengthImp_fn[] = { + CompareFixedLengthImp, CompareFixedLengthImp, + CompareFixedLengthImp, CompareFixedLengthImp, + CompareFixedLengthImp, CompareFixedLengthImp}; + int dispatch_const = (use_selection ? 3 : 0) + + ((fixed_length <= 8) ? 0 : ((fixed_length <= 16) ? 1 : 2)); + CompareFixedLengthImp_fn[dispatch_const]( + num_rows_already_processed, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, match_bytevector, fixed_length, rows_left, rows_right); +} + +template +void KeyCompare::CompareFixedLengthImp(uint32_t num_rows_already_processed, + uint32_t num_rows, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + uint8_t* match_bytevector, uint32_t length, + const uint8_t* rows_left, + const uint8_t* rows_right) { + // Key length (for encoded key) has to be non-zero + ARROW_DCHECK(length > 0); + + // Non-zero length guarantees no underflow + int32_t num_loops_less_one = (static_cast(length) + 7) / 8 - 1; + + // Length remaining in last loop can only be zero for input length equal to zero + uint32_t length_remaining_last_loop = length - num_loops_less_one * 8; + uint64_t tail_mask = (~0ULL) >> (8 * (8 - length_remaining_last_loop)); + + for (uint32_t id_input = num_rows_already_processed; id_input < num_rows; ++id_input) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[id_input] : id_input; + uint32_t irow_right = left_to_right_map[irow_left]; + uint32_t begin_left = length * irow_left; + uint32_t begin_right = length * irow_right; + const uint64_t* key_left_ptr = + reinterpret_cast(rows_left + begin_left); + const uint64_t* key_right_ptr = + reinterpret_cast(rows_right + begin_right); + uint64_t result_or = 0ULL; + int32_t istripe = 0; + + // Specializations for keys up to 8 bytes and between 9 and 16 bytes to + // avoid internal loop over words in the value for short ones. + // + // Template argument 0 means arbitrarily many 64-bit words, + // 1 means up to 1 and 2 means up to 2. + // + if (num_64bit_words == 0) { + for (; istripe < num_loops_less_one; ++istripe) { + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); + result_or |= (key_left ^ key_right); + } + } else if (num_64bit_words == 2) { + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); + result_or |= (key_left ^ key_right); + ++istripe; + } + + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); + result_or |= (tail_mask & (key_left ^ key_right)); + + int result = (result_or == 0 ? 0xff : 0); + match_bytevector[id_input] &= result; + } +} + +void KeyCompare::CompareVaryingLength(uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + uint8_t* match_bytevector, + KeyEncoder::KeyEncoderContext* ctx, + const uint8_t* rows_left, const uint8_t* rows_right, + const uint32_t* offsets_left, + const uint32_t* offsets_right) { + bool use_selection = (sel_left_maybe_null != nullptr); + +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2() && !use_selection) { + CompareVaryingLength_avx2(num_rows_to_compare, left_to_right_map, match_bytevector, + rows_left, rows_right, offsets_left, offsets_right); + } else { +#endif + if (use_selection) { + CompareVaryingLengthImp(num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, match_bytevector, rows_left, + rows_right, offsets_left, offsets_right); + } else { + CompareVaryingLengthImp(num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, match_bytevector, rows_left, + rows_right, offsets_left, offsets_right); + } +#if defined(ARROW_HAVE_AVX2) + } +#endif +} + +template +void KeyCompare::CompareVaryingLengthImp( + uint32_t num_rows, const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, uint8_t* match_bytevector, + const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left, + const uint32_t* offsets_right) { + static const uint64_t tail_masks[] = { + 0x0000000000000000ULL, 0x00000000000000ffULL, 0x000000000000ffffULL, + 0x0000000000ffffffULL, 0x00000000ffffffffULL, 0x000000ffffffffffULL, + 0x0000ffffffffffffULL, 0x00ffffffffffffffULL, 0xffffffffffffffffULL}; + for (uint32_t i = 0; i < num_rows; ++i) { + uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i; + uint32_t irow_right = left_to_right_map[irow_left]; + uint32_t begin_left = offsets_left[irow_left]; + uint32_t begin_right = offsets_right[irow_right]; + uint32_t length_left = offsets_left[irow_left + 1] - begin_left; + uint32_t length_right = offsets_right[irow_right + 1] - begin_right; + uint32_t length = std::min(length_left, length_right); + const uint64_t* key_left_ptr = + reinterpret_cast(rows_left + begin_left); + const uint64_t* key_right_ptr = + reinterpret_cast(rows_right + begin_right); + uint64_t result_or = 0; + int32_t istripe; + // length can be zero + for (istripe = 0; istripe < (static_cast(length) + 7) / 8 - 1; ++istripe) { + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); + result_or |= (key_left ^ key_right); + } + + uint32_t length_remaining = length - static_cast(istripe) * 8; + uint64_t tail_mask = tail_masks[length_remaining]; + + uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]); + uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]); + result_or |= (tail_mask & (key_left ^ key_right)); + + int result = (result_or == 0 ? 0xff : 0); + match_bytevector[i] &= result; + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_compare.h b/cpp/src/arrow/compute/exec/key_compare.h new file mode 100644 index 00000000000..1dffabb884b --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_compare.h @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/exec/key_encode.h" +#include "arrow/compute/exec/util.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/status.h" + +namespace arrow { +namespace compute { + +class KeyCompare { + public: + // Returns a single 16-bit selection vector of rows that failed comparison. + // If there is input selection on the left, the resulting selection is a filtered image + // of input selection. + static void CompareRows(uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows, + uint16_t* out_sel_left_maybe_same, + const KeyEncoder::KeyRowArray& rows_left, + const KeyEncoder::KeyRowArray& rows_right); + + private: + static void CompareFixedLength(uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + uint8_t* match_bytevector, + KeyEncoder::KeyEncoderContext* ctx, + uint32_t fixed_length, const uint8_t* rows_left, + const uint8_t* rows_right); + static void CompareVaryingLength(uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + uint8_t* match_bytevector, + KeyEncoder::KeyEncoderContext* ctx, + const uint8_t* rows_left, const uint8_t* rows_right, + const uint32_t* offsets_left, + const uint32_t* offsets_right); + + // Second template argument is 0, 1 or 2. + // 0 means arbitrarily many 64-bit words, 1 means up to 1 and 2 means up to 2. + template + static void CompareFixedLengthImp(uint32_t num_rows_already_processed, + uint32_t num_rows, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + uint8_t* match_bytevector, uint32_t length, + const uint8_t* rows_left, const uint8_t* rows_right); + template + static void CompareVaryingLengthImp(uint32_t num_rows, + const uint16_t* sel_left_maybe_null, + const uint32_t* left_to_right_map, + uint8_t* match_bytevector, const uint8_t* rows_left, + const uint8_t* rows_right, + const uint32_t* offsets_left, + const uint32_t* offsets_right); + +#if defined(ARROW_HAVE_AVX2) + + static uint32_t CompareFixedLength_UpTo8B_avx2( + uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, + uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right); + static uint32_t CompareFixedLength_UpTo16B_avx2( + uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, + uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right); + static uint32_t CompareFixedLength_avx2(uint32_t num_rows, + const uint32_t* left_to_right_map, + uint8_t* match_bytevector, uint32_t length, + const uint8_t* rows_left, + const uint8_t* rows_right); + static void CompareVaryingLength_avx2( + uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, + const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left, + const uint32_t* offsets_right); + +#endif +}; + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_compare_avx2.cc b/cpp/src/arrow/compute/exec/key_compare_avx2.cc new file mode 100644 index 00000000000..6abdf6c3c3a --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_compare_avx2.cc @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec/key_compare.h" +#include "arrow/util/bit_util.h" + +namespace arrow { +namespace compute { + +#if defined(ARROW_HAVE_AVX2) + +uint32_t KeyCompare::CompareFixedLength_UpTo8B_avx2( + uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, + uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right) { + ARROW_DCHECK(length <= 8); + __m256i offset_left = _mm256_setr_epi64x(0, length, length * 2, length * 3); + __m256i offset_left_incr = _mm256_set1_epi64x(length * 4); + __m256i mask = _mm256_set1_epi64x(~0ULL >> (8 * (8 - length))); + + constexpr uint32_t unroll = 4; + for (uint32_t i = 0; i < num_rows / unroll; ++i) { + auto key_left = _mm256_i64gather_epi64( + reinterpret_cast(rows_left), offset_left, 1); + offset_left = _mm256_add_epi64(offset_left, offset_left_incr); + __m128i offset_right = + _mm_loadu_si128(reinterpret_cast(left_to_right_map) + i); + offset_right = _mm_mullo_epi32(offset_right, _mm_set1_epi32(length)); + + auto key_right = _mm256_i32gather_epi64( + reinterpret_cast(rows_right), offset_right, 1); + uint32_t cmp = _mm256_movemask_epi8(_mm256_cmpeq_epi64( + _mm256_and_si256(key_left, mask), _mm256_and_si256(key_right, mask))); + reinterpret_cast(match_bytevector)[i] &= cmp; + } + + uint32_t num_rows_processed = num_rows - (num_rows % unroll); + return num_rows_processed; +} + +uint32_t KeyCompare::CompareFixedLength_UpTo16B_avx2( + uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, + uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right) { + ARROW_DCHECK(length <= 16); + + constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; + constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; + + __m256i mask = + _mm256_cmpgt_epi8(_mm256_set1_epi8(length), + _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, + kByteSequence0To7, kByteSequence8To15)); + const uint8_t* key_left_ptr = rows_left; + + constexpr uint32_t unroll = 2; + for (uint32_t i = 0; i < num_rows / unroll; ++i) { + auto key_left = _mm256_inserti128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128(reinterpret_cast(key_left_ptr))), + _mm_loadu_si128(reinterpret_cast(key_left_ptr + length)), 1); + key_left_ptr += length * 2; + auto key_right = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast( + rows_right + length * left_to_right_map[2 * i]))), + _mm_loadu_si128(reinterpret_cast( + rows_right + length * left_to_right_map[2 * i + 1])), + 1); + __m256i cmp = _mm256_cmpeq_epi64(_mm256_and_si256(key_left, mask), + _mm256_and_si256(key_right, mask)); + cmp = _mm256_and_si256(cmp, _mm256_shuffle_epi32(cmp, 0xee)); // 0b11101110 + cmp = _mm256_permute4x64_epi64(cmp, 0x08); // 0b00001000 + reinterpret_cast(match_bytevector)[i] &= + (_mm256_movemask_epi8(cmp) & 0xffff); + } + + uint32_t num_rows_processed = num_rows - (num_rows % unroll); + return num_rows_processed; +} + +uint32_t KeyCompare::CompareFixedLength_avx2(uint32_t num_rows, + const uint32_t* left_to_right_map, + uint8_t* match_bytevector, uint32_t length, + const uint8_t* rows_left, + const uint8_t* rows_right) { + ARROW_DCHECK(length > 0); + + constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; + constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; + constexpr uint64_t kByteSequence16To23 = 0x1716151413121110ULL; + constexpr uint64_t kByteSequence24To31 = 0x1f1e1d1c1b1a1918ULL; + + // Non-zero length guarantees no underflow + int32_t num_loops_less_one = (static_cast(length) + 31) / 32 - 1; + + __m256i tail_mask = + _mm256_cmpgt_epi8(_mm256_set1_epi8(length - num_loops_less_one * 32), + _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, + kByteSequence16To23, kByteSequence24To31)); + + for (uint32_t irow_left = 0; irow_left < num_rows; ++irow_left) { + uint32_t irow_right = left_to_right_map[irow_left]; + uint32_t begin_left = length * irow_left; + uint32_t begin_right = length * irow_right; + const __m256i* key_left_ptr = + reinterpret_cast(rows_left + begin_left); + const __m256i* key_right_ptr = + reinterpret_cast(rows_right + begin_right); + __m256i result_or = _mm256_setzero_si256(); + int32_t i; + // length cannot be zero + for (i = 0; i < num_loops_less_one; ++i) { + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + } + + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256( + result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + int result = _mm256_testz_si256(result_or, result_or) * 0xff; + match_bytevector[irow_left] &= result; + } + + uint32_t num_rows_processed = num_rows; + return num_rows_processed; +} + +void KeyCompare::CompareVaryingLength_avx2( + uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector, + const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left, + const uint32_t* offsets_right) { + for (uint32_t irow_left = 0; irow_left < num_rows; ++irow_left) { + uint32_t irow_right = left_to_right_map[irow_left]; + uint32_t begin_left = offsets_left[irow_left]; + uint32_t begin_right = offsets_right[irow_right]; + uint32_t length_left = offsets_left[irow_left + 1] - begin_left; + uint32_t length_right = offsets_right[irow_right + 1] - begin_right; + uint32_t length = std::min(length_left, length_right); + auto key_left_ptr = reinterpret_cast(rows_left + begin_left); + auto key_right_ptr = reinterpret_cast(rows_right + begin_right); + __m256i result_or = _mm256_setzero_si256(); + int32_t i; + // length can be zero + for (i = 0; i < (static_cast(length) + 31) / 32 - 1; ++i) { + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256(result_or, _mm256_xor_si256(key_left, key_right)); + } + + constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; + constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; + constexpr uint64_t kByteSequence16To23 = 0x1716151413121110ULL; + constexpr uint64_t kByteSequence24To31 = 0x1f1e1d1c1b1a1918ULL; + + __m256i tail_mask = + _mm256_cmpgt_epi8(_mm256_set1_epi8(length - i * 32), + _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, + kByteSequence16To23, kByteSequence24To31)); + + __m256i key_left = _mm256_loadu_si256(key_left_ptr + i); + __m256i key_right = _mm256_loadu_si256(key_right_ptr + i); + result_or = _mm256_or_si256( + result_or, _mm256_and_si256(tail_mask, _mm256_xor_si256(key_left, key_right))); + int result = _mm256_testz_si256(result_or, result_or) * 0xff; + match_bytevector[irow_left] &= result; + } +} + +#endif + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_encode.cc b/cpp/src/arrow/compute/exec/key_encode.cc new file mode 100644 index 00000000000..de79558f2c2 --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_encode.cc @@ -0,0 +1,1649 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/key_encode.h" + +#include + +#include + +#include "arrow/compute/exec/util.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace compute { + +KeyEncoder::KeyRowArray::KeyRowArray() + : pool_(nullptr), rows_capacity_(0), bytes_capacity_(0) {} + +Status KeyEncoder::KeyRowArray::Init(MemoryPool* pool, const KeyRowMetadata& metadata) { + pool_ = pool; + metadata_ = metadata; + + DCHECK(!null_masks_ && !offsets_ && !rows_); + + constexpr int64_t rows_capacity = 8; + constexpr int64_t bytes_capacity = 1024; + + // Null masks + ARROW_ASSIGN_OR_RAISE(auto null_masks, + AllocateResizableBuffer(size_null_masks(rows_capacity), pool_)); + null_masks_ = std::move(null_masks); + memset(null_masks_->mutable_data(), 0, size_null_masks(rows_capacity)); + + // Offsets and rows + if (!metadata.is_fixed_length) { + ARROW_ASSIGN_OR_RAISE(auto offsets, + AllocateResizableBuffer(size_offsets(rows_capacity), pool_)); + offsets_ = std::move(offsets); + memset(offsets_->mutable_data(), 0, size_offsets(rows_capacity)); + reinterpret_cast(offsets_->mutable_data())[0] = 0; + + ARROW_ASSIGN_OR_RAISE( + auto rows, + AllocateResizableBuffer(size_rows_varying_length(bytes_capacity), pool_)); + rows_ = std::move(rows); + memset(rows_->mutable_data(), 0, size_rows_varying_length(bytes_capacity)); + bytes_capacity_ = size_rows_varying_length(bytes_capacity) - padding_for_vectors; + } else { + ARROW_ASSIGN_OR_RAISE( + auto rows, AllocateResizableBuffer(size_rows_fixed_length(rows_capacity), pool_)); + rows_ = std::move(rows); + memset(rows_->mutable_data(), 0, size_rows_fixed_length(rows_capacity)); + bytes_capacity_ = size_rows_fixed_length(rows_capacity) - padding_for_vectors; + } + + update_buffer_pointers(); + + rows_capacity_ = rows_capacity; + + num_rows_ = 0; + num_rows_for_has_any_nulls_ = 0; + has_any_nulls_ = false; + + return Status::OK(); +} + +void KeyEncoder::KeyRowArray::Clean() { + num_rows_ = 0; + num_rows_for_has_any_nulls_ = 0; + has_any_nulls_ = false; + + if (!metadata_.is_fixed_length) { + reinterpret_cast(offsets_->mutable_data())[0] = 0; + } +} + +int64_t KeyEncoder::KeyRowArray::size_null_masks(int64_t num_rows) { + return num_rows * metadata_.null_masks_bytes_per_row + padding_for_vectors; +} + +int64_t KeyEncoder::KeyRowArray::size_offsets(int64_t num_rows) { + return (num_rows + 1) * sizeof(uint32_t) + padding_for_vectors; +} + +int64_t KeyEncoder::KeyRowArray::size_rows_fixed_length(int64_t num_rows) { + return num_rows * metadata_.fixed_length + padding_for_vectors; +} + +int64_t KeyEncoder::KeyRowArray::size_rows_varying_length(int64_t num_bytes) { + return num_bytes + padding_for_vectors; +} + +void KeyEncoder::KeyRowArray::update_buffer_pointers() { + buffers_[0] = mutable_buffers_[0] = null_masks_->mutable_data(); + if (metadata_.is_fixed_length) { + buffers_[1] = mutable_buffers_[1] = rows_->mutable_data(); + buffers_[2] = mutable_buffers_[2] = nullptr; + } else { + buffers_[1] = mutable_buffers_[1] = offsets_->mutable_data(); + buffers_[2] = mutable_buffers_[2] = rows_->mutable_data(); + } +} + +Status KeyEncoder::KeyRowArray::ResizeFixedLengthBuffers(int64_t num_extra_rows) { + if (rows_capacity_ >= num_rows_ + num_extra_rows) { + return Status::OK(); + } + + int64_t rows_capacity_new = std::max(static_cast(1), 2 * rows_capacity_); + while (rows_capacity_new < num_rows_ + num_extra_rows) { + rows_capacity_new *= 2; + } + + // Null masks + RETURN_NOT_OK(null_masks_->Resize(size_null_masks(rows_capacity_new), false)); + memset(null_masks_->mutable_data() + size_null_masks(rows_capacity_), 0, + size_null_masks(rows_capacity_new) - size_null_masks(rows_capacity_)); + + // Either offsets or rows + if (!metadata_.is_fixed_length) { + RETURN_NOT_OK(offsets_->Resize(size_offsets(rows_capacity_new), false)); + memset(offsets_->mutable_data() + size_offsets(rows_capacity_), 0, + size_offsets(rows_capacity_new) - size_offsets(rows_capacity_)); + } else { + RETURN_NOT_OK(rows_->Resize(size_rows_fixed_length(rows_capacity_new), false)); + memset(rows_->mutable_data() + size_rows_fixed_length(rows_capacity_), 0, + size_rows_fixed_length(rows_capacity_new) - + size_rows_fixed_length(rows_capacity_)); + bytes_capacity_ = size_rows_fixed_length(rows_capacity_new) - padding_for_vectors; + } + + update_buffer_pointers(); + + rows_capacity_ = rows_capacity_new; + + return Status::OK(); +} + +Status KeyEncoder::KeyRowArray::ResizeOptionalVaryingLengthBuffer( + int64_t num_extra_bytes) { + int64_t num_bytes = offsets()[num_rows_]; + if (bytes_capacity_ >= num_bytes + num_extra_bytes || metadata_.is_fixed_length) { + return Status::OK(); + } + + int64_t bytes_capacity_new = std::max(static_cast(1), 2 * bytes_capacity_); + while (bytes_capacity_new < num_bytes + num_extra_bytes) { + bytes_capacity_new *= 2; + } + + RETURN_NOT_OK(rows_->Resize(size_rows_varying_length(bytes_capacity_new), false)); + memset(rows_->mutable_data() + size_rows_varying_length(bytes_capacity_), 0, + size_rows_varying_length(bytes_capacity_new) - + size_rows_varying_length(bytes_capacity_)); + + update_buffer_pointers(); + + bytes_capacity_ = bytes_capacity_new; + + return Status::OK(); +} + +Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from, + uint32_t num_rows_to_append, + const uint16_t* source_row_ids) { + DCHECK(metadata_.is_compatible(from.metadata())); + + RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append)); + + if (!metadata_.is_fixed_length) { + // Varying-length rows + auto from_offsets = reinterpret_cast(from.offsets_->data()); + auto to_offsets = reinterpret_cast(offsets_->mutable_data()); + uint32_t total_length = to_offsets[num_rows_]; + uint32_t total_length_to_append = 0; + for (uint32_t i = 0; i < num_rows_to_append; ++i) { + uint16_t row_id = source_row_ids[i]; + uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id]; + total_length_to_append += length; + to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append; + } + + RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append)); + + const uint8_t* src = from.rows_->data(); + uint8_t* dst = rows_->mutable_data() + total_length; + for (uint32_t i = 0; i < num_rows_to_append; ++i) { + uint16_t row_id = source_row_ids[i]; + uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id]; + auto src64 = reinterpret_cast(src + from_offsets[row_id]); + auto dst64 = reinterpret_cast(dst); + for (uint32_t j = 0; j < (length + 7) / 8; ++j) { + dst64[j] = src64[j]; + } + dst += length; + } + } else { + // Fixed-length rows + const uint8_t* src = from.rows_->data(); + uint8_t* dst = rows_->mutable_data() + num_rows_ * metadata_.fixed_length; + for (uint32_t i = 0; i < num_rows_to_append; ++i) { + uint16_t row_id = source_row_ids[i]; + uint32_t length = metadata_.fixed_length; + auto src64 = reinterpret_cast(src + length * row_id); + auto dst64 = reinterpret_cast(dst); + for (uint32_t j = 0; j < (length + 7) / 8; ++j) { + dst64[j] = src64[j]; + } + dst += length; + } + } + + // Null masks + uint32_t byte_length = metadata_.null_masks_bytes_per_row; + uint64_t dst_byte_offset = num_rows_ * byte_length; + const uint8_t* src_base = from.null_masks_->data(); + uint8_t* dst_base = null_masks_->mutable_data(); + for (uint32_t i = 0; i < num_rows_to_append; ++i) { + uint32_t row_id = source_row_ids[i]; + int64_t src_byte_offset = row_id * byte_length; + const uint8_t* src = src_base + src_byte_offset; + uint8_t* dst = dst_base + dst_byte_offset; + for (uint32_t ibyte = 0; ibyte < byte_length; ++ibyte) { + dst[ibyte] = src[ibyte]; + } + dst_byte_offset += byte_length; + } + + num_rows_ += num_rows_to_append; + + return Status::OK(); +} + +Status KeyEncoder::KeyRowArray::AppendEmpty(uint32_t num_rows_to_append, + uint32_t num_extra_bytes_to_append) { + RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append)); + RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append)); + num_rows_ += num_rows_to_append; + if (metadata_.row_alignment > 1 || metadata_.string_alignment > 1) { + memset(rows_->mutable_data(), 0, bytes_capacity_); + } + return Status::OK(); +} + +bool KeyEncoder::KeyRowArray::has_any_nulls(const KeyEncoderContext* ctx) const { + if (has_any_nulls_) { + return true; + } + if (num_rows_for_has_any_nulls_ < num_rows_) { + auto size_per_row = metadata().null_masks_bytes_per_row; + has_any_nulls_ = !util::BitUtil::are_all_bytes_zero( + ctx->hardware_flags, null_masks() + size_per_row * num_rows_for_has_any_nulls_, + static_cast(size_per_row * (num_rows_ - num_rows_for_has_any_nulls_))); + num_rows_for_has_any_nulls_ = num_rows_; + } + return has_any_nulls_; +} + +KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata, + const KeyColumnArray& left, + const KeyColumnArray& right, + int buffer_id_to_replace) { + metadata_ = metadata; + length_ = left.length(); + for (int i = 0; i < max_buffers_; ++i) { + buffers_[i] = left.buffers_[i]; + mutable_buffers_[i] = left.mutable_buffers_[i]; + } + buffers_[buffer_id_to_replace] = right.buffers_[buffer_id_to_replace]; + mutable_buffers_[buffer_id_to_replace] = right.mutable_buffers_[buffer_id_to_replace]; + bit_offset_[0] = left.bit_offset_[0]; + bit_offset_[1] = left.bit_offset_[1]; + if (buffer_id_to_replace < max_buffers_ - 1) { + bit_offset_[buffer_id_to_replace] = right.bit_offset_[buffer_id_to_replace]; + } +} + +KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata, + int64_t length, const uint8_t* buffer0, + const uint8_t* buffer1, const uint8_t* buffer2, + int bit_offset0, int bit_offset1) { + metadata_ = metadata; + length_ = length; + buffers_[0] = buffer0; + buffers_[1] = buffer1; + buffers_[2] = buffer2; + mutable_buffers_[0] = mutable_buffers_[1] = mutable_buffers_[2] = nullptr; + bit_offset_[0] = bit_offset0; + bit_offset_[1] = bit_offset1; +} + +KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata, + int64_t length, uint8_t* buffer0, + uint8_t* buffer1, uint8_t* buffer2, + int bit_offset0, int bit_offset1) { + metadata_ = metadata; + length_ = length; + buffers_[0] = mutable_buffers_[0] = buffer0; + buffers_[1] = mutable_buffers_[1] = buffer1; + buffers_[2] = mutable_buffers_[2] = buffer2; + bit_offset_[0] = bit_offset0; + bit_offset_[1] = bit_offset1; +} + +KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnArray& from, int64_t start, + int64_t length) { + metadata_ = from.metadata_; + length_ = length; + uint32_t fixed_size = + !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length; + + buffers_[0] = + from.buffers_[0] ? from.buffers_[0] + (from.bit_offset_[0] + start) / 8 : nullptr; + mutable_buffers_[0] = from.mutable_buffers_[0] + ? from.mutable_buffers_[0] + (from.bit_offset_[0] + start) / 8 + : nullptr; + bit_offset_[0] = (from.bit_offset_[0] + start) % 8; + + if (fixed_size == 0) { + buffers_[1] = + from.buffers_[1] ? from.buffers_[1] + (from.bit_offset_[1] + start) / 8 : nullptr; + mutable_buffers_[1] = from.mutable_buffers_[1] ? from.mutable_buffers_[1] + + (from.bit_offset_[1] + start) / 8 + : nullptr; + bit_offset_[1] = (from.bit_offset_[1] + start) % 8; + } else { + buffers_[1] = from.buffers_[1] ? from.buffers_[1] + start * fixed_size : nullptr; + mutable_buffers_[1] = from.mutable_buffers_[1] + ? from.mutable_buffers_[1] + start * fixed_size + : nullptr; + bit_offset_[1] = 0; + } + + buffers_[2] = from.buffers_[2]; + mutable_buffers_[2] = from.mutable_buffers_[2]; +} + +KeyEncoder::KeyColumnArray KeyEncoder::TransformBoolean::ArrayReplace( + const KeyColumnArray& column, const KeyColumnArray& temp) { + // Make sure that the temp buffer is large enough + DCHECK(temp.length() >= column.length() && temp.metadata().is_fixed_length && + temp.metadata().fixed_length >= sizeof(uint8_t)); + KeyColumnMetadata metadata; + metadata.is_fixed_length = true; + metadata.fixed_length = sizeof(uint8_t); + constexpr int buffer_index = 1; + KeyColumnArray result = KeyColumnArray(metadata, column, temp, buffer_index); + return result; +} + +void KeyEncoder::TransformBoolean::PreEncode(const KeyColumnArray& input, + KeyColumnArray* output, + KeyEncoderContext* ctx) { + // Make sure that metadata and lengths are compatible. + DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length); + DCHECK(output->metadata().fixed_length == 1 && input.metadata().fixed_length == 0); + DCHECK(output->length() == input.length()); + constexpr int buffer_index = 1; + DCHECK(input.data(buffer_index) != nullptr); + DCHECK(output->mutable_data(buffer_index) != nullptr); + util::BitUtil::bits_to_bytes( + ctx->hardware_flags, static_cast(input.length()), input.data(buffer_index), + output->mutable_data(buffer_index), input.bit_offset(buffer_index)); +} + +void KeyEncoder::TransformBoolean::PostDecode(const KeyColumnArray& input, + KeyColumnArray* output, + KeyEncoderContext* ctx) { + // Make sure that metadata and lengths are compatible. + DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length); + DCHECK(output->metadata().fixed_length == 0 && input.metadata().fixed_length == 1); + DCHECK(output->length() == input.length()); + constexpr int buffer_index = 1; + DCHECK(input.data(buffer_index) != nullptr); + DCHECK(output->mutable_data(buffer_index) != nullptr); + + util::BitUtil::bytes_to_bits( + ctx->hardware_flags, static_cast(input.length()), input.data(buffer_index), + output->mutable_data(buffer_index), output->bit_offset(buffer_index)); +} + +bool KeyEncoder::EncoderInteger::IsBoolean(const KeyColumnMetadata& metadata) { + return metadata.is_fixed_length && metadata.fixed_length == 0; +} + +bool KeyEncoder::EncoderInteger::UsesTransform(const KeyColumnArray& column) { + return IsBoolean(column.metadata()); +} + +KeyEncoder::KeyColumnArray KeyEncoder::EncoderInteger::ArrayReplace( + const KeyColumnArray& column, const KeyColumnArray& temp) { + if (IsBoolean(column.metadata())) { + return TransformBoolean::ArrayReplace(column, temp); + } + return column; +} + +void KeyEncoder::EncoderInteger::PreEncode(const KeyColumnArray& input, + KeyColumnArray* output, + KeyEncoderContext* ctx) { + if (IsBoolean(input.metadata())) { + TransformBoolean::PreEncode(input, output, ctx); + } +} + +void KeyEncoder::EncoderInteger::PostDecode(const KeyColumnArray& input, + KeyColumnArray* output, + KeyEncoderContext* ctx) { + if (IsBoolean(output->metadata())) { + TransformBoolean::PostDecode(input, output, ctx); + } +} + +void KeyEncoder::EncoderInteger::Encode(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col, KeyEncoderContext* ctx, + KeyColumnArray* temp) { + KeyColumnArray col_prep; + if (UsesTransform(col)) { + col_prep = ArrayReplace(col, *temp); + PreEncode(col, &col_prep, ctx); + } else { + col_prep = col; + } + + const auto num_rows = static_cast(col.length()); + + // When we have a single fixed length column we can just do memcpy + if (rows->metadata().is_fixed_length && + rows->metadata().fixed_length == col.metadata().fixed_length) { + DCHECK_EQ(offset_within_row, 0); + uint32_t row_size = col.metadata().fixed_length; + memcpy(rows->mutable_data(1), col.data(1), num_rows * row_size); + } else if (rows->metadata().is_fixed_length) { + uint32_t row_size = rows->metadata().fixed_length; + uint8_t* row_base = rows->mutable_data(1) + offset_within_row; + const uint8_t* col_base = col_prep.data(1); + switch (col_prep.metadata().fixed_length) { + case 1: + for (uint32_t i = 0; i < num_rows; ++i) { + row_base[i * row_size] = col_base[i]; + } + break; + case 2: + for (uint32_t i = 0; i < num_rows; ++i) { + *reinterpret_cast(row_base + i * row_size) = + reinterpret_cast(col_base)[i]; + } + break; + case 4: + for (uint32_t i = 0; i < num_rows; ++i) { + *reinterpret_cast(row_base + i * row_size) = + reinterpret_cast(col_base)[i]; + } + break; + case 8: + for (uint32_t i = 0; i < num_rows; ++i) { + *reinterpret_cast(row_base + i * row_size) = + reinterpret_cast(col_base)[i]; + } + break; + default: + DCHECK(false); + } + } else { + const uint32_t* row_offsets = rows->offsets(); + uint8_t* row_base = rows->mutable_data(2) + offset_within_row; + const uint8_t* col_base = col_prep.data(1); + switch (col_prep.metadata().fixed_length) { + case 1: + for (uint32_t i = 0; i < num_rows; ++i) { + row_base[row_offsets[i]] = col_base[i]; + } + break; + case 2: + for (uint32_t i = 0; i < num_rows; ++i) { + *reinterpret_cast(row_base + row_offsets[i]) = + reinterpret_cast(col_base)[i]; + } + break; + case 4: + for (uint32_t i = 0; i < num_rows; ++i) { + *reinterpret_cast(row_base + row_offsets[i]) = + reinterpret_cast(col_base)[i]; + } + break; + case 8: + for (uint32_t i = 0; i < num_rows; ++i) { + *reinterpret_cast(row_base + row_offsets[i]) = + reinterpret_cast(col_base)[i]; + } + break; + default: + DCHECK(false); + } + } +} + +void KeyEncoder::EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col, + KeyEncoderContext* ctx, KeyColumnArray* temp) { + KeyColumnArray col_prep; + if (UsesTransform(*col)) { + col_prep = ArrayReplace(*col, *temp); + } else { + col_prep = *col; + } + + // When we have a single fixed length column we can just do memcpy + if (rows.metadata().is_fixed_length && + col_prep.metadata().fixed_length == rows.metadata().fixed_length) { + DCHECK_EQ(offset_within_row, 0); + uint32_t row_size = rows.metadata().fixed_length; + memcpy(col_prep.mutable_data(1), rows.data(1) + start_row * row_size, + num_rows * row_size); + } else if (rows.metadata().is_fixed_length) { + uint32_t row_size = rows.metadata().fixed_length; + const uint8_t* row_base = rows.data(1) + start_row * row_size; + row_base += offset_within_row; + uint8_t* col_base = col_prep.mutable_data(1); + switch (col_prep.metadata().fixed_length) { + case 1: + for (uint32_t i = 0; i < num_rows; ++i) { + col_base[i] = row_base[i * row_size]; + } + break; + case 2: + for (uint32_t i = 0; i < num_rows; ++i) { + reinterpret_cast(col_base)[i] = + *reinterpret_cast(row_base + i * row_size); + } + break; + case 4: + for (uint32_t i = 0; i < num_rows; ++i) { + reinterpret_cast(col_base)[i] = + *reinterpret_cast(row_base + i * row_size); + } + break; + case 8: + for (uint32_t i = 0; i < num_rows; ++i) { + reinterpret_cast(col_base)[i] = + *reinterpret_cast(row_base + i * row_size); + } + break; + default: + DCHECK(false); + } + } else { + const uint32_t* row_offsets = rows.offsets() + start_row; + const uint8_t* row_base = rows.data(2); + row_base += offset_within_row; + uint8_t* col_base = col_prep.mutable_data(1); + switch (col_prep.metadata().fixed_length) { + case 1: + for (uint32_t i = 0; i < num_rows; ++i) { + col_base[i] = row_base[row_offsets[i]]; + } + break; + case 2: + for (uint32_t i = 0; i < num_rows; ++i) { + reinterpret_cast(col_base)[i] = + *reinterpret_cast(row_base + row_offsets[i]); + } + break; + case 4: + for (uint32_t i = 0; i < num_rows; ++i) { + reinterpret_cast(col_base)[i] = + *reinterpret_cast(row_base + row_offsets[i]); + } + break; + case 8: + for (uint32_t i = 0; i < num_rows; ++i) { + reinterpret_cast(col_base)[i] = + *reinterpret_cast(row_base + row_offsets[i]); + } + break; + default: + DCHECK(false); + } + } + + if (UsesTransform(*col)) { + PostDecode(col_prep, col, ctx); + } +} + +bool KeyEncoder::EncoderBinary::IsInteger(const KeyColumnMetadata& metadata) { + bool is_fixed_length = metadata.is_fixed_length; + auto size = metadata.fixed_length; + return is_fixed_length && + (size == 0 || size == 1 || size == 2 || size == 4 || size == 8); +} + +void KeyEncoder::EncoderBinary::Encode(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col, KeyEncoderContext* ctx, + KeyColumnArray* temp) { + if (IsInteger(col.metadata())) { + EncoderInteger::Encode(offset_within_row, rows, col, ctx, temp); + } else { + KeyColumnArray col_prep; + if (EncoderInteger::UsesTransform(col)) { + col_prep = EncoderInteger::ArrayReplace(col, *temp); + EncoderInteger::PreEncode(col, &col_prep, ctx); + } else { + col_prep = col; + } + + bool is_row_fixed_length = rows->metadata().is_fixed_length; + +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2()) { + EncodeHelper_avx2(is_row_fixed_length, offset_within_row, rows, col); + } else { +#endif + if (is_row_fixed_length) { + EncodeImp(offset_within_row, rows, col); + } else { + EncodeImp(offset_within_row, rows, col); + } +#if defined(ARROW_HAVE_AVX2) + } +#endif + } + + DCHECK(temp->metadata().is_fixed_length); + DCHECK(temp->length() * temp->metadata().fixed_length >= + col.length() * static_cast(sizeof(uint16_t))); + + KeyColumnArray temp16bit(KeyColumnMetadata(true, sizeof(uint16_t)), col.length(), + nullptr, temp->mutable_data(1), nullptr); + ColumnMemsetNulls(offset_within_row, rows, col, ctx, &temp16bit, 0xae); +} + +void KeyEncoder::EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col, + KeyEncoderContext* ctx, KeyColumnArray* temp) { + if (IsInteger(col->metadata())) { + EncoderInteger::Decode(start_row, num_rows, offset_within_row, rows, col, ctx, temp); + } else { + KeyColumnArray col_prep; + if (EncoderInteger::UsesTransform(*col)) { + col_prep = EncoderInteger::ArrayReplace(*col, *temp); + } else { + col_prep = *col; + } + + bool is_row_fixed_length = rows.metadata().is_fixed_length; + +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2()) { + DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, offset_within_row, rows, + col); + } else { +#endif + if (is_row_fixed_length) { + DecodeImp(start_row, num_rows, offset_within_row, rows, col); + } else { + DecodeImp(start_row, num_rows, offset_within_row, rows, col); + } +#if defined(ARROW_HAVE_AVX2) + } +#endif + + if (EncoderInteger::UsesTransform(*col)) { + EncoderInteger::PostDecode(col_prep, col, ctx); + } + } +} + +template +void KeyEncoder::EncoderBinary::EncodeImp(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col) { + EncodeDecodeHelper( + 0, static_cast(col.length()), offset_within_row, rows, rows, &col, + nullptr, [](uint8_t* dst, const uint8_t* src, int64_t length) { + auto dst64 = reinterpret_cast(dst); + auto src64 = reinterpret_cast(src); + uint32_t istripe; + for (istripe = 0; istripe < length / 8; ++istripe) { + dst64[istripe] = util::SafeLoad(src64 + istripe); + } + if ((length % 8) > 0) { + uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length)); + dst64[istripe] = (dst64[istripe] & ~mask_last) | + (util::SafeLoad(src64 + istripe) & mask_last); + } + }); +} + +template +void KeyEncoder::EncoderBinary::DecodeImp(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col) { + EncodeDecodeHelper( + start_row, num_rows, offset_within_row, &rows, nullptr, col, col, + [](uint8_t* dst, const uint8_t* src, int64_t length) { + for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) { + auto dst64 = reinterpret_cast(dst); + auto src64 = reinterpret_cast(src); + util::SafeStore(dst64 + istripe, src64[istripe]); + } + }); +} + +void KeyEncoder::EncoderBinary::ColumnMemsetNulls( + uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col, + KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) { + using ColumnMemsetNullsImp_t = void (*)(uint32_t, KeyRowArray*, const KeyColumnArray&, + KeyEncoderContext*, KeyColumnArray*, uint8_t); + static const ColumnMemsetNullsImp_t ColumnMemsetNullsImp_fn[] = { + ColumnMemsetNullsImp, ColumnMemsetNullsImp, + ColumnMemsetNullsImp, ColumnMemsetNullsImp, + ColumnMemsetNullsImp, ColumnMemsetNullsImp, + ColumnMemsetNullsImp, ColumnMemsetNullsImp, + ColumnMemsetNullsImp, ColumnMemsetNullsImp}; + uint32_t col_width = col.metadata().fixed_length; + int dispatch_const = + (rows->metadata().is_fixed_length ? 5 : 0) + + (col_width == 1 ? 0 + : col_width == 2 ? 1 : col_width == 4 ? 2 : col_width == 8 ? 3 : 4); + ColumnMemsetNullsImp_fn[dispatch_const](offset_within_row, rows, col, ctx, + temp_vector_16bit, byte_value); +} + +template +void KeyEncoder::EncoderBinary::ColumnMemsetNullsImp( + uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col, + KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) { + // Nothing to do when there are no nulls + if (!col.data(0)) { + return; + } + + const auto num_rows = static_cast(col.length()); + + // Temp vector needs space for the required number of rows + DCHECK(temp_vector_16bit->length() >= num_rows); + DCHECK(temp_vector_16bit->metadata().is_fixed_length && + temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t)); + auto temp_vector = reinterpret_cast(temp_vector_16bit->mutable_data(1)); + + // Bit vector to index vector of null positions + int num_selected; + util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, static_cast(col.length()), + col.data(0), &num_selected, temp_vector, + col.bit_offset(0)); + + for (int i = 0; i < num_selected; ++i) { + uint32_t row_id = temp_vector[i]; + + // Target binary field pointer + uint8_t* dst; + if (is_row_fixed_length) { + dst = rows->mutable_data(1) + rows->metadata().fixed_length * row_id; + } else { + dst = rows->mutable_data(2) + rows->offsets()[row_id]; + } + dst += offset_within_row; + + if (col_width == 1) { + *dst = byte_value; + } else if (col_width == 2) { + *reinterpret_cast(dst) = + (static_cast(byte_value) * static_cast(0x0101)); + } else if (col_width == 4) { + *reinterpret_cast(dst) = + (static_cast(byte_value) * static_cast(0x01010101)); + } else if (col_width == 8) { + *reinterpret_cast(dst) = + (static_cast(byte_value) * 0x0101010101010101ULL); + } else { + uint64_t value = (static_cast(byte_value) * 0x0101010101010101ULL); + uint32_t col_width_actual = col.metadata().fixed_length; + uint32_t j; + for (j = 0; j < col_width_actual / 8; ++j) { + reinterpret_cast(dst)[j] = value; + } + int tail = col_width_actual % 8; + if (tail) { + uint64_t mask = ~0ULL >> (8 * (8 - tail)); + reinterpret_cast(dst)[j] = + (reinterpret_cast(dst)[j] & ~mask) | (value & mask); + } + } + } +} + +void KeyEncoder::EncoderBinaryPair::Encode(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col1, + const KeyColumnArray& col2, + KeyEncoderContext* ctx, KeyColumnArray* temp1, + KeyColumnArray* temp2) { + DCHECK(CanProcessPair(col1.metadata(), col2.metadata())); + + KeyColumnArray col_prep[2]; + if (EncoderInteger::UsesTransform(col1)) { + col_prep[0] = EncoderInteger::ArrayReplace(col1, *temp1); + EncoderInteger::PreEncode(col1, &(col_prep[0]), ctx); + } else { + col_prep[0] = col1; + } + if (EncoderInteger::UsesTransform(col2)) { + col_prep[1] = EncoderInteger::ArrayReplace(col2, *temp2); + EncoderInteger::PreEncode(col2, &(col_prep[1]), ctx); + } else { + col_prep[1] = col2; + } + + uint32_t col_width1 = col_prep[0].metadata().fixed_length; + uint32_t col_width2 = col_prep[1].metadata().fixed_length; + int log_col_width1 = + col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0; + int log_col_width2 = + col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0; + + bool is_row_fixed_length = rows->metadata().is_fixed_length; + + const auto num_rows = static_cast(col1.length()); + uint32_t num_processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2() && col_width1 == col_width2) { + num_processed = EncodeHelper_avx2(is_row_fixed_length, col_width1, offset_within_row, + rows, col_prep[0], col_prep[1]); + } +#endif + if (num_processed < num_rows) { + using EncodeImp_t = void (*)(uint32_t, uint32_t, KeyRowArray*, const KeyColumnArray&, + const KeyColumnArray&); + static const EncodeImp_t EncodeImp_fn[] = { + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp, + EncodeImp, EncodeImp}; + int dispatch_const = (log_col_width2 << 2) | log_col_width1; + dispatch_const += (is_row_fixed_length ? 16 : 0); + EncodeImp_fn[dispatch_const](num_processed, offset_within_row, rows, col_prep[0], + col_prep[1]); + } +} + +template +void KeyEncoder::EncoderBinaryPair::EncodeImp(uint32_t num_rows_to_skip, + uint32_t offset_within_row, + KeyRowArray* rows, + const KeyColumnArray& col1, + const KeyColumnArray& col2) { + const uint8_t* src_A = col1.data(1); + const uint8_t* src_B = col2.data(1); + + const auto num_rows = static_cast(col1.length()); + + uint32_t fixed_length = rows->metadata().fixed_length; + const uint32_t* offsets; + uint8_t* dst_base; + if (is_row_fixed_length) { + dst_base = rows->mutable_data(1) + offset_within_row; + offsets = nullptr; + } else { + dst_base = rows->mutable_data(2) + offset_within_row; + offsets = rows->offsets(); + } + + using col1_type_const = typename std::add_const::type; + using col2_type_const = typename std::add_const::type; + + if (is_row_fixed_length) { + uint8_t* dst = dst_base + num_rows_to_skip * fixed_length; + for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) { + *reinterpret_cast(dst) = reinterpret_cast(src_A)[i]; + *reinterpret_cast(dst + sizeof(col1_type)) = + reinterpret_cast(src_B)[i]; + dst += fixed_length; + } + } else { + for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) { + uint8_t* dst = dst_base + offsets[i]; + *reinterpret_cast(dst) = reinterpret_cast(src_A)[i]; + *reinterpret_cast(dst + sizeof(col1_type)) = + reinterpret_cast(src_B)[i]; + } + } +} + +void KeyEncoder::EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col1, + KeyColumnArray* col2, KeyEncoderContext* ctx, + KeyColumnArray* temp1, KeyColumnArray* temp2) { + DCHECK(CanProcessPair(col1->metadata(), col2->metadata())); + + KeyColumnArray col_prep[2]; + if (EncoderInteger::UsesTransform(*col1)) { + col_prep[0] = EncoderInteger::ArrayReplace(*col1, *temp1); + } else { + col_prep[0] = *col1; + } + if (EncoderInteger::UsesTransform(*col2)) { + col_prep[1] = EncoderInteger::ArrayReplace(*col2, *temp2); + } else { + col_prep[1] = *col2; + } + + uint32_t col_width1 = col_prep[0].metadata().fixed_length; + uint32_t col_width2 = col_prep[1].metadata().fixed_length; + int log_col_width1 = + col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0; + int log_col_width2 = + col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0; + + bool is_row_fixed_length = rows.metadata().is_fixed_length; + + uint32_t num_processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2() && col_width1 == col_width2) { + num_processed = + DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows, + offset_within_row, rows, &col_prep[0], &col_prep[1]); + } +#endif + if (num_processed < num_rows) { + using DecodeImp_t = void (*)(uint32_t, uint32_t, uint32_t, uint32_t, + const KeyRowArray&, KeyColumnArray*, KeyColumnArray*); + static const DecodeImp_t DecodeImp_fn[] = { + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp, + DecodeImp, DecodeImp}; + int dispatch_const = + (log_col_width2 << 2) | log_col_width1 | (is_row_fixed_length ? 16 : 0); + DecodeImp_fn[dispatch_const](num_processed, start_row, num_rows, offset_within_row, + rows, &(col_prep[0]), &(col_prep[1])); + } + + if (EncoderInteger::UsesTransform(*col1)) { + EncoderInteger::PostDecode(col_prep[0], col1, ctx); + } + if (EncoderInteger::UsesTransform(*col2)) { + EncoderInteger::PostDecode(col_prep[1], col2, ctx); + } +} + +template +void KeyEncoder::EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip, + uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, + const KeyRowArray& rows, + KeyColumnArray* col1, + KeyColumnArray* col2) { + DCHECK(rows.length() >= start_row + num_rows); + DCHECK(col1->length() == num_rows && col2->length() == num_rows); + + uint8_t* dst_A = col1->mutable_data(1); + uint8_t* dst_B = col2->mutable_data(1); + + uint32_t fixed_length = rows.metadata().fixed_length; + const uint32_t* offsets; + const uint8_t* src_base; + if (is_row_fixed_length) { + src_base = rows.data(1) + fixed_length * start_row + offset_within_row; + offsets = nullptr; + } else { + src_base = rows.data(2) + offset_within_row; + offsets = rows.offsets() + start_row; + } + + using col1_type_const = typename std::add_const::type; + using col2_type_const = typename std::add_const::type; + + if (is_row_fixed_length) { + const uint8_t* src = src_base + num_rows_to_skip * fixed_length; + for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) { + reinterpret_cast(dst_A)[i] = *reinterpret_cast(src); + reinterpret_cast(dst_B)[i] = + *reinterpret_cast(src + sizeof(col1_type)); + src += fixed_length; + } + } else { + for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) { + const uint8_t* src = src_base + offsets[i]; + reinterpret_cast(dst_A)[i] = *reinterpret_cast(src); + reinterpret_cast(dst_B)[i] = + *reinterpret_cast(src + sizeof(col1_type)); + } + } +} + +void KeyEncoder::EncoderOffsets::Encode(KeyRowArray* rows, + const std::vector& varbinary_cols, + KeyEncoderContext* ctx) { + DCHECK(!varbinary_cols.empty()); + + // Rows and columns must all be varying-length + DCHECK(!rows->metadata().is_fixed_length); + for (const auto& col : varbinary_cols) { + DCHECK(!col.metadata().is_fixed_length); + } + + const auto num_rows = static_cast(varbinary_cols[0].length()); + + uint32_t num_processed = 0; +#if defined(ARROW_HAVE_AVX2) + // Whether any of the columns has non-zero starting bit offset for non-nulls bit vector + bool has_bit_offset = false; + + // The space in columns must be exactly equal to a space for offsets in rows + DCHECK(rows->length() == num_rows); + for (const auto& col : varbinary_cols) { + DCHECK(col.length() == num_rows); + if (col.bit_offset(0) != 0) { + has_bit_offset = true; + } + } + + if (ctx->has_avx2() && !has_bit_offset) { + // Create a temp vector sized based on the number of columns + auto temp_buffer_holder = util::TempVectorHolder( + ctx->stack, static_cast(varbinary_cols.size()) * 8); + auto temp_buffer_32B_per_col = KeyColumnArray( + KeyColumnMetadata(true, sizeof(uint32_t)), varbinary_cols.size() * 8, nullptr, + reinterpret_cast(temp_buffer_holder.mutable_data()), nullptr); + + num_processed = EncodeImp_avx2(rows, varbinary_cols, &temp_buffer_32B_per_col); + } +#endif + if (num_processed < num_rows) { + EncodeImp(num_processed, rows, varbinary_cols); + } +} + +void KeyEncoder::EncoderOffsets::EncodeImp( + uint32_t num_rows_already_processed, KeyRowArray* rows, + const std::vector& varbinary_cols) { + DCHECK_GT(varbinary_cols.size(), 0); + + int row_alignment = rows->metadata().row_alignment; + int string_alignment = rows->metadata().string_alignment; + + uint32_t* row_offsets = rows->mutable_offsets(); + uint8_t* row_values = rows->mutable_data(2); + const auto num_rows = static_cast(varbinary_cols[0].length()); + + if (num_rows_already_processed == 0) { + row_offsets[0] = 0; + } + + uint32_t row_offset = row_offsets[num_rows_already_processed]; + for (uint32_t i = num_rows_already_processed; i < num_rows; ++i) { + uint32_t* varbinary_end = + rows->metadata().varbinary_end_array(row_values + row_offset); + + // Zero out lengths for nulls. + // Add lengths of all columns to get row size. + // Store varbinary field ends while summing their lengths. + + uint32_t offset_within_row = rows->metadata().fixed_length; + + for (size_t col = 0; col < varbinary_cols.size(); ++col) { + const uint32_t* col_offsets = varbinary_cols[col].offsets(); + uint32_t col_length = col_offsets[i + 1] - col_offsets[i]; + + const int bit_offset = varbinary_cols[col].bit_offset(0); + + const uint8_t* non_nulls = varbinary_cols[col].data(0); + if (non_nulls && BitUtil::GetBit(non_nulls, bit_offset + i) == 0) { + col_length = 0; + } + + offset_within_row += + KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment); + offset_within_row += col_length; + + varbinary_end[col] = offset_within_row; + } + + offset_within_row += + KeyRowMetadata::padding_for_alignment(offset_within_row, row_alignment); + row_offset += offset_within_row; + row_offsets[i + 1] = row_offset; + } +} + +void KeyEncoder::EncoderOffsets::Decode( + uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows, + std::vector* varbinary_cols, + const std::vector& varbinary_cols_base_offset, KeyEncoderContext* ctx) { + DCHECK(!varbinary_cols->empty()); + DCHECK(varbinary_cols->size() == varbinary_cols_base_offset.size()); + + DCHECK(!rows.metadata().is_fixed_length); + DCHECK(rows.length() >= start_row + num_rows); + for (const auto& col : *varbinary_cols) { + // Rows and columns must all be varying-length + DCHECK(!col.metadata().is_fixed_length); + // The space in columns must be exactly equal to a subset of rows selected + DCHECK(col.length() == num_rows); + } + + // Offsets of varbinary columns data within each encoded row are stored + // in the same encoded row as an array of 32-bit integers. + // This array follows immediately the data of fixed-length columns. + // There is one element for each varying-length column. + // The Nth element is the sum of all the lengths of varbinary columns data in + // that row, up to and including Nth varbinary column. + + const uint32_t* row_offsets = rows.offsets() + start_row; + + // Set the base offset for each column + for (size_t col = 0; col < varbinary_cols->size(); ++col) { + uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets(); + col_offsets[0] = varbinary_cols_base_offset[col]; + } + + int string_alignment = rows.metadata().string_alignment; + + for (uint32_t i = 0; i < num_rows; ++i) { + // Find the beginning of cumulative lengths array for next row + const uint8_t* row = rows.data(2) + row_offsets[i]; + const uint32_t* varbinary_ends = rows.metadata().varbinary_end_array(row); + + // Update the offset of each column + uint32_t offset_within_row = rows.metadata().fixed_length; + for (size_t col = 0; col < varbinary_cols->size(); ++col) { + offset_within_row += + KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment); + uint32_t length = varbinary_ends[col] - offset_within_row; + offset_within_row = varbinary_ends[col]; + uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets(); + col_offsets[i + 1] = col_offsets[i] + length; + } + } +} + +void KeyEncoder::EncoderVarBinary::Encode(uint32_t varbinary_col_id, KeyRowArray* rows, + const KeyColumnArray& col, + KeyEncoderContext* ctx) { +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2()) { + EncodeHelper_avx2(varbinary_col_id, rows, col); + } else { +#endif + if (varbinary_col_id == 0) { + EncodeImp(varbinary_col_id, rows, col); + } else { + EncodeImp(varbinary_col_id, rows, col); + } +#if defined(ARROW_HAVE_AVX2) + } +#endif +} + +void KeyEncoder::EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows, + uint32_t varbinary_col_id, + const KeyRowArray& rows, KeyColumnArray* col, + KeyEncoderContext* ctx) { + // Output column varbinary buffer needs an extra 32B + // at the end in avx2 version and 8B otherwise. +#if defined(ARROW_HAVE_AVX2) + if (ctx->has_avx2()) { + DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col); + } else { +#endif + if (varbinary_col_id == 0) { + DecodeImp(start_row, num_rows, varbinary_col_id, rows, col); + } else { + DecodeImp(start_row, num_rows, varbinary_col_id, rows, col); + } +#if defined(ARROW_HAVE_AVX2) + } +#endif +} + +template +void KeyEncoder::EncoderVarBinary::EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows, + const KeyColumnArray& col) { + EncodeDecodeHelper( + 0, static_cast(col.length()), varbinary_col_id, rows, rows, &col, nullptr, + [](uint8_t* dst, const uint8_t* src, int64_t length) { + auto dst64 = reinterpret_cast(dst); + auto src64 = reinterpret_cast(src); + uint32_t istripe; + for (istripe = 0; istripe < length / 8; ++istripe) { + dst64[istripe] = util::SafeLoad(src64 + istripe); + } + if ((length % 8) > 0) { + uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length)); + dst64[istripe] = (dst64[istripe] & ~mask_last) | + (util::SafeLoad(src64 + istripe) & mask_last); + } + }); +} + +template +void KeyEncoder::EncoderVarBinary::DecodeImp(uint32_t start_row, uint32_t num_rows, + uint32_t varbinary_col_id, + const KeyRowArray& rows, + KeyColumnArray* col) { + EncodeDecodeHelper( + start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col, + [](uint8_t* dst, const uint8_t* src, int64_t length) { + for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) { + auto dst64 = reinterpret_cast(dst); + auto src64 = reinterpret_cast(src); + util::SafeStore(dst64 + istripe, src64[istripe]); + } + }); +} + +void KeyEncoder::EncoderNulls::Encode(KeyRowArray* rows, + const std::vector& cols, + KeyEncoderContext* ctx, + KeyColumnArray* temp_vector_16bit) { + DCHECK_GT(cols.size(), 0); + const auto num_rows = static_cast(rows->length()); + + // All input columns should have the same number of rows. + // They may or may not have non-nulls bit-vectors allocated. + for (const auto& col : cols) { + DCHECK(col.length() == num_rows); + } + + // Temp vector needs space for the required number of rows + DCHECK(temp_vector_16bit->length() >= num_rows); + DCHECK(temp_vector_16bit->metadata().is_fixed_length && + temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t)); + + uint8_t* null_masks = rows->null_masks(); + uint32_t null_masks_bytes_per_row = rows->metadata().null_masks_bytes_per_row; + memset(null_masks, 0, null_masks_bytes_per_row * num_rows); + for (size_t col = 0; col < cols.size(); ++col) { + const uint8_t* non_nulls = cols[col].data(0); + if (!non_nulls) { + continue; + } + int bit_offset = cols[col].bit_offset(0); + DCHECK_LT(bit_offset, 8); + int num_selected; + util::BitUtil::bits_to_indexes( + 0, ctx->hardware_flags, num_rows, non_nulls, &num_selected, + reinterpret_cast(temp_vector_16bit->mutable_data(1)), bit_offset); + for (int i = 0; i < num_selected; ++i) { + uint16_t row_id = reinterpret_cast(temp_vector_16bit->data(1))[i]; + int64_t null_masks_bit_id = row_id * null_masks_bytes_per_row * 8 + col; + BitUtil::SetBit(null_masks, null_masks_bit_id); + } + } +} + +void KeyEncoder::EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows, + const KeyRowArray& rows, + std::vector* cols) { + // Every output column needs to have a space for exactly the required number + // of rows. It also needs to have non-nulls bit-vector allocated and mutable. + DCHECK_GT(cols->size(), 0); + for (auto& col : *cols) { + DCHECK(col.length() == num_rows); + DCHECK(col.mutable_data(0)); + } + + const uint8_t* null_masks = rows.null_masks(); + uint32_t null_masks_bytes_per_row = rows.metadata().null_masks_bytes_per_row; + for (size_t col = 0; col < cols->size(); ++col) { + uint8_t* non_nulls = (*cols)[col].mutable_data(0); + const int bit_offset = (*cols)[col].bit_offset(0); + DCHECK_LT(bit_offset, 8); + non_nulls[0] |= 0xff << (bit_offset); + if (bit_offset + num_rows > 8) { + int bits_in_first_byte = 8 - bit_offset; + memset(non_nulls + 1, 0xff, BitUtil::BytesForBits(num_rows - bits_in_first_byte)); + } + for (uint32_t row = 0; row < num_rows; ++row) { + uint32_t null_masks_bit_id = + (start_row + row) * null_masks_bytes_per_row * 8 + static_cast(col); + bool is_set = BitUtil::GetBit(null_masks, null_masks_bit_id); + if (is_set) { + BitUtil::ClearBit(non_nulls, bit_offset + row); + } + } + } +} + +uint32_t KeyEncoder::KeyRowMetadata::num_varbinary_cols() const { + uint32_t result = 0; + for (auto column_metadata : column_metadatas) { + if (!column_metadata.is_fixed_length) { + ++result; + } + } + return result; +} + +bool KeyEncoder::KeyRowMetadata::is_compatible(const KeyRowMetadata& other) const { + if (other.num_cols() != num_cols()) { + return false; + } + if (row_alignment != other.row_alignment || + string_alignment != other.string_alignment) { + return false; + } + for (size_t i = 0; i < column_metadatas.size(); ++i) { + if (column_metadatas[i].is_fixed_length != + other.column_metadatas[i].is_fixed_length) { + return false; + } + if (column_metadatas[i].fixed_length != other.column_metadatas[i].fixed_length) { + return false; + } + } + return true; +} + +void KeyEncoder::KeyRowMetadata::FromColumnMetadataVector( + const std::vector& cols, int in_row_alignment, + int in_string_alignment) { + column_metadatas.resize(cols.size()); + for (size_t i = 0; i < cols.size(); ++i) { + column_metadatas[i] = cols[i]; + } + + const auto num_cols = static_cast(cols.size()); + + // Sort columns. + // Columns are sorted based on the size in bytes of their fixed-length part. + // For the varying-length column, the fixed-length part is the 32-bit field storing + // cumulative length of varying-length fields. + // The rules are: + // a) Boolean column, marked with fixed-length 0, is considered to have fixed-length + // part of 1 byte. b) Columns with fixed-length part being power of 2 or multiple of row + // alignment precede other columns. They are sorted among themselves based on size of + // fixed-length part. c) Fixed-length columns precede varying-length columns when both + // have the same size fixed-length part. + column_order.resize(num_cols); + for (uint32_t i = 0; i < num_cols; ++i) { + column_order[i] = i; + } + std::sort( + column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) { + bool is_left_pow2 = + !cols[left].is_fixed_length || ARROW_POPCOUNT64(cols[left].fixed_length) <= 1; + bool is_right_pow2 = !cols[right].is_fixed_length || + ARROW_POPCOUNT64(cols[right].fixed_length) <= 1; + bool is_left_fixedlen = cols[left].is_fixed_length; + bool is_right_fixedlen = cols[right].is_fixed_length; + uint32_t width_left = + cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t); + uint32_t width_right = + cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t); + if (is_left_pow2 != is_right_pow2) { + return is_left_pow2; + } + if (!is_left_pow2) { + return left < right; + } + if (width_left != width_right) { + return width_left > width_right; + } + if (is_left_fixedlen != is_right_fixedlen) { + return is_left_fixedlen; + } + return left < right; + }); + + row_alignment = in_row_alignment; + string_alignment = in_string_alignment; + varbinary_end_array_offset = 0; + + column_offsets.resize(num_cols); + uint32_t num_varbinary_cols = 0; + uint32_t offset_within_row = 0; + for (uint32_t i = 0; i < num_cols; ++i) { + const KeyColumnMetadata& col = cols[column_order[i]]; + offset_within_row += + KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment, col); + column_offsets[i] = offset_within_row; + if (!col.is_fixed_length) { + if (num_varbinary_cols == 0) { + varbinary_end_array_offset = offset_within_row; + } + DCHECK(column_offsets[i] - varbinary_end_array_offset == + num_varbinary_cols * sizeof(uint32_t)); + ++num_varbinary_cols; + offset_within_row += sizeof(uint32_t); + } else { + // Boolean column is a bit-vector, which is indicated by + // setting fixed length in column metadata to zero. + // It will be stored as a byte in output row. + if (col.fixed_length == 0) { + offset_within_row += 1; + } else { + offset_within_row += col.fixed_length; + } + } + } + + is_fixed_length = (num_varbinary_cols == 0); + fixed_length = + offset_within_row + + KeyRowMetadata::padding_for_alignment( + offset_within_row, num_varbinary_cols == 0 ? row_alignment : string_alignment); + + // We set the number of bytes per row storing null masks of individual key columns + // to be a power of two. This is not required. It could be also set to the minimal + // number of bytes required for a given number of bits (one bit per column). + null_masks_bytes_per_row = 1; + while (static_cast(null_masks_bytes_per_row * 8) < num_cols) { + null_masks_bytes_per_row *= 2; + } +} + +void KeyEncoder::Init(const std::vector& cols, KeyEncoderContext* ctx, + int row_alignment, int string_alignment) { + ctx_ = ctx; + row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment); + uint32_t num_cols = row_metadata_.num_cols(); + uint32_t num_varbinary_cols = row_metadata_.num_varbinary_cols(); + batch_all_cols_.resize(num_cols); + batch_varbinary_cols_.resize(num_varbinary_cols); + batch_varbinary_cols_base_offsets_.resize(num_varbinary_cols); +} + +void KeyEncoder::PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows, + const std::vector& cols_in) { + const auto num_cols = static_cast(cols_in.size()); + DCHECK(batch_all_cols_.size() == num_cols); + + uint32_t num_varbinary_visited = 0; + for (uint32_t i = 0; i < num_cols; ++i) { + const KeyColumnArray& col = cols_in[row_metadata_.column_order[i]]; + KeyColumnArray col_window(col, start_row, num_rows); + batch_all_cols_[i] = col_window; + if (!col.metadata().is_fixed_length) { + DCHECK(num_varbinary_visited < batch_varbinary_cols_.size()); + // If start row is zero, then base offset of varbinary column is also zero. + if (start_row == 0) { + batch_varbinary_cols_base_offsets_[num_varbinary_visited] = 0; + } else { + batch_varbinary_cols_base_offsets_[num_varbinary_visited] = + col.offsets()[start_row]; + } + batch_varbinary_cols_[num_varbinary_visited++] = col_window; + } + } +} + +Status KeyEncoder::PrepareOutputForEncode(int64_t start_row, int64_t num_rows, + KeyRowArray* rows, + const std::vector& all_cols) { + int64_t num_bytes_required = 0; + + int64_t fixed_part = row_metadata_.fixed_length * num_rows; + int64_t var_part = 0; + for (const auto& col : all_cols) { + if (!col.metadata().is_fixed_length) { + DCHECK(col.length() >= start_row + num_rows); + const uint32_t* offsets = col.offsets(); + var_part += offsets[start_row + num_rows] - offsets[start_row]; + // Include maximum padding that can be added to align the start of varbinary fields. + var_part += num_rows * row_metadata_.string_alignment; + } + } + // Include maximum padding that can be added to align the start of the rows. + if (!row_metadata_.is_fixed_length) { + fixed_part += row_metadata_.row_alignment * num_rows; + } + num_bytes_required = fixed_part + var_part; + + rows->Clean(); + RETURN_NOT_OK(rows->AppendEmpty(static_cast(num_rows), + static_cast(num_bytes_required))); + + return Status::OK(); +} + +void KeyEncoder::Encode(int64_t start_row, int64_t num_rows, KeyRowArray* rows, + const std::vector& cols) { + // Prepare column array vectors + PrepareKeyColumnArrays(start_row, num_rows, cols); + + // Create two temp vectors with 16-bit elements + auto temp_buffer_holder_A = + util::TempVectorHolder(ctx_->stack, static_cast(num_rows)); + auto temp_buffer_A = KeyColumnArray( + KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr, + reinterpret_cast(temp_buffer_holder_A.mutable_data()), nullptr); + auto temp_buffer_holder_B = + util::TempVectorHolder(ctx_->stack, static_cast(num_rows)); + auto temp_buffer_B = KeyColumnArray( + KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr, + reinterpret_cast(temp_buffer_holder_B.mutable_data()), nullptr); + + bool is_row_fixed_length = row_metadata_.is_fixed_length; + if (!is_row_fixed_length) { + // This call will generate and fill in data for both: + // - offsets to the entire encoded arrays + // - offsets for individual varbinary fields within each row + EncoderOffsets::Encode(rows, batch_varbinary_cols_, ctx_); + + for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) { + // Memcpy varbinary fields into precomputed in the previous step + // positions in the output row buffer. + EncoderVarBinary::Encode(static_cast(i), rows, batch_varbinary_cols_[i], + ctx_); + } + } + + // Process fixed length columns + const auto num_cols = static_cast(batch_all_cols_.size()); + for (uint32_t i = 0; i < num_cols;) { + if (!batch_all_cols_[i].metadata().is_fixed_length) { + i += 1; + continue; + } + bool can_process_pair = + (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length && + EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(), + batch_all_cols_[i + 1].metadata()); + if (!can_process_pair) { + EncoderBinary::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i], + ctx_, &temp_buffer_A); + i += 1; + } else { + EncoderBinaryPair::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i], + batch_all_cols_[i + 1], ctx_, &temp_buffer_A, + &temp_buffer_B); + i += 2; + } + } + + // Process nulls + EncoderNulls::Encode(rows, batch_all_cols_, ctx_, &temp_buffer_A); +} + +void KeyEncoder::DecodeFixedLengthBuffers(int64_t start_row_input, + int64_t start_row_output, int64_t num_rows, + const KeyRowArray& rows, + std::vector* cols) { + // Prepare column array vectors + PrepareKeyColumnArrays(start_row_output, num_rows, *cols); + + // Create two temp vectors with 16-bit elements + auto temp_buffer_holder_A = + util::TempVectorHolder(ctx_->stack, static_cast(num_rows)); + auto temp_buffer_A = KeyColumnArray( + KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr, + reinterpret_cast(temp_buffer_holder_A.mutable_data()), nullptr); + auto temp_buffer_holder_B = + util::TempVectorHolder(ctx_->stack, static_cast(num_rows)); + auto temp_buffer_B = KeyColumnArray( + KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr, + reinterpret_cast(temp_buffer_holder_B.mutable_data()), nullptr); + + bool is_row_fixed_length = row_metadata_.is_fixed_length; + if (!is_row_fixed_length) { + EncoderOffsets::Decode(static_cast(start_row_input), + static_cast(num_rows), rows, &batch_varbinary_cols_, + batch_varbinary_cols_base_offsets_, ctx_); + } + + // Process fixed length columns + const auto num_cols = static_cast(batch_all_cols_.size()); + for (uint32_t i = 0; i < num_cols;) { + if (!batch_all_cols_[i].metadata().is_fixed_length) { + i += 1; + continue; + } + bool can_process_pair = + (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length && + EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(), + batch_all_cols_[i + 1].metadata()); + if (!can_process_pair) { + EncoderBinary::Decode(static_cast(start_row_input), + static_cast(num_rows), + row_metadata_.column_offsets[i], rows, &batch_all_cols_[i], + ctx_, &temp_buffer_A); + i += 1; + } else { + EncoderBinaryPair::Decode( + static_cast(start_row_input), static_cast(num_rows), + row_metadata_.column_offsets[i], rows, &batch_all_cols_[i], + &batch_all_cols_[i + 1], ctx_, &temp_buffer_A, &temp_buffer_B); + i += 2; + } + } + + // Process nulls + EncoderNulls::Decode(static_cast(start_row_input), + static_cast(num_rows), rows, &batch_all_cols_); +} + +void KeyEncoder::DecodeVaryingLengthBuffers(int64_t start_row_input, + int64_t start_row_output, int64_t num_rows, + const KeyRowArray& rows, + std::vector* cols) { + // Prepare column array vectors + PrepareKeyColumnArrays(start_row_output, num_rows, *cols); + + bool is_row_fixed_length = row_metadata_.is_fixed_length; + if (!is_row_fixed_length) { + for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) { + // Memcpy varbinary fields into precomputed in the previous step + // positions in the output row buffer. + EncoderVarBinary::Decode(static_cast(start_row_input), + static_cast(num_rows), static_cast(i), + rows, &batch_varbinary_cols_[i], ctx_); + } + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_encode.h b/cpp/src/arrow/compute/exec/key_encode.h new file mode 100644 index 00000000000..e5397b9dfd4 --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_encode.h @@ -0,0 +1,635 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/compute/exec/util.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/bit_util.h" + +namespace arrow { +namespace compute { + +class KeyColumnMetadata; + +/// Converts between key representation as a collection of arrays for +/// individual columns and another representation as a single array of rows +/// combining data from all columns into one value. +/// This conversion is reversible. +/// Row-oriented storage is beneficial when there is a need for random access +/// of individual rows and at the same time all included columns are likely to +/// be accessed together, as in the case of hash table key. +class KeyEncoder { + public: + struct KeyEncoderContext { + bool has_avx2() const { + return (hardware_flags & arrow::internal::CpuInfo::AVX2) > 0; + } + int64_t hardware_flags; + util::TempVectorStack* stack; + }; + + /// Description of a storage format of a single key column as needed + /// for the purpose of row encoding. + struct KeyColumnMetadata { + KeyColumnMetadata() = default; + KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in) + : is_fixed_length(is_fixed_length_in), fixed_length(fixed_length_in) {} + /// Is column storing a varying-length binary, using offsets array + /// to find a beginning of a value, or is it a fixed-length binary. + bool is_fixed_length; + /// For a fixed-length binary column: number of bytes per value. + /// Zero has a special meaning, indicating a bit vector with one bit per value. + /// For a varying-length binary column: number of bytes per offset. + uint32_t fixed_length; + }; + + /// Description of a storage format for rows produced by encoder. + struct KeyRowMetadata { + /// Is row a varying-length binary, using offsets array to find a beginning of a row, + /// or is it a fixed-length binary. + bool is_fixed_length; + + /// For a fixed-length binary row, common size of rows in bytes, + /// rounded up to the multiple of alignment. + /// + /// For a varying-length binary, size of all encoded fixed-length key columns, + /// including lengths of varying-length columns, rounded up to the multiple of string + /// alignment. + uint32_t fixed_length; + + /// Offset within a row to the array of 32-bit offsets within a row of + /// ends of varbinary fields. + /// Used only when the row is not fixed-length, zero for fixed-length row. + /// There are N elements for N varbinary fields. + /// Each element is the offset within a row of the first byte after + /// the corresponding varbinary field bytes in that row. + /// If varbinary fields begin at aligned addresses, than the end of the previous + /// varbinary field needs to be rounded up according to the specified alignment + /// to obtain the beginning of the next varbinary field. + /// The first varbinary field starts at offset specified by fixed_length, + /// which should already be aligned. + uint32_t varbinary_end_array_offset; + + /// Fixed number of bytes per row that are used to encode null masks. + /// Null masks indicate for a single row which of its key columns are null. + /// Nth bit in the sequence of bytes assigned to a row represents null + /// information for Nth field according to the order in which they are encoded. + int null_masks_bytes_per_row; + + /// Power of 2. Every row will start at the offset aligned to that number of bytes. + int row_alignment; + + /// Power of 2. Must be no greater than row alignment. + /// Every non-power-of-2 binary field and every varbinary field bytes + /// will start aligned to that number of bytes. + int string_alignment; + + /// Metadata of encoded columns in their original order. + std::vector column_metadatas; + + /// Order in which fields are encoded. + std::vector column_order; + + /// Offsets within a row to fields in their encoding order. + std::vector column_offsets; + + /// Rounding up offset to the nearest multiple of alignment value. + /// Alignment must be a power of 2. + static inline uint32_t padding_for_alignment(uint32_t offset, + int required_alignment) { + ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1); + return static_cast((-static_cast(offset)) & + (required_alignment - 1)); + } + + /// Rounding up offset to the beginning of next column, + /// chosing required alignment based on the data type of that column. + static inline uint32_t padding_for_alignment(uint32_t offset, int string_alignment, + const KeyColumnMetadata& col_metadata) { + if (!col_metadata.is_fixed_length || + ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) { + return 0; + } else { + return padding_for_alignment(offset, string_alignment); + } + } + + /// Returns an array of offsets within a row of ends of varbinary fields. + inline const uint32_t* varbinary_end_array(const uint8_t* row) const { + ARROW_DCHECK(!is_fixed_length); + return reinterpret_cast(row + varbinary_end_array_offset); + } + inline uint32_t* varbinary_end_array(uint8_t* row) const { + ARROW_DCHECK(!is_fixed_length); + return reinterpret_cast(row + varbinary_end_array_offset); + } + + /// Returns the offset within the row and length of the first varbinary field. + inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t* offset, + uint32_t* length) const { + ARROW_DCHECK(!is_fixed_length); + *offset = fixed_length; + *length = varbinary_end_array(row)[0] - fixed_length; + } + + /// Returns the offset within the row and length of the second and further varbinary + /// fields. + inline void nth_varbinary_offset_and_length(const uint8_t* row, int varbinary_id, + uint32_t* out_offset, + uint32_t* out_length) const { + ARROW_DCHECK(!is_fixed_length); + ARROW_DCHECK(varbinary_id > 0); + const uint32_t* varbinary_end = varbinary_end_array(row); + uint32_t offset = varbinary_end[varbinary_id - 1]; + offset += padding_for_alignment(offset, string_alignment); + *out_offset = offset; + *out_length = varbinary_end[varbinary_id] - offset; + } + + uint32_t encoded_field_order(uint32_t icol) const { return column_order[icol]; } + + uint32_t encoded_field_offset(uint32_t icol) const { return column_offsets[icol]; } + + uint32_t num_cols() const { return static_cast(column_metadatas.size()); } + + uint32_t num_varbinary_cols() const; + + void FromColumnMetadataVector(const std::vector& cols, + int in_row_alignment, int in_string_alignment); + + bool is_compatible(const KeyRowMetadata& other) const; + }; + + class KeyRowArray { + public: + KeyRowArray(); + Status Init(MemoryPool* pool, const KeyRowMetadata& metadata); + void Clean(); + Status AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append); + Status AppendSelectionFrom(const KeyRowArray& from, uint32_t num_rows_to_append, + const uint16_t* source_row_ids); + const KeyRowMetadata& metadata() const { return metadata_; } + int64_t length() const { return num_rows_; } + const uint8_t* data(int i) const { + ARROW_DCHECK(i >= 0 && i <= max_buffers_); + return buffers_[i]; + } + uint8_t* mutable_data(int i) { + ARROW_DCHECK(i >= 0 && i <= max_buffers_); + return mutable_buffers_[i]; + } + const uint32_t* offsets() const { return reinterpret_cast(data(1)); } + uint32_t* mutable_offsets() { return reinterpret_cast(mutable_data(1)); } + const uint8_t* null_masks() const { return null_masks_->data(); } + uint8_t* null_masks() { return null_masks_->mutable_data(); } + + bool has_any_nulls(const KeyEncoderContext* ctx) const; + + private: + Status ResizeFixedLengthBuffers(int64_t num_extra_rows); + Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes); + + int64_t size_null_masks(int64_t num_rows); + int64_t size_offsets(int64_t num_rows); + int64_t size_rows_fixed_length(int64_t num_rows); + int64_t size_rows_varying_length(int64_t num_bytes); + void update_buffer_pointers(); + + static constexpr int64_t padding_for_vectors = 64; + MemoryPool* pool_; + KeyRowMetadata metadata_; + /// Buffers can only expand during lifetime and never shrink. + std::unique_ptr null_masks_; + std::unique_ptr offsets_; + std::unique_ptr rows_; + static constexpr int max_buffers_ = 3; + const uint8_t* buffers_[max_buffers_]; + uint8_t* mutable_buffers_[max_buffers_]; + int64_t num_rows_; + int64_t rows_capacity_; + int64_t bytes_capacity_; + + // Mutable to allow lazy evaluation + mutable int64_t num_rows_for_has_any_nulls_; + mutable bool has_any_nulls_; + }; + + /// A lightweight description of an array representing one of key columns. + class KeyColumnArray { + public: + KeyColumnArray() = default; + /// Create as a mix of buffers according to the mask from two descriptions + /// (Nth bit is set to 0 if Nth buffer from the first input + /// should be used and is set to 1 otherwise). + /// Metadata is inherited from the first input. + KeyColumnArray(const KeyColumnMetadata& metadata, const KeyColumnArray& left, + const KeyColumnArray& right, int buffer_id_to_replace); + /// Create for reading + KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, + const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* buffer2, + int bit_offset0 = 0, int bit_offset1 = 0); + /// Create for writing + KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, uint8_t* buffer0, + uint8_t* buffer1, uint8_t* buffer2, int bit_offset0 = 0, + int bit_offset1 = 0); + /// Create as a window view of original description that is offset + /// by a given number of rows. + /// The number of rows used in offset must be divisible by 8 + /// in order to not split bit vectors within a single byte. + KeyColumnArray(const KeyColumnArray& from, int64_t start, int64_t length); + uint8_t* mutable_data(int i) { + ARROW_DCHECK(i >= 0 && i <= max_buffers_); + return mutable_buffers_[i]; + } + const uint8_t* data(int i) const { + ARROW_DCHECK(i >= 0 && i <= max_buffers_); + return buffers_[i]; + } + uint32_t* mutable_offsets() { return reinterpret_cast(mutable_data(1)); } + const uint32_t* offsets() const { return reinterpret_cast(data(1)); } + const KeyColumnMetadata& metadata() const { return metadata_; } + int64_t length() const { return length_; } + int bit_offset(int i) const { + ARROW_DCHECK(i >= 0 && i < max_buffers_); + return bit_offset_[i]; + } + + private: + static constexpr int max_buffers_ = 3; + const uint8_t* buffers_[max_buffers_]; + uint8_t* mutable_buffers_[max_buffers_]; + KeyColumnMetadata metadata_; + int64_t length_; + // Starting bit offset within the first byte (between 0 and 7) + // to be used when accessing buffers that store bit vectors. + int bit_offset_[max_buffers_ - 1]; + }; + + void Init(const std::vector& cols, KeyEncoderContext* ctx, + int row_alignment, int string_alignment); + + const KeyRowMetadata& row_metadata() { return row_metadata_; } + + /// Find out the required sizes of all buffers output buffers for encoding + /// (including varying-length buffers). + /// Use that information to resize provided row array so that it can fit + /// encoded data. + Status PrepareOutputForEncode(int64_t start_input_row, int64_t num_input_rows, + KeyRowArray* rows, + const std::vector& all_cols); + + /// Encode a window of column oriented data into the entire output + /// row oriented storage. + /// The output buffers for encoding need to be correctly sized before + /// starting encoding. + void Encode(int64_t start_input_row, int64_t num_input_rows, KeyRowArray* rows, + const std::vector& cols); + + /// Decode a window of row oriented data into a corresponding + /// window of column oriented storage. + /// The output buffers need to be correctly allocated and sized before + /// calling each method. + /// For that reason decoding is split into two functions. + /// The output of the first one, that processes everything except for + /// varying length buffers, can be used to find out required varying + /// length buffers sizes. + void DecodeFixedLengthBuffers(int64_t start_row_input, int64_t start_row_output, + int64_t num_rows, const KeyRowArray& rows, + std::vector* cols); + + void DecodeVaryingLengthBuffers(int64_t start_row_input, int64_t start_row_output, + int64_t num_rows, const KeyRowArray& rows, + std::vector* cols); + + private: + /// Prepare column array vectors. + /// Output column arrays represent a range of input column arrays + /// specified by starting row and number of rows. + /// Three vectors are generated: + /// - all columns + /// - fixed-length columns only + /// - varying-length columns only + void PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows, + const std::vector& cols_in); + + class TransformBoolean { + public: + static KeyColumnArray ArrayReplace(const KeyColumnArray& column, + const KeyColumnArray& temp); + static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output, + KeyEncoderContext* ctx); + static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output, + KeyEncoderContext* ctx); + }; + + class EncoderInteger { + public: + static void Encode(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col, KeyEncoderContext* ctx, + KeyColumnArray* temp); + static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col, + KeyEncoderContext* ctx, KeyColumnArray* temp); + static bool UsesTransform(const KeyColumnArray& column); + static KeyColumnArray ArrayReplace(const KeyColumnArray& column, + const KeyColumnArray& temp); + static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output, + KeyEncoderContext* ctx); + static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output, + KeyEncoderContext* ctx); + + private: + static bool IsBoolean(const KeyColumnMetadata& metadata); + }; + + class EncoderBinary { + public: + static void Encode(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col, KeyEncoderContext* ctx, + KeyColumnArray* temp); + static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col, + KeyEncoderContext* ctx, KeyColumnArray* temp); + static bool IsInteger(const KeyColumnMetadata& metadata); + + private: + template + static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, + const KeyRowArray* rows_const, + KeyRowArray* rows_mutable_maybe_null, + const KeyColumnArray* col_const, + KeyColumnArray* col_mutable_maybe_null, + COPY_FN copy_fn); + template + static void EncodeImp(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col); + template + static void DecodeImp(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, const KeyRowArray& rows, + KeyColumnArray* col); +#if defined(ARROW_HAVE_AVX2) + static void EncodeHelper_avx2(bool is_row_fixed_length, uint32_t offset_within_row, + KeyRowArray* rows, const KeyColumnArray& col); + static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row, + uint32_t num_rows, uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col); + template + static void EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col); + template + static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, const KeyRowArray& rows, + KeyColumnArray* col); +#endif + static void ColumnMemsetNulls(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col, KeyEncoderContext* ctx, + KeyColumnArray* temp_vector_16bit, uint8_t byte_value); + template + static void ColumnMemsetNullsImp(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col, KeyEncoderContext* ctx, + KeyColumnArray* temp_vector_16bit, + uint8_t byte_value); + }; + + class EncoderBinaryPair { + public: + static bool CanProcessPair(const KeyColumnMetadata& col1, + const KeyColumnMetadata& col2) { + return EncoderBinary::IsInteger(col1) && EncoderBinary::IsInteger(col2); + } + static void Encode(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col1, const KeyColumnArray& col2, + KeyEncoderContext* ctx, KeyColumnArray* temp1, + KeyColumnArray* temp2); + static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col1, + KeyColumnArray* col2, KeyEncoderContext* ctx, + KeyColumnArray* temp1, KeyColumnArray* temp2); + + private: + template + static void EncodeImp(uint32_t num_rows_to_skip, uint32_t offset_within_row, + KeyRowArray* rows, const KeyColumnArray& col1, + const KeyColumnArray& col2); + template + static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, + uint32_t num_rows, uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col1, + KeyColumnArray* col2); +#if defined(ARROW_HAVE_AVX2) + static uint32_t EncodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width, + uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col1, + const KeyColumnArray& col2); + static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width, + uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, const KeyRowArray& rows, + KeyColumnArray* col1, KeyColumnArray* col2); + template + static uint32_t EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows, + const KeyColumnArray& col1, + const KeyColumnArray& col2); + template + static uint32_t DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, const KeyRowArray& rows, + KeyColumnArray* col1, KeyColumnArray* col2); +#endif + }; + + class EncoderOffsets { + public: + // In order not to repeat work twice, + // encoding combines in a single pass computing of: + // a) row offsets for varying-length rows + // b) within each new row, the cumulative length array + // of varying-length values within a row. + static void Encode(KeyRowArray* rows, + const std::vector& varbinary_cols, + KeyEncoderContext* ctx); + static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows, + std::vector* varbinary_cols, + const std::vector& varbinary_cols_base_offset, + KeyEncoderContext* ctx); + + private: + static void EncodeImp(uint32_t num_rows_already_processed, KeyRowArray* rows, + const std::vector& varbinary_cols); +#if defined(ARROW_HAVE_AVX2) + static uint32_t EncodeImp_avx2(KeyRowArray* rows, + const std::vector& varbinary_cols, + KeyColumnArray* temp_buffer_32B_per_col); +#endif + }; + + class EncoderVarBinary { + public: + static void Encode(uint32_t varbinary_col_id, KeyRowArray* rows, + const KeyColumnArray& col, KeyEncoderContext* ctx); + static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, + const KeyRowArray& rows, KeyColumnArray* col, + KeyEncoderContext* ctx); + + private: + template + static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows, + uint32_t varbinary_col_id, + const KeyRowArray* rows_const, + KeyRowArray* rows_mutable_maybe_null, + const KeyColumnArray* col_const, + KeyColumnArray* col_mutable_maybe_null, + COPY_FN copy_fn); + template + static void EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows, + const KeyColumnArray& col); + template + static void DecodeImp(uint32_t start_row, uint32_t num_rows, + uint32_t varbinary_col_id, const KeyRowArray& rows, + KeyColumnArray* col); +#if defined(ARROW_HAVE_AVX2) + static void EncodeHelper_avx2(uint32_t varbinary_col_id, KeyRowArray* rows, + const KeyColumnArray& col); + static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows, + uint32_t varbinary_col_id, const KeyRowArray& rows, + KeyColumnArray* col); + template + static void EncodeImp_avx2(uint32_t varbinary_col_id, KeyRowArray* rows, + const KeyColumnArray& col); + template + static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, + uint32_t varbinary_col_id, const KeyRowArray& rows, + KeyColumnArray* col); +#endif + }; + + class EncoderNulls { + public: + static void Encode(KeyRowArray* rows, const std::vector& cols, + KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit); + static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows, + std::vector* cols); + }; + + KeyEncoderContext* ctx_; + + // Data initialized once, based on data types of key columns + KeyRowMetadata row_metadata_; + + // Data initialized for each input batch. + // All elements are ordered according to the order of encoded fields in a row. + std::vector batch_all_cols_; + std::vector batch_varbinary_cols_; + std::vector batch_varbinary_cols_base_offsets_; +}; + +template +inline void KeyEncoder::EncoderBinary::EncodeDecodeHelper( + uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, + const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null, + const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null, + COPY_FN copy_fn) { + ARROW_DCHECK(col_const && col_const->metadata().is_fixed_length); + uint32_t col_width = col_const->metadata().fixed_length; + + if (is_row_fixed_length) { + uint32_t row_width = rows_const->metadata().fixed_length; + for (uint32_t i = 0; i < num_rows; ++i) { + const uint8_t* src; + uint8_t* dst; + if (is_encoding) { + src = col_const->data(1) + col_width * i; + dst = rows_mutable_maybe_null->mutable_data(1) + row_width * (start_row + i) + + offset_within_row; + } else { + src = rows_const->data(1) + row_width * (start_row + i) + offset_within_row; + dst = col_mutable_maybe_null->mutable_data(1) + col_width * i; + } + copy_fn(dst, src, col_width); + } + } else { + const uint32_t* row_offsets = rows_const->offsets(); + for (uint32_t i = 0; i < num_rows; ++i) { + const uint8_t* src; + uint8_t* dst; + if (is_encoding) { + src = col_const->data(1) + col_width * i; + dst = rows_mutable_maybe_null->mutable_data(2) + row_offsets[start_row + i] + + offset_within_row; + } else { + src = rows_const->data(2) + row_offsets[start_row + i] + offset_within_row; + dst = col_mutable_maybe_null->mutable_data(1) + col_width * i; + } + copy_fn(dst, src, col_width); + } + } +} + +template +inline void KeyEncoder::EncoderVarBinary::EncodeDecodeHelper( + uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, + const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null, + const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null, + COPY_FN copy_fn) { + // Column and rows need to be varying length + ARROW_DCHECK(!rows_const->metadata().is_fixed_length && + !col_const->metadata().is_fixed_length); + + const uint32_t* row_offsets_for_batch = rows_const->offsets() + start_row; + const uint32_t* col_offsets = col_const->offsets(); + + uint32_t col_offset_next = col_offsets[0]; + for (uint32_t i = 0; i < num_rows; ++i) { + uint32_t col_offset = col_offset_next; + col_offset_next = col_offsets[i + 1]; + + uint32_t row_offset = row_offsets_for_batch[i]; + const uint8_t* row = rows_const->data(2) + row_offset; + + uint32_t offset_within_row; + uint32_t length; + if (first_varbinary_col) { + rows_const->metadata().first_varbinary_offset_and_length(row, &offset_within_row, + &length); + } else { + rows_const->metadata().nth_varbinary_offset_and_length(row, varbinary_col_id, + &offset_within_row, &length); + } + + row_offset += offset_within_row; + + const uint8_t* src; + uint8_t* dst; + if (is_encoding) { + src = col_const->data(2) + col_offset; + dst = rows_mutable_maybe_null->mutable_data(2) + row_offset; + } else { + src = rows_const->data(2) + row_offset; + dst = col_mutable_maybe_null->mutable_data(2) + col_offset; + } + copy_fn(dst, src, length); + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_encode_avx2.cc b/cpp/src/arrow/compute/exec/key_encode_avx2.cc new file mode 100644 index 00000000000..d875412cf88 --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_encode_avx2.cc @@ -0,0 +1,545 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec/key_encode.h" + +namespace arrow { +namespace compute { + +#if defined(ARROW_HAVE_AVX2) + +inline __m256i set_first_n_bytes_avx2(int n) { + constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; + constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; + constexpr uint64_t kByteSequence16To23 = 0x1716151413121110ULL; + constexpr uint64_t kByteSequence24To31 = 0x1f1e1d1c1b1a1918ULL; + + return _mm256_cmpgt_epi8(_mm256_set1_epi8(n), + _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, + kByteSequence16To23, kByteSequence24To31)); +} + +inline __m256i inclusive_prefix_sum_32bit_avx2(__m256i x) { + x = _mm256_add_epi32( + x, _mm256_permutevar8x32_epi32( + _mm256_andnot_si256(_mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0xffffffff), x), + _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6))); + x = _mm256_add_epi32( + x, _mm256_permute4x64_epi64( + _mm256_andnot_si256( + _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0xffffffff, 0xffffffff), x), + 0x93)); // 0b10010011 + x = _mm256_add_epi32( + x, _mm256_permute4x64_epi64( + _mm256_andnot_si256( + _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0xffffffff, 0xffffffff), x), + 0x4f)); // 0b01001111 + return x; +} + +void KeyEncoder::EncoderBinary::EncodeHelper_avx2(bool is_row_fixed_length, + uint32_t offset_within_row, + KeyRowArray* rows, + const KeyColumnArray& col) { + if (is_row_fixed_length) { + EncodeImp_avx2(offset_within_row, rows, col); + } else { + EncodeImp_avx2(offset_within_row, rows, col); + } +} + +template +void KeyEncoder::EncoderBinary::EncodeImp_avx2(uint32_t offset_within_row, + KeyRowArray* rows, + const KeyColumnArray& col) { + EncodeDecodeHelper( + 0, static_cast(col.length()), offset_within_row, rows, rows, &col, + nullptr, [](uint8_t* dst, const uint8_t* src, int64_t length) { + __m256i* dst256 = reinterpret_cast<__m256i*>(dst); + const __m256i* src256 = reinterpret_cast(src); + uint32_t istripe; + for (istripe = 0; istripe < length / 32; ++istripe) { + _mm256_storeu_si256(dst256 + istripe, _mm256_loadu_si256(src256 + istripe)); + } + if ((length % 32) > 0) { + __m256i mask = set_first_n_bytes_avx2(length % 32); + _mm256_storeu_si256( + dst256 + istripe, + _mm256_blendv_epi8(_mm256_loadu_si256(dst256 + istripe), + _mm256_loadu_si256(src256 + istripe), mask)); + } + }); +} + +void KeyEncoder::EncoderBinary::DecodeHelper_avx2(bool is_row_fixed_length, + uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, + const KeyRowArray& rows, + KeyColumnArray* col) { + if (is_row_fixed_length) { + DecodeImp_avx2(start_row, num_rows, offset_within_row, rows, col); + } else { + DecodeImp_avx2(start_row, num_rows, offset_within_row, rows, col); + } +} + +template +void KeyEncoder::EncoderBinary::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, + const KeyRowArray& rows, + KeyColumnArray* col) { + EncodeDecodeHelper( + start_row, num_rows, offset_within_row, &rows, nullptr, col, col, + [](uint8_t* dst, const uint8_t* src, int64_t length) { + for (uint32_t istripe = 0; istripe < (length + 31) / 32; ++istripe) { + __m256i* dst256 = reinterpret_cast<__m256i*>(dst); + const __m256i* src256 = reinterpret_cast(src); + _mm256_storeu_si256(dst256 + istripe, _mm256_loadu_si256(src256 + istripe)); + } + }); +} + +uint32_t KeyEncoder::EncoderBinaryPair::EncodeHelper_avx2( + bool is_row_fixed_length, uint32_t col_width, uint32_t offset_within_row, + KeyRowArray* rows, const KeyColumnArray& col1, const KeyColumnArray& col2) { + using EncodeImp_avx2_t = + uint32_t (*)(uint32_t, KeyRowArray*, const KeyColumnArray&, const KeyColumnArray&); + static const EncodeImp_avx2_t EncodeImp_avx2_fn[] = { + EncodeImp_avx2, EncodeImp_avx2, EncodeImp_avx2, + EncodeImp_avx2, EncodeImp_avx2, EncodeImp_avx2, + EncodeImp_avx2, EncodeImp_avx2, + }; + int log_col_width = col_width == 8 ? 3 : col_width == 4 ? 2 : col_width == 2 ? 1 : 0; + int dispatch_const = (is_row_fixed_length ? 4 : 0) + log_col_width; + return EncodeImp_avx2_fn[dispatch_const](offset_within_row, rows, col1, col2); +} + +template +uint32_t KeyEncoder::EncoderBinaryPair::EncodeImp_avx2(uint32_t offset_within_row, + KeyRowArray* rows, + const KeyColumnArray& col1, + const KeyColumnArray& col2) { + uint32_t num_rows = static_cast(col1.length()); + ARROW_DCHECK(col_width == 1 || col_width == 2 || col_width == 4 || col_width == 8); + + const uint8_t* col_vals_A = col1.data(1); + const uint8_t* col_vals_B = col2.data(1); + uint8_t* row_vals = is_row_fixed_length ? rows->mutable_data(1) : rows->mutable_data(2); + + constexpr int unroll = 32 / col_width; + + uint32_t num_processed = num_rows / unroll * unroll; + + for (uint32_t i = 0; i < num_rows / unroll; ++i) { + __m256i col_A = _mm256_loadu_si256(reinterpret_cast(col_vals_A) + i); + __m256i col_B = _mm256_loadu_si256(reinterpret_cast(col_vals_B) + i); + __m256i r0, r1; + if (col_width == 1) { + // results in 16-bit outputs in the order: 0..7, 16..23 + r0 = _mm256_unpacklo_epi8(col_A, col_B); + // results in 16-bit outputs in the order: 8..15, 24..31 + r1 = _mm256_unpackhi_epi8(col_A, col_B); + } else if (col_width == 2) { + // results in 32-bit outputs in the order: 0..3, 8..11 + r0 = _mm256_unpacklo_epi16(col_A, col_B); + // results in 32-bit outputs in the order: 4..7, 12..15 + r1 = _mm256_unpackhi_epi16(col_A, col_B); + } else if (col_width == 4) { + // results in 64-bit outputs in the order: 0..1, 4..5 + r0 = _mm256_unpacklo_epi32(col_A, col_B); + // results in 64-bit outputs in the order: 2..3, 6..7 + r1 = _mm256_unpackhi_epi32(col_A, col_B); + } else if (col_width == 8) { + // results in 128-bit outputs in the order: 0, 2 + r0 = _mm256_unpacklo_epi64(col_A, col_B); + // results in 128-bit outputs in the order: 1, 3 + r1 = _mm256_unpackhi_epi64(col_A, col_B); + } + col_A = _mm256_permute2x128_si256(r0, r1, 0x20); + col_B = _mm256_permute2x128_si256(r0, r1, 0x31); + if (col_width == 8) { + __m128i *dst0, *dst1, *dst2, *dst3; + if (is_row_fixed_length) { + uint32_t fixed_length = rows->metadata().fixed_length; + uint8_t* dst = row_vals + offset_within_row + fixed_length * i * unroll; + dst0 = reinterpret_cast<__m128i*>(dst); + dst1 = reinterpret_cast<__m128i*>(dst + fixed_length); + dst2 = reinterpret_cast<__m128i*>(dst + fixed_length * 2); + dst3 = reinterpret_cast<__m128i*>(dst + fixed_length * 3); + } else { + const uint32_t* row_offsets = rows->offsets() + i * unroll; + uint8_t* dst = row_vals + offset_within_row; + dst0 = reinterpret_cast<__m128i*>(dst + row_offsets[0]); + dst1 = reinterpret_cast<__m128i*>(dst + row_offsets[1]); + dst2 = reinterpret_cast<__m128i*>(dst + row_offsets[2]); + dst3 = reinterpret_cast<__m128i*>(dst + row_offsets[3]); + } + _mm_storeu_si128(dst0, _mm256_castsi256_si128(r0)); + _mm_storeu_si128(dst1, _mm256_castsi256_si128(r1)); + _mm_storeu_si128(dst2, _mm256_extracti128_si256(r0, 1)); + _mm_storeu_si128(dst3, _mm256_extracti128_si256(r1, 1)); + + } else { + uint8_t buffer[64]; + _mm256_storeu_si256(reinterpret_cast<__m256i*>(buffer), col_A); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(buffer) + 1, col_B); + + if (is_row_fixed_length) { + uint32_t fixed_length = rows->metadata().fixed_length; + uint8_t* dst = row_vals + offset_within_row + fixed_length * i * unroll; + for (int j = 0; j < unroll; ++j) { + if (col_width == 1) { + *reinterpret_cast(dst + fixed_length * j) = + reinterpret_cast(buffer)[j]; + } else if (col_width == 2) { + *reinterpret_cast(dst + fixed_length * j) = + reinterpret_cast(buffer)[j]; + } else if (col_width == 4) { + *reinterpret_cast(dst + fixed_length * j) = + reinterpret_cast(buffer)[j]; + } + } + } else { + const uint32_t* row_offsets = rows->offsets() + i * unroll; + uint8_t* dst = row_vals + offset_within_row; + for (int j = 0; j < unroll; ++j) { + if (col_width == 1) { + *reinterpret_cast(dst + row_offsets[j]) = + reinterpret_cast(buffer)[j]; + } else if (col_width == 2) { + *reinterpret_cast(dst + row_offsets[j]) = + reinterpret_cast(buffer)[j]; + } else if (col_width == 4) { + *reinterpret_cast(dst + row_offsets[j]) = + reinterpret_cast(buffer)[j]; + } + } + } + } + } + + return num_processed; +} + +uint32_t KeyEncoder::EncoderBinaryPair::DecodeHelper_avx2( + bool is_row_fixed_length, uint32_t col_width, uint32_t start_row, uint32_t num_rows, + uint32_t offset_within_row, const KeyRowArray& rows, KeyColumnArray* col1, + KeyColumnArray* col2) { + using DecodeImp_avx2_t = + uint32_t (*)(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col1, KeyColumnArray* col2); + static const DecodeImp_avx2_t DecodeImp_avx2_fn[] = { + DecodeImp_avx2, DecodeImp_avx2, DecodeImp_avx2, + DecodeImp_avx2, DecodeImp_avx2, DecodeImp_avx2, + DecodeImp_avx2, DecodeImp_avx2}; + int log_col_width = col_width == 8 ? 3 : col_width == 4 ? 2 : col_width == 2 ? 1 : 0; + int dispatch_const = log_col_width | (is_row_fixed_length ? 4 : 0); + return DecodeImp_avx2_fn[dispatch_const](start_row, num_rows, offset_within_row, rows, + col1, col2); +} + +template +uint32_t KeyEncoder::EncoderBinaryPair::DecodeImp_avx2( + uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, + const KeyRowArray& rows, KeyColumnArray* col1, KeyColumnArray* col2) { + ARROW_DCHECK(col_width == 1 || col_width == 2 || col_width == 4 || col_width == 8); + + uint8_t* col_vals_A = col1->mutable_data(1); + uint8_t* col_vals_B = col2->mutable_data(1); + + uint32_t fixed_length = rows.metadata().fixed_length; + const uint32_t* offsets; + const uint8_t* src_base; + if (is_row_fixed_length) { + src_base = rows.data(1) + fixed_length * start_row + offset_within_row; + offsets = nullptr; + } else { + src_base = rows.data(2) + offset_within_row; + offsets = rows.offsets() + start_row; + } + + constexpr int unroll = 32 / col_width; + + uint32_t num_processed = num_rows / unroll * unroll; + + if (col_width == 8) { + for (uint32_t i = 0; i < num_rows / unroll; ++i) { + const __m128i *src0, *src1, *src2, *src3; + if (is_row_fixed_length) { + const uint8_t* src = src_base + (i * unroll) * fixed_length; + src0 = reinterpret_cast(src); + src1 = reinterpret_cast(src + fixed_length); + src2 = reinterpret_cast(src + fixed_length * 2); + src3 = reinterpret_cast(src + fixed_length * 3); + } else { + const uint32_t* row_offsets = offsets + i * unroll; + const uint8_t* src = src_base; + src0 = reinterpret_cast(src + row_offsets[0]); + src1 = reinterpret_cast(src + row_offsets[1]); + src2 = reinterpret_cast(src + row_offsets[2]); + src3 = reinterpret_cast(src + row_offsets[3]); + } + + __m256i r0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128(src0)), + _mm_loadu_si128(src1), 1); + __m256i r1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128(src2)), + _mm_loadu_si128(src3), 1); + + r0 = _mm256_permute4x64_epi64(r0, 0xd8); // 0b11011000 + r1 = _mm256_permute4x64_epi64(r1, 0xd8); + + // First 128-bit lanes from both inputs + __m256i c1 = _mm256_permute2x128_si256(r0, r1, 0x20); + // Second 128-bit lanes from both inputs + __m256i c2 = _mm256_permute2x128_si256(r0, r1, 0x31); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(col_vals_A) + i, c1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(col_vals_B) + i, c2); + } + } else { + uint8_t buffer[64]; + for (uint32_t i = 0; i < num_rows / unroll; ++i) { + if (is_row_fixed_length) { + const uint8_t* src = src_base + (i * unroll) * fixed_length; + for (int j = 0; j < unroll; ++j) { + if (col_width == 1) { + reinterpret_cast(buffer)[j] = + *reinterpret_cast(src + fixed_length * j); + } else if (col_width == 2) { + reinterpret_cast(buffer)[j] = + *reinterpret_cast(src + fixed_length * j); + } else if (col_width == 4) { + reinterpret_cast(buffer)[j] = + *reinterpret_cast(src + fixed_length * j); + } + } + } else { + const uint32_t* row_offsets = offsets + i * unroll; + const uint8_t* src = src_base; + for (int j = 0; j < unroll; ++j) { + if (col_width == 1) { + reinterpret_cast(buffer)[j] = + *reinterpret_cast(src + row_offsets[j]); + } else if (col_width == 2) { + reinterpret_cast(buffer)[j] = + *reinterpret_cast(src + row_offsets[j]); + } else if (col_width == 4) { + reinterpret_cast(buffer)[j] = + *reinterpret_cast(src + row_offsets[j]); + } + } + } + + __m256i r0 = _mm256_loadu_si256(reinterpret_cast(buffer)); + __m256i r1 = _mm256_loadu_si256(reinterpret_cast(buffer) + 1); + + constexpr uint64_t kByteSequence_0_2_4_6_8_10_12_14 = 0x0e0c0a0806040200ULL; + constexpr uint64_t kByteSequence_1_3_5_7_9_11_13_15 = 0x0f0d0b0907050301ULL; + constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL; + constexpr uint64_t kByteSequence_2_3_6_7_10_11_14_15 = 0x0f0e0b0a07060302ULL; + + if (col_width == 1) { + // Collect every second byte next to each other + const __m256i shuffle_const = _mm256_setr_epi64x( + kByteSequence_0_2_4_6_8_10_12_14, kByteSequence_1_3_5_7_9_11_13_15, + kByteSequence_0_2_4_6_8_10_12_14, kByteSequence_1_3_5_7_9_11_13_15); + r0 = _mm256_shuffle_epi8(r0, shuffle_const); + r1 = _mm256_shuffle_epi8(r1, shuffle_const); + // 0b11011000 swapping second and third 64-bit lane + r0 = _mm256_permute4x64_epi64(r0, 0xd8); + r1 = _mm256_permute4x64_epi64(r1, 0xd8); + } else if (col_width == 2) { + // Collect every second 16-bit word next to each other + const __m256i shuffle_const = _mm256_setr_epi64x( + kByteSequence_0_1_4_5_8_9_12_13, kByteSequence_2_3_6_7_10_11_14_15, + kByteSequence_0_1_4_5_8_9_12_13, kByteSequence_2_3_6_7_10_11_14_15); + r0 = _mm256_shuffle_epi8(r0, shuffle_const); + r1 = _mm256_shuffle_epi8(r1, shuffle_const); + // 0b11011000 swapping second and third 64-bit lane + r0 = _mm256_permute4x64_epi64(r0, 0xd8); + r1 = _mm256_permute4x64_epi64(r1, 0xd8); + } else if (col_width == 4) { + // Collect every second 32-bit word next to each other + const __m256i permute_const = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + r0 = _mm256_permutevar8x32_epi32(r0, permute_const); + r1 = _mm256_permutevar8x32_epi32(r1, permute_const); + } + + // First 128-bit lanes from both inputs + __m256i c1 = _mm256_permute2x128_si256(r0, r1, 0x20); + // Second 128-bit lanes from both inputs + __m256i c2 = _mm256_permute2x128_si256(r0, r1, 0x31); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(col_vals_A) + i, c1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(col_vals_B) + i, c2); + } + } + + return num_processed; +} + +uint32_t KeyEncoder::EncoderOffsets::EncodeImp_avx2( + KeyRowArray* rows, const std::vector& varbinary_cols, + KeyColumnArray* temp_buffer_32B_per_col) { + ARROW_DCHECK(temp_buffer_32B_per_col->metadata().is_fixed_length && + temp_buffer_32B_per_col->metadata().fixed_length == + static_cast(sizeof(uint32_t)) && + temp_buffer_32B_per_col->length() >= + static_cast(varbinary_cols.size()) * 8); + ARROW_DCHECK(varbinary_cols.size() > 0); + + int row_alignment = rows->metadata().row_alignment; + int string_alignment = rows->metadata().string_alignment; + + uint32_t* row_offsets = rows->mutable_offsets(); + uint8_t* row_values = rows->mutable_data(2); + uint32_t num_rows = static_cast(varbinary_cols[0].length()); + + constexpr int unroll = 8; + uint32_t num_processed = num_rows / unroll * unroll; + uint32_t* temp_varbinary_ends = + reinterpret_cast(temp_buffer_32B_per_col->mutable_data(1)); + + row_offsets[0] = 0; + + __m256i row_offset = _mm256_setzero_si256(); + for (uint32_t i = 0; i < num_rows / unroll; ++i) { + // Zero out lengths for nulls. + // Add lengths of all columns to get row size. + // Store in temp buffer varbinary field ends while summing their lengths. + + __m256i offset_within_row = _mm256_set1_epi32(rows->metadata().fixed_length); + + for (size_t col = 0; col < varbinary_cols.size(); ++col) { + const uint32_t* col_offsets = varbinary_cols[col].offsets(); + __m256i col_length = _mm256_sub_epi32( + _mm256_loadu_si256(reinterpret_cast(col_offsets + 1) + i), + _mm256_loadu_si256(reinterpret_cast(col_offsets + 0) + i)); + + const uint8_t* non_nulls = varbinary_cols[col].data(0); + if (non_nulls && non_nulls[i] != 0xff) { + // Zero out lengths for values that are not null + const __m256i individual_bits = + _mm256_setr_epi32(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); + __m256i null_mask = _mm256_cmpeq_epi32( + _mm256_setzero_si256(), + _mm256_and_si256(_mm256_set1_epi32(non_nulls[i]), individual_bits)); + col_length = _mm256_andnot_si256(null_mask, col_length); + } + + __m256i padding = + _mm256_and_si256(_mm256_sub_epi32(_mm256_setzero_si256(), offset_within_row), + _mm256_set1_epi32(string_alignment - 1)); + offset_within_row = _mm256_add_epi32(offset_within_row, padding); + offset_within_row = _mm256_add_epi32(offset_within_row, col_length); + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(temp_varbinary_ends) + col, + offset_within_row); + } + + __m256i padding = + _mm256_and_si256(_mm256_sub_epi32(_mm256_setzero_si256(), offset_within_row), + _mm256_set1_epi32(row_alignment - 1)); + offset_within_row = _mm256_add_epi32(offset_within_row, padding); + + // Inclusive prefix sum of 32-bit elements + __m256i row_offset_delta = inclusive_prefix_sum_32bit_avx2(offset_within_row); + row_offset = _mm256_add_epi32( + _mm256_permutevar8x32_epi32(row_offset, _mm256_set1_epi32(7)), row_offset_delta); + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(row_offsets + 1) + i, row_offset); + + // Output varbinary ends for all fields in each row + for (size_t col = 0; col < varbinary_cols.size(); ++col) { + for (uint32_t row = 0; row < unroll; ++row) { + uint32_t* dst = rows->metadata().varbinary_end_array( + row_values + row_offsets[i * unroll + row]) + + col; + const uint32_t* src = temp_varbinary_ends + (col * unroll + row); + *dst = *src; + } + } + } + + return num_processed; +} + +void KeyEncoder::EncoderVarBinary::EncodeHelper_avx2(uint32_t varbinary_col_id, + KeyRowArray* rows, + const KeyColumnArray& col) { + if (varbinary_col_id == 0) { + EncodeImp_avx2(varbinary_col_id, rows, col); + } else { + EncodeImp_avx2(varbinary_col_id, rows, col); + } +} + +template +void KeyEncoder::EncoderVarBinary::EncodeImp_avx2(uint32_t varbinary_col_id, + KeyRowArray* rows, + const KeyColumnArray& col) { + EncodeDecodeHelper( + 0, static_cast(col.length()), varbinary_col_id, rows, rows, &col, nullptr, + [](uint8_t* dst, const uint8_t* src, int64_t length) { + __m256i* dst256 = reinterpret_cast<__m256i*>(dst); + const __m256i* src256 = reinterpret_cast(src); + uint32_t istripe; + for (istripe = 0; istripe < length / 32; ++istripe) { + _mm256_storeu_si256(dst256 + istripe, _mm256_loadu_si256(src256 + istripe)); + } + if ((length % 32) > 0) { + __m256i mask = set_first_n_bytes_avx2(length % 32); + _mm256_storeu_si256( + dst256 + istripe, + _mm256_blendv_epi8(_mm256_loadu_si256(dst256 + istripe), + _mm256_loadu_si256(src256 + istripe), mask)); + } + }); +} + +void KeyEncoder::EncoderVarBinary::DecodeHelper_avx2(uint32_t start_row, + uint32_t num_rows, + uint32_t varbinary_col_id, + const KeyRowArray& rows, + KeyColumnArray* col) { + if (varbinary_col_id == 0) { + DecodeImp_avx2(start_row, num_rows, varbinary_col_id, rows, col); + } else { + DecodeImp_avx2(start_row, num_rows, varbinary_col_id, rows, col); + } +} + +template +void KeyEncoder::EncoderVarBinary::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, + uint32_t varbinary_col_id, + const KeyRowArray& rows, + KeyColumnArray* col) { + EncodeDecodeHelper( + start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col, + [](uint8_t* dst, const uint8_t* src, int64_t length) { + for (uint32_t istripe = 0; istripe < (length + 31) / 32; ++istripe) { + __m256i* dst256 = reinterpret_cast<__m256i*>(dst); + const __m256i* src256 = reinterpret_cast(src); + _mm256_storeu_si256(dst256 + istripe, _mm256_loadu_si256(src256 + istripe)); + } + }); +} + +#endif + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_hash.cc b/cpp/src/arrow/compute/exec/key_hash.cc new file mode 100644 index 00000000000..081411e708e --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_hash.cc @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/key_hash.h" + +#include + +#include +#include + +#include "arrow/compute/exec/util.h" + +namespace arrow { +namespace compute { + +inline uint32_t Hashing::avalanche_helper(uint32_t acc) { + acc ^= (acc >> 15); + acc *= PRIME32_2; + acc ^= (acc >> 13); + acc *= PRIME32_3; + acc ^= (acc >> 16); + return acc; +} + +void Hashing::avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes) { + uint32_t processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + int tail = num_keys % 8; + avalanche_avx2(num_keys - tail, hashes); + processed = num_keys - tail; + } +#endif + for (uint32_t i = processed; i < num_keys; ++i) { + hashes[i] = avalanche_helper(hashes[i]); + } +} + +inline uint32_t Hashing::combine_accumulators(const uint32_t acc1, const uint32_t acc2, + const uint32_t acc3, const uint32_t acc4) { + return ROTL(acc1, 1) + ROTL(acc2, 7) + ROTL(acc3, 12) + ROTL(acc4, 18); +} + +inline void Hashing::helper_8B(uint32_t key_length, uint32_t num_keys, + const uint8_t* keys, uint32_t* hashes) { + ARROW_DCHECK(key_length <= 8); + uint64_t mask = ~0ULL >> (8 * (8 - key_length)); + constexpr uint64_t multiplier = 14029467366897019727ULL; + uint32_t offset = 0; + for (uint32_t ikey = 0; ikey < num_keys; ++ikey) { + uint64_t x = *reinterpret_cast(keys + offset); + x &= mask; + hashes[ikey] = static_cast(BYTESWAP(x * multiplier)); + offset += key_length; + } +} + +inline void Hashing::helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys, + uint32_t& acc1, uint32_t& acc2, uint32_t& acc3, + uint32_t& acc4) { + uint64_t v1 = reinterpret_cast(keys + offset)[0]; + // We do not need to mask v1, because we will not process a stripe + // unless at least 9 bytes of it are part of the key. + uint64_t v2 = reinterpret_cast(keys + offset)[1]; + v2 &= mask_hi; + uint32_t x1 = static_cast(v1); + uint32_t x2 = static_cast(v1 >> 32); + uint32_t x3 = static_cast(v2); + uint32_t x4 = static_cast(v2 >> 32); + acc1 += x1 * PRIME32_2; + acc1 = ROTL(acc1, 13) * PRIME32_1; + acc2 += x2 * PRIME32_2; + acc2 = ROTL(acc2, 13) * PRIME32_1; + acc3 += x3 * PRIME32_2; + acc3 = ROTL(acc3, 13) * PRIME32_1; + acc4 += x4 * PRIME32_2; + acc4 = ROTL(acc4, 13) * PRIME32_1; +} + +void Hashing::helper_stripes(int64_t hardware_flags, uint32_t num_keys, + uint32_t key_length, const uint8_t* keys, uint32_t* hash) { + uint32_t processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + int tail = num_keys % 2; + helper_stripes_avx2(num_keys - tail, key_length, keys, hash); + processed = num_keys - tail; + } +#endif + + // If length modulo stripe length is less than or equal 8, round down to the nearest 16B + // boundary (8B ending will be processed in a separate function), otherwise round up. + const uint32_t num_stripes = (key_length + 7) / 16; + uint64_t mask_hi = + ~0ULL >> + (8 * ((num_stripes * 16 > key_length) ? num_stripes * 16 - key_length : 0)); + + for (uint32_t i = processed; i < num_keys; ++i) { + uint32_t acc1, acc2, acc3, acc4; + acc1 = static_cast( + (static_cast(PRIME32_1) + static_cast(PRIME32_2)) & + 0xffffffff); + acc2 = PRIME32_2; + acc3 = 0; + acc4 = static_cast(-static_cast(PRIME32_1)); + uint32_t offset = i * key_length; + for (uint32_t stripe = 0; stripe < num_stripes - 1; ++stripe) { + helper_stripe(offset, ~0ULL, keys, acc1, acc2, acc3, acc4); + offset += 16; + } + helper_stripe(offset, mask_hi, keys, acc1, acc2, acc3, acc4); + hash[i] = combine_accumulators(acc1, acc2, acc3, acc4); + } +} + +inline uint32_t Hashing::helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys, + uint32_t acc) { + uint64_t v = reinterpret_cast(keys + offset)[0]; + v &= mask; + uint32_t x1 = static_cast(v); + uint32_t x2 = static_cast(v >> 32); + acc += x1 * PRIME32_3; + acc = ROTL(acc, 17) * PRIME32_4; + acc += x2 * PRIME32_3; + acc = ROTL(acc, 17) * PRIME32_4; + return acc; +} + +void Hashing::helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length, + const uint8_t* keys, uint32_t* hash) { + uint32_t processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + int tail = num_keys % 8; + helper_tails_avx2(num_keys - tail, key_length, keys, hash); + processed = num_keys - tail; + } +#endif + uint64_t mask = ~0ULL >> (8 * (((key_length % 8) == 0) ? 0 : 8 - (key_length % 8))); + uint32_t offset = key_length / 16 * 16; + offset += processed * key_length; + for (uint32_t i = processed; i < num_keys; ++i) { + hash[i] = helper_tail(offset, mask, keys, hash[i]); + offset += key_length; + } +} + +void Hashing::hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key, + const uint8_t* keys, uint32_t* hashes) { + ARROW_DCHECK(length_key > 0); + + if (length_key <= 8) { + helper_8B(length_key, num_keys, keys, hashes); + return; + } + helper_stripes(hardware_flags, num_keys, length_key, keys, hashes); + if ((length_key % 16) > 0 && (length_key % 16) <= 8) { + helper_tails(hardware_flags, num_keys, length_key, keys, hashes); + } + avalanche(hardware_flags, num_keys, hashes); +} + +void Hashing::hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc) { + for (uint32_t i = 0; i < length / 16; ++i) { + for (int j = 0; j < 4; ++j) { + uint32_t lane = reinterpret_cast(key)[i * 4 + j]; + acc[j] += (lane * PRIME32_2); + acc[j] = ROTL(acc[j], 13); + acc[j] *= PRIME32_1; + } + } + + int tail = length % 16; + if (tail) { + uint64_t last_stripe[2]; + const uint64_t* last_stripe_base = + reinterpret_cast(key + length - (length % 16)); + last_stripe[0] = last_stripe_base[0]; + uint64_t mask = ~0ULL >> (8 * ((length + 7) / 8 * 8 - length)); + if (tail <= 8) { + last_stripe[1] = 0; + last_stripe[0] &= mask; + } else { + last_stripe[1] = last_stripe_base[1]; + last_stripe[1] &= mask; + } + for (int j = 0; j < 4; ++j) { + uint32_t lane = reinterpret_cast(last_stripe)[j]; + acc[j] += (lane * PRIME32_2); + acc[j] = ROTL(acc[j], 13); + acc[j] *= PRIME32_1; + } + } +} + +void Hashing::hash_varlen(int64_t hardware_flags, uint32_t num_rows, + const uint32_t* offsets, const uint8_t* concatenated_keys, + uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row + uint32_t* hashes) { +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + hash_varlen_avx2(num_rows, offsets, concatenated_keys, temp_buffer, hashes); + } else { +#endif + for (uint32_t i = 0; i < num_rows; ++i) { + uint32_t acc[4]; + acc[0] = static_cast( + (static_cast(PRIME32_1) + static_cast(PRIME32_2)) & + 0xffffffff); + acc[1] = PRIME32_2; + acc[2] = 0; + acc[3] = static_cast(-static_cast(PRIME32_1)); + uint32_t length = offsets[i + 1] - offsets[i]; + hash_varlen_helper(length, concatenated_keys + offsets[i], acc); + hashes[i] = combine_accumulators(acc[0], acc[1], acc[2], acc[3]); + } + avalanche(hardware_flags, num_rows, hashes); +#if defined(ARROW_HAVE_AVX2) + } +#endif +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_hash.h b/cpp/src/arrow/compute/exec/key_hash.h new file mode 100644 index 00000000000..7f8ab5185cc --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_hash.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(ARROW_HAVE_AVX2) +#include +#endif + +#include + +#include "arrow/compute/exec/util.h" + +namespace arrow { +namespace compute { + +// Implementations are based on xxh3 32-bit algorithm description from: +// https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md +// +class Hashing { + public: + static void hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key, + const uint8_t* keys, uint32_t* hashes); + + static void hash_varlen(int64_t hardware_flags, uint32_t num_rows, + const uint32_t* offsets, const uint8_t* concatenated_keys, + uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row + uint32_t* hashes); + + private: + static const uint32_t PRIME32_1 = 0x9E3779B1; // 0b10011110001101110111100110110001 + static const uint32_t PRIME32_2 = 0x85EBCA77; // 0b10000101111010111100101001110111 + static const uint32_t PRIME32_3 = 0xC2B2AE3D; // 0b11000010101100101010111000111101 + static const uint32_t PRIME32_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111 + static const uint32_t PRIME32_5 = 0x165667B1; // 0b00010110010101100110011110110001 + + // Avalanche + static inline uint32_t avalanche_helper(uint32_t acc); +#if defined(ARROW_HAVE_AVX2) + static void avalanche_avx2(uint32_t num_keys, uint32_t* hashes); +#endif + static void avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes); + + // Accumulator combine + static inline uint32_t combine_accumulators(const uint32_t acc1, const uint32_t acc2, + const uint32_t acc3, const uint32_t acc4); +#if defined(ARROW_HAVE_AVX2) + static inline uint64_t combine_accumulators_avx2(__m256i acc); +#endif + + // Helpers + static inline void helper_8B(uint32_t key_length, uint32_t num_keys, + const uint8_t* keys, uint32_t* hashes); + static inline void helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys, + uint32_t& acc1, uint32_t& acc2, uint32_t& acc3, + uint32_t& acc4); + static inline uint32_t helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys, + uint32_t acc); +#if defined(ARROW_HAVE_AVX2) + static void helper_stripes_avx2(uint32_t num_keys, uint32_t key_length, + const uint8_t* keys, uint32_t* hash); + static void helper_tails_avx2(uint32_t num_keys, uint32_t key_length, + const uint8_t* keys, uint32_t* hash); +#endif + static void helper_stripes(int64_t hardware_flags, uint32_t num_keys, + uint32_t key_length, const uint8_t* keys, uint32_t* hash); + static void helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length, + const uint8_t* keys, uint32_t* hash); + + static void hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc); +#if defined(ARROW_HAVE_AVX2) + static void hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets, + const uint8_t* concatenated_keys, + uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row + uint32_t* hashes); +#endif +}; + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_hash_avx2.cc b/cpp/src/arrow/compute/exec/key_hash_avx2.cc new file mode 100644 index 00000000000..b58db015088 --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_hash_avx2.cc @@ -0,0 +1,248 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec/key_hash.h" + +namespace arrow { +namespace compute { + +#if defined(ARROW_HAVE_AVX2) + +void Hashing::avalanche_avx2(uint32_t num_keys, uint32_t* hashes) { + constexpr int unroll = 8; + ARROW_DCHECK(num_keys % unroll == 0); + for (uint32_t i = 0; i < num_keys / unroll; ++i) { + __m256i hash = _mm256_loadu_si256(reinterpret_cast(hashes) + i); + hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15)); + hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2)); + hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 13)); + hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_3)); + hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 16)); + _mm256_storeu_si256((reinterpret_cast<__m256i*>(hashes)) + i, hash); + } +} + +inline uint64_t Hashing::combine_accumulators_avx2(__m256i acc) { + acc = _mm256_or_si256( + _mm256_sllv_epi32(acc, _mm256_setr_epi32(1, 7, 12, 18, 1, 7, 12, 18)), + _mm256_srlv_epi32(acc, _mm256_setr_epi32(32 - 1, 32 - 7, 32 - 12, 32 - 18, 32 - 1, + 32 - 7, 32 - 12, 32 - 18))); + acc = _mm256_add_epi32(acc, _mm256_shuffle_epi32(acc, 0xee)); // 0b11101110 + acc = _mm256_add_epi32(acc, _mm256_srli_epi64(acc, 32)); + acc = _mm256_permutevar8x32_epi32(acc, _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0)); + uint64_t result = _mm256_extract_epi64(acc, 0); + return result; +} + +void Hashing::helper_stripes_avx2(uint32_t num_keys, uint32_t key_length, + const uint8_t* keys, uint32_t* hash) { + constexpr int unroll = 2; + ARROW_DCHECK(num_keys % unroll == 0); + + constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; + constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; + + const __m256i mask_last_stripe = + (key_length % 16) <= 8 + ? _mm256_set1_epi8(static_cast(0xffU)) + : _mm256_cmpgt_epi8(_mm256_set1_epi8(key_length % 16), + _mm256_setr_epi64x(kByteSequence0To7, kByteSequence8To15, + kByteSequence0To7, kByteSequence8To15)); + + // If length modulo stripe length is less than or equal 8, round down to the nearest 16B + // boundary (8B ending will be processed in a separate function), otherwise round up. + const uint32_t num_stripes = (key_length + 7) / 16; + for (uint32_t i = 0; i < num_keys / unroll; ++i) { + __m256i acc = _mm256_setr_epi32( + static_cast((static_cast(PRIME32_1) + PRIME32_2) & + 0xffffffff), + PRIME32_2, 0, static_cast(-static_cast(PRIME32_1)), + static_cast((static_cast(PRIME32_1) + PRIME32_2) & + 0xffffffff), + PRIME32_2, 0, static_cast(-static_cast(PRIME32_1))); + auto key0 = reinterpret_cast(keys + key_length * 2 * i); + auto key1 = reinterpret_cast(keys + key_length * 2 * i + key_length); + for (uint32_t stripe = 0; stripe < num_stripes - 1; ++stripe) { + auto key_stripe = + _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128(key0 + stripe)), + _mm_loadu_si128(key1 + stripe), 1); + acc = _mm256_add_epi32( + acc, _mm256_mullo_epi32(key_stripe, _mm256_set1_epi32(PRIME32_2))); + acc = _mm256_or_si256(_mm256_slli_epi32(acc, 13), _mm256_srli_epi32(acc, 32 - 13)); + acc = _mm256_mullo_epi32(acc, _mm256_set1_epi32(PRIME32_1)); + } + auto key_stripe = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_loadu_si128(key0 + num_stripes - 1)), + _mm_loadu_si128(key1 + num_stripes - 1), 1); + key_stripe = _mm256_and_si256(key_stripe, mask_last_stripe); + acc = _mm256_add_epi32(acc, + _mm256_mullo_epi32(key_stripe, _mm256_set1_epi32(PRIME32_2))); + acc = _mm256_or_si256(_mm256_slli_epi32(acc, 13), _mm256_srli_epi32(acc, 32 - 13)); + acc = _mm256_mullo_epi32(acc, _mm256_set1_epi32(PRIME32_1)); + uint64_t result = combine_accumulators_avx2(acc); + reinterpret_cast(hash)[i] = result; + } +} + +void Hashing::helper_tails_avx2(uint32_t num_keys, uint32_t key_length, + const uint8_t* keys, uint32_t* hash) { + constexpr int unroll = 8; + ARROW_DCHECK(num_keys % unroll == 0); + auto keys_i64 = reinterpret_cast(keys); + + // Process between 1 and 8 last bytes of each key, starting from 16B boundary. + // The caller needs to make sure that there are no more than 8 bytes to process after + // that 16B boundary. + uint32_t first_offset = key_length - (key_length % 16); + __m256i mask = _mm256_set1_epi64x((~0ULL) >> (8 * (8 - (key_length % 16)))); + __m256i offset = + _mm256_setr_epi32(0, key_length, key_length * 2, key_length * 3, key_length * 4, + key_length * 5, key_length * 6, key_length * 7); + offset = _mm256_add_epi32(offset, _mm256_set1_epi32(first_offset)); + __m256i offset_incr = _mm256_set1_epi32(key_length * 8); + + for (uint32_t i = 0; i < num_keys / unroll; ++i) { + auto v1 = _mm256_i32gather_epi64(keys_i64, _mm256_castsi256_si128(offset), 1); + auto v2 = _mm256_i32gather_epi64(keys_i64, _mm256_extracti128_si256(offset, 1), 1); + v1 = _mm256_and_si256(v1, mask); + v2 = _mm256_and_si256(v2, mask); + v1 = _mm256_permutevar8x32_epi32(v1, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + v2 = _mm256_permutevar8x32_epi32(v2, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)); + auto x1 = _mm256_permute2x128_si256(v1, v2, 0x20); + auto x2 = _mm256_permute2x128_si256(v1, v2, 0x31); + __m256i acc = _mm256_loadu_si256((reinterpret_cast(hash)) + i); + + acc = _mm256_add_epi32(acc, _mm256_mullo_epi32(x1, _mm256_set1_epi32(PRIME32_3))); + acc = _mm256_or_si256(_mm256_slli_epi32(acc, 17), _mm256_srli_epi32(acc, 32 - 17)); + acc = _mm256_mullo_epi32(acc, _mm256_set1_epi32(PRIME32_4)); + + acc = _mm256_add_epi32(acc, _mm256_mullo_epi32(x2, _mm256_set1_epi32(PRIME32_3))); + acc = _mm256_or_si256(_mm256_slli_epi32(acc, 17), _mm256_srli_epi32(acc, 32 - 17)); + acc = _mm256_mullo_epi32(acc, _mm256_set1_epi32(PRIME32_4)); + + _mm256_storeu_si256((reinterpret_cast<__m256i*>(hash)) + i, acc); + + offset = _mm256_add_epi32(offset, offset_incr); + } +} + +void Hashing::hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets, + const uint8_t* concatenated_keys, + uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row + uint32_t* hashes) { + constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; + constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; + + const __m128i sequence = _mm_set_epi64x(kByteSequence8To15, kByteSequence0To7); + const __m128i acc_init = _mm_setr_epi32( + static_cast((static_cast(PRIME32_1) + PRIME32_2) & 0xffffffff), + PRIME32_2, 0, static_cast(-static_cast(PRIME32_1))); + + // Variable length keys are always processed as a sequence of 16B stripes, + // with the last stripe, if extending past the end of the key, having extra bytes set to + // 0 on the fly. + for (uint32_t ikey = 0; ikey < num_rows; ++ikey) { + uint32_t begin = offsets[ikey]; + uint32_t end = offsets[ikey + 1]; + uint32_t length = end - begin; + const uint8_t* base = concatenated_keys + begin; + + __m128i acc = acc_init; + + uint32_t i; + for (i = 0; i < (length - 1) / 16; ++i) { + __m128i key_stripe = _mm_loadu_si128(reinterpret_cast(base) + i); + acc = _mm_add_epi32(acc, _mm_mullo_epi32(key_stripe, _mm_set1_epi32(PRIME32_2))); + acc = _mm_or_si128(_mm_slli_epi32(acc, 13), _mm_srli_epi32(acc, 32 - 13)); + acc = _mm_mullo_epi32(acc, _mm_set1_epi32(PRIME32_1)); + } + __m128i key_stripe = _mm_loadu_si128(reinterpret_cast(base) + i); + __m128i mask = _mm_cmpgt_epi8(_mm_set1_epi8(((length - 1) % 16) + 1), sequence); + key_stripe = _mm_and_si128(key_stripe, mask); + acc = _mm_add_epi32(acc, _mm_mullo_epi32(key_stripe, _mm_set1_epi32(PRIME32_2))); + acc = _mm_or_si128(_mm_slli_epi32(acc, 13), _mm_srli_epi32(acc, 32 - 13)); + acc = _mm_mullo_epi32(acc, _mm_set1_epi32(PRIME32_1)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(temp_buffer) + ikey, acc); + } + + // Combine accumulators and perform avalanche + constexpr int unroll = 8; + for (uint32_t i = 0; i < num_rows / unroll; ++i) { + __m256i accA = + _mm256_loadu_si256(reinterpret_cast(temp_buffer) + 4 * i + 0); + __m256i accB = + _mm256_loadu_si256(reinterpret_cast(temp_buffer) + 4 * i + 1); + __m256i accC = + _mm256_loadu_si256(reinterpret_cast(temp_buffer) + 4 * i + 2); + __m256i accD = + _mm256_loadu_si256(reinterpret_cast(temp_buffer) + 4 * i + 3); + // Transpose 2x 4x4 32-bit matrices + __m256i r0 = _mm256_unpacklo_epi32(accA, accB); + __m256i r1 = _mm256_unpackhi_epi32(accA, accB); + __m256i r2 = _mm256_unpacklo_epi32(accC, accD); + __m256i r3 = _mm256_unpackhi_epi32(accC, accD); + accA = _mm256_unpacklo_epi64(r0, r2); + accB = _mm256_unpackhi_epi64(r0, r2); + accC = _mm256_unpacklo_epi64(r1, r3); + accD = _mm256_unpackhi_epi64(r1, r3); + // _rotl(accA, 1) + // _rotl(accB, 7) + // _rotl(accC, 12) + // _rotl(accD, 18) + accA = _mm256_or_si256(_mm256_slli_epi32(accA, 1), _mm256_srli_epi32(accA, 32 - 1)); + accB = _mm256_or_si256(_mm256_slli_epi32(accB, 7), _mm256_srli_epi32(accB, 32 - 7)); + accC = _mm256_or_si256(_mm256_slli_epi32(accC, 12), _mm256_srli_epi32(accC, 32 - 12)); + accD = _mm256_or_si256(_mm256_slli_epi32(accD, 18), _mm256_srli_epi32(accD, 32 - 18)); + accA = _mm256_add_epi32(_mm256_add_epi32(accA, accB), _mm256_add_epi32(accC, accD)); + // avalanche + __m256i hash = accA; + hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15)); + hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2)); + hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 13)); + hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_3)); + hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 16)); + // Store. + // At this point, because of way 2x 4x4 transposition was done, output hashes are in + // order: 0, 2, 4, 6, 1, 3, 5, 7. Bring back the original order. + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(hashes) + i, + _mm256_permutevar8x32_epi32(hash, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7))); + } + // Process the tail of up to 7 hashes + for (uint32_t i = num_rows - num_rows % unroll; i < num_rows; ++i) { + uint32_t* temp_buffer_base = temp_buffer + i * 4; + uint32_t acc = ROTL(temp_buffer_base[0], 1) + ROTL(temp_buffer_base[1], 7) + + ROTL(temp_buffer_base[2], 12) + ROTL(temp_buffer_base[3], 18); + + // avalanche + acc ^= (acc >> 15); + acc *= PRIME32_2; + acc ^= (acc >> 13); + acc *= PRIME32_3; + acc ^= (acc >> 16); + + hashes[i] = acc; + } +} + +#endif + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_map.cc b/cpp/src/arrow/compute/exec/key_map.cc new file mode 100644 index 00000000000..ac47c04403c --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_map.cc @@ -0,0 +1,610 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/key_map.h" + +#include + +#include +#include + +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/ubsan.h" + +namespace arrow { + +using BitUtil::CountLeadingZeros; + +namespace compute { + +constexpr uint64_t kHighBitOfEachByte = 0x8080808080808080ULL; + +// Search status bytes inside a block of 8 slots (64-bit word). +// Try to find a slot that contains a 7-bit stamp matching the one provided. +// There are three possible outcomes: +// 1. A matching slot is found. +// -> Return its index between 0 and 7 and set match found flag. +// 2. A matching slot is not found and there is an empty slot in the block. +// -> Return the index of the first empty slot and clear match found flag. +// 3. A matching slot is not found and there are no empty slots in the block. +// -> Return 8 as the output slot index and clear match found flag. +// +// Optionally an index of the first slot to start the search from can be specified. +// In this case slots before it will be ignored. +// +template +inline void SwissTable::search_block(uint64_t block, int stamp, int start_slot, + int* out_slot, int* out_match_found) { + // Filled slot bytes have the highest bit set to 0 and empty slots are equal to 0x80. + uint64_t block_high_bits = block & kHighBitOfEachByte; + + // Replicate 7-bit stamp to all non-empty slots, leaving zeroes for empty slots. + uint64_t stamp_pattern = stamp * ((block_high_bits ^ kHighBitOfEachByte) >> 7); + + // If we xor this pattern with block status bytes we get in individual bytes: + // a) 0x00, for filled slots matching the stamp, + // b) 0x00 < x < 0x80, for filled slots not matching the stamp, + // c) 0x80, for empty slots. + uint64_t block_xor_pattern = block ^ stamp_pattern; + + // If we then add 0x7f to every byte, we get: + // a) 0x7F + // b) 0x80 <= x < 0xFF + // c) 0xFF + uint64_t match_base = block_xor_pattern + ~kHighBitOfEachByte; + + // The highest bit now tells us if we have a match (0) or not (1). + // We will negate the bits so that match is represented by a set bit. + uint64_t matches = ~match_base; + + // Clear 7 non-relevant bits in each byte. + // Also clear bytes that correspond to slots that we were supposed to + // skip due to provided start slot index. + // Note: the highest byte corresponds to the first slot. + if (use_start_slot) { + matches &= kHighBitOfEachByte >> (8 * start_slot); + } else { + matches &= kHighBitOfEachByte; + } + + // We get 0 if there are no matches + *out_match_found = (matches == 0 ? 0 : 1); + + // Now if we or with the highest bits of the block and scan zero bits in reverse, + // we get 8x slot index that we were looking for. + // This formula works in all three cases a), b) and c). + *out_slot = static_cast(CountLeadingZeros(matches | block_high_bits) >> 3); +} + +// This call follows the call to search_block. +// The input slot index is the output returned by it, which is a value from 0 to 8, +// with 8 indicating that both: no match was found and there were no empty slots. +// +// If the slot corresponds to a non-empty slot return a group id associated with it. +// Otherwise return any group id from any of the slots or +// zero, which is the default value stored in empty slots. +// +inline uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot, + uint64_t group_id_mask) { + // Input slot can be equal to 8, in which case we need to output any valid group id + // value, so we take the one from slot 0 in the block. + int clamped_slot = slot & 7; + + // Group id values for all 8 slots in the block are bit-packed and follow the status + // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In + // that case we can extract group id using aligned 64-bit word access. + int num_groupid_bits = static_cast(ARROW_POPCOUNT64(group_id_mask)); + ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 || + num_groupid_bits == 32 || num_groupid_bits == 64); + + int bit_offset = clamped_slot * num_groupid_bits; + const uint64_t* group_id_bytes = + reinterpret_cast(block_ptr) + 1 + (bit_offset >> 6); + uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask; + + return group_id; +} + +// Return global slot id (the index including the information about the block) +// where the search should continue if the first comparison fails. +// This function always follows search_block and receives the slot id returned by it. +// +inline uint64_t SwissTable::next_slot_to_visit(uint64_t block_index, int slot, + int match_found) { + // The result should be taken modulo the number of all slots in all blocks, + // but here we allow it to take a value one above the last slot index. + // Modulo operation is postponed to later. + return block_index * 8 + slot + match_found; +} + +// Implements first (fast-path, optimistic) lookup. +// Searches for a match only within the start block and +// trying only the first slot with a matching stamp. +// +// Comparison callback needed for match verification is done outside of this function. +// Match bit vector filled by it only indicates finding a matching stamp in a slot. +// +template +void SwissTable::lookup_1(const uint16_t* selection, const int num_keys, + const uint32_t* hashes, uint8_t* out_match_bitvector, + uint32_t* out_groupids, uint32_t* out_slot_ids) { + // Clear the output bit vector + memset(out_match_bitvector, 0, (num_keys + 7) / 8); + + // Based on the size of the table, prepare bit number constants. + uint32_t stamp_mask = (1 << bits_stamp_) - 1; + int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); + uint32_t groupid_mask = (1 << num_groupid_bits) - 1; + + for (int i = 0; i < num_keys; ++i) { + int id; + if (use_selection) { + id = util::SafeLoad(&selection[i]); + } else { + id = i; + } + + // Extract from hash: block index and stamp + // + uint32_t hash = hashes[id]; + uint32_t iblock = hash >> (bits_hash_ - bits_stamp_ - log_blocks_); + uint32_t stamp = iblock & stamp_mask; + iblock >>= bits_stamp_; + + uint32_t num_block_bytes = num_groupid_bits + 8; + const uint8_t* blockbase = reinterpret_cast(blocks_) + + static_cast(iblock) * num_block_bytes; + uint64_t block = util::SafeLoadAs(blockbase); + + // Call helper functions to obtain the output triplet: + // - match (of a stamp) found flag + // - group id for key comparison + // - slot to resume search from in case of no match or false positive + int match_found; + int islot_in_block; + search_block(block, stamp, 0, &islot_in_block, &match_found); + uint64_t groupid = extract_group_id(blockbase, islot_in_block, groupid_mask); + ARROW_DCHECK(groupid < num_inserted_ || num_inserted_ == 0); + uint64_t islot = next_slot_to_visit(iblock, islot_in_block, match_found); + + out_match_bitvector[id / 8] |= match_found << (id & 7); + util::SafeStore(&out_groupids[id], static_cast(groupid)); + util::SafeStore(&out_slot_ids[id], static_cast(islot)); + } +} + +// How many groups we can keep in the hash table without the need for resizing. +// When we reach this limit, we need to break processing of any further rows and resize. +// +uint64_t SwissTable::num_groups_for_resize() const { + // Resize small hash tables when 50% full (up to 12KB). + // Resize large hash tables when 75% full. + constexpr int log_blocks_small_ = 9; + uint64_t num_slots = 1ULL << (log_blocks_ + 3); + if (log_blocks_ <= log_blocks_small_) { + return num_slots / 2; + } else { + return num_slots * 3 / 4; + } +} + +uint64_t SwissTable::wrap_global_slot_id(uint64_t global_slot_id) { + uint64_t global_slot_id_mask = (1 << (log_blocks_ + 3)) - 1; + return global_slot_id & global_slot_id_mask; +} + +// Run a single round of slot search - comparison / insert - filter unprocessed. +// Update selection vector to reflect which items have been processed. +// Ids in selection vector do not have to be sorted. +// +Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected, + uint16_t* inout_selection, bool* out_need_resize, + uint32_t* out_group_ids, uint32_t* inout_next_slot_ids) { + auto num_groups_limit = num_groups_for_resize(); + ARROW_DCHECK(num_inserted_ < num_groups_limit); + + // Temporary arrays are of limited size. + // The input needs to be split into smaller portions if it exceeds that limit. + // + ARROW_DCHECK(*inout_num_selected <= static_cast(1 << log_minibatch_)); + + // We will split input row ids into three categories: + // - needing to visit next block [0] + // - needing comparison [1] + // - inserted [2] + // + auto ids_inserted_buf = + util::TempVectorHolder(temp_stack_, *inout_num_selected); + auto ids_for_comparison_buf = + util::TempVectorHolder(temp_stack_, *inout_num_selected); + constexpr int category_nomatch = 0; + constexpr int category_cmp = 1; + constexpr int category_inserted = 2; + int num_ids[3]; + num_ids[0] = num_ids[1] = num_ids[2] = 0; + uint16_t* ids[3]{inout_selection, ids_for_comparison_buf.mutable_data(), + ids_inserted_buf.mutable_data()}; + auto push_id = [&num_ids, &ids](int category, int id) { + util::SafeStore(&ids[category][num_ids[category]++], static_cast(id)); + }; + + uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); + uint64_t groupid_mask = (1ULL << num_groupid_bits) - 1; + constexpr uint64_t stamp_mask = 0x7f; + uint64_t num_block_bytes = (8 + num_groupid_bits); + + uint32_t num_processed; + for (num_processed = 0; + // Second condition in for loop: + // We need to break processing and have the caller of this function + // resize hash table if we reach the limit of the number of groups present. + num_processed < *inout_num_selected && + num_inserted_ + num_ids[category_inserted] < num_groups_limit; + ++num_processed) { + // row id in original batch + int id = util::SafeLoad(&inout_selection[num_processed]); + + uint64_t slot_id = wrap_global_slot_id(util::SafeLoad(&inout_next_slot_ids[id])); + uint64_t block_id = slot_id >> 3; + uint32_t hash = hashes[id]; + uint8_t* blockbase = blocks_ + num_block_bytes * block_id; + uint64_t block = *reinterpret_cast(blockbase); + uint64_t stamp = (hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask; + int start_slot = (slot_id & 7); + + bool isempty = (blockbase[7 - start_slot] == 0x80); + if (isempty) { + // If we reach the empty slot we insert key for new group + + blockbase[7 - start_slot] = static_cast(stamp); + uint32_t group_id = num_inserted_ + num_ids[category_inserted]; + int groupid_bit_offset = static_cast(start_slot * num_groupid_bits); + + // We assume here that the number of bits is rounded up to 8, 16, 32 or 64. + // In that case we can insert group id value using aligned 64-bit word access. + ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 || + num_groupid_bits == 32 || num_groupid_bits == 64); + uint64_t* ptr = + &reinterpret_cast(blockbase + 8)[groupid_bit_offset >> 6]; + util::SafeStore(ptr, util::SafeLoad(ptr) | (static_cast(group_id) + << (groupid_bit_offset & 63))); + + hashes_[slot_id] = hash; + util::SafeStore(&out_group_ids[id], group_id); + push_id(category_inserted, id); + } else { + // We search for a slot with a matching stamp within a single block. + // We append row id to the appropriate sequence of ids based on + // whether the match has been found or not. + + int new_match_found; + int new_slot; + search_block(block, static_cast(stamp), start_slot, &new_slot, + &new_match_found); + auto new_groupid = + static_cast(extract_group_id(blockbase, new_slot, groupid_mask)); + ARROW_DCHECK(new_groupid < num_inserted_ + num_ids[category_inserted]); + new_slot = + static_cast(next_slot_to_visit(block_id, new_slot, new_match_found)); + util::SafeStore(&inout_next_slot_ids[id], new_slot); + util::SafeStore(&out_group_ids[id], new_groupid); + push_id(new_match_found, id); + } + } + + // Copy keys for newly inserted rows using callback + RETURN_NOT_OK(append_impl_(num_ids[category_inserted], ids[category_inserted])); + num_inserted_ += num_ids[category_inserted]; + + // Evaluate comparisons and append ids of rows that failed it to the non-match set. + uint32_t num_not_equal; + equal_impl_(num_ids[category_cmp], ids[category_cmp], out_group_ids, &num_not_equal, + ids[category_nomatch] + num_ids[category_nomatch]); + num_ids[category_nomatch] += num_not_equal; + + // Append ids of any unprocessed entries if we aborted processing due to the need + // to resize. + if (num_processed < *inout_num_selected) { + memmove(ids[category_nomatch] + num_ids[category_nomatch], + inout_selection + num_processed, + sizeof(uint16_t) * (*inout_num_selected - num_processed)); + num_ids[category_nomatch] += (*inout_num_selected - num_processed); + } + + *out_need_resize = (num_inserted_ == num_groups_limit); + *inout_num_selected = num_ids[category_nomatch]; + return Status::OK(); +} + +// Use hashes and callbacks to find group ids for already existing keys and +// to insert and report newly assigned group ids for new keys. +// +Status SwissTable::map(const int num_keys, const uint32_t* hashes, + uint32_t* out_groupids) { + // Temporary buffers have limited size. + // Caller is responsible for splitting larger input arrays into smaller chunks. + ARROW_DCHECK(num_keys <= (1 << log_minibatch_)); + + // Allocate temporary buffers with a lifetime of this function + auto match_bitvector_buf = util::TempVectorHolder(temp_stack_, num_keys); + uint8_t* match_bitvector = match_bitvector_buf.mutable_data(); + auto slot_ids_buf = util::TempVectorHolder(temp_stack_, num_keys); + uint32_t* slot_ids = slot_ids_buf.mutable_data(); + auto ids_buf = util::TempVectorHolder(temp_stack_, num_keys); + uint16_t* ids = ids_buf.mutable_data(); + uint32_t num_ids; + + // First-pass processing. + // Optimistically use simplified lookup involving only a start block to find + // a single group id candidate for every input. +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) { + if (log_blocks_ <= 4) { + int tail = num_keys % 32; + int delta = num_keys - tail; + lookup_1_avx2_x32(num_keys - tail, hashes, match_bitvector, out_groupids, slot_ids); + lookup_1_avx2_x8(tail, hashes + delta, match_bitvector + delta / 8, + out_groupids + delta, slot_ids + delta); + } else { + lookup_1_avx2_x8(num_keys, hashes, match_bitvector, out_groupids, slot_ids); + } + } else { +#endif + lookup_1(nullptr, num_keys, hashes, match_bitvector, out_groupids, slot_ids); +#if defined(ARROW_HAVE_AVX2) + } +#endif + + int64_t num_matches = + arrow::internal::CountSetBits(match_bitvector, /*offset=*/0, num_keys); + + // After the first-pass processing count rows with matches (based on stamp comparison) + // and decide based on their percentage whether to call dense or sparse comparison + // function. Dense comparison means evaluating it for all inputs, even if the matching + // stamp was not found. It may be cheaper to evaluate comparison for all inputs if the + // extra cost of filtering is higher than the wasted processing of rows with no match. + // + // Dense comparison can only be used if there is at least one inserted key, + // because otherwise there is no key to compare to. + // + if (num_inserted_ > 0 && num_matches > 0 && num_matches > 3 * num_keys / 4) { + // Dense comparisons + equal_impl_(num_keys, nullptr, out_groupids, &num_ids, ids); + } else { + // Sparse comparisons that involve filtering the input set of keys + auto ids_cmp_buf = util::TempVectorHolder(temp_stack_, num_keys); + uint16_t* ids_cmp = ids_cmp_buf.mutable_data(); + int num_ids_result; + util::BitUtil::bits_split_indexes(hardware_flags_, num_keys, match_bitvector, + &num_ids_result, ids, ids_cmp); + num_ids = num_ids_result; + uint32_t num_not_equal; + equal_impl_(num_keys - num_ids, ids_cmp, out_groupids, &num_not_equal, ids + num_ids); + num_ids += num_not_equal; + } + + do { + // A single round of slow-pass (robust) lookup or insert. + // A single round ends with either a single comparison verifying the match candidate + // or inserting a new key. A single round of slow-pass may return early if we reach + // the limit of the number of groups due to inserts of new keys. In that case we need + // to resize and recalculating starting global slot ids for new bigger hash table. + bool out_of_capacity; + RETURN_NOT_OK( + lookup_2(hashes, &num_ids, ids, &out_of_capacity, out_groupids, slot_ids)); + if (out_of_capacity) { + RETURN_NOT_OK(grow_double()); + // Reset start slot ids for still unprocessed input keys. + // + for (uint32_t i = 0; i < num_ids; ++i) { + // First slot in the new starting block + const int16_t id = util::SafeLoad(&ids[i]); + util::SafeStore(&slot_ids[id], (hashes[id] >> (bits_hash_ - log_blocks_)) * 8); + } + } + } while (num_ids > 0); + + return Status::OK(); +} + +Status SwissTable::grow_double() { + // Before and after metadata + int num_group_id_bits_before = num_groupid_bits_from_log_blocks(log_blocks_); + int num_group_id_bits_after = num_groupid_bits_from_log_blocks(log_blocks_ + 1); + uint64_t group_id_mask_before = ~0ULL >> (64 - num_group_id_bits_before); + int log_blocks_before = log_blocks_; + int log_blocks_after = log_blocks_ + 1; + uint64_t block_size_before = (8 + num_group_id_bits_before); + uint64_t block_size_after = (8 + num_group_id_bits_after); + uint64_t block_size_total_before = (block_size_before << log_blocks_before) + padding_; + uint64_t block_size_total_after = (block_size_after << log_blocks_after) + padding_; + uint64_t hashes_size_total_before = + (bits_hash_ / 8 * (1 << (log_blocks_before + 3))) + padding_; + uint64_t hashes_size_total_after = + (bits_hash_ / 8 * (1 << (log_blocks_after + 3))) + padding_; + constexpr uint32_t stamp_mask = (1 << bits_stamp_) - 1; + + // Allocate new buffers + uint8_t* blocks_new; + RETURN_NOT_OK(pool_->Allocate(block_size_total_after, &blocks_new)); + memset(blocks_new, 0, block_size_total_after); + uint8_t* hashes_new_8B; + uint32_t* hashes_new; + RETURN_NOT_OK(pool_->Allocate(hashes_size_total_after, &hashes_new_8B)); + hashes_new = reinterpret_cast(hashes_new_8B); + + // First pass over all old blocks. + // Reinsert entries that were not in the overflow block + // (block other than selected by hash bits corresponding to the entry). + for (int i = 0; i < (1 << log_blocks_); ++i) { + // How many full slots in this block + uint8_t* block_base = blocks_ + i * block_size_before; + uint8_t* double_block_base_new = blocks_new + 2 * i * block_size_after; + uint64_t block = *reinterpret_cast(block_base); + + auto full_slots = + static_cast(CountLeadingZeros(block & kHighBitOfEachByte) >> 3); + int full_slots_new[2]; + full_slots_new[0] = full_slots_new[1] = 0; + util::SafeStore(double_block_base_new, kHighBitOfEachByte); + util::SafeStore(double_block_base_new + block_size_after, kHighBitOfEachByte); + + for (int j = 0; j < full_slots; ++j) { + uint64_t slot_id = i * 8 + j; + uint32_t hash = hashes_[slot_id]; + uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after); + bool is_overflow_entry = ((block_id_new >> 1) != static_cast(i)); + if (is_overflow_entry) { + continue; + } + + int ihalf = block_id_new & 1; + uint8_t stamp_new = + hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask; + uint64_t group_id_bit_offs = j * num_group_id_bits_before; + uint64_t group_id = + (util::SafeLoadAs(block_base + 8 + (group_id_bit_offs >> 3)) >> + (group_id_bit_offs & 7)) & + group_id_mask_before; + + uint64_t slot_id_new = i * 16 + ihalf * 8 + full_slots_new[ihalf]; + hashes_new[slot_id_new] = hash; + uint8_t* block_base_new = double_block_base_new + ihalf * block_size_after; + block_base_new[7 - full_slots_new[ihalf]] = stamp_new; + int group_id_bit_offs_new = full_slots_new[ihalf] * num_group_id_bits_after; + uint64_t* ptr = + reinterpret_cast(block_base_new + 8 + (group_id_bit_offs_new >> 3)); + util::SafeStore(ptr, + util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7))); + full_slots_new[ihalf]++; + } + } + + // Second pass over all old blocks. + // Reinsert entries that were in an overflow block. + for (int i = 0; i < (1 << log_blocks_); ++i) { + // How many full slots in this block + uint8_t* block_base = blocks_ + i * block_size_before; + uint64_t block = util::SafeLoadAs(block_base); + int full_slots = static_cast(CountLeadingZeros(block & kHighBitOfEachByte) >> 3); + + for (int j = 0; j < full_slots; ++j) { + uint64_t slot_id = i * 8 + j; + uint32_t hash = hashes_[slot_id]; + uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after); + bool is_overflow_entry = ((block_id_new >> 1) != static_cast(i)); + if (!is_overflow_entry) { + continue; + } + + uint64_t group_id_bit_offs = j * num_group_id_bits_before; + uint64_t group_id = + (util::SafeLoadAs(block_base + 8 + (group_id_bit_offs >> 3)) >> + (group_id_bit_offs & 7)) & + group_id_mask_before; + uint8_t stamp_new = + hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask; + + uint8_t* block_base_new = blocks_new + block_id_new * block_size_after; + uint64_t block_new = util::SafeLoadAs(block_base_new); + int full_slots_new = + static_cast(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3); + while (full_slots_new == 8) { + block_id_new = (block_id_new + 1) & ((1 << log_blocks_after) - 1); + block_base_new = blocks_new + block_id_new * block_size_after; + block_new = util::SafeLoadAs(block_base_new); + full_slots_new = + static_cast(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3); + } + + hashes_new[block_id_new * 8 + full_slots_new] = hash; + block_base_new[7 - full_slots_new] = stamp_new; + int group_id_bit_offs_new = full_slots_new * num_group_id_bits_after; + uint64_t* ptr = + reinterpret_cast(block_base_new + 8 + (group_id_bit_offs_new >> 3)); + util::SafeStore(ptr, + util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7))); + } + } + + pool_->Free(blocks_, block_size_total_before); + pool_->Free(reinterpret_cast(hashes_), hashes_size_total_before); + log_blocks_ = log_blocks_after; + blocks_ = blocks_new; + hashes_ = hashes_new; + + return Status::OK(); +} + +Status SwissTable::init(int64_t hardware_flags, MemoryPool* pool, + util::TempVectorStack* temp_stack, int log_minibatch, + EqualImpl equal_impl, AppendImpl append_impl) { + hardware_flags_ = hardware_flags; + pool_ = pool; + temp_stack_ = temp_stack; + log_minibatch_ = log_minibatch; + equal_impl_ = equal_impl; + append_impl_ = append_impl; + + log_blocks_ = 0; + int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); + num_inserted_ = 0; + + const uint64_t block_bytes = 8 + num_groupid_bits; + const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_; + RETURN_NOT_OK(pool_->Allocate(slot_bytes, &blocks_)); + + // Make sure group ids are initially set to zero for all slots. + memset(blocks_, 0, slot_bytes); + + // Initialize all status bytes to represent an empty slot. + for (uint64_t i = 0; i < (static_cast(1) << log_blocks_); ++i) { + util::SafeStore(blocks_ + i * block_bytes, kHighBitOfEachByte); + } + + uint64_t num_slots = 1ULL << (log_blocks_ + 3); + const uint64_t hash_size = sizeof(uint32_t); + const uint64_t hash_bytes = hash_size * num_slots + padding_; + uint8_t* hashes8; + RETURN_NOT_OK(pool_->Allocate(hash_bytes, &hashes8)); + hashes_ = reinterpret_cast(hashes8); + + return Status::OK(); +} + +void SwissTable::cleanup() { + if (blocks_) { + int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); + const uint64_t block_bytes = 8 + num_groupid_bits; + const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_; + pool_->Free(blocks_, slot_bytes); + blocks_ = nullptr; + } + if (hashes_) { + uint64_t num_slots = 1ULL << (log_blocks_ + 3); + const uint64_t hash_size = sizeof(uint32_t); + const uint64_t hash_bytes = hash_size * num_slots + padding_; + pool_->Free(reinterpret_cast(hashes_), hash_bytes); + hashes_ = nullptr; + } + log_blocks_ = 0; + num_inserted_ = 0; +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_map.h b/cpp/src/arrow/compute/exec/key_map.h new file mode 100644 index 00000000000..8c472736ec4 --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_map.h @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/exec/util.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/status.h" + +namespace arrow { +namespace compute { + +class SwissTable { + public: + SwissTable() = default; + ~SwissTable() { cleanup(); } + + using EqualImpl = + std::function; + using AppendImpl = std::function; + + Status init(int64_t hardware_flags, MemoryPool* pool, util::TempVectorStack* temp_stack, + int log_minibatch, EqualImpl equal_impl, AppendImpl append_impl); + void cleanup(); + + Status map(const int ckeys, const uint32_t* hashes, uint32_t* outgroupids); + + private: + // Lookup helpers + + /// \brief Scan bytes in block in reverse and stop as soon + /// as a position of interest is found. + /// + /// Positions of interest: + /// a) slot with a matching stamp is encountered, + /// b) first empty slot is encountered, + /// c) we reach the end of the block. + /// + /// \param[in] block 8 byte block of hash table + /// \param[in] stamp 7 bits of hash used as a stamp + /// \param[in] start_slot Index of the first slot in the block to start search from. We + /// assume that this index always points to a non-empty slot, equivalently + /// that it comes before any empty slots. (Used only by one template + /// variant.) + /// \param[out] out_slot index corresponding to the discovered position of interest (8 + /// represents end of block). + /// \param[out] out_match_found an integer flag (0 or 1) indicating if we found a + /// matching stamp. + template + inline void search_block(uint64_t block, int stamp, int start_slot, int* out_slot, + int* out_match_found); + + /// \brief Extract group id for a given slot in a given block. + /// + /// Group ids follow in memory after 64-bit block data. + /// Maximum number of groups inserted is equal to the number + /// of all slots in all blocks, which is 8 * the number of blocks. + /// Group ids are bit packed using that maximum to determine the necessary number of + /// bits. + inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot, + uint64_t group_id_mask); + + inline uint64_t next_slot_to_visit(uint64_t block_index, int slot, int match_found); + + inline void insert(uint8_t* block_base, uint64_t slot_id, uint32_t hash, uint8_t stamp, + uint32_t group_id); + + inline uint64_t num_groups_for_resize() const; + + inline uint64_t wrap_global_slot_id(uint64_t global_slot_id); + + // First hash table access + // Find first match in the start block if exists. + // Possible cases: + // 1. Stamp match in a block + // 2. No stamp match in a block, no empty buckets in a block + // 3. No stamp match in a block, empty buckets in a block + // + template + void lookup_1(const uint16_t* selection, const int num_keys, const uint32_t* hashes, + uint8_t* out_match_bitvector, uint32_t* out_group_ids, + uint32_t* out_slot_ids); +#if defined(ARROW_HAVE_AVX2) + void lookup_1_avx2_x8(const int num_hashes, const uint32_t* hashes, + uint8_t* out_match_bitvector, uint32_t* out_group_ids, + uint32_t* out_next_slot_ids); + void lookup_1_avx2_x32(const int num_hashes, const uint32_t* hashes, + uint8_t* out_match_bitvector, uint32_t* out_group_ids, + uint32_t* out_next_slot_ids); +#endif + + // Completing hash table lookup post first access + Status lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected, + uint16_t* inout_selection, bool* out_need_resize, + uint32_t* out_group_ids, uint32_t* out_next_slot_ids); + + // Resize small hash tables when 50% full (up to 8KB). + // Resize large hash tables when 75% full. + Status grow_double(); + + static int num_groupid_bits_from_log_blocks(int log_blocks) { + int required_bits = log_blocks + 3; + return required_bits <= 8 ? 8 + : required_bits <= 16 ? 16 : required_bits <= 32 ? 32 : 64; + } + + // Use 32-bit hash for now + static constexpr int bits_hash_ = 32; + + // Number of hash bits stored in slots in a block. + // The highest bits of hash determine block id. + // The next set of highest bits is a "stamp" stored in a slot in a block. + static constexpr int bits_stamp_ = 7; + + // Padding bytes added at the end of buffers for ease of SIMD access + static constexpr int padding_ = 64; + + int log_minibatch_; + // Base 2 log of the number of blocks + int log_blocks_ = 0; + // Number of keys inserted into hash table + uint32_t num_inserted_ = 0; + + // Data for blocks. + // Each block has 8 status bytes for 8 slots, followed by 8 bit packed group ids for + // these slots. In 8B status word, the order of bytes is reversed. Group ids are in + // normal order. There is 64B padding at the end. + // + // 0 byte - 7 bucket | 1. byte - 6 bucket | ... + // --------------------------------------------------- + // | Empty bit* | Empty bit | + // --------------------------------------------------- + // | 7-bit hash | 7-bit hash | + // --------------------------------------------------- + // * Empty bucket has value 0x80. Non-empty bucket has highest bit set to 0. + // + uint8_t* blocks_; + + // Array of hashes of values inserted into slots. + // Undefined if the corresponding slot is empty. + // There is 64B padding at the end. + uint32_t* hashes_; + + int64_t hardware_flags_; + MemoryPool* pool_; + util::TempVectorStack* temp_stack_; + + EqualImpl equal_impl_; + AppendImpl append_impl_; +}; + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_map_avx2.cc b/cpp/src/arrow/compute/exec/key_map_avx2.cc new file mode 100644 index 00000000000..a2efb4d1bb9 --- /dev/null +++ b/cpp/src/arrow/compute/exec/key_map_avx2.cc @@ -0,0 +1,407 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec/key_map.h" + +namespace arrow { +namespace compute { + +#if defined(ARROW_HAVE_AVX2) + +// Why it is OK to round up number of rows internally: +// All of the buffers: hashes, out_match_bitvector, out_group_ids, out_next_slot_ids +// are temporary buffers of group id mapping. +// Temporary buffers are buffers that live only within the boundaries of a single +// minibatch. Temporary buffers add 64B at the end, so that SIMD code does not have to +// worry about reading and writing outside of the end of the buffer up to 64B. If the +// hashes array contains garbage after the last element, it cannot cause computation to +// fail, since any random data is a valid hash for the purpose of lookup. +// +// This is more or less translation of equivalent scalar code, adjusted for a different +// instruction set (e.g. missing leading zero count instruction). +// +void SwissTable::lookup_1_avx2_x8(const int num_hashes, const uint32_t* hashes, + uint8_t* out_match_bitvector, uint32_t* out_group_ids, + uint32_t* out_next_slot_ids) { + // Number of inputs processed together in a loop + constexpr int unroll = 8; + + const int num_group_id_bits = num_groupid_bits_from_log_blocks(log_blocks_); + uint32_t group_id_mask = ~static_cast(0) >> (32 - num_group_id_bits); + const __m256i* vhash_ptr = reinterpret_cast(hashes); + const __m256i vstamp_mask = _mm256_set1_epi32((1 << bits_stamp_) - 1); + + // TODO: explain why it is ok to process hashes outside of buffer boundaries + for (int i = 0; i < ((num_hashes + unroll - 1) / unroll); ++i) { + constexpr uint64_t kEachByteIs8 = 0x0808080808080808ULL; + constexpr uint64_t kByteSequenceOfPowersOf2 = 0x8040201008040201ULL; + + // Calculate block index and hash stamp for a byte in a block + // + __m256i vhash = _mm256_loadu_si256(vhash_ptr + i); + __m256i vblock_id = _mm256_srlv_epi32( + vhash, _mm256_set1_epi32(bits_hash_ - bits_stamp_ - log_blocks_)); + __m256i vstamp = _mm256_and_si256(vblock_id, vstamp_mask); + vblock_id = _mm256_srli_epi32(vblock_id, bits_stamp_); + + // We now split inputs and process 4 at a time, + // in order to process 64-bit blocks + // + __m256i vblock_offset = + _mm256_mullo_epi32(vblock_id, _mm256_set1_epi32(num_group_id_bits + 8)); + __m256i voffset_A = _mm256_and_si256(vblock_offset, _mm256_set1_epi64x(0xffffffff)); + __m256i vstamp_A = _mm256_and_si256(vstamp, _mm256_set1_epi64x(0xffffffff)); + __m256i voffset_B = _mm256_srli_epi64(vblock_offset, 32); + __m256i vstamp_B = _mm256_srli_epi64(vstamp, 32); + + auto blocks_i64 = reinterpret_cast(blocks_); + auto vblock_A = _mm256_i64gather_epi64(blocks_i64, voffset_A, 1); + auto vblock_B = _mm256_i64gather_epi64(blocks_i64, voffset_B, 1); + __m256i vblock_highbits_A = + _mm256_cmpeq_epi8(vblock_A, _mm256_set1_epi8(static_cast(0x80))); + __m256i vblock_highbits_B = + _mm256_cmpeq_epi8(vblock_B, _mm256_set1_epi8(static_cast(0x80))); + __m256i vbyte_repeat_pattern = + _mm256_setr_epi64x(0ULL, kEachByteIs8, 0ULL, kEachByteIs8); + vstamp_A = _mm256_shuffle_epi8( + vstamp_A, _mm256_or_si256(vbyte_repeat_pattern, vblock_highbits_A)); + vstamp_B = _mm256_shuffle_epi8( + vstamp_B, _mm256_or_si256(vbyte_repeat_pattern, vblock_highbits_B)); + __m256i vmatches_A = _mm256_cmpeq_epi8(vblock_A, vstamp_A); + __m256i vmatches_B = _mm256_cmpeq_epi8(vblock_B, vstamp_B); + __m256i vmatch_found = _mm256_andnot_si256( + _mm256_blend_epi32(_mm256_cmpeq_epi64(vmatches_A, _mm256_setzero_si256()), + _mm256_cmpeq_epi64(vmatches_B, _mm256_setzero_si256()), + 0xaa), // 0b10101010 + _mm256_set1_epi8(static_cast(0xff))); + vmatches_A = + _mm256_sad_epu8(_mm256_and_si256(_mm256_or_si256(vmatches_A, vblock_highbits_A), + _mm256_set1_epi64x(kByteSequenceOfPowersOf2)), + _mm256_setzero_si256()); + vmatches_B = + _mm256_sad_epu8(_mm256_and_si256(_mm256_or_si256(vmatches_B, vblock_highbits_B), + _mm256_set1_epi64x(kByteSequenceOfPowersOf2)), + _mm256_setzero_si256()); + __m256i vmatches = _mm256_or_si256(vmatches_A, _mm256_slli_epi64(vmatches_B, 32)); + + // We are now back to processing 8 at a time. + // Each lane contains 8-bit bit vector marking slots that are matches. + // We need to find leading zeroes count for all slots. + // + // Emulating lzcnt in lowest bytes of 32-bit elements + __m256i vgt = _mm256_cmpgt_epi32(_mm256_set1_epi32(16), vmatches); + __m256i vnext_slot_id = + _mm256_blendv_epi8(_mm256_srli_epi32(vmatches, 4), + _mm256_and_si256(vmatches, _mm256_set1_epi32(0x0f)), vgt); + vnext_slot_id = _mm256_shuffle_epi8( + _mm256_setr_epi8(4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 2, 2, 1, 1, + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), + vnext_slot_id); + vnext_slot_id = + _mm256_add_epi32(_mm256_and_si256(vnext_slot_id, _mm256_set1_epi32(0xff)), + _mm256_and_si256(vgt, _mm256_set1_epi32(4))); + + // Lookup group ids + // + __m256i vgroupid_bit_offset = + _mm256_mullo_epi32(_mm256_and_si256(vnext_slot_id, _mm256_set1_epi32(7)), + _mm256_set1_epi32(num_group_id_bits)); + + // This only works for up to 25 bits per group id, since it uses 32-bit gather + // TODO: make sure this will never get called when there are more than 2^25 groups. + __m256i vgroupid = + _mm256_add_epi32(_mm256_srli_epi32(vgroupid_bit_offset, 3), + _mm256_add_epi32(vblock_offset, _mm256_set1_epi32(8))); + vgroupid = _mm256_i32gather_epi32(reinterpret_cast(blocks_), vgroupid, 1); + vgroupid = _mm256_srlv_epi32( + vgroupid, _mm256_and_si256(vgroupid_bit_offset, _mm256_set1_epi32(7))); + vgroupid = _mm256_and_si256(vgroupid, _mm256_set1_epi32(group_id_mask)); + + // Convert slot id relative to the block to slot id relative to the beginnning of the + // table + // + vnext_slot_id = _mm256_add_epi32( + _mm256_add_epi32(vnext_slot_id, + _mm256_and_si256(vmatch_found, _mm256_set1_epi32(1))), + _mm256_slli_epi32(vblock_id, 3)); + + // Convert match found vector from 32-bit elements to bit vector + out_match_bitvector[i] = _pext_u32(_mm256_movemask_epi8(vmatch_found), + 0x11111111); // 0b00010001 repeated 4x + _mm256_storeu_si256(reinterpret_cast<__m256i*>(out_group_ids) + i, vgroupid); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(out_next_slot_ids) + i, vnext_slot_id); + } +} + +// Take a set of 16 64-bit elements, +// Output one AVX2 register per byte (0 to 7), containing a sequence of 16 bytes, +// one from each input 64-bit word, all from the same position in 64-bit word. +// 16 bytes are replicated in lower and upper half of each output register. +// +inline void split_bytes_avx2(__m256i word0, __m256i word1, __m256i word2, __m256i word3, + __m256i& byte0, __m256i& byte1, __m256i& byte2, + __m256i& byte3, __m256i& byte4, __m256i& byte5, + __m256i& byte6, __m256i& byte7) { + __m256i word01lo = _mm256_unpacklo_epi8( + word0, word1); // {a0, e0, a1, e1, ... a7, e7, c0, g0, c1, g1, ... c7, g7} + __m256i word23lo = _mm256_unpacklo_epi8( + word2, word3); // {i0, m0, i1, m1, ... i7, m7, k0, o0, k1, o1, ... k7, o7} + __m256i word01hi = _mm256_unpackhi_epi8( + word0, word1); // {b0, f0, b1, f1, ... b7, f1, d0, h0, d1, h1, ... d7, h7} + __m256i word23hi = _mm256_unpackhi_epi8( + word2, word3); // {j0, n0, j1, n1, ... j7, n7, l0, p0, l1, p1, ... l7, p7} + + __m256i a = + _mm256_unpacklo_epi16(word01lo, word01hi); // {a0, e0, b0, f0, ... a3, e3, b3, f3, + // c0, g0, d0, h0, ... c3, g3, d3, h3} + __m256i b = + _mm256_unpacklo_epi16(word23lo, word23hi); // {i0, m0, j0, n0, ... i3, m3, j3, n3, + // k0, o0, l0, p0, ... k3, o3, l3, p3} + __m256i c = + _mm256_unpackhi_epi16(word01lo, word01hi); // {a4, e4, b4, f4, ... a7, e7, b7, f7, + // c4, g4, d4, h4, ... c7, g7, d7, h7} + __m256i d = + _mm256_unpackhi_epi16(word23lo, word23hi); // {i4, m4, j4, n4, ... i7, m7, j7, n7, + // k4, o4, l4, p4, ... k7, o7, l7, p7} + + __m256i byte01 = _mm256_unpacklo_epi32( + a, b); // {a0, e0, b0, f0, i0, m0, j0, n0, a1, e1, b1, f1, i1, m1, j1, n1, c0, g0, + // d0, h0, k0, o0, l0, p0, ...} + __m256i shuffle_const = + _mm256_setr_epi8(0, 2, 8, 10, 1, 3, 9, 11, 4, 6, 12, 14, 5, 7, 13, 15, 0, 2, 8, 10, + 1, 3, 9, 11, 4, 6, 12, 14, 5, 7, 13, 15); + byte01 = _mm256_permute4x64_epi64( + byte01, 0xd8); // 11011000 b - swapping middle two 64-bit elements + byte01 = _mm256_shuffle_epi8(byte01, shuffle_const); + __m256i byte23 = _mm256_unpackhi_epi32(a, b); + byte23 = _mm256_permute4x64_epi64(byte23, 0xd8); + byte23 = _mm256_shuffle_epi8(byte23, shuffle_const); + __m256i byte45 = _mm256_unpacklo_epi32(c, d); + byte45 = _mm256_permute4x64_epi64(byte45, 0xd8); + byte45 = _mm256_shuffle_epi8(byte45, shuffle_const); + __m256i byte67 = _mm256_unpackhi_epi32(c, d); + byte67 = _mm256_permute4x64_epi64(byte67, 0xd8); + byte67 = _mm256_shuffle_epi8(byte67, shuffle_const); + + byte0 = _mm256_permute4x64_epi64(byte01, 0x44); // 01000100 b + byte1 = _mm256_permute4x64_epi64(byte01, 0xee); // 11101110 b + byte2 = _mm256_permute4x64_epi64(byte23, 0x44); // 01000100 b + byte3 = _mm256_permute4x64_epi64(byte23, 0xee); // 11101110 b + byte4 = _mm256_permute4x64_epi64(byte45, 0x44); // 01000100 b + byte5 = _mm256_permute4x64_epi64(byte45, 0xee); // 11101110 b + byte6 = _mm256_permute4x64_epi64(byte67, 0x44); // 01000100 b + byte7 = _mm256_permute4x64_epi64(byte67, 0xee); // 11101110 b +} + +// This one can only process a multiple of 32 values. +// The caller needs to process the remaining tail, if the input is not divisible by 32, +// using a different method. +// TODO: Explain the idea behind storing arrays in SIMD registers. +// Explain why it is faster with SIMD than using memory loads. +void SwissTable::lookup_1_avx2_x32(const int num_hashes, const uint32_t* hashes, + uint8_t* out_match_bitvector, uint32_t* out_group_ids, + uint32_t* out_next_slot_ids) { + constexpr int unroll = 32; + + // There is a limit on the number of input blocks, + // because we want to store all their data in a set of AVX2 registers. + ARROW_DCHECK(log_blocks_ <= 4); + + // Remember that block bytes and group id bytes are in opposite orders in memory of hash + // table. We put them in the same order. + __m256i vblock_byte0, vblock_byte1, vblock_byte2, vblock_byte3, vblock_byte4, + vblock_byte5, vblock_byte6, vblock_byte7; + __m256i vgroupid_byte0, vgroupid_byte1, vgroupid_byte2, vgroupid_byte3, vgroupid_byte4, + vgroupid_byte5, vgroupid_byte6, vgroupid_byte7; + // What we output if there is no match in the block + __m256i vslot_empty_or_end; + + constexpr uint32_t k4ByteSequence_0_4_8_12 = 0x0c080400; + constexpr uint32_t k4ByteSequence_1_5_9_13 = 0x0d090501; + constexpr uint32_t k4ByteSequence_2_6_10_14 = 0x0e0a0602; + constexpr uint32_t k4ByteSequence_3_7_11_15 = 0x0f0b0703; + constexpr uint64_t kEachByteIs1 = 0x0101010101010101ULL; + constexpr uint64_t kByteSequence7DownTo0 = 0x0001020304050607ULL; + constexpr uint64_t kByteSequence15DownTo8 = 0x08090A0B0C0D0E0FULL; + + // Bit unpack group ids into 1B. + // Assemble the sequence of block bytes. + uint64_t block_bytes[16]; + uint64_t groupid_bytes[16]; + const int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); + uint64_t bit_unpack_mask = ((1 << num_groupid_bits) - 1) * kEachByteIs1; + for (int i = 0; i < (1 << log_blocks_); ++i) { + uint64_t in_groupids = + *reinterpret_cast(blocks_ + (8 + num_groupid_bits) * i + 8); + uint64_t in_blockbytes = + *reinterpret_cast(blocks_ + (8 + num_groupid_bits) * i); + groupid_bytes[i] = _pdep_u64(in_groupids, bit_unpack_mask); + block_bytes[i] = in_blockbytes; + } + + // Split a sequence of 64-bit words into SIMD vectors holding individual bytes + __m256i vblock_words0 = + _mm256_loadu_si256(reinterpret_cast(block_bytes) + 0); + __m256i vblock_words1 = + _mm256_loadu_si256(reinterpret_cast(block_bytes) + 1); + __m256i vblock_words2 = + _mm256_loadu_si256(reinterpret_cast(block_bytes) + 2); + __m256i vblock_words3 = + _mm256_loadu_si256(reinterpret_cast(block_bytes) + 3); + // Reverse the bytes in blocks + __m256i vshuffle_const = + _mm256_setr_epi64x(kByteSequence7DownTo0, kByteSequence15DownTo8, + kByteSequence7DownTo0, kByteSequence15DownTo8); + vblock_words0 = _mm256_shuffle_epi8(vblock_words0, vshuffle_const); + vblock_words1 = _mm256_shuffle_epi8(vblock_words1, vshuffle_const); + vblock_words2 = _mm256_shuffle_epi8(vblock_words2, vshuffle_const); + vblock_words3 = _mm256_shuffle_epi8(vblock_words3, vshuffle_const); + split_bytes_avx2(vblock_words0, vblock_words1, vblock_words2, vblock_words3, + vblock_byte0, vblock_byte1, vblock_byte2, vblock_byte3, vblock_byte4, + vblock_byte5, vblock_byte6, vblock_byte7); + split_bytes_avx2( + _mm256_loadu_si256(reinterpret_cast(groupid_bytes) + 0), + _mm256_loadu_si256(reinterpret_cast(groupid_bytes) + 1), + _mm256_loadu_si256(reinterpret_cast(groupid_bytes) + 2), + _mm256_loadu_si256(reinterpret_cast(groupid_bytes) + 3), + vgroupid_byte0, vgroupid_byte1, vgroupid_byte2, vgroupid_byte3, vgroupid_byte4, + vgroupid_byte5, vgroupid_byte6, vgroupid_byte7); + + // Calculate the slot to output when there is no match in a block. + // It will be the index of the first empty slot or 8 (the number of slots in block) + // if there are no empty slots. + vslot_empty_or_end = _mm256_set1_epi8(8); + { + __m256i vis_empty; +#define CMP(VBLOCKBYTE, BYTENUM) \ + vis_empty = \ + _mm256_cmpeq_epi8(VBLOCKBYTE, _mm256_set1_epi8(static_cast(0x80))); \ + vslot_empty_or_end = \ + _mm256_blendv_epi8(vslot_empty_or_end, _mm256_set1_epi8(BYTENUM), vis_empty); + CMP(vblock_byte7, 7); + CMP(vblock_byte6, 6); + CMP(vblock_byte5, 5); + CMP(vblock_byte4, 4); + CMP(vblock_byte3, 3); + CMP(vblock_byte2, 2); + CMP(vblock_byte1, 1); + CMP(vblock_byte0, 0); +#undef CMP + } + + const int block_id_mask = (1 << log_blocks_) - 1; + + for (int i = 0; i < num_hashes / unroll; ++i) { + __m256i vhash0 = + _mm256_loadu_si256(reinterpret_cast(hashes) + 4 * i + 0); + __m256i vhash1 = + _mm256_loadu_si256(reinterpret_cast(hashes) + 4 * i + 1); + __m256i vhash2 = + _mm256_loadu_si256(reinterpret_cast(hashes) + 4 * i + 2); + __m256i vhash3 = + _mm256_loadu_si256(reinterpret_cast(hashes) + 4 * i + 3); + + // We will get input in byte lanes in the order: [0, 8, 16, 24, 1, 9, 17, 25, 2, 10, + // 18, 26, ...] + vhash0 = _mm256_or_si256(_mm256_srli_epi32(vhash0, 16), + _mm256_and_si256(vhash2, _mm256_set1_epi32(0xffff0000))); + vhash1 = _mm256_or_si256(_mm256_srli_epi32(vhash1, 16), + _mm256_and_si256(vhash3, _mm256_set1_epi32(0xffff0000))); + __m256i vstamp_A = _mm256_and_si256( + _mm256_srlv_epi32(vhash0, _mm256_set1_epi32(16 - log_blocks_ - 7)), + _mm256_set1_epi16(0x7f)); + __m256i vstamp_B = _mm256_and_si256( + _mm256_srlv_epi32(vhash1, _mm256_set1_epi32(16 - log_blocks_ - 7)), + _mm256_set1_epi16(0x7f)); + __m256i vstamp = _mm256_or_si256(vstamp_A, _mm256_slli_epi16(vstamp_B, 8)); + __m256i vblock_id_A = + _mm256_and_si256(_mm256_srlv_epi32(vhash0, _mm256_set1_epi32(16 - log_blocks_)), + _mm256_set1_epi16(block_id_mask)); + __m256i vblock_id_B = + _mm256_and_si256(_mm256_srlv_epi32(vhash1, _mm256_set1_epi32(16 - log_blocks_)), + _mm256_set1_epi16(block_id_mask)); + __m256i vblock_id = _mm256_or_si256(vblock_id_A, _mm256_slli_epi16(vblock_id_B, 8)); + + // Visit all block bytes in reverse order (overwriting data on multiple matches) + __m256i vmatch_found = _mm256_setzero_si256(); + __m256i vslot_id = _mm256_shuffle_epi8(vslot_empty_or_end, vblock_id); + __m256i vgroup_id = _mm256_setzero_si256(); +#define CMP(VBLOCK_BYTE, VGROUPID_BYTE, BYTENUM) \ + { \ + __m256i vcmp = \ + _mm256_cmpeq_epi8(_mm256_shuffle_epi8(VBLOCK_BYTE, vblock_id), vstamp); \ + vmatch_found = _mm256_or_si256(vmatch_found, vcmp); \ + vgroup_id = _mm256_blendv_epi8(vgroup_id, \ + _mm256_shuffle_epi8(VGROUPID_BYTE, vblock_id), vcmp); \ + vslot_id = _mm256_blendv_epi8(vslot_id, _mm256_set1_epi8(BYTENUM + 1), vcmp); \ + } + CMP(vblock_byte7, vgroupid_byte7, 7); + CMP(vblock_byte6, vgroupid_byte6, 6); + CMP(vblock_byte5, vgroupid_byte5, 5); + CMP(vblock_byte4, vgroupid_byte4, 4); + CMP(vblock_byte3, vgroupid_byte3, 3); + CMP(vblock_byte2, vgroupid_byte2, 2); + CMP(vblock_byte1, vgroupid_byte1, 1); + CMP(vblock_byte0, vgroupid_byte0, 0); +#undef CMP + + vslot_id = _mm256_add_epi8(vslot_id, _mm256_slli_epi32(vblock_id, 3)); + // So far the output is in the order: [0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, ...] + vmatch_found = _mm256_shuffle_epi8( + vmatch_found, + _mm256_setr_epi32(k4ByteSequence_0_4_8_12, k4ByteSequence_1_5_9_13, + k4ByteSequence_2_6_10_14, k4ByteSequence_3_7_11_15, + k4ByteSequence_0_4_8_12, k4ByteSequence_1_5_9_13, + k4ByteSequence_2_6_10_14, k4ByteSequence_3_7_11_15)); + // Now it is: [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, | 4, 5, 6, 7, + // 12, 13, 14, 15, ...] + vmatch_found = _mm256_permutevar8x32_epi32(vmatch_found, + _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7)); + + reinterpret_cast(out_match_bitvector)[i] = + _mm256_movemask_epi8(vmatch_found); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(out_group_ids) + 4 * i + 0, + _mm256_and_si256(vgroup_id, _mm256_set1_epi32(0xff))); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(out_group_ids) + 4 * i + 1, + _mm256_and_si256(_mm256_srli_epi32(vgroup_id, 8), _mm256_set1_epi32(0xff))); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(out_group_ids) + 4 * i + 2, + _mm256_and_si256(_mm256_srli_epi32(vgroup_id, 16), _mm256_set1_epi32(0xff))); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(out_group_ids) + 4 * i + 3, + _mm256_and_si256(_mm256_srli_epi32(vgroup_id, 24), _mm256_set1_epi32(0xff))); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(out_next_slot_ids) + 4 * i + 0, + _mm256_and_si256(vslot_id, _mm256_set1_epi32(0xff))); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(out_next_slot_ids) + 4 * i + 1, + _mm256_and_si256(_mm256_srli_epi32(vslot_id, 8), _mm256_set1_epi32(0xff))); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(out_next_slot_ids) + 4 * i + 2, + _mm256_and_si256(_mm256_srli_epi32(vslot_id, 16), _mm256_set1_epi32(0xff))); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(out_next_slot_ids) + 4 * i + 3, + _mm256_and_si256(_mm256_srli_epi32(vslot_id, 24), _mm256_set1_epi32(0xff))); + } +} + +#endif + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/plan_test.cc b/cpp/src/arrow/compute/exec/plan_test.cc new file mode 100644 index 00000000000..101257f5de8 --- /dev/null +++ b/cpp/src/arrow/compute/exec/plan_test.cc @@ -0,0 +1,585 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/compute/exec/expression.h" +#include "arrow/compute/exec/test_util.h" +#include "arrow/record_batch.h" +#include "arrow/testing/future_util.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" +#include "arrow/testing/random.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/logging.h" +#include "arrow/util/thread_pool.h" +#include "arrow/util/vector.h" + +using testing::ElementsAre; +using testing::HasSubstr; +using testing::Optional; +using testing::UnorderedElementsAreArray; + +namespace arrow { + +namespace compute { + +TEST(ExecPlanConstruction, Empty) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + ASSERT_THAT(plan->Validate(), Raises(StatusCode::Invalid)); +} + +TEST(ExecPlanConstruction, SingleNode) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + auto node = MakeDummyNode(plan.get(), "dummy", /*inputs=*/{}, /*num_outputs=*/0); + ASSERT_OK(plan->Validate()); + ASSERT_THAT(plan->sources(), ElementsAre(node)); + ASSERT_THAT(plan->sinks(), ElementsAre(node)); + + ASSERT_OK_AND_ASSIGN(plan, ExecPlan::Make()); + node = MakeDummyNode(plan.get(), "dummy", /*inputs=*/{}, /*num_outputs=*/1); + // Output not bound + ASSERT_THAT(plan->Validate(), Raises(StatusCode::Invalid)); +} + +TEST(ExecPlanConstruction, SourceSink) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + auto source = MakeDummyNode(plan.get(), "source", /*inputs=*/{}, /*num_outputs=*/1); + auto sink = MakeDummyNode(plan.get(), "sink", /*inputs=*/{source}, /*num_outputs=*/0); + + ASSERT_OK(plan->Validate()); + EXPECT_THAT(plan->sources(), ElementsAre(source)); + EXPECT_THAT(plan->sinks(), ElementsAre(sink)); +} + +TEST(ExecPlanConstruction, MultipleNode) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + auto source1 = MakeDummyNode(plan.get(), "source1", /*inputs=*/{}, /*num_outputs=*/2); + + auto source2 = MakeDummyNode(plan.get(), "source2", /*inputs=*/{}, /*num_outputs=*/1); + + auto process1 = + MakeDummyNode(plan.get(), "process1", /*inputs=*/{source1}, /*num_outputs=*/2); + + auto process2 = MakeDummyNode(plan.get(), "process1", /*inputs=*/{source1, source2}, + /*num_outputs=*/1); + + auto process3 = + MakeDummyNode(plan.get(), "process3", /*inputs=*/{process1, process2, process1}, + /*num_outputs=*/1); + + auto sink = MakeDummyNode(plan.get(), "sink", /*inputs=*/{process3}, /*num_outputs=*/0); + + ASSERT_OK(plan->Validate()); + ASSERT_THAT(plan->sources(), ElementsAre(source1, source2)); + ASSERT_THAT(plan->sinks(), ElementsAre(sink)); +} + +struct StartStopTracker { + std::vector started, stopped; + + StartProducingFunc start_producing_func(Status st = Status::OK()) { + return [this, st](ExecNode* node) { + started.push_back(node->label()); + return st; + }; + } + + StopProducingFunc stop_producing_func() { + return [this](ExecNode* node) { stopped.push_back(node->label()); }; + } +}; + +TEST(ExecPlan, DummyStartProducing) { + StartStopTracker t; + + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + auto source1 = MakeDummyNode(plan.get(), "source1", /*inputs=*/{}, /*num_outputs=*/2, + t.start_producing_func(), t.stop_producing_func()); + + auto source2 = MakeDummyNode(plan.get(), "source2", /*inputs=*/{}, /*num_outputs=*/1, + t.start_producing_func(), t.stop_producing_func()); + + auto process1 = + MakeDummyNode(plan.get(), "process1", /*inputs=*/{source1}, /*num_outputs=*/2, + t.start_producing_func(), t.stop_producing_func()); + + auto process2 = + MakeDummyNode(plan.get(), "process2", /*inputs=*/{process1, source2}, + /*num_outputs=*/1, t.start_producing_func(), t.stop_producing_func()); + + auto process3 = + MakeDummyNode(plan.get(), "process3", /*inputs=*/{process1, source1, process2}, + /*num_outputs=*/1, t.start_producing_func(), t.stop_producing_func()); + + MakeDummyNode(plan.get(), "sink", /*inputs=*/{process3}, /*num_outputs=*/0, + t.start_producing_func(), t.stop_producing_func()); + + ASSERT_OK(plan->Validate()); + ASSERT_EQ(t.started.size(), 0); + ASSERT_EQ(t.stopped.size(), 0); + + ASSERT_OK(plan->StartProducing()); + // Note that any correct reverse topological order may do + ASSERT_THAT(t.started, ElementsAre("sink", "process3", "process2", "process1", + "source2", "source1")); + + plan->StopProducing(); + ASSERT_THAT(plan->finished(), Finishes(Ok())); + // Note that any correct topological order may do + ASSERT_THAT(t.stopped, ElementsAre("source1", "source2", "process1", "process2", + "process3", "sink")); + + ASSERT_THAT(plan->StartProducing(), + Raises(StatusCode::Invalid, HasSubstr("restarted"))); +} + +TEST(ExecPlan, DummyStartProducingError) { + StartStopTracker t; + + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + auto source1 = MakeDummyNode( + plan.get(), "source1", /*num_inputs=*/{}, /*num_outputs=*/2, + t.start_producing_func(Status::NotImplemented("zzz")), t.stop_producing_func()); + + auto source2 = + MakeDummyNode(plan.get(), "source2", /*num_inputs=*/{}, /*num_outputs=*/1, + t.start_producing_func(), t.stop_producing_func()); + + auto process1 = MakeDummyNode( + plan.get(), "process1", /*num_inputs=*/{source1}, /*num_outputs=*/2, + t.start_producing_func(Status::IOError("xxx")), t.stop_producing_func()); + + auto process2 = + MakeDummyNode(plan.get(), "process2", /*num_inputs=*/{process1, source2}, + /*num_outputs=*/1, t.start_producing_func(), t.stop_producing_func()); + + auto process3 = + MakeDummyNode(plan.get(), "process3", /*num_inputs=*/{process1, source1, process2}, + /*num_outputs=*/1, t.start_producing_func(), t.stop_producing_func()); + + MakeDummyNode(plan.get(), "sink", /*num_inputs=*/{process3}, /*num_outputs=*/0, + t.start_producing_func(), t.stop_producing_func()); + + ASSERT_OK(plan->Validate()); + ASSERT_EQ(t.started.size(), 0); + ASSERT_EQ(t.stopped.size(), 0); + + // `process1` raises IOError + ASSERT_THAT(plan->StartProducing(), Raises(StatusCode::IOError)); + ASSERT_THAT(t.started, ElementsAre("sink", "process3", "process2", "process1")); + // Nodes that started successfully were stopped in reverse order + ASSERT_THAT(t.stopped, ElementsAre("process2", "process3", "sink")); +} + +namespace { + +struct BatchesWithSchema { + std::vector batches; + std::shared_ptr schema; +}; + +Result MakeTestSourceNode(ExecPlan* plan, std::string label, + BatchesWithSchema batches_with_schema, bool parallel, + bool slow) { + DCHECK_GT(batches_with_schema.batches.size(), 0); + + auto opt_batches = ::arrow::internal::MapVector( + [](ExecBatch batch) { return util::make_optional(std::move(batch)); }, + std::move(batches_with_schema.batches)); + + AsyncGenerator> gen; + + if (parallel) { + // emulate batches completing initial decode-after-scan on a cpu thread + ARROW_ASSIGN_OR_RAISE( + gen, MakeBackgroundGenerator(MakeVectorIterator(std::move(opt_batches)), + ::arrow::internal::GetCpuThreadPool())); + + // ensure that callbacks are not executed immediately on a background thread + gen = MakeTransferredGenerator(std::move(gen), ::arrow::internal::GetCpuThreadPool()); + } else { + gen = MakeVectorGenerator(std::move(opt_batches)); + } + + if (slow) { + gen = MakeMappedGenerator(std::move(gen), [](const util::optional& batch) { + SleepABit(); + return batch; + }); + } + + return MakeSourceNode(plan, label, std::move(batches_with_schema.schema), + std::move(gen)); +} + +Future> StartAndCollect( + ExecPlan* plan, AsyncGenerator> gen) { + RETURN_NOT_OK(plan->Validate()); + RETURN_NOT_OK(plan->StartProducing()); + + auto collected_fut = CollectAsyncGenerator(gen); + + return AllComplete({plan->finished(), Future<>(collected_fut)}) + .Then([collected_fut]() -> Result> { + ARROW_ASSIGN_OR_RAISE(auto collected, collected_fut.result()); + return ::arrow::internal::MapVector( + [](util::optional batch) { return std::move(*batch); }, + std::move(collected)); + }); +} + +BatchesWithSchema MakeBasicBatches() { + BatchesWithSchema out; + out.batches = { + ExecBatchFromJSON({int32(), boolean()}, "[[null, true], [4, false]]"), + ExecBatchFromJSON({int32(), boolean()}, "[[5, null], [6, false], [7, false]]")}; + out.schema = schema({field("i32", int32()), field("bool", boolean())}); + return out; +} + +BatchesWithSchema MakeRandomBatches(const std::shared_ptr& schema, + int num_batches = 10, int batch_size = 4) { + BatchesWithSchema out; + + random::RandomArrayGenerator rng(42); + out.batches.resize(num_batches); + + for (int i = 0; i < num_batches; ++i) { + out.batches[i] = ExecBatch(*rng.BatchOf(schema->fields(), batch_size)); + // add a tag scalar to ensure the batches are unique + out.batches[i].values.emplace_back(i); + } + return out; +} +} // namespace + +TEST(ExecPlanExecution, SourceSink) { + for (bool slow : {false, true}) { + SCOPED_TRACE(slow ? "slowed" : "unslowed"); + + for (bool parallel : {false, true}) { + SCOPED_TRACE(parallel ? "parallel" : "single threaded"); + + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + auto basic_data = MakeBasicBatches(); + + ASSERT_OK_AND_ASSIGN(auto source, MakeTestSourceNode(plan.get(), "source", + basic_data, parallel, slow)); + + auto sink_gen = MakeSinkNode(source, "sink"); + + ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray(basic_data.batches)))); + } + } +} + +TEST(ExecPlanExecution, SourceSinkError) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + auto basic_data = MakeBasicBatches(); + auto it = basic_data.batches.begin(); + AsyncGenerator> gen = + [&]() -> Result> { + if (it == basic_data.batches.end()) { + return Status::Invalid("Artificial error"); + } + return util::make_optional(*it++); + }; + + auto source = MakeSourceNode(plan.get(), "source", {}, gen); + auto sink_gen = MakeSinkNode(source, "sink"); + + ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), + Finishes(Raises(StatusCode::Invalid, HasSubstr("Artificial")))); +} + +TEST(ExecPlanExecution, StressSourceSink) { + for (bool slow : {false, true}) { + SCOPED_TRACE(slow ? "slowed" : "unslowed"); + + for (bool parallel : {false, true}) { + SCOPED_TRACE(parallel ? "parallel" : "single threaded"); + + int num_batches = slow && !parallel ? 30 : 300; + + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + auto random_data = MakeRandomBatches( + schema({field("a", int32()), field("b", boolean())}), num_batches); + + ASSERT_OK_AND_ASSIGN(auto source, MakeTestSourceNode(plan.get(), "source", + random_data, parallel, slow)); + + auto sink_gen = MakeSinkNode(source, "sink"); + + ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray(random_data.batches)))); + } + } +} + +TEST(ExecPlanExecution, StressSourceSinkStopped) { + for (bool slow : {false, true}) { + SCOPED_TRACE(slow ? "slowed" : "unslowed"); + + for (bool parallel : {false, true}) { + SCOPED_TRACE(parallel ? "parallel" : "single threaded"); + + int num_batches = slow && !parallel ? 30 : 300; + + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + auto random_data = MakeRandomBatches( + schema({field("a", int32()), field("b", boolean())}), num_batches); + + ASSERT_OK_AND_ASSIGN(auto source, MakeTestSourceNode(plan.get(), "source", + random_data, parallel, slow)); + + auto sink_gen = MakeSinkNode(source, "sink"); + + ASSERT_OK(plan->Validate()); + ASSERT_OK(plan->StartProducing()); + + EXPECT_THAT(sink_gen(), Finishes(ResultWith(Optional(random_data.batches[0])))); + + plan->StopProducing(); + ASSERT_THAT(plan->finished(), Finishes(Ok())); + } + } +} + +TEST(ExecPlanExecution, SourceFilterSink) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + auto basic_data = MakeBasicBatches(); + + ASSERT_OK_AND_ASSIGN(auto source, + MakeTestSourceNode(plan.get(), "source", basic_data, + /*parallel=*/false, /*slow=*/false)); + + ASSERT_OK_AND_ASSIGN( + auto filter, MakeFilterNode(source, "filter", equal(field_ref("i32"), literal(6)))); + + auto sink_gen = MakeSinkNode(filter, "sink"); + + ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray( + {ExecBatchFromJSON({int32(), boolean()}, "[]"), + ExecBatchFromJSON({int32(), boolean()}, "[[6, false]]")})))); +} + +TEST(ExecPlanExecution, SourceProjectSink) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + auto basic_data = MakeBasicBatches(); + + ASSERT_OK_AND_ASSIGN(auto source, + MakeTestSourceNode(plan.get(), "source", basic_data, + /*parallel=*/false, /*slow=*/false)); + + std::vector exprs{ + not_(field_ref("bool")), + call("add", {field_ref("i32"), literal(1)}), + }; + for (auto& expr : exprs) { + ASSERT_OK_AND_ASSIGN(expr, expr.Bind(*basic_data.schema)); + } + + ASSERT_OK_AND_ASSIGN(auto projection, + MakeProjectNode(source, "project", exprs, {"!bool", "i32 + 1"})); + + auto sink_gen = MakeSinkNode(projection, "sink"); + + ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray( + {ExecBatchFromJSON({boolean(), int32()}, "[[false, null], [true, 5]]"), + ExecBatchFromJSON({boolean(), int32()}, + "[[null, 6], [true, 7], [true, 8]]")})))); +} + +namespace { + +BatchesWithSchema MakeGroupableBatches(int multiplicity = 1) { + BatchesWithSchema out; + + out.batches = {ExecBatchFromJSON({int32(), utf8()}, R"([ + [12, "alfa"], + [7, "beta"], + [3, "alfa"] + ])"), + ExecBatchFromJSON({int32(), utf8()}, R"([ + [-2, "alfa"], + [-1, "gama"], + [3, "alfa"] + ])"), + ExecBatchFromJSON({int32(), utf8()}, R"([ + [5, "gama"], + [3, "beta"], + [-8, "alfa"] + ])")}; + + size_t batch_count = out.batches.size(); + for (int repeat = 1; repeat < multiplicity; ++repeat) { + for (size_t i = 0; i < batch_count; ++i) { + out.batches.push_back(out.batches[i]); + } + } + + out.schema = schema({field("i32", int32()), field("str", utf8())}); + + return out; +} +} // namespace + +TEST(ExecPlanExecution, SourceGroupedSum) { + for (bool parallel : {false, true}) { + SCOPED_TRACE(parallel ? "parallel/merged" : "serial"); + + auto input = MakeGroupableBatches(/*multiplicity=*/parallel ? 100 : 1); + + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + ASSERT_OK_AND_ASSIGN(auto source, + MakeTestSourceNode(plan.get(), "source", input, + /*parallel=*/parallel, /*slow=*/false)); + ASSERT_OK_AND_ASSIGN( + auto gby, MakeGroupByNode(source, "gby", /*keys=*/{"str"}, /*targets=*/{"i32"}, + {{"hash_sum", nullptr}})); + auto sink_gen = MakeSinkNode(gby, "sink"); + + ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray({ExecBatchFromJSON( + {int64(), utf8()}, + parallel ? R"([[800, "alfa"], [1000, "beta"], [400, "gama"]])" + : R"([[8, "alfa"], [10, "beta"], [4, "gama"]])")})))); + } +} + +TEST(ExecPlanExecution, SourceFilterProjectGroupedSumFilter) { + for (bool parallel : {false, true}) { + SCOPED_TRACE(parallel ? "parallel/merged" : "serial"); + + int batch_multiplicity = parallel ? 100 : 1; + auto input = MakeGroupableBatches(/*multiplicity=*/batch_multiplicity); + + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + ASSERT_OK_AND_ASSIGN(auto source, + MakeTestSourceNode(plan.get(), "source", input, + /*parallel=*/parallel, /*slow=*/false)); + ASSERT_OK_AND_ASSIGN( + auto filter, + MakeFilterNode(source, "filter", greater_equal(field_ref("i32"), literal(0)))); + + ASSERT_OK_AND_ASSIGN( + auto projection, + MakeProjectNode(filter, "project", + { + field_ref("str"), + call("multiply", {field_ref("i32"), literal(2)}), + })); + + ASSERT_OK_AND_ASSIGN(auto gby, MakeGroupByNode(projection, "gby", /*keys=*/{"str"}, + /*targets=*/{"multiply(i32, 2)"}, + {{"hash_sum", nullptr}})); + + ASSERT_OK_AND_ASSIGN( + auto having, + MakeFilterNode(gby, "having", + greater(field_ref("hash_sum"), literal(10 * batch_multiplicity)))); + + auto sink_gen = MakeSinkNode(having, "sink"); + + ASSERT_THAT(StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray({ExecBatchFromJSON( + {int64(), utf8()}, parallel ? R"([[3600, "alfa"], [2000, "beta"]])" + : R"([[36, "alfa"], [20, "beta"]])")})))); + } +} + +TEST(ExecPlanExecution, SourceScalarAggSink) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + auto basic_data = MakeBasicBatches(); + + ASSERT_OK_AND_ASSIGN(auto source, + MakeTestSourceNode(plan.get(), "source", basic_data, + /*parallel=*/false, /*slow=*/false)); + + ASSERT_OK_AND_ASSIGN( + auto scalar_agg, + MakeScalarAggregateNode(source, "scalar_agg", {{"sum", nullptr}, {"any", nullptr}}, + /*targets=*/{"i32", "bool"}, + /*out_field_names=*/{"sum(i32)", "any(bool)"})); + + auto sink_gen = MakeSinkNode(scalar_agg, "sink"); + + ASSERT_THAT( + StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray({ + ExecBatchFromJSON({ValueDescr::Scalar(int64()), ValueDescr::Scalar(boolean())}, + "[[22, true]]"), + })))); +} + +TEST(ExecPlanExecution, ScalarSourceScalarAggSink) { + ASSERT_OK_AND_ASSIGN(auto plan, ExecPlan::Make()); + + BatchesWithSchema basic_data; + basic_data.batches = { + ExecBatchFromJSON({ValueDescr::Scalar(int32()), ValueDescr::Scalar(int32()), + ValueDescr::Scalar(int32())}, + "[[5, 5, 5], [5, 5, 5], [5, 5, 5]]"), + ExecBatchFromJSON({int32(), int32(), int32()}, + "[[5, 5, 5], [6, 6, 6], [7, 7, 7]]")}; + basic_data.schema = + schema({field("a", int32()), field("b", int32()), field("c", int32())}); + + ASSERT_OK_AND_ASSIGN(auto source, + MakeTestSourceNode(plan.get(), "source", basic_data, + /*parallel=*/false, /*slow=*/false)); + + ASSERT_OK_AND_ASSIGN( + auto scalar_agg, + MakeScalarAggregateNode(source, "scalar_agg", + {{"count", nullptr}, {"sum", nullptr}, {"mean", nullptr}}, + {"a", "b", "c"}, {"sum a", "sum b", "sum c"})); + + auto sink_gen = MakeSinkNode(scalar_agg, "sink"); + + ASSERT_THAT( + StartAndCollect(plan.get(), sink_gen), + Finishes(ResultWith(UnorderedElementsAreArray({ + ExecBatchFromJSON({ValueDescr::Scalar(int64()), ValueDescr::Scalar(int64()), + ValueDescr::Scalar(float64())}, + "[[6, 33, 5.5]]"), + })))); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/subtree_internal.h b/cpp/src/arrow/compute/exec/subtree_internal.h new file mode 100644 index 00000000000..72d419df225 --- /dev/null +++ b/cpp/src/arrow/compute/exec/subtree_internal.h @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/compute/exec/expression.h" +#include "arrow/util/optional.h" + +namespace arrow { +namespace compute { +// Helper class for efficiently detecting subtrees given expressions. +// +// Using fragment partition expressions as an example: +// Partition expressions are broken into conjunction members and each member dictionary +// encoded to impose a sortable ordering. In addition, subtrees are generated which span +// groups of fragments and nested subtrees. After encoding each fragment is guaranteed to +// be a descendant of at least one subtree. For example, given fragments in a +// HivePartitioning with paths: +// +// /num=0/al=eh/dat.par +// /num=0/al=be/dat.par +// /num=1/al=eh/dat.par +// /num=1/al=be/dat.par +// +// The following subtrees will be introduced: +// +// /num=0/ +// /num=0/al=eh/ +// /num=0/al=eh/dat.par +// /num=0/al=be/ +// /num=0/al=be/dat.par +// /num=1/ +// /num=1/al=eh/ +// /num=1/al=eh/dat.par +// /num=1/al=be/ +// /num=1/al=be/dat.par +struct SubtreeImpl { + // Each unique conjunction member is mapped to an integer. + using expression_code = char32_t; + // Partition expressions are mapped to strings of codes; strings give us lexicographic + // ordering (and potentially useful optimizations). + using expression_codes = std::basic_string; + // An encoded guarantee (if index is set) or subtree. + struct Encoded { + // An external index identifying the corresponding object (e.g. a Fragment) of the + // guarantee. + util::optional index; + // An encoded expression representing a guarantee. + expression_codes guarantee; + }; + + std::unordered_map + expr_to_code_; + std::vector code_to_expr_; + std::unordered_set subtree_exprs_; + + // Encode a subexpression (returning the existing code if possible). + expression_code GetOrInsert(const compute::Expression& expr) { + auto next_code = static_cast(expr_to_code_.size()); + auto it_success = expr_to_code_.emplace(expr, next_code); + + if (it_success.second) { + code_to_expr_.push_back(expr); + } + return it_success.first->second; + } + + // Encode an expression (recursively breaking up conjunction members if possible). + void EncodeConjunctionMembers(const compute::Expression& expr, + expression_codes* codes) { + if (auto call = expr.call()) { + if (call->function_name == "and_kleene") { + // expr is a conjunction, encode its arguments + EncodeConjunctionMembers(call->arguments[0], codes); + EncodeConjunctionMembers(call->arguments[1], codes); + return; + } + } + // expr is not a conjunction, encode it whole + codes->push_back(GetOrInsert(expr)); + } + + // Convert an encoded subtree or guarantee back into an expression. + compute::Expression GetSubtreeExpression(const Encoded& encoded_subtree) { + // Filters will already be simplified by all of a subtree's ancestors, so + // we only need to simplify the filter by the trailing conjunction member + // of each subtree. + return code_to_expr_[encoded_subtree.guarantee.back()]; + } + + // Insert subtrees for each component of an encoded partition expression. + void GenerateSubtrees(expression_codes guarantee, std::vector* encoded) { + while (!guarantee.empty()) { + if (subtree_exprs_.insert(guarantee).second) { + Encoded encoded_subtree{/*index=*/util::nullopt, guarantee}; + encoded->push_back(std::move(encoded_subtree)); + } + guarantee.resize(guarantee.size() - 1); + } + } + + // Encode a guarantee, and generate subtrees for it as well. + void EncodeOneGuarantee(int index, const Expression& guarantee, + std::vector* encoded) { + Encoded encoded_guarantee{index, {}}; + EncodeConjunctionMembers(guarantee, &encoded_guarantee.guarantee); + GenerateSubtrees(encoded_guarantee.guarantee, encoded); + encoded->push_back(std::move(encoded_guarantee)); + } + + template + std::vector EncodeGuarantees(const GetGuarantee& get, int count) { + std::vector encoded; + for (int i = 0; i < count; ++i) { + EncodeOneGuarantee(i, get(i), &encoded); + } + return encoded; + } + + // Comparator for sort + struct ByGuarantee { + bool operator()(const Encoded& l, const Encoded& r) { + const auto cmp = l.guarantee.compare(r.guarantee); + if (cmp != 0) { + return cmp < 0; + } + // Equal guarantees; sort encodings with indices after encodings without + return (l.index ? 1 : 0) < (r.index ? 1 : 0); + } + }; + + // Comparator for building a Forest + struct IsAncestor { + const std::vector encoded; + + bool operator()(int l, int r) const { + if (encoded[l].index) { + // Leaf-level object (e.g. a Fragment): not an ancestor. + return false; + } + + const auto& ancestor = encoded[l].guarantee; + const auto& descendant = encoded[r].guarantee; + + if (descendant.size() >= ancestor.size()) { + return std::equal(ancestor.begin(), ancestor.end(), descendant.begin()); + } + return false; + } + }; +}; + +inline bool operator==(const SubtreeImpl::Encoded& l, const SubtreeImpl::Encoded& r) { + return l.index == r.index && l.guarantee == r.guarantee; +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/subtree_test.cc b/cpp/src/arrow/compute/exec/subtree_test.cc new file mode 100644 index 00000000000..97213104454 --- /dev/null +++ b/cpp/src/arrow/compute/exec/subtree_test.cc @@ -0,0 +1,377 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include + +#include "arrow/compute/exec/forest_internal.h" +#include "arrow/compute/exec/subtree_internal.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/string_view.h" + +namespace arrow { +namespace compute { + +using testing::ContainerEq; + +// Tests of subtree pruning + +// Don't depend on FileSystem - port just enough to be useful here +struct FileInfo { + bool is_dir; + std::string path; + + bool operator==(const FileInfo& other) const { + return is_dir == other.is_dir && path == other.path; + } + + static FileInfo Dir(std::string path) { return FileInfo{true, std::move(path)}; } + + static FileInfo File(std::string path) { return FileInfo{false, std::move(path)}; } + + static bool ByPath(const FileInfo& l, const FileInfo& r) { return l.path < r.path; } +}; + +struct TestPathTree { + FileInfo info; + std::vector subtrees; + + explicit TestPathTree(std::string file_path) + : info(FileInfo::File(std::move(file_path))) {} + + TestPathTree(std::string dir_path, std::vector subtrees) + : info(FileInfo::Dir(std::move(dir_path))), subtrees(std::move(subtrees)) {} + + TestPathTree(Forest::Ref ref, const std::vector& infos) : info(infos[ref.i]) { + const Forest& forest = *ref.forest; + + int begin = ref.i + 1; + int end = begin + ref.num_descendants(); + + for (int i = begin; i < end; ++i) { + subtrees.emplace_back(forest[i], infos); + i += forest[i].num_descendants(); + } + } + + bool operator==(const TestPathTree& other) const { + return info == other.info && subtrees == other.subtrees; + } + + std::string ToString() const { + auto out = "\n" + info.path; + if (info.is_dir) out += "/"; + + for (const auto& subtree : subtrees) { + out += subtree.ToString(); + } + return out; + } + + friend std::ostream& operator<<(std::ostream& os, const TestPathTree& tree) { + return os << tree.ToString(); + } +}; + +using PT = TestPathTree; + +util::string_view RemoveTrailingSlash(util::string_view key) { + while (!key.empty() && key.back() == '/') { + key.remove_suffix(1); + } + return key; +} +bool IsAncestorOf(util::string_view ancestor, util::string_view descendant) { + // See filesystem/path_util.h + ancestor = RemoveTrailingSlash(ancestor); + if (ancestor == "") return true; + descendant = RemoveTrailingSlash(descendant); + if (!descendant.starts_with(ancestor)) return false; + descendant.remove_prefix(ancestor.size()); + if (descendant.empty()) return true; + return descendant.front() == '/'; +} + +Forest MakeForest(std::vector* infos) { + std::sort(infos->begin(), infos->end(), FileInfo::ByPath); + + return Forest(static_cast(infos->size()), [&](int i, int j) { + return IsAncestorOf(infos->at(i).path, infos->at(j).path); + }); +} + +void ExpectForestIs(std::vector infos, std::vector expected_roots) { + auto forest = MakeForest(&infos); + + std::vector actual_roots; + ASSERT_OK(forest.Visit( + [&](Forest::Ref ref) -> Result { + actual_roots.emplace_back(ref, infos); + return false; // only vist roots + }, + [](Forest::Ref) {})); + + // visit expected and assert equality + EXPECT_THAT(actual_roots, ContainerEq(expected_roots)); +} + +TEST(Forest, Basic) { + ExpectForestIs({}, {}); + + ExpectForestIs({FileInfo::File("aa")}, {PT("aa")}); + ExpectForestIs({FileInfo::Dir("AA")}, {PT("AA", {})}); + ExpectForestIs({FileInfo::Dir("AA"), FileInfo::File("AA/aa")}, + {PT("AA", {PT("AA/aa")})}); + ExpectForestIs({FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0")}, + {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})})}); + + // Missing parent can still find ancestor. + ExpectForestIs({FileInfo::Dir("AA"), FileInfo::File("AA/BB/bb")}, + {PT("AA", {PT("AA/BB/bb")})}); + + // Ancestors should link to parent regardless of ordering. + ExpectForestIs({FileInfo::File("AA/aa"), FileInfo::Dir("AA")}, + {PT("AA", {PT("AA/aa")})}); + + // Multiple roots are supported. + ExpectForestIs({FileInfo::File("aa"), FileInfo::File("bb")}, {PT("aa"), PT("bb")}); + ExpectForestIs({FileInfo::File("00"), FileInfo::Dir("AA"), FileInfo::File("AA/aa"), + FileInfo::File("BB/bb")}, + {PT("00"), PT("AA", {PT("AA/aa")}), PT("BB/bb")}); + ExpectForestIs({FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0"), + FileInfo::Dir("CC"), FileInfo::Dir("CC/BB"), FileInfo::File("CC/BB/0")}, + {PT("AA", {PT("AA/BB", {PT("AA/BB/0")})}), + PT("CC", {PT("CC/BB", {PT("CC/BB/0")})})}); +} + +TEST(Forest, HourlyETL) { + // This test mimics a scenario where an ETL dumps hourly files in a structure + // `$year/$month/$day/$hour/*.parquet`. + constexpr int64_t kYears = 3; + constexpr int64_t kMonthsPerYear = 12; + constexpr int64_t kDaysPerMonth = 31; + constexpr int64_t kHoursPerDay = 24; + constexpr int64_t kFilesPerHour = 2; + + // Avoid constructing strings + std::vector numbers{kDaysPerMonth + 1}; + for (size_t i = 0; i < numbers.size(); i++) { + numbers[i] = std::to_string(i); + if (numbers[i].size() == 1) { + numbers[i] = "0" + numbers[i]; + } + } + + auto join = [](const std::vector& path) { + if (path.empty()) return std::string(""); + std::string result = path[0]; + for (const auto& part : path) { + result += '/'; + result += part; + } + return result; + }; + + std::vector infos; + + std::vector forest; + for (int64_t year = 0; year < kYears; year++) { + auto year_str = std::to_string(year + 2000); + auto year_dir = FileInfo::Dir(year_str); + infos.push_back(year_dir); + + std::vector months; + for (int64_t month = 0; month < kMonthsPerYear; month++) { + auto month_str = join({year_str, numbers[month + 1]}); + auto month_dir = FileInfo::Dir(month_str); + infos.push_back(month_dir); + + std::vector days; + for (int64_t day = 0; day < kDaysPerMonth; day++) { + auto day_str = join({month_str, numbers[day + 1]}); + auto day_dir = FileInfo::Dir(day_str); + infos.push_back(day_dir); + + std::vector hours; + for (int64_t hour = 0; hour < kHoursPerDay; hour++) { + auto hour_str = join({day_str, numbers[hour]}); + auto hour_dir = FileInfo::Dir(hour_str); + infos.push_back(hour_dir); + + std::vector files; + for (int64_t file = 0; file < kFilesPerHour; file++) { + auto file_str = join({hour_str, numbers[file] + ".parquet"}); + auto file_fd = FileInfo::File(file_str); + infos.push_back(file_fd); + files.emplace_back(file_str); + } + + auto hour_pt = PT(hour_str, std::move(files)); + hours.push_back(hour_pt); + } + + auto day_pt = PT(day_str, std::move(hours)); + days.push_back(day_pt); + } + + auto month_pt = PT(month_str, std::move(days)); + months.push_back(month_pt); + } + + auto year_pt = PT(year_str, std::move(months)); + forest.push_back(year_pt); + } + + ExpectForestIs(infos, forest); +} + +TEST(Forest, Visit) { + using Infos = std::vector; + + for (auto infos : + {Infos{}, Infos{FileInfo::Dir("A"), FileInfo::File("A/a")}, + Infos{FileInfo::Dir("AA"), FileInfo::Dir("AA/BB"), FileInfo::File("AA/BB/0"), + FileInfo::Dir("CC"), FileInfo::Dir("CC/BB"), FileInfo::File("CC/BB/0")}}) { + ASSERT_TRUE(std::is_sorted(infos.begin(), infos.end(), FileInfo::ByPath)); + + auto forest = MakeForest(&infos); + + auto ignore_post = [](Forest::Ref) {}; + + // noop is fine + ASSERT_OK( + forest.Visit([](Forest::Ref) -> Result { return false; }, ignore_post)); + + // Should propagate failure + if (forest.size() != 0) { + ASSERT_RAISES( + Invalid, + forest.Visit([](Forest::Ref) -> Result { return Status::Invalid(""); }, + ignore_post)); + } + + // Ensure basic visit of all nodes + int i = 0; + ASSERT_OK(forest.Visit( + [&](Forest::Ref ref) -> Result { + EXPECT_EQ(ref.i, i); + ++i; + return true; + }, + ignore_post)); + + // Visit only directories + Infos actual_dirs; + ASSERT_OK(forest.Visit( + [&](Forest::Ref ref) -> Result { + if (!infos[ref.i].is_dir) { + return false; + } + actual_dirs.push_back(infos[ref.i]); + return true; + }, + ignore_post)); + + Infos expected_dirs; + for (const auto& info : infos) { + if (info.is_dir) { + expected_dirs.push_back(info); + } + } + EXPECT_THAT(actual_dirs, ContainerEq(expected_dirs)); + } +} + +TEST(Subtree, EncodeExpression) { + SubtreeImpl tree; + ASSERT_EQ(0, tree.GetOrInsert(equal(field_ref("a"), literal("1")))); + // Should be idempotent + ASSERT_EQ(0, tree.GetOrInsert(equal(field_ref("a"), literal("1")))); + ASSERT_EQ(equal(field_ref("a"), literal("1")), tree.code_to_expr_[0]); + + SubtreeImpl::expression_codes codes; + auto conj = + and_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2"))); + tree.EncodeConjunctionMembers(conj, &codes); + ASSERT_EQ(SubtreeImpl::expression_codes({0, 1}), codes); + + codes.clear(); + conj = or_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2"))); + tree.EncodeConjunctionMembers(conj, &codes); + ASSERT_EQ(SubtreeImpl::expression_codes({2}), codes); +} + +TEST(Subtree, GetSubtreeExpression) { + SubtreeImpl tree; + const auto expr_a = equal(field_ref("a"), literal("1")); + const auto expr_b = equal(field_ref("b"), literal("2")); + const auto code_a = tree.GetOrInsert(expr_a); + const auto code_b = tree.GetOrInsert(expr_b); + ASSERT_EQ(expr_a, + tree.GetSubtreeExpression(SubtreeImpl::Encoded{util::nullopt, {code_a}})); + ASSERT_EQ(expr_b, tree.GetSubtreeExpression( + SubtreeImpl::Encoded{util::nullopt, {code_a, code_b}})); +} + +class FakeFragment { + public: + explicit FakeFragment(Expression partition_expression) + : partition_expression_(partition_expression) {} + const Expression& partition_expression() const { return partition_expression_; } + + private: + Expression partition_expression_; +}; + +TEST(Subtree, EncodeFragments) { + const auto expr_a = + and_(equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2"))); + const auto expr_b = + and_(equal(field_ref("a"), literal("2")), equal(field_ref("b"), literal("3"))); + std::vector> fragments; + fragments.push_back(std::make_shared(expr_a)); + fragments.push_back(std::make_shared(expr_b)); + + SubtreeImpl tree; + auto encoded = tree.EncodeGuarantees( + [&](int index) { return fragments[index]->partition_expression(); }, + static_cast(fragments.size())); + EXPECT_THAT( + tree.code_to_expr_, + ContainerEq(std::vector{ + equal(field_ref("a"), literal("1")), equal(field_ref("b"), literal("2")), + equal(field_ref("a"), literal("2")), equal(field_ref("b"), literal("3"))})); + EXPECT_THAT( + encoded, + testing::UnorderedElementsAreArray({ + SubtreeImpl::Encoded{util::make_optional(0), + SubtreeImpl::expression_codes({0, 1})}, + SubtreeImpl::Encoded{util::make_optional(1), + SubtreeImpl::expression_codes({2, 3})}, + SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0})}, + SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2})}, + SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({0, 1})}, + SubtreeImpl::Encoded{util::nullopt, SubtreeImpl::expression_codes({2, 3})}, + })); +} +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/test_util.cc b/cpp/src/arrow/compute/exec/test_util.cc new file mode 100644 index 00000000000..b47d6087c0b --- /dev/null +++ b/cpp/src/arrow/compute/exec/test_util.cc @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/test_util.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/datum.h" +#include "arrow/record_batch.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/async_generator.h" +#include "arrow/util/iterator.h" +#include "arrow/util/logging.h" +#include "arrow/util/optional.h" +#include "arrow/util/vector.h" + +namespace arrow { + +using internal::Executor; + +namespace compute { +namespace { + +struct DummyNode : ExecNode { + DummyNode(ExecPlan* plan, std::string label, NodeVector inputs, int num_outputs, + StartProducingFunc start_producing, StopProducingFunc stop_producing) + : ExecNode(plan, std::move(label), std::move(inputs), {}, dummy_schema(), + num_outputs), + start_producing_(std::move(start_producing)), + stop_producing_(std::move(stop_producing)) { + input_labels_.resize(inputs_.size()); + for (size_t i = 0; i < input_labels_.size(); ++i) { + input_labels_[i] = std::to_string(i); + } + } + + const char* kind_name() override { return "Dummy"; } + + void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) override {} + + void ErrorReceived(ExecNode* input, Status error) override {} + + void InputFinished(ExecNode* input, int seq_stop) override {} + + Status StartProducing() override { + if (start_producing_) { + RETURN_NOT_OK(start_producing_(this)); + } + started_ = true; + return Status::OK(); + } + + void PauseProducing(ExecNode* output) override { + ASSERT_GE(num_outputs(), 0) << "Sink nodes should not experience backpressure"; + AssertIsOutput(output); + } + + void ResumeProducing(ExecNode* output) override { + ASSERT_GE(num_outputs(), 0) << "Sink nodes should not experience backpressure"; + AssertIsOutput(output); + } + + void StopProducing(ExecNode* output) override { + EXPECT_GE(num_outputs(), 0) << "Sink nodes should not experience backpressure"; + AssertIsOutput(output); + } + + void StopProducing() override { + if (started_) { + for (const auto& input : inputs_) { + input->StopProducing(this); + } + if (stop_producing_) { + stop_producing_(this); + } + } + } + + Future<> finished() override { return Future<>::MakeFinished(); } + + private: + void AssertIsOutput(ExecNode* output) { + auto it = std::find(outputs_.begin(), outputs_.end(), output); + ASSERT_NE(it, outputs_.end()); + } + + std::shared_ptr dummy_schema() const { + return schema({field("dummy", null())}); + } + + StartProducingFunc start_producing_; + StopProducingFunc stop_producing_; + std::unordered_set requested_stop_; + bool started_ = false; +}; + +} // namespace + +ExecNode* MakeDummyNode(ExecPlan* plan, std::string label, std::vector inputs, + int num_outputs, StartProducingFunc start_producing, + StopProducingFunc stop_producing) { + return plan->EmplaceNode(plan, std::move(label), std::move(inputs), + num_outputs, std::move(start_producing), + std::move(stop_producing)); +} + +ExecBatch ExecBatchFromJSON(const std::vector& descrs, + util::string_view json) { + auto fields = ::arrow::internal::MapVector( + [](const ValueDescr& descr) { return field("", descr.type); }, descrs); + + ExecBatch batch{*RecordBatchFromJSON(schema(std::move(fields)), json)}; + + auto value_it = batch.values.begin(); + for (const auto& descr : descrs) { + if (descr.shape == ValueDescr::SCALAR) { + if (batch.length == 0) { + *value_it = MakeNullScalar(value_it->type()); + } else { + *value_it = value_it->make_array()->GetScalar(0).ValueOrDie(); + } + } + ++value_it; + } + + return batch; +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/test_util.h b/cpp/src/arrow/compute/exec/test_util.h new file mode 100644 index 00000000000..faa395bab78 --- /dev/null +++ b/cpp/src/arrow/compute/exec/test_util.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/exec/exec_plan.h" +#include "arrow/testing/visibility.h" +#include "arrow/util/string_view.h" + +namespace arrow { +namespace compute { + +using StartProducingFunc = std::function; +using StopProducingFunc = std::function; + +// Make a dummy node that has no execution behaviour +ARROW_TESTING_EXPORT +ExecNode* MakeDummyNode(ExecPlan* plan, std::string label, std::vector inputs, + int num_outputs, StartProducingFunc = {}, StopProducingFunc = {}); + +ARROW_TESTING_EXPORT +ExecBatch ExecBatchFromJSON(const std::vector& descrs, + util::string_view json); + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc new file mode 100644 index 00000000000..a44676c2f0d --- /dev/null +++ b/cpp/src/arrow/compute/exec/util.cc @@ -0,0 +1,278 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/exec/util.h" + +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/ubsan.h" + +namespace arrow { + +using BitUtil::CountTrailingZeros; + +namespace util { + +inline void BitUtil::bits_to_indexes_helper(uint64_t word, uint16_t base_index, + int* num_indexes, uint16_t* indexes) { + int n = *num_indexes; + while (word) { + indexes[n++] = base_index + static_cast(CountTrailingZeros(word)); + word &= word - 1; + } + *num_indexes = n; +} + +inline void BitUtil::bits_filter_indexes_helper(uint64_t word, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes) { + int n = *num_indexes; + while (word) { + indexes[n++] = input_indexes[CountTrailingZeros(word)]; + word &= word - 1; + } + *num_indexes = n; +} + +template +void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes) { + // 64 bits at a time + constexpr int unroll = 64; + int tail = num_bits % unroll; +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + if (filter_input_indexes) { + bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, + num_indexes, indexes); + } else { + bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes); + } + } else { +#endif + *num_indexes = 0; + for (int i = 0; i < num_bits / unroll; ++i) { + uint64_t word = util::SafeLoad(&reinterpret_cast(bits)[i]); + if (bit_to_search == 0) { + word = ~word; + } + if (filter_input_indexes) { + bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes); + } else { + bits_to_indexes_helper(word, i * 64, num_indexes, indexes); + } + } +#if defined(ARROW_HAVE_AVX2) + } +#endif + // Optionally process the last partial word with masking out bits outside range + if (tail) { + uint64_t word = + util::SafeLoad(&reinterpret_cast(bits)[num_bits / unroll]); + if (bit_to_search == 0) { + word = ~word; + } + word &= ~0ULL >> (64 - tail); + if (filter_input_indexes) { + bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes, + indexes); + } else { + bits_to_indexes_helper(word, num_bits - tail, num_indexes, indexes); + } + } +} + +void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, int* num_indexes, + uint16_t* indexes, int bit_offset) { + bits += bit_offset / 8; + bit_offset %= 8; + if (bit_offset != 0) { + int num_indexes_head = 0; + uint64_t bits_head = + util::SafeLoad(reinterpret_cast(bits)) >> bit_offset; + int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); + bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte, + reinterpret_cast(&bits_head), &num_indexes_head, + indexes); + int num_indexes_tail = 0; + if (num_bits > bits_in_first_byte) { + bits_to_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte, + bits + 1, &num_indexes_tail, indexes + num_indexes_head); + } + *num_indexes = num_indexes_head + num_indexes_tail; + return; + } + + if (bit_to_search == 0) { + bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr, + num_indexes, indexes); + } else { + ARROW_DCHECK(bit_to_search == 1); + bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr, + num_indexes, indexes); + } +} + +void BitUtil::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes, int bit_offset) { + bits += bit_offset / 8; + bit_offset %= 8; + if (bit_offset != 0) { + int num_indexes_head = 0; + uint64_t bits_head = + util::SafeLoad(reinterpret_cast(bits)) >> bit_offset; + int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); + bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte, + reinterpret_cast(&bits_head), input_indexes, + &num_indexes_head, indexes); + int num_indexes_tail = 0; + if (num_bits > bits_in_first_byte) { + bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte, + bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail, + indexes + num_indexes_head); + } + *num_indexes = num_indexes_head + num_indexes_tail; + return; + } + + if (bit_to_search == 0) { + bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes, + num_indexes, indexes); + } else { + ARROW_DCHECK(bit_to_search == 1); + bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes, + num_indexes, indexes); + } +} + +void BitUtil::bits_split_indexes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, int* num_indexes_bit0, + uint16_t* indexes_bit0, uint16_t* indexes_bit1, + int bit_offset) { + bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0, + bit_offset); + int num_indexes_bit1; + bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1, + bit_offset); +} + +void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, uint8_t* bytes, int bit_offset) { + bits += bit_offset / 8; + bit_offset %= 8; + if (bit_offset != 0) { + uint64_t bits_head = + util::SafeLoad(reinterpret_cast(bits)) >> bit_offset; + int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); + bits_to_bytes(hardware_flags, bits_in_first_byte, + reinterpret_cast(&bits_head), bytes); + if (num_bits > bits_in_first_byte) { + bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1, + bytes + bits_in_first_byte); + } + return; + } + + int num_processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + // The function call below processes whole 32 bit chunks together. + num_processed = num_bits - (num_bits % 32); + bits_to_bytes_avx2(num_processed, bits, bytes); + } +#endif + // Processing 8 bits at a time + constexpr int unroll = 8; + for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) { + uint8_t bits_next = bits[i]; + // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart + // from the previous. + uint64_t unpacked = static_cast(bits_next & 0xfe) * + ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) | + (1ULL << 35) | (1ULL << 42) | (1ULL << 49)); + unpacked |= (bits_next & 1); + unpacked &= 0x0101010101010101ULL; + unpacked *= 255; + util::SafeStore(&reinterpret_cast(bytes)[i], unpacked); + } +} + +void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits, + const uint8_t* bytes, uint8_t* bits, int bit_offset) { + bits += bit_offset / 8; + bit_offset %= 8; + if (bit_offset != 0) { + uint64_t bits_head; + int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); + bytes_to_bits(hardware_flags, bits_in_first_byte, bytes, + reinterpret_cast(&bits_head)); + uint8_t mask = (1 << bit_offset) - 1; + *bits = static_cast((*bits & mask) | (bits_head << bit_offset)); + + if (num_bits > bits_in_first_byte) { + bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte, + bytes + bits_in_first_byte, bits + 1); + } + return; + } + + int num_processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + // The function call below processes whole 32 bit chunks together. + num_processed = num_bits - (num_bits % 32); + bytes_to_bits_avx2(num_processed, bytes, bits); + } +#endif + // Process 8 bits at a time + constexpr int unroll = 8; + for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) { + uint64_t bytes_next = util::SafeLoad(&reinterpret_cast(bytes)[i]); + bytes_next &= 0x0101010101010101ULL; + bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes + bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes + bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte + bits[i] = static_cast(bytes_next & 0xff); + } +} + +bool BitUtil::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, + uint32_t num_bytes) { +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + return are_all_bytes_zero_avx2(bytes, num_bytes); + } +#endif + uint64_t result_or = 0; + uint32_t i; + for (i = 0; i < num_bytes / 8; ++i) { + uint64_t x = util::SafeLoad(&reinterpret_cast(bytes)[i]); + result_or |= x; + } + if (num_bytes % 8 > 0) { + uint64_t tail = 0; + result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8); + } + return result_or == 0; +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h new file mode 100644 index 00000000000..d8248ceacab --- /dev/null +++ b/cpp/src/arrow/compute/exec/util.h @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/cpu_info.h" +#include "arrow/util/logging.h" + +#if defined(__clang__) || defined(__GNUC__) +#define BYTESWAP(x) __builtin_bswap64(x) +#define ROTL(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +#elif defined(_MSC_VER) +#include +#define BYTESWAP(x) _byteswap_uint64(x) +#define ROTL(x, n) _rotl((x), (n)) +#endif + +namespace arrow { +namespace util { + +// Some platforms typedef int64_t as long int instead of long long int, +// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics +// which need long long. +// We use the cast to the type below in these intrinsics to make the code +// compile in all cases. +// +using int64_for_gather_t = const long long int; // NOLINT runtime-int + +/// Storage used to allocate temporary vectors of a batch size. +/// Temporary vectors should resemble allocating temporary variables on the stack +/// but in the context of vectorized processing where we need to store a vector of +/// temporaries instead of a single value. +class TempVectorStack { + template + friend class TempVectorHolder; + + public: + Status Init(MemoryPool* pool, int64_t size) { + num_vectors_ = 0; + top_ = 0; + buffer_size_ = size; + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); + buffer_ = std::move(buffer); + return Status::OK(); + } + + private: + int64_t PaddedAllocationSize(int64_t num_bytes) { + // Round up allocation size to multiple of 8 bytes + // to avoid returning temp vectors with unaligned address. + // + // Also add padding at the end to facilitate loads and stores + // using SIMD when number of vector elements is not divisible + // by the number of SIMD lanes. + // + return ::arrow::BitUtil::RoundUp(num_bytes, sizeof(int64_t)) + padding; + } + void alloc(uint32_t num_bytes, uint8_t** data, int* id) { + int64_t old_top = top_; + top_ += PaddedAllocationSize(num_bytes); + // Stack overflow check + ARROW_DCHECK(top_ <= buffer_size_); + *data = buffer_->mutable_data() + old_top; + *id = num_vectors_++; + } + void release(int id, uint32_t num_bytes) { + ARROW_DCHECK(num_vectors_ == id + 1); + int64_t size = PaddedAllocationSize(num_bytes); + ARROW_DCHECK(top_ >= size); + top_ -= size; + --num_vectors_; + } + static constexpr int64_t padding = 64; + int num_vectors_; + int64_t top_; + std::unique_ptr buffer_; + int64_t buffer_size_; +}; + +template +class TempVectorHolder { + friend class TempVectorStack; + + public: + ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } + T* mutable_data() { return reinterpret_cast(data_); } + TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { + stack_ = stack; + num_elements_ = num_elements; + stack_->alloc(num_elements * sizeof(T), &data_, &id_); + } + + private: + TempVectorStack* stack_; + uint8_t* data_; + int id_; + uint32_t num_elements_; +}; + +class BitUtil { + public: + static void bits_to_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, int* num_indexes, + uint16_t* indexes, int bit_offset = 0); + + static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes, int bit_offset = 0); + + // Input and output indexes may be pointing to the same data (in-place filtering). + static void bits_split_indexes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, int* num_indexes_bit0, + uint16_t* indexes_bit0, uint16_t* indexes_bit1, + int bit_offset = 0); + + // Bit 1 is replaced with byte 0xFF. + static void bits_to_bytes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); + + // Return highest bit of each byte. + static void bytes_to_bits(int64_t hardware_flags, const int num_bits, + const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); + + static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, + uint32_t num_bytes); + + private: + inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index, + int* num_indexes, uint16_t* indexes); + inline static void bits_filter_indexes_helper(uint64_t word, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes); + template + static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes); + +#if defined(ARROW_HAVE_AVX2) + static void bits_to_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, int* num_indexes, + uint16_t* indexes); + static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes); + template + static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, + int* num_indexes, uint16_t* indexes); + template + static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes); + static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes); + static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits); + static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); +#endif +}; + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec/util_avx2.cc b/cpp/src/arrow/compute/exec/util_avx2.cc new file mode 100644 index 00000000000..8cf0104db46 --- /dev/null +++ b/cpp/src/arrow/compute/exec/util_avx2.cc @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec/util.h" +#include "arrow/util/bit_util.h" + +namespace arrow { +namespace util { + +#if defined(ARROW_HAVE_AVX2) + +void BitUtil::bits_to_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, int* num_indexes, + uint16_t* indexes) { + if (bit_to_search == 0) { + bits_to_indexes_imp_avx2<0>(num_bits, bits, num_indexes, indexes); + } else { + ARROW_DCHECK(bit_to_search == 1); + bits_to_indexes_imp_avx2<1>(num_bits, bits, num_indexes, indexes); + } +} + +template +void BitUtil::bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, + int* num_indexes, uint16_t* indexes) { + // 64 bits at a time + constexpr int unroll = 64; + + // The caller takes care of processing the remaining bits at the end outside of the + // multiples of 64 + ARROW_DCHECK(num_bits % unroll == 0); + + constexpr uint64_t kEachByteIs1 = 0X0101010101010101ULL; + constexpr uint64_t kEachByteIs8 = 0x0808080808080808ULL; + constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; + + uint8_t byte_indexes[64]; + const uint64_t incr = kEachByteIs8; + const uint64_t mask = kByteSequence0To7; + *num_indexes = 0; + for (int i = 0; i < num_bits / unroll; ++i) { + uint64_t word = reinterpret_cast(bits)[i]; + if (bit_to_search == 0) { + word = ~word; + } + uint64_t base = 0; + int num_indexes_loop = 0; + while (word) { + uint64_t byte_indexes_next = + _pext_u64(mask, _pdep_u64(word, kEachByteIs1) * 0xff) + base; + *reinterpret_cast(byte_indexes + num_indexes_loop) = byte_indexes_next; + base += incr; + num_indexes_loop += static_cast(arrow::BitUtil::PopCount(word & 0xff)); + word >>= 8; + } + // Unpack indexes to 16-bits and either add the base of i * 64 or shuffle input + // indexes + for (int j = 0; j < (num_indexes_loop + 15) / 16; ++j) { + __m256i output = _mm256_cvtepi8_epi16( + _mm_loadu_si128(reinterpret_cast(byte_indexes) + j)); + output = _mm256_add_epi16(output, _mm256_set1_epi16(i * 64)); + _mm256_storeu_si256(((__m256i*)(indexes + *num_indexes)) + j, output); + } + *num_indexes += num_indexes_loop; + } +} + +void BitUtil::bits_filter_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes) { + if (bit_to_search == 0) { + bits_filter_indexes_imp_avx2<0>(num_bits, bits, input_indexes, num_indexes, indexes); + } else { + bits_filter_indexes_imp_avx2<1>(num_bits, bits, input_indexes, num_indexes, indexes); + } +} + +template +void BitUtil::bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, + int* out_num_indexes, uint16_t* indexes) { + // 64 bits at a time + constexpr int unroll = 64; + + // The caller takes care of processing the remaining bits at the end outside of the + // multiples of 64 + ARROW_DCHECK(num_bits % unroll == 0); + + constexpr uint64_t kRepeatedBitPattern0001 = 0x1111111111111111ULL; + constexpr uint64_t k4BitSequence0To15 = 0xfedcba9876543210ULL; + constexpr uint64_t kByteSequence_0_0_1_1_2_2_3_3 = 0x0303020201010000ULL; + constexpr uint64_t kByteSequence_4_4_5_5_6_6_7_7 = 0x0707060605050404ULL; + constexpr uint64_t kByteSequence_0_2_4_6_8_10_12_14 = 0x0e0c0a0806040200ULL; + constexpr uint64_t kByteSequence_1_3_5_7_9_11_13_15 = 0x0f0d0b0907050301ULL; + constexpr uint64_t kByteSequence_0_8_1_9_2_10_3_11 = 0x0b030a0209010800ULL; + constexpr uint64_t kByteSequence_4_12_5_13_6_14_7_15 = 0x0f070e060d050c04ULL; + + const uint64_t mask = k4BitSequence0To15; + int num_indexes = 0; + for (int i = 0; i < num_bits / unroll; ++i) { + uint64_t word = reinterpret_cast(bits)[i]; + if (bit_to_search == 0) { + word = ~word; + } + + int loop_id = 0; + while (word) { + uint64_t indexes_4bit = + _pext_u64(mask, _pdep_u64(word, kRepeatedBitPattern0001) * 0xf); + // Unpack 4 bit indexes to 8 bits + __m256i indexes_8bit = _mm256_set1_epi64x(indexes_4bit); + indexes_8bit = _mm256_shuffle_epi8( + indexes_8bit, + _mm256_setr_epi64x(kByteSequence_0_0_1_1_2_2_3_3, kByteSequence_4_4_5_5_6_6_7_7, + kByteSequence_0_0_1_1_2_2_3_3, + kByteSequence_4_4_5_5_6_6_7_7)); + indexes_8bit = _mm256_blendv_epi8( + _mm256_and_si256(indexes_8bit, _mm256_set1_epi8(0x0f)), + _mm256_and_si256(_mm256_srli_epi32(indexes_8bit, 4), _mm256_set1_epi8(0x0f)), + _mm256_set1_epi16(static_cast(0xff00))); + __m256i input = + _mm256_loadu_si256(((const __m256i*)input_indexes) + 4 * i + loop_id); + // Shuffle bytes to get low bytes in the first 128-bit lane and high bytes in the + // second + input = _mm256_shuffle_epi8( + input, _mm256_setr_epi64x( + kByteSequence_0_2_4_6_8_10_12_14, kByteSequence_1_3_5_7_9_11_13_15, + kByteSequence_0_2_4_6_8_10_12_14, kByteSequence_1_3_5_7_9_11_13_15)); + input = _mm256_permute4x64_epi64(input, 0xd8); // 0b11011000 + // Apply permutation + __m256i output = _mm256_shuffle_epi8(input, indexes_8bit); + // Move low and high bytes across 128-bit lanes to assemble back 16-bit indexes. + // (This is the reverse of the byte permutation we did on the input) + output = _mm256_permute4x64_epi64(output, + 0xd8); // The reverse of swapping 2nd and 3rd + // 64-bit element is the same permutation + output = _mm256_shuffle_epi8(output, + _mm256_setr_epi64x(kByteSequence_0_8_1_9_2_10_3_11, + kByteSequence_4_12_5_13_6_14_7_15, + kByteSequence_0_8_1_9_2_10_3_11, + kByteSequence_4_12_5_13_6_14_7_15)); + _mm256_storeu_si256((__m256i*)(indexes + num_indexes), output); + num_indexes += static_cast(arrow::BitUtil::PopCount(word & 0xffff)); + word >>= 16; + ++loop_id; + } + } + + *out_num_indexes = num_indexes; +} + +void BitUtil::bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, + uint8_t* bytes) { + constexpr int unroll = 32; + + constexpr uint64_t kEachByteIs1 = 0x0101010101010101ULL; + constexpr uint64_t kEachByteIs2 = 0x0202020202020202ULL; + constexpr uint64_t kEachByteIs3 = 0x0303030303030303ULL; + constexpr uint64_t kByteSequencePowersOf2 = 0x8040201008040201ULL; + + // Processing 32 bits at a time + for (int i = 0; i < num_bits / unroll; ++i) { + __m256i unpacked = _mm256_set1_epi32(reinterpret_cast(bits)[i]); + unpacked = _mm256_shuffle_epi8( + unpacked, _mm256_setr_epi64x(0ULL, kEachByteIs1, kEachByteIs2, kEachByteIs3)); + __m256i bits_in_bytes = _mm256_set1_epi64x(kByteSequencePowersOf2); + unpacked = + _mm256_cmpeq_epi8(bits_in_bytes, _mm256_and_si256(unpacked, bits_in_bytes)); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(bytes) + i, unpacked); + } +} + +void BitUtil::bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, + uint8_t* bits) { + constexpr int unroll = 32; + // Processing 32 bits at a time + for (int i = 0; i < num_bits / unroll; ++i) { + reinterpret_cast(bits)[i] = _mm256_movemask_epi8( + _mm256_loadu_si256(reinterpret_cast(bytes) + i)); + } +} + +bool BitUtil::are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) { + __m256i result_or = _mm256_setzero_si256(); + uint32_t i; + for (i = 0; i < num_bytes / 32; ++i) { + __m256i x = _mm256_loadu_si256(reinterpret_cast(bytes) + i); + result_or = _mm256_or_si256(result_or, x); + } + uint32_t result_or32 = _mm256_movemask_epi8(result_or); + if (num_bytes % 32 > 0) { + uint64_t tail[4] = {0, 0, 0, 0}; + result_or32 |= memcmp(bytes + i * 32, tail, num_bytes % 32); + } + return result_or32 == 0; +} + +#endif // ARROW_HAVE_AVX2 + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index e9bd57596b5..2c145dadaeb 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -31,6 +31,7 @@ #include "arrow/compute/exec.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/function.h" +#include "arrow/compute/function_internal.h" #include "arrow/compute/kernel.h" #include "arrow/compute/registry.h" #include "arrow/memory_pool.h" @@ -50,6 +51,10 @@ using internal::checked_cast; namespace compute { namespace detail { +using ::arrow::internal::BitmapEquals; +using ::arrow::internal::CopyBitmap; +using ::arrow::internal::CountSetBits; + TEST(ExecContext, BasicWorkings) { { ExecContext ctx; @@ -58,13 +63,13 @@ TEST(ExecContext, BasicWorkings) { ASSERT_EQ(std::numeric_limits::max(), ctx.exec_chunksize()); ASSERT_TRUE(ctx.use_threads()); - ASSERT_EQ(internal::CpuInfo::GetInstance(), ctx.cpu_info()); + ASSERT_EQ(arrow::internal::CpuInfo::GetInstance(), ctx.cpu_info()); } // Now, let's customize all the things LoggingMemoryPool my_pool(default_memory_pool()); std::unique_ptr custom_reg = FunctionRegistry::Make(); - ExecContext ctx(&my_pool, custom_reg.get()); + ExecContext ctx(&my_pool, /*executor=*/nullptr, custom_reg.get()); ASSERT_EQ(custom_reg.get(), ctx.func_registry()); ASSERT_EQ(&my_pool, ctx.memory_pool()); @@ -277,9 +282,9 @@ TEST_F(TestPropagateNulls, SingleValueWithNulls) { ASSERT_EQ(arr->Slice(offset)->null_count(), output.GetNullCount()); - ASSERT_TRUE(internal::BitmapEquals(output.buffers[0]->data(), output.offset, - sliced->null_bitmap_data(), sliced->offset(), - output.length)); + ASSERT_TRUE(BitmapEquals(output.buffers[0]->data(), output.offset, + sliced->null_bitmap_data(), sliced->offset(), + output.length)); AssertValidityZeroExtraBits(output); }; @@ -372,8 +377,8 @@ TEST_F(TestPropagateNulls, IntersectsNulls) { const auto& out_buffer = *output.buffers[0]; - ASSERT_TRUE(internal::BitmapEquals(out_buffer.data(), output_offset, ex_bitmap, - /*ex_offset=*/0, length)); + ASSERT_TRUE(BitmapEquals(out_buffer.data(), output_offset, ex_bitmap, + /*ex_offset=*/0, length)); // Now check that the rest of the bits in out_buffer are still 0 AssertValidityZeroExtraBits(output); @@ -537,7 +542,7 @@ TEST_F(TestExecBatchIterator, ZeroLengthInputs) { // ---------------------------------------------------------------------- // Scalar function execution -void ExecCopy(KernelContext*, const ExecBatch& batch, Datum* out) { +Status ExecCopy(KernelContext*, const ExecBatch& batch, Datum* out) { DCHECK_EQ(1, batch.num_values()); const auto& type = checked_cast(*batch[0].type()); int value_size = type.bit_width() / 8; @@ -547,27 +552,27 @@ void ExecCopy(KernelContext*, const ExecBatch& batch, Datum* out) { uint8_t* dst = out_arr->buffers[1]->mutable_data() + out_arr->offset * value_size; const uint8_t* src = arg0.buffers[1]->data() + arg0.offset * value_size; std::memcpy(dst, src, batch.length * value_size); + return Status::OK(); } -void ExecComputedBitmap(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecComputedBitmap(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // Propagate nulls not used. Check that the out bitmap isn't the same already // as the input bitmap const ArrayData& arg0 = *batch[0].array(); ArrayData* out_arr = out->mutable_array(); - if (internal::CountSetBits(arg0.buffers[0]->data(), arg0.offset, batch.length) > 0) { + if (CountSetBits(arg0.buffers[0]->data(), arg0.offset, batch.length) > 0) { // Check that the bitmap has not been already copied over - DCHECK(!internal::BitmapEquals(arg0.buffers[0]->data(), arg0.offset, - out_arr->buffers[0]->data(), out_arr->offset, - batch.length)); + DCHECK(!BitmapEquals(arg0.buffers[0]->data(), arg0.offset, + out_arr->buffers[0]->data(), out_arr->offset, batch.length)); } - internal::CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length, - out_arr->buffers[0]->mutable_data(), out_arr->offset); - ExecCopy(ctx, batch, out); + CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length, + out_arr->buffers[0]->mutable_data(), out_arr->offset); + return ExecCopy(ctx, batch, out); } -void ExecNoPreallocatedData(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecNoPreallocatedData(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // Validity preallocated, but not the data ArrayData* out_arr = out->mutable_array(); DCHECK_EQ(0, out_arr->offset); @@ -575,26 +580,44 @@ void ExecNoPreallocatedData(KernelContext* ctx, const ExecBatch& batch, Datum* o int value_size = type.bit_width() / 8; Status s = (ctx->Allocate(out_arr->length * value_size).Value(&out_arr->buffers[1])); DCHECK_OK(s); - ExecCopy(ctx, batch, out); + return ExecCopy(ctx, batch, out); } -void ExecNoPreallocatedAnything(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecNoPreallocatedAnything(KernelContext* ctx, const ExecBatch& batch, + Datum* out) { // Neither validity nor data preallocated ArrayData* out_arr = out->mutable_array(); DCHECK_EQ(0, out_arr->offset); Status s = (ctx->AllocateBitmap(out_arr->length).Value(&out_arr->buffers[0])); DCHECK_OK(s); const ArrayData& arg0 = *batch[0].array(); - internal::CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length, - out_arr->buffers[0]->mutable_data(), /*offset=*/0); + CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length, + out_arr->buffers[0]->mutable_data(), /*offset=*/0); // Reuse the kernel that allocates the data - ExecNoPreallocatedData(ctx, batch, out); + return ExecNoPreallocatedData(ctx, batch, out); } -struct ExampleOptions : public FunctionOptions { +class ExampleOptionsType : public FunctionOptionsType { + public: + static const FunctionOptionsType* GetInstance() { + static std::unique_ptr instance(new ExampleOptionsType()); + return instance.get(); + } + const char* type_name() const override { return "example"; } + std::string Stringify(const FunctionOptions& options) const override { + return type_name(); + } + bool Compare(const FunctionOptions& options, + const FunctionOptions& other) const override { + return true; + } +}; +class ExampleOptions : public FunctionOptions { + public: + explicit ExampleOptions(std::shared_ptr value) + : FunctionOptions(ExampleOptionsType::GetInstance()), value(std::move(value)) {} std::shared_ptr value; - explicit ExampleOptions(std::shared_ptr value) : value(std::move(value)) {} }; struct ExampleState : public KernelState { @@ -602,12 +625,13 @@ struct ExampleState : public KernelState { explicit ExampleState(std::shared_ptr value) : value(std::move(value)) {} }; -std::unique_ptr InitStateful(KernelContext*, const KernelInitArgs& args) { +Result> InitStateful(KernelContext*, + const KernelInitArgs& args) { auto func_options = static_cast(args.options); return std::unique_ptr(new ExampleState{func_options->value}); } -void ExecStateful(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecStateful(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // We take the value from the state and multiply the data in batch[0] with it ExampleState* state = static_cast(ctx->state()); int32_t multiplier = checked_cast(*state->value).value; @@ -619,12 +643,14 @@ void ExecStateful(KernelContext* ctx, const ExecBatch& batch, Datum* out) { for (int64_t i = 0; i < arg0.length; ++i) { dst[i] = arg0_data[i] * multiplier; } + return Status::OK(); } -void ExecAddInt32(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +Status ExecAddInt32(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const Int32Scalar& arg0 = batch[0].scalar_as(); const Int32Scalar& arg1 = batch[1].scalar_as(); out->value = std::make_shared(arg0.value + arg1.value); + return Status::OK(); } class TestCallScalarFunction : public TestComputeInternals { diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index c8fc8b8dec0..05d14d03b16 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -21,10 +21,13 @@ #include #include +#include "arrow/compute/api_scalar.h" #include "arrow/compute/cast.h" #include "arrow/compute/exec.h" #include "arrow/compute/exec_internal.h" +#include "arrow/compute/function_internal.h" #include "arrow/compute/kernels/common.h" +#include "arrow/compute/registry.h" #include "arrow/datum.h" #include "arrow/util/cpu_info.h" @@ -33,6 +36,38 @@ namespace arrow { using internal::checked_cast; namespace compute { +Result> FunctionOptionsType::Serialize( + const FunctionOptions&) const { + return Status::NotImplemented("Serialize for ", type_name()); +} + +Result> FunctionOptionsType::Deserialize( + const Buffer& buffer) const { + return Status::NotImplemented("Deserialize for ", type_name()); +} + +std::string FunctionOptions::ToString() const { return options_type()->Stringify(*this); } + +bool FunctionOptions::Equals(const FunctionOptions& other) const { + if (this == &other) return true; + if (options_type() != other.options_type()) return false; + return options_type()->Compare(*this, other); +} + +Result> FunctionOptions::Serialize() const { + return options_type()->Serialize(*this); +} + +Result> FunctionOptions::Deserialize( + const std::string& type_name, const Buffer& buffer) { + ARROW_ASSIGN_OR_RAISE(auto options, + GetFunctionRegistry()->GetFunctionOptionsType(type_name)); + return options->Deserialize(buffer); +} + +void PrintTo(const FunctionOptions& options, std::ostream* os) { + *os << options.ToString(); +} static const FunctionDoc kEmptyFunctionDoc{}; @@ -179,8 +214,7 @@ Result Function::Execute(const std::vector& args, KernelContext kernel_ctx{ctx}; if (kernel->init) { - state = kernel->init(&kernel_ctx, {kernel, inputs, options}); - RETURN_NOT_OK(kernel_ctx.status()); + ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options})); kernel_ctx.SetState(state.get()); } @@ -211,8 +245,9 @@ Status Function::Validate() const { if (arity_.is_varargs && arg_count == arity_.num_args + 1) { return Status::OK(); } - return Status::Invalid("In function '", name_, - "': ", "number of argument names != function arity"); + return Status::Invalid( + "In function '", name_, + "': ", "number of argument names for function documentation != function arity"); } return Status::OK(); } diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index 9a3e1c1852f..bd854bbb28e 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -29,6 +29,7 @@ #include "arrow/datum.h" #include "arrow/result.h" #include "arrow/status.h" +#include "arrow/util/compare.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -39,12 +40,50 @@ namespace compute { /// /// @{ +/// \brief Extension point for defining options outside libarrow (but +/// still within this project). +class ARROW_EXPORT FunctionOptionsType { + public: + virtual ~FunctionOptionsType() = default; + + virtual const char* type_name() const = 0; + virtual std::string Stringify(const FunctionOptions&) const = 0; + virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; + virtual Result> Serialize(const FunctionOptions&) const; + virtual Result> Deserialize( + const Buffer& buffer) const; +}; + /// \brief Base class for specifying options configuring a function's behavior, /// such as error handling. -struct ARROW_EXPORT FunctionOptions { +class ARROW_EXPORT FunctionOptions : public util::EqualityComparable { + public: virtual ~FunctionOptions() = default; + + const FunctionOptionsType* options_type() const { return options_type_; } + const char* type_name() const { return options_type()->type_name(); } + + bool Equals(const FunctionOptions& other) const; + using util::EqualityComparable::Equals; + using util::EqualityComparable::operator==; + using util::EqualityComparable::operator!=; + std::string ToString() const; + /// \brief Serialize an options struct to a buffer. + Result> Serialize() const; + /// \brief Deserialize an options struct from a buffer. + /// Note: this will only look for `type_name` in the default FunctionRegistry; + /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then + /// call FunctionOptionsType::Deserialize(). + static Result> Deserialize( + const std::string& type_name, const Buffer& buffer); + + protected: + explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {} + const FunctionOptionsType* options_type_; }; +ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*); + /// \brief Contains the number of required arguments for the function. /// /// Naming conventions taken from https://en.wikipedia.org/wiki/Arity. diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc index 5dc305bdd89..a29a766be79 100644 --- a/cpp/src/arrow/compute/function_benchmark.cc +++ b/cpp/src/arrow/compute/function_benchmark.cc @@ -19,6 +19,7 @@ #include "arrow/array/array_base.h" #include "arrow/compute/api.h" +#include "arrow/compute/exec_internal.h" #include "arrow/memory_pool.h" #include "arrow/scalar.h" #include "arrow/testing/gtest_util.h" @@ -78,16 +79,17 @@ void BM_CastDispatchBaseline(benchmark::State& state) { ExecContext exec_context; KernelContext kernel_context(&exec_context); - auto cast_state = - cast_kernel->init(&kernel_context, {cast_kernel, {double_type}, &cast_options}); - ABORT_NOT_OK(kernel_context.status()); + auto cast_state = cast_kernel + ->init(&kernel_context, + KernelInitArgs{cast_kernel, {double_type}, &cast_options}) + .ValueOrDie(); kernel_context.SetState(cast_state.get()); for (auto _ : state) { Datum timestamp_scalar = MakeNullScalar(double_type); for (Datum int_scalar : int_scalars) { - exec(&kernel_context, {{std::move(int_scalar)}, 1}, ×tamp_scalar); - ABORT_NOT_OK(kernel_context.status()); + ABORT_NOT_OK( + exec(&kernel_context, {{std::move(int_scalar)}, 1}, ×tamp_scalar)); } benchmark::DoNotOptimize(timestamp_scalar); } @@ -164,8 +166,7 @@ void BM_ExecuteScalarKernelOnScalar(benchmark::State& state) { int64_t total = 0; for (const auto& scalar : scalars) { Datum result{MakeNullScalar(int64())}; - exec(&kernel_context, ExecBatch{{scalar}, /*length=*/1}, &result); - ABORT_NOT_OK(kernel_context.status()); + ABORT_NOT_OK(exec(&kernel_context, ExecBatch{{scalar}, /*length=*/1}, &result)); total += result.scalar()->is_valid; } benchmark::DoNotOptimize(total); @@ -174,11 +175,44 @@ void BM_ExecuteScalarKernelOnScalar(benchmark::State& state) { state.SetItemsProcessed(state.iterations() * N); } +void BM_ExecBatchIterator(benchmark::State& state) { + // Measure overhead related to splitting ExecBatch into smaller ExecBatches + // for parallelism or more optimal CPU cache affinity + random::RandomArrayGenerator rag(kSeed); + + const int64_t length = 1 << 20; + const int num_fields = 32; + + std::vector args(num_fields); + for (int i = 0; i < num_fields; ++i) { + args[i] = rag.Int64(length, 0, 100)->data(); + } + + const int64_t blocksize = state.range(0); + for (auto _ : state) { + std::unique_ptr it = + *detail::ExecBatchIterator::Make(args, blocksize); + ExecBatch batch; + while (it->Next(&batch)) { + for (int i = 0; i < num_fields; ++i) { + auto data = batch.values[i].array()->buffers[1]->data(); + benchmark::DoNotOptimize(data); + } + } + benchmark::DoNotOptimize(batch); + } + // Provides comparability across blocksizes by looking at the iterations per + // second. So 1000 iterations/second means that input splitting associated + // with ExecBatchIterator takes up 1ms every time. + state.SetItemsProcessed(state.iterations()); +} + BENCHMARK(BM_CastDispatch); BENCHMARK(BM_CastDispatchBaseline); BENCHMARK(BM_AddDispatch); BENCHMARK(BM_ExecuteScalarFunctionOnScalar); BENCHMARK(BM_ExecuteScalarKernelOnScalar); +BENCHMARK(BM_ExecBatchIterator)->RangeMultiplier(4)->Range(1024, 64 * 1024); } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/function_internal.cc b/cpp/src/arrow/compute/function_internal.cc new file mode 100644 index 00000000000..0a926e0a39c --- /dev/null +++ b/cpp/src/arrow/compute/function_internal.cc @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/function_internal.h" + +#include "arrow/array/util.h" +#include "arrow/compute/function.h" +#include "arrow/compute/registry.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" +#include "arrow/record_batch.h" +#include "arrow/scalar.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace compute { +namespace internal { +using ::arrow::internal::checked_cast; + +constexpr char kTypeNameField[] = "_type_name"; + +Result> FunctionOptionsToStructScalar( + const FunctionOptions& options) { + std::vector field_names; + std::vector> values; + const auto* options_type = + dynamic_cast(options.options_type()); + if (!options_type) { + return Status::NotImplemented("serializing ", options.type_name(), + " to StructScalar"); + } + RETURN_NOT_OK(options_type->ToStructScalar(options, &field_names, &values)); + field_names.push_back(kTypeNameField); + const char* options_name = options.type_name(); + values.emplace_back( + new BinaryScalar(Buffer::Wrap(options_name, std::strlen(options_name)))); + return StructScalar::Make(std::move(values), std::move(field_names)); +} + +Result> FunctionOptionsFromStructScalar( + const StructScalar& scalar) { + ARROW_ASSIGN_OR_RAISE(auto type_name_holder, scalar.field(kTypeNameField)); + const std::string type_name = + checked_cast(*type_name_holder).value->ToString(); + ARROW_ASSIGN_OR_RAISE(auto raw_options_type, + GetFunctionRegistry()->GetFunctionOptionsType(type_name)); + const auto* options_type = checked_cast(raw_options_type); + return options_type->FromStructScalar(scalar); +} + +Result> GenericOptionsType::Serialize( + const FunctionOptions& options) const { + ARROW_ASSIGN_OR_RAISE(auto scalar, FunctionOptionsToStructScalar(options)); + ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*scalar, 1)); + auto batch = + RecordBatch::Make(schema({field("", array->type())}), /*num_rows=*/1, {array}); + ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create()); + ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema())); + RETURN_NOT_OK(writer->WriteRecordBatch(*batch)); + RETURN_NOT_OK(writer->Close()); + return stream->Finish(); +} + +Result> GenericOptionsType::Deserialize( + const Buffer& buffer) const { + return DeserializeFunctionOptions(buffer); +} + +Result> DeserializeFunctionOptions( + const Buffer& buffer) { + io::BufferReader stream(buffer); + ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream)); + ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0)); + if (batch->num_rows() != 1) { + return Status::Invalid( + "serialized FunctionOptions's batch repr was not a single row - had ", + batch->num_rows()); + } + if (batch->num_columns() != 1) { + return Status::Invalid( + "serialized FunctionOptions's batch repr was not a single column - had ", + batch->num_columns()); + } + auto column = batch->column(0); + if (column->type()->id() != Type::STRUCT) { + return Status::Invalid( + "serialized FunctionOptions's batch repr was not a struct column - was ", + column->type()->ToString()); + } + ARROW_ASSIGN_OR_RAISE(auto raw_scalar, + checked_cast(*column).GetScalar(0)); + auto scalar = checked_cast(*raw_scalar); + return FunctionOptionsFromStructScalar(scalar); +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/function_internal.h b/cpp/src/arrow/compute/function_internal.h new file mode 100644 index 00000000000..fdd7f09ba1f --- /dev/null +++ b/cpp/src/arrow/compute/function_internal.h @@ -0,0 +1,626 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/function.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/reflection_internal.h" +#include "arrow/util/string.h" +#include "arrow/util/visibility.h" + +namespace arrow { +struct Scalar; +struct StructScalar; +using ::arrow::internal::checked_cast; + +namespace internal { +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "SortOrder"; } + static std::string value_name(compute::SortOrder value) { + switch (value) { + case compute::SortOrder::Ascending: + return "Ascending"; + case compute::SortOrder::Descending: + return "Descending"; + } + return ""; + } +}; +} // namespace internal + +namespace compute { +namespace internal { + +using arrow::internal::EnumTraits; +using arrow::internal::has_enum_traits; + +template ::type> +Result ValidateEnumValue(CType raw) { + for (auto valid : EnumTraits::values()) { + if (raw == static_cast(valid)) { + return static_cast(raw); + } + } + return Status::Invalid("Invalid value for ", EnumTraits::name(), ": ", raw); +} + +class GenericOptionsType : public FunctionOptionsType { + public: + Result> Serialize(const FunctionOptions&) const override; + Result> Deserialize( + const Buffer& buffer) const override; + virtual Status ToStructScalar(const FunctionOptions& options, + std::vector* field_names, + std::vector>* values) const = 0; + virtual Result> FromStructScalar( + const StructScalar& scalar) const = 0; +}; + +ARROW_EXPORT +Result> FunctionOptionsToStructScalar( + const FunctionOptions&); +ARROW_EXPORT +Result> FunctionOptionsFromStructScalar( + const StructScalar&); +ARROW_EXPORT +Result> DeserializeFunctionOptions(const Buffer& buffer); + +template +static inline enable_if_t::value, std::string> GenericToString( + const T& value) { + std::stringstream ss; + ss << value; + return ss.str(); +} + +static inline std::string GenericToString(bool value) { return value ? "true" : "false"; } + +static inline std::string GenericToString(const std::string& value) { + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); +} + +template +static inline enable_if_t::value, std::string> GenericToString( + const T value) { + return EnumTraits::value_name(value); +} + +template +static inline std::string GenericToString(const std::shared_ptr& value) { + std::stringstream ss; + return value ? value->ToString() : ""; +} + +static inline std::string GenericToString(const std::shared_ptr& value) { + std::stringstream ss; + ss << value->type->ToString() << ":" << value->ToString(); + return ss.str(); +} + +static inline std::string GenericToString( + const std::shared_ptr& value) { + std::stringstream ss; + ss << "KeyValueMetadata{"; + if (value) { + bool first = true; + for (const auto& pair : value->sorted_pairs()) { + if (!first) ss << ", "; + first = false; + ss << pair.first << ':' << pair.second; + } + } + ss << '}'; + return ss.str(); +} + +static inline std::string GenericToString(const Datum& value) { + switch (value.kind()) { + case Datum::NONE: + return ""; + case Datum::SCALAR: + return GenericToString(value.scalar()); + case Datum::ARRAY: { + std::stringstream ss; + ss << value.type()->ToString() << ':' << value.make_array()->ToString(); + return ss.str(); + } + case Datum::CHUNKED_ARRAY: + case Datum::RECORD_BATCH: + case Datum::TABLE: + case Datum::COLLECTION: + return value.ToString(); + } + return value.ToString(); +} + +template +static inline std::string GenericToString(const std::vector& value) { + std::stringstream ss; + ss << "["; + bool first = true; + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + if (!first) ss << ", "; + first = false; + ss << GenericToString(*it); + } + ss << ']'; + return ss.str(); +} + +static inline std::string GenericToString(SortOrder value) { + switch (value) { + case SortOrder::Ascending: + return "Ascending"; + case SortOrder::Descending: + return "Descending"; + } + return ""; +} + +static inline std::string GenericToString(const std::vector& value) { + std::stringstream ss; + ss << '['; + bool first = true; + for (const auto& key : value) { + if (!first) { + ss << ", "; + } + first = false; + ss << key.ToString(); + } + ss << ']'; + return ss.str(); +} + +template +static inline bool GenericEquals(const T& left, const T& right) { + return left == right; +} + +template +static inline bool GenericEquals(const std::shared_ptr& left, + const std::shared_ptr& right) { + if (left && right) { + return left->Equals(*right); + } + return left == right; +} + +static inline bool IsEmpty(const std::shared_ptr& meta) { + return !meta || meta->size() == 0; +} + +static inline bool GenericEquals(const std::shared_ptr& left, + const std::shared_ptr& right) { + // Special case since null metadata is considered equivalent to empty + if (IsEmpty(left) || IsEmpty(right)) { + return IsEmpty(left) && IsEmpty(right); + } + return left->Equals(*right); +} + +template +static inline bool GenericEquals(const std::vector& left, + const std::vector& right) { + if (left.size() != right.size()) return false; + for (size_t i = 0; i < left.size(); i++) { + if (!GenericEquals(left[i], right[i])) return false; + } + return true; +} + +template +static inline decltype(TypeTraits::ArrowType>::type_singleton()) +GenericTypeSingleton() { + return TypeTraits::ArrowType>::type_singleton(); +} + +template +static inline enable_if_same, + std::shared_ptr> +GenericTypeSingleton() { + return map(binary(), binary()); +} + +template +static inline enable_if_t::value, std::shared_ptr> +GenericTypeSingleton() { + return TypeTraits::Type>::type_singleton(); +} + +template +static inline enable_if_same> +GenericTypeSingleton() { + std::vector> fields; + fields.emplace_back(new Field("name", GenericTypeSingleton())); + fields.emplace_back(new Field("order", GenericTypeSingleton())); + return std::make_shared(std::move(fields)); +} + +// N.B. ordering of overloads is relatively fragile +template +static inline Result()))> GenericToScalar( + const T& value) { + return MakeScalar(value); +} + +// For Clang/libc++: when iterating through vector, we can't +// pass it by reference so the overload above doesn't apply +static inline Result> GenericToScalar(bool value) { + return MakeScalar(value); +} + +template ::value>> +static inline Result> GenericToScalar(const T value) { + using CType = typename EnumTraits::CType; + return GenericToScalar(static_cast(value)); +} + +static inline Result> GenericToScalar(const SortKey& value) { + ARROW_ASSIGN_OR_RAISE(auto name, GenericToScalar(value.name)); + ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(value.order)); + return StructScalar::Make({name, order}, {"name", "order"}); +} + +static inline Result> GenericToScalar( + const std::shared_ptr& value) { + auto ty = GenericTypeSingleton>(); + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), ty, &builder)); + auto* map_builder = checked_cast(builder.get()); + auto* key_builder = checked_cast(map_builder->key_builder()); + auto* item_builder = checked_cast(map_builder->item_builder()); + RETURN_NOT_OK(map_builder->Append()); + if (value) { + RETURN_NOT_OK(key_builder->AppendValues(value->keys())); + RETURN_NOT_OK(item_builder->AppendValues(value->values())); + } + std::shared_ptr arr; + RETURN_NOT_OK(map_builder->Finish(&arr)); + return arr->GetScalar(0); +} + +template +static inline Result> GenericToScalar( + const std::vector& value) { + std::shared_ptr type = GenericTypeSingleton(); + std::vector> scalars; + scalars.reserve(value.size()); + // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis + for (auto it = value.begin(); it != value.end(); it++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, GenericToScalar(*it)); + scalars.push_back(std::move(scalar)); + } + std::unique_ptr builder; + RETURN_NOT_OK( + MakeBuilder(default_memory_pool(), type ? type : scalars[0]->type, &builder)); + RETURN_NOT_OK(builder->AppendScalars(scalars)); + std::shared_ptr out; + RETURN_NOT_OK(builder->Finish(&out)); + return std::make_shared(std::move(out)); +} + +static inline Result> GenericToScalar( + const std::shared_ptr& value) { + if (!value) { + return Status::Invalid("shared_ptr is nullptr"); + } + return MakeNullScalar(value); +} + +static inline Result> GenericToScalar( + const std::shared_ptr& value) { + return value; +} + +static inline Result> GenericToScalar( + const std::shared_ptr& value) { + return std::make_shared(value); +} + +static inline Result> GenericToScalar(const Datum& value) { + // TODO(ARROW-9434): store in a union instead. + switch (value.kind()) { + case Datum::ARRAY: + return GenericToScalar(value.make_array()); + break; + default: + return Status::NotImplemented("Cannot serialize Datum kind ", value.kind()); + } +} + +template +static inline enable_if_primitive_ctype::ArrowType, Result> +GenericFromScalar(const std::shared_ptr& value) { + using ArrowType = typename CTypeTraits::ArrowType; + using ScalarType = typename TypeTraits::ScalarType; + if (value->type->id() != ArrowType::type_id) { + return Status::Invalid("Expected type ", ArrowType::type_id, " but got ", + value->type->ToString()); + } + const auto& holder = checked_cast(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + return holder.value; +} + +template +static inline enable_if_primitive_ctype::Type, Result> +GenericFromScalar(const std::shared_ptr& value) { + ARROW_ASSIGN_OR_RAISE(auto raw_val, + GenericFromScalar::CType>(value)); + return ValidateEnumValue(raw_val); +} + +template +using enable_if_same_result = enable_if_same>; + +template +static inline enable_if_same_result GenericFromScalar( + const std::shared_ptr& value) { + if (!is_base_binary_like(value->type->id())) { + return Status::Invalid("Expected binary-like type but got ", value->type->ToString()); + } + const auto& holder = checked_cast(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + return holder.value->ToString(); +} + +template +static inline enable_if_same_result GenericFromScalar( + const std::shared_ptr& value) { + if (value->type->id() != Type::STRUCT) { + return Status::Invalid("Expected type STRUCT but got ", value->type->id()); + } + if (!value->is_valid) return Status::Invalid("Got null scalar"); + const auto& holder = checked_cast(*value); + ARROW_ASSIGN_OR_RAISE(auto name_holder, holder.field("name")); + ARROW_ASSIGN_OR_RAISE(auto order_holder, holder.field("order")); + ARROW_ASSIGN_OR_RAISE(auto name, GenericFromScalar(name_holder)); + ARROW_ASSIGN_OR_RAISE(auto order, GenericFromScalar(order_holder)); + return SortKey{std::move(name), order}; +} + +template +static inline enable_if_same_result> GenericFromScalar( + const std::shared_ptr& value) { + return value->type; +} + +template +static inline enable_if_same_result> GenericFromScalar( + const std::shared_ptr& value) { + return value; +} + +template +static inline enable_if_same_result> +GenericFromScalar(const std::shared_ptr& value) { + auto ty = GenericTypeSingleton>(); + if (!value->type->Equals(ty)) { + return Status::Invalid("Expected ", ty->ToString(), " but got ", + value->type->ToString()); + } + const auto& holder = checked_cast(*value); + std::vector keys; + std::vector values; + const auto& list = checked_cast(*holder.value); + const auto& key_arr = checked_cast(*list.field(0)); + const auto& value_arr = checked_cast(*list.field(1)); + for (int64_t i = 0; i < list.length(); i++) { + keys.push_back(key_arr.GetString(i)); + values.push_back(value_arr.GetString(i)); + } + return key_value_metadata(std::move(keys), std::move(values)); +} + +template +static inline enable_if_same_result GenericFromScalar( + const std::shared_ptr& value) { + if (value->type->id() == Type::LIST) { + const auto& holder = checked_cast(*value); + return holder.value; + } + // TODO(ARROW-9434): handle other possible datum kinds by looking for a union + return Status::Invalid("Cannot deserialize Datum from ", value->ToString()); +} + +template +static enable_if_same::ArrowType, ListType, Result> +GenericFromScalar(const std::shared_ptr& value) { + using ValueType = typename T::value_type; + if (value->type->id() != Type::LIST) { + return Status::Invalid("Expected type LIST but got ", value->type->ToString()); + } + const auto& holder = checked_cast(*value); + if (!holder.is_valid) return Status::Invalid("Got null scalar"); + std::vector result; + for (int i = 0; i < holder.value->length(); i++) { + ARROW_ASSIGN_OR_RAISE(auto scalar, holder.value->GetScalar(i)); + ARROW_ASSIGN_OR_RAISE(auto v, GenericFromScalar(scalar)); + result.push_back(std::move(v)); + } + return result; +} + +template +struct StringifyImpl { + template + StringifyImpl(const Options& obj, const Tuple& props) + : obj_(obj), members_(props.size()) { + props.ForEach(*this); + } + + template + void operator()(const Property& prop, size_t i) { + std::stringstream ss; + ss << prop.name() << '=' << GenericToString(prop.get(obj_)); + members_[i] = ss.str(); + } + + std::string Finish() { + return "{" + arrow::internal::JoinStrings(members_, ", ") + "}"; + } + + const Options& obj_; + std::vector members_; +}; + +template +struct CompareImpl { + template + CompareImpl(const Options& l, const Options& r, const Tuple& props) + : left_(l), right_(r) { + props.ForEach(*this); + } + + template + void operator()(const Property& prop, size_t) { + equal_ &= GenericEquals(prop.get(left_), prop.get(right_)); + } + + const Options& left_; + const Options& right_; + bool equal_ = true; +}; + +template +struct ToStructScalarImpl { + template + ToStructScalarImpl(const Options& obj, const Tuple& props, + std::vector* field_names, + std::vector>* values) + : obj_(obj), field_names_(field_names), values_(values) { + props.ForEach(*this); + } + + template + void operator()(const Property& prop, size_t) { + if (!status_.ok()) return; + auto result = GenericToScalar(prop.get(obj_)); + if (!result.ok()) { + status_ = result.status().WithMessage("Could not serialize field ", prop.name(), + " of options type ", Options::kTypeName, ": ", + result.status().message()); + return; + } + field_names_->emplace_back(prop.name()); + values_->push_back(result.MoveValueUnsafe()); + } + + const Options& obj_; + Status status_; + std::vector* field_names_; + std::vector>* values_; +}; + +template +struct FromStructScalarImpl { + template + FromStructScalarImpl(Options* obj, const StructScalar& scalar, const Tuple& props) + : obj_(obj), scalar_(scalar) { + props.ForEach(*this); + } + + template + void operator()(const Property& prop, size_t) { + if (!status_.ok()) return; + auto maybe_holder = scalar_.field(std::string(prop.name())); + if (!maybe_holder.ok()) { + status_ = maybe_holder.status().WithMessage( + "Cannot deserialize field ", prop.name(), " of options type ", + Options::kTypeName, ": ", maybe_holder.status().message()); + return; + } + auto holder = maybe_holder.MoveValueUnsafe(); + auto result = GenericFromScalar(holder); + if (!result.ok()) { + status_ = result.status().WithMessage("Cannot deserialize field ", prop.name(), + " of options type ", Options::kTypeName, ": ", + result.status().message()); + return; + } + prop.set(obj_, result.MoveValueUnsafe()); + } + + Options* obj_; + Status status_; + const StructScalar& scalar_; +}; + +template +const FunctionOptionsType* GetFunctionOptionsType(const Properties&... properties) { + static const class OptionsType : public GenericOptionsType { + public: + explicit OptionsType(const arrow::internal::PropertyTuple properties) + : properties_(properties) {} + + const char* type_name() const override { return Options::kTypeName; } + + std::string Stringify(const FunctionOptions& options) const override { + const auto& self = checked_cast(options); + return StringifyImpl(self, properties_).Finish(); + } + bool Compare(const FunctionOptions& options, + const FunctionOptions& other) const override { + const auto& lhs = checked_cast(options); + const auto& rhs = checked_cast(other); + return CompareImpl(lhs, rhs, properties_).equal_; + } + Status ToStructScalar(const FunctionOptions& options, + std::vector* field_names, + std::vector>* values) const override { + const auto& self = checked_cast(options); + RETURN_NOT_OK( + ToStructScalarImpl(self, properties_, field_names, values).status_); + return Status::OK(); + } + Result> FromStructScalar( + const StructScalar& scalar) const override { + auto options = std::unique_ptr(new Options()); + RETURN_NOT_OK( + FromStructScalarImpl(options.get(), scalar, properties_).status_); + return std::move(options); + } + + private: + const arrow::internal::PropertyTuple properties_; + } instance(arrow::internal::MakeProperties(properties...)); + return &instance; +} + +} // namespace internal +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index b6f1815b89e..7aca10ef0fa 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -21,16 +21,114 @@ #include +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/cast.h" #include "arrow/compute/function.h" #include "arrow/compute/kernel.h" #include "arrow/datum.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" +#include "arrow/util/key_value_metadata.h" namespace arrow { namespace compute { +TEST(FunctionOptions, Equality) { + std::vector> options; + options.emplace_back(new ScalarAggregateOptions()); + options.emplace_back(new ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/1)); + options.emplace_back(new ModeOptions()); + options.emplace_back(new ModeOptions(/*n=*/2)); + options.emplace_back(new VarianceOptions()); + options.emplace_back(new VarianceOptions(/*ddof=*/2)); + options.emplace_back(new QuantileOptions()); + options.emplace_back( + new QuantileOptions(/*q=*/0.75, QuantileOptions::Interpolation::MIDPOINT)); + options.emplace_back(new TDigestOptions()); + options.emplace_back( + new TDigestOptions(/*q=*/0.75, /*delta=*/50, /*buffer_size=*/1024)); + options.emplace_back(new IndexOptions(ScalarFromJSON(int64(), "16"))); + options.emplace_back(new IndexOptions(ScalarFromJSON(boolean(), "true"))); + options.emplace_back(new IndexOptions(ScalarFromJSON(boolean(), "null"))); + options.emplace_back(new ArithmeticOptions()); + options.emplace_back(new ArithmeticOptions(/*check_overflow=*/true)); + options.emplace_back(new ElementWiseAggregateOptions()); + options.emplace_back(new ElementWiseAggregateOptions(/*skip_nulls=*/false)); + options.emplace_back(new JoinOptions()); + options.emplace_back(new JoinOptions(JoinOptions::REPLACE, "replacement")); + options.emplace_back(new MatchSubstringOptions("pattern")); + options.emplace_back(new MatchSubstringOptions("pattern", /*ignore_case=*/true)); + options.emplace_back(new SplitOptions()); + options.emplace_back(new SplitOptions(/*max_splits=*/2, /*reverse=*/true)); + options.emplace_back(new SplitPatternOptions("pattern")); + options.emplace_back( + new SplitPatternOptions("pattern", /*max_splits=*/2, /*reverse=*/true)); + options.emplace_back(new ReplaceSubstringOptions("pattern", "replacement")); + options.emplace_back( + new ReplaceSubstringOptions("pattern", "replacement", /*max_replacements=*/2)); + options.emplace_back(new ReplaceSliceOptions(0, 1, "foo")); + options.emplace_back(new ReplaceSliceOptions(1, -1, "bar")); + options.emplace_back(new ExtractRegexOptions("pattern")); + options.emplace_back(new ExtractRegexOptions("pattern2")); + options.emplace_back(new SetLookupOptions(ArrayFromJSON(int64(), "[1, 2, 3, 4]"))); + options.emplace_back(new SetLookupOptions(ArrayFromJSON(boolean(), "[true, false]"))); + options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI)); + options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::NANO)); + options.emplace_back(new PadOptions(5, " ")); + options.emplace_back(new PadOptions(10, "A")); + options.emplace_back(new TrimOptions(" ")); + options.emplace_back(new TrimOptions("abc")); + options.emplace_back(new SliceOptions(/*start=*/1)); + options.emplace_back(new SliceOptions(/*start=*/1, /*stop=*/-5, /*step=*/-2)); + // N.B. we never actually use field_nullability or field_metadata in Arrow + options.emplace_back(new MakeStructOptions({"col1"}, {true}, {})); + options.emplace_back(new MakeStructOptions({"col1"}, {false}, {})); + options.emplace_back( + new MakeStructOptions({"col1"}, {false}, {key_value_metadata({{"key", "val"}})})); + options.emplace_back(new DayOfWeekOptions(false, 1)); + options.emplace_back(new CastOptions(CastOptions::Safe(boolean()))); + options.emplace_back(new CastOptions(CastOptions::Unsafe(int64()))); + options.emplace_back(new FilterOptions()); + options.emplace_back( + new FilterOptions(FilterOptions::NullSelectionBehavior::EMIT_NULL)); + options.emplace_back(new TakeOptions()); + options.emplace_back(new TakeOptions(/*boundscheck=*/false)); + options.emplace_back(new DictionaryEncodeOptions()); + options.emplace_back( + new DictionaryEncodeOptions(DictionaryEncodeOptions::NullEncodingBehavior::ENCODE)); + options.emplace_back(new ArraySortOptions()); + options.emplace_back(new ArraySortOptions(SortOrder::Descending)); + options.emplace_back(new SortOptions()); + options.emplace_back(new SortOptions({SortKey("key", SortOrder::Ascending)})); + options.emplace_back(new SortOptions( + {SortKey("key", SortOrder::Descending), SortKey("value", SortOrder::Descending)})); + options.emplace_back(new PartitionNthOptions(/*pivot=*/0)); + options.emplace_back(new PartitionNthOptions(/*pivot=*/42)); + + for (size_t i = 0; i < options.size(); i++) { + const size_t prev_i = i == 0 ? options.size() - 1 : i - 1; + const FunctionOptions& cur = *options[i]; + const FunctionOptions& prev = *options[prev_i]; + SCOPED_TRACE(cur.type_name()); + SCOPED_TRACE(cur.ToString()); + ASSERT_EQ(cur, cur); + ASSERT_NE(cur, prev); + ASSERT_NE(prev, cur); + ASSERT_NE("", cur.ToString()); + + ASSERT_OK_AND_ASSIGN(auto serialized, cur.Serialize()); + const auto* type_name = cur.type_name(); + ASSERT_OK_AND_ASSIGN( + auto deserialized, + FunctionOptions::Deserialize(std::string(type_name, std::strlen(type_name)), + *serialized)); + ASSERT_TRUE(cur.Equals(*deserialized)); + } +} + struct ExecBatch; TEST(Arity, Basics) { @@ -87,8 +185,7 @@ TEST(VectorFunction, Basics) { } auto ExecNYI = [](KernelContext* ctx, const ExecBatch& args, Datum* out) { - ctx->SetStatus(Status::NotImplemented("NYI")); - return; + return Status::NotImplemented("NYI"); }; template @@ -181,13 +278,15 @@ TEST(ScalarAggregateFunction, Basics) { ASSERT_EQ(Function::SCALAR_AGGREGATE, func.kind()); } -std::unique_ptr NoopInit(KernelContext*, const KernelInitArgs&) { +Result> NoopInit(KernelContext*, const KernelInitArgs&) { return nullptr; } -void NoopConsume(KernelContext*, const ExecBatch&) {} -void NoopMerge(KernelContext*, const KernelState&, KernelState*) {} -void NoopFinalize(KernelContext*, Datum*) {} +Status NoopConsume(KernelContext*, const ExecBatch&) { return Status::OK(); } +Status NoopMerge(KernelContext*, const KernelState&, KernelState*) { + return Status::OK(); +} +Status NoopFinalize(KernelContext*, Datum*) { return Status::OK(); } TEST(ScalarAggregateFunction, DispatchExact) { ScalarAggregateFunction func("agg_test", Arity::Unary(), /*doc=*/nullptr); diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index 88b42716fa2..f131f524d2e 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -59,15 +59,25 @@ Result> KernelContext::AllocateBitmap(int64_t n return result; } -void KernelContext::SetStatus(const Status& status) { - if (ARROW_PREDICT_TRUE(status.ok())) { - return; +Status Kernel::InitAll(KernelContext* ctx, const KernelInitArgs& args, + std::vector>* states) { + for (auto& state : *states) { + ARROW_ASSIGN_OR_RAISE(state, args.kernel->init(ctx, args)); } - status_ = status; + return Status::OK(); } -/// \brief Clear any error status -void KernelContext::ResetStatus() { status_ = Status::OK(); } +Result> ScalarAggregateKernel::MergeAll( + const ScalarAggregateKernel* kernel, KernelContext* ctx, + std::vector> states) { + auto out = std::move(states.back()); + states.pop_back(); + ctx->SetState(out.get()); + for (auto& state : states) { + RETURN_NOT_OK(kernel->merge(ctx, std::move(*state), out.get())); + } + return std::move(out); +} // ---------------------------------------------------------------------- // Some basic TypeMatcher implementations @@ -392,8 +402,7 @@ KernelSignature::KernelSignature(std::vector in_types, OutputType out out_type_(std::move(out_type)), is_varargs_(is_varargs), hash_code_(0) { - // VarArgs sigs must have only a single input type to use for argument validation - DCHECK(!is_varargs || (is_varargs && (in_types_.size() == 1))); + DCHECK(!is_varargs || (is_varargs && (in_types_.size() >= 1))); } std::shared_ptr KernelSignature::Make(std::vector in_types, @@ -420,8 +429,8 @@ bool KernelSignature::Equals(const KernelSignature& other) const { bool KernelSignature::MatchesInputs(const std::vector& args) const { if (is_varargs_) { - for (const auto& arg : args) { - if (!in_types_[0].Matches(arg)) { + for (size_t i = 0; i < args.size(); ++i) { + if (!in_types_[std::min(i, in_types_.size() - 1)].Matches(args[i])) { return false; } } @@ -454,15 +463,19 @@ std::string KernelSignature::ToString() const { std::stringstream ss; if (is_varargs_) { - ss << "varargs[" << in_types_[0].ToString() << "]"; + ss << "varargs["; } else { ss << "("; - for (size_t i = 0; i < in_types_.size(); ++i) { - if (i > 0) { - ss << ", "; - } - ss << in_types_[i].ToString(); + } + for (size_t i = 0; i < in_types_.size(); ++i) { + if (i > 0) { + ss << ", "; } + ss << in_types_[i].ToString(); + } + if (is_varargs_) { + ss << "]"; + } else { ss << ")"; } ss << " -> " << out_type_.ToString(); diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index b99b41170d2..099bd95bbf2 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -41,7 +41,7 @@ namespace arrow { namespace compute { -struct FunctionOptions; +class FunctionOptions; /// \brief Base class for opaque kernel-specific state. For example, if there /// is some kind of initialization required. @@ -63,22 +63,6 @@ class ARROW_EXPORT KernelContext { /// byte is preemptively zeroed to help avoid ASAN or valgrind issues. Result> AllocateBitmap(int64_t num_bits); - /// \brief Indicate that an error has occurred, to be checked by a exec caller - /// \param[in] status a Status instance. - /// - /// \note Will not overwrite a prior set Status, so we will have the first - /// error that occurred until ExecContext::ResetStatus is called. - void SetStatus(const Status& status); - - /// \brief Clear any error status. - void ResetStatus(); - - /// \brief Return true if an error has occurred. - bool HasError() const { return !status_.ok(); } - - /// \brief Return the current status of the context. - const Status& status() const { return status_; } - /// \brief Assign the active KernelState to be utilized for each stage of /// kernel execution. Ownership and memory lifetime of the KernelState must /// be minded separately. @@ -96,21 +80,9 @@ class ARROW_EXPORT KernelContext { private: ExecContext* exec_ctx_; - Status status_; - KernelState* state_; + KernelState* state_ = NULLPTR; }; -// A macro to invoke for error control flow after invoking functions (such as -// kernel init or exec functions) that propagate errors via KernelContext. -#define ARROW_CTX_RETURN_IF_ERROR(CTX) \ - do { \ - if (ARROW_PREDICT_FALSE((CTX)->HasError())) { \ - Status s = (CTX)->status(); \ - (CTX)->ResetStatus(); \ - return s; \ - } \ - } while (0) - /// \brief The standard kernel execution API that must be implemented for /// SCALAR and VECTOR kernel types. This includes both stateless and stateful /// kernels. Kernels depending on some execution state access that state via @@ -119,7 +91,7 @@ class ARROW_EXPORT KernelContext { /// into pre-allocated memory if they are able, though for some kernels /// (e.g. in cases when a builder like StringBuilder) must be employed this may /// not be possible. -using ArrayKernelExec = std::function; +using ArrayKernelExec = std::function; /// \brief An type-checking interface to permit customizable validation rules /// for use with InputType and KernelSignature. This is for scenarios where the @@ -349,6 +321,9 @@ class ARROW_EXPORT OutputType { this->resolver_ = other.resolver_; } + OutputType& operator=(const OutputType&) = default; + OutputType& operator=(OutputType&&) = default; + /// \brief Return the shape and type of the expected output value of the /// kernel given the value descriptors (shapes and types) of the input /// arguments. The resolver may make use of state information kept in the @@ -391,8 +366,10 @@ class ARROW_EXPORT OutputType { /// \brief Holds the input types and output type of the kernel. /// -/// VarArgs functions should pass a single input type to be used to validate -/// the input types of a function invocation. +/// VarArgs functions with minimum N arguments should pass up to N input types to be +/// used to validate the input types of a function invocation. The first N-1 types +/// will be matched against the first N-1 arguments, and the last type will be +/// matched against the remaining arguments. class ARROW_EXPORT KernelSignature { public: KernelSignature(std::vector in_types, OutputType out_type, @@ -523,9 +500,8 @@ struct KernelInitArgs { }; /// \brief Common initializer function for all kernel types. -/// If an error occurs it will be stored in the KernelContext; nullptr will be returned. -using KernelInit = - std::function(KernelContext*, const KernelInitArgs&)>; +using KernelInit = std::function>( + KernelContext*, const KernelInitArgs&)>; /// \brief Base type for kernels. Contains the function signature and /// optionally the state initialization function, along with some common @@ -548,6 +524,10 @@ struct Kernel { /// set up any options or state relevant for execution. KernelInit init; + /// \brief Create a vector of new KernelState for invocations of this kernel. + static Status InitAll(KernelContext*, const KernelInitArgs&, + std::vector>*); + /// \brief Indicates whether execution can benefit from parallelization /// (splitting large chunks into smaller chunks and using multiple /// threads). Some kernels may not support parallel execution at @@ -608,7 +588,7 @@ struct ScalarKernel : public ArrayKernel { // VectorKernel (for VectorFunction) /// \brief See VectorKernel::finalize member for usage -using VectorFinalize = std::function*)>; +using VectorFinalize = std::function*)>; /// \brief Kernel data structure for implementations of VectorFunction. In /// addition to the members found in ArrayKernel, contains an optional @@ -663,13 +643,13 @@ struct VectorKernel : public ArrayKernel { // ---------------------------------------------------------------------- // ScalarAggregateKernel (for ScalarAggregateFunction) -using ScalarAggregateConsume = std::function; +using ScalarAggregateConsume = std::function; using ScalarAggregateMerge = - std::function; + std::function; // Finalize returns Datum to permit multiple return values -using ScalarAggregateFinalize = std::function; +using ScalarAggregateFinalize = std::function; /// \brief Kernel data structure for implementations of /// ScalarAggregateFunction. The four necessary components of an aggregation @@ -699,6 +679,12 @@ struct ScalarAggregateKernel : public Kernel { KernelSignature::Make(std::move(in_types), std::move(out_type)), std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {} + /// \brief Merge a vector of KernelStates into a single KernelState. + /// The merged state will be returned and will be set on the KernelContext. + static Result> MergeAll( + const ScalarAggregateKernel* kernel, KernelContext* ctx, + std::vector> states); + ScalarAggregateConsume consume; ScalarAggregateMerge merge; ScalarAggregateFinalize finalize; @@ -707,19 +693,22 @@ struct ScalarAggregateKernel : public Kernel { // ---------------------------------------------------------------------- // HashAggregateKernel (for HashAggregateFunction) -using HashAggregateConsume = std::function; +using HashAggregateResize = std::function; + +using HashAggregateConsume = std::function; using HashAggregateMerge = - std::function; + std::function; // Finalize returns Datum to permit multiple return values -using HashAggregateFinalize = std::function; +using HashAggregateFinalize = std::function; /// \brief Kernel data structure for implementations of /// HashAggregateFunction. The four necessary components of an aggregation /// kernel are the init, consume, merge, and finalize functions. /// /// * init: creates a new KernelState for a kernel. +/// * resize: ensure that the KernelState can accommodate the specified number of groups. /// * consume: processes an ExecBatch (which includes the argument as well /// as an array of group identifiers) and updates the KernelState found in the /// KernelContext. @@ -730,20 +719,24 @@ struct HashAggregateKernel : public Kernel { HashAggregateKernel() = default; HashAggregateKernel(std::shared_ptr sig, KernelInit init, - HashAggregateConsume consume, HashAggregateMerge merge, - HashAggregateFinalize finalize) + HashAggregateResize resize, HashAggregateConsume consume, + HashAggregateMerge merge, HashAggregateFinalize finalize) : Kernel(std::move(sig), std::move(init)), + resize(std::move(resize)), consume(std::move(consume)), merge(std::move(merge)), finalize(std::move(finalize)) {} HashAggregateKernel(std::vector in_types, OutputType out_type, - KernelInit init, HashAggregateMerge merge, - HashAggregateConsume consume, HashAggregateFinalize finalize) + KernelInit init, HashAggregateConsume consume, + HashAggregateResize resize, HashAggregateMerge merge, + HashAggregateFinalize finalize) : HashAggregateKernel( KernelSignature::Make(std::move(in_types), std::move(out_type)), - std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {} + std::move(init), std::move(resize), std::move(consume), std::move(merge), + std::move(finalize)) {} + HashAggregateResize resize; HashAggregateConsume consume; HashAggregateMerge merge; HashAggregateFinalize finalize; diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc index a5ef9d44e18..a63c42d4fde 100644 --- a/cpp/src/arrow/compute/kernel_test.cc +++ b/cpp/src/arrow/compute/kernel_test.cc @@ -468,15 +468,28 @@ TEST(KernelSignature, MatchesInputs) { } TEST(KernelSignature, VarArgsMatchesInputs) { - KernelSignature sig({int8()}, utf8(), /*is_varargs=*/true); - - std::vector args = {int8()}; - ASSERT_TRUE(sig.MatchesInputs(args)); - args.push_back(ValueDescr::Scalar(int8())); - args.push_back(ValueDescr::Array(int8())); - ASSERT_TRUE(sig.MatchesInputs(args)); - args.push_back(int32()); - ASSERT_FALSE(sig.MatchesInputs(args)); + { + KernelSignature sig({int8()}, utf8(), /*is_varargs=*/true); + + std::vector args = {int8()}; + ASSERT_TRUE(sig.MatchesInputs(args)); + args.push_back(ValueDescr::Scalar(int8())); + args.push_back(ValueDescr::Array(int8())); + ASSERT_TRUE(sig.MatchesInputs(args)); + args.push_back(int32()); + ASSERT_FALSE(sig.MatchesInputs(args)); + } + { + KernelSignature sig({int8(), utf8()}, utf8(), /*is_varargs=*/true); + + std::vector args = {int8()}; + ASSERT_TRUE(sig.MatchesInputs(args)); + args.push_back(ValueDescr::Scalar(utf8())); + args.push_back(ValueDescr::Array(utf8())); + ASSERT_TRUE(sig.MatchesInputs(args)); + args.push_back(int32()); + ASSERT_FALSE(sig.MatchesInputs(args)); + } } TEST(KernelSignature, ToString) { diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 5e223a1f906..474ce1418fd 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -27,14 +27,17 @@ add_arrow_compute_test(scalar_test scalar_nested_test.cc scalar_set_lookup_test.cc scalar_string_test.cc + scalar_temporal_test.cc scalar_validity_test.cc scalar_fill_null_test.cc + scalar_if_else_test.cc test_util.cc) add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_cast_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(scalar_if_else_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute") @@ -45,6 +48,7 @@ add_arrow_compute_test(vector_test SOURCES vector_hash_test.cc vector_nested_test.cc + vector_replace_test.cc vector_selection_test.cc vector_sort_test.cc test_util.cc) @@ -52,6 +56,7 @@ add_arrow_compute_test(vector_test add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute") add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute") add_arrow_benchmark(vector_partition_benchmark PREFIX "arrow-compute") +add_arrow_benchmark(vector_replace_benchmark PREFIX "arrow-compute") add_arrow_benchmark(vector_selection_benchmark PREFIX "arrow-compute") # ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 61dc8cb403c..a7df66695b2 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -27,16 +27,16 @@ namespace compute { namespace { -void AggregateConsume(KernelContext* ctx, const ExecBatch& batch) { - checked_cast(ctx->state())->Consume(ctx, batch); +Status AggregateConsume(KernelContext* ctx, const ExecBatch& batch) { + return checked_cast(ctx->state())->Consume(ctx, batch); } -void AggregateMerge(KernelContext* ctx, KernelState&& src, KernelState* dst) { - checked_cast(dst)->MergeFrom(ctx, std::move(src)); +Status AggregateMerge(KernelContext* ctx, KernelState&& src, KernelState* dst) { + return checked_cast(dst)->MergeFrom(ctx, std::move(src)); } -void AggregateFinalize(KernelContext* ctx, Datum* out) { - checked_cast(ctx->state())->Finalize(ctx, out); +Status AggregateFinalize(KernelContext* ctx, Datum* out) { + return checked_cast(ctx->state())->Finalize(ctx, out); } } // namespace @@ -56,72 +56,91 @@ namespace aggregate { // Count implementation struct CountImpl : public ScalarAggregator { - explicit CountImpl(CountOptions options) : options(std::move(options)) {} - - void Consume(KernelContext*, const ExecBatch& batch) override { - const ArrayData& input = *batch[0].array(); - const int64_t nulls = input.GetNullCount(); - this->nulls += nulls; - this->non_nulls += input.length - nulls; + explicit CountImpl(ScalarAggregateOptions options) : options(std::move(options)) {} + + Status Consume(KernelContext*, const ExecBatch& batch) override { + if (batch[0].is_array()) { + const ArrayData& input = *batch[0].array(); + const int64_t nulls = input.GetNullCount(); + this->nulls += nulls; + this->non_nulls += input.length - nulls; + } else { + const Scalar& input = *batch[0].scalar(); + this->nulls += !input.is_valid * batch.length; + this->non_nulls += input.is_valid * batch.length; + } + return Status::OK(); } - void MergeFrom(KernelContext*, KernelState&& src) override { + Status MergeFrom(KernelContext*, KernelState&& src) override { const auto& other_state = checked_cast(src); this->non_nulls += other_state.non_nulls; this->nulls += other_state.nulls; + return Status::OK(); } - void Finalize(KernelContext* ctx, Datum* out) override { + Status Finalize(KernelContext* ctx, Datum* out) override { const auto& state = checked_cast(*ctx->state()); - switch (state.options.count_mode) { - case CountOptions::COUNT_NON_NULL: - *out = Datum(state.non_nulls); - break; - case CountOptions::COUNT_NULL: - *out = Datum(state.nulls); - break; - default: - ctx->SetStatus(Status::Invalid("Unknown CountOptions encountered")); - break; + if (state.options.skip_nulls) { + *out = Datum(state.non_nulls); + } else { + *out = Datum(state.nulls); } + return Status::OK(); } - CountOptions options; + ScalarAggregateOptions options; int64_t non_nulls = 0; int64_t nulls = 0; }; -std::unique_ptr CountInit(KernelContext*, const KernelInitArgs& args) { +Result> CountInit(KernelContext*, + const KernelInitArgs& args) { return ::arrow::internal::make_unique( - static_cast(*args.options)); + static_cast(*args.options)); } // ---------------------------------------------------------------------- // Sum implementation template -struct SumImplDefault : public SumImpl {}; +struct SumImplDefault : public SumImpl { + explicit SumImplDefault(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; template -struct MeanImplDefault : public MeanImpl {}; +struct MeanImplDefault : public MeanImpl { + explicit MeanImplDefault(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; -std::unique_ptr SumInit(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); +Result> SumInit(KernelContext* ctx, + const KernelInitArgs& args) { + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } -std::unique_ptr MeanInit(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); +Result> MeanInit(KernelContext* ctx, + const KernelInitArgs& args) { + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } // ---------------------------------------------------------------------- // MinMax implementation -std::unique_ptr MinMaxInit(KernelContext* ctx, const KernelInitArgs& args) { +Result> MinMaxInit(KernelContext* ctx, + const KernelInitArgs& args) { MinMaxInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options)); + static_cast(*args.options)); return visitor.Create(); } @@ -129,13 +148,21 @@ std::unique_ptr MinMaxInit(KernelContext* ctx, const KernelInitArgs // Any implementation struct BooleanAnyImpl : public ScalarAggregator { - void Consume(KernelContext*, const ExecBatch& batch) override { + explicit BooleanAnyImpl(ScalarAggregateOptions options) : options(std::move(options)) {} + + Status Consume(KernelContext*, const ExecBatch& batch) override { // short-circuit if seen a True already if (this->any == true) { - return; + return Status::OK(); + } + if (batch[0].is_scalar()) { + const auto& scalar = *batch[0].scalar(); + this->has_nulls = !scalar.is_valid; + this->any = scalar.is_valid && checked_cast(scalar).value; + return Status::OK(); } - const auto& data = *batch[0].array(); + this->has_nulls = data.GetNullCount() > 0; arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[0], data.offset, data.buffers[1], data.offset, data.length); int64_t position = 0; @@ -147,34 +174,60 @@ struct BooleanAnyImpl : public ScalarAggregator { } position += block.length; } + return Status::OK(); } - void MergeFrom(KernelContext*, KernelState&& src) override { + Status MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); this->any |= other.any; + this->has_nulls |= other.has_nulls; + return Status::OK(); } - void Finalize(KernelContext*, Datum* out) override { - out->value = std::make_shared(this->any); + Status Finalize(KernelContext* ctx, Datum* out) override { + if (!options.skip_nulls && !this->any && this->has_nulls) { + out->value = std::make_shared(); + } else { + out->value = std::make_shared(this->any); + } + return Status::OK(); } + bool any = false; + bool has_nulls = false; + ScalarAggregateOptions options; }; -std::unique_ptr AnyInit(KernelContext*, const KernelInitArgs& args) { - return ::arrow::internal::make_unique(); +Result> AnyInit(KernelContext*, const KernelInitArgs& args) { + const ScalarAggregateOptions options = + static_cast(*args.options); + return ::arrow::internal::make_unique( + static_cast(*args.options)); } // ---------------------------------------------------------------------- // All implementation struct BooleanAllImpl : public ScalarAggregator { - void Consume(KernelContext*, const ExecBatch& batch) override { + explicit BooleanAllImpl(ScalarAggregateOptions options) : options(std::move(options)) {} + + Status Consume(KernelContext*, const ExecBatch& batch) override { // short-circuit if seen a false already if (this->all == false) { - return; + return Status::OK(); + } + // short-circuit if seen a null already + if (!options.skip_nulls && this->has_nulls) { + return Status::OK(); + } + if (batch[0].is_scalar()) { + const auto& scalar = *batch[0].scalar(); + this->has_nulls = !scalar.is_valid; + this->all = !scalar.is_valid || checked_cast(scalar).value; + return Status::OK(); } - const auto& data = *batch[0].array(); + this->has_nulls = data.GetNullCount() > 0; arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[1], data.offset, data.buffers[0], data.offset, data.length); int64_t position = 0; @@ -186,23 +239,161 @@ struct BooleanAllImpl : public ScalarAggregator { } position += block.length; } + + return Status::OK(); } - void MergeFrom(KernelContext*, KernelState&& src) override { + Status MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); this->all &= other.all; + this->has_nulls |= other.has_nulls; + return Status::OK(); } - void Finalize(KernelContext*, Datum* out) override { - out->value = std::make_shared(this->all); + Status Finalize(KernelContext*, Datum* out) override { + if (!options.skip_nulls && this->all && this->has_nulls) { + out->value = std::make_shared(); + } else { + out->value = std::make_shared(this->all); + } + return Status::OK(); } + bool all = true; + bool has_nulls = false; + ScalarAggregateOptions options; }; -std::unique_ptr AllInit(KernelContext*, const KernelInitArgs& args) { - return ::arrow::internal::make_unique(); +Result> AllInit(KernelContext*, const KernelInitArgs& args) { + return ::arrow::internal::make_unique( + static_cast(*args.options)); } +// ---------------------------------------------------------------------- +// Index implementation + +template +struct IndexImpl : public ScalarAggregator { + using ArgValue = typename internal::GetViewType::T; + + explicit IndexImpl(IndexOptions options, KernelState* raw_state) + : options(std::move(options)), seen(0), index(-1) { + if (auto state = static_cast*>(raw_state)) { + seen = state->seen; + index = state->index; + } + } + + Status Consume(KernelContext* ctx, const ExecBatch& batch) override { + // short-circuit + if (index >= 0 || !options.value->is_valid) { + return Status::OK(); + } + + auto input = batch[0].array(); + seen = input->length; + const ArgValue desired = internal::UnboxScalar::Unbox(*options.value); + int64_t i = 0; + + ARROW_UNUSED(internal::VisitArrayValuesInline( + *input, + [&](ArgValue v) -> Status { + if (v == desired) { + index = i; + return Status::Cancelled("Found"); + } else { + ++i; + return Status::OK(); + } + }, + [&]() -> Status { + ++i; + return Status::OK(); + })); + + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + if (index < 0 && other.index >= 0) { + index = seen + other.index; + } + seen += other.seen; + return Status::OK(); + } + + Status Finalize(KernelContext*, Datum* out) override { + out->value = std::make_shared(index >= 0 ? index : -1); + return Status::OK(); + } + + const IndexOptions options; + int64_t seen = 0; + int64_t index = -1; +}; + +struct IndexInit { + std::unique_ptr state; + KernelContext* ctx; + const IndexOptions& options; + const DataType& type; + + IndexInit(KernelContext* ctx, const IndexOptions& options, const DataType& type) + : ctx(ctx), options(options), type(type) {} + + Status Visit(const DataType& type) { + return Status::NotImplemented("Index kernel not implemented for ", type.ToString()); + } + + Status Visit(const BooleanType&) { + state.reset(new IndexImpl(options, ctx->state())); + return Status::OK(); + } + + template + enable_if_number Visit(const Type&) { + state.reset(new IndexImpl(options, ctx->state())); + return Status::OK(); + } + + template + enable_if_base_binary Visit(const Type&) { + state.reset(new IndexImpl(options, ctx->state())); + return Status::OK(); + } + + template + enable_if_date Visit(const Type&) { + state.reset(new IndexImpl(options, ctx->state())); + return Status::OK(); + } + + template + enable_if_time Visit(const Type&) { + state.reset(new IndexImpl(options, ctx->state())); + return Status::OK(); + } + + template + enable_if_timestamp Visit(const Type&) { + state.reset(new IndexImpl(options, ctx->state())); + return Status::OK(); + } + + Result> Create() { + RETURN_NOT_OK(VisitTypeInline(type, this)); + return std::move(state); + } + + static Result> Init(KernelContext* ctx, + const KernelInitArgs& args) { + IndexInit visitor(ctx, static_cast(*args.options), + *args.inputs[0].type); + return visitor.Create(); + } +}; + void AddBasicAggKernels(KernelInit init, const std::vector>& types, std::shared_ptr out_ty, ScalarAggregateFunction* func, @@ -214,13 +405,33 @@ void AddBasicAggKernels(KernelInit init, } } +void AddScalarAggKernels(KernelInit init, + const std::vector>& types, + std::shared_ptr out_ty, + ScalarAggregateFunction* func) { + for (const auto& ty : types) { + // scalar[InT] -> scalar[OutT] + auto sig = KernelSignature::Make({InputType::Scalar(ty)}, ValueDescr::Scalar(out_ty)); + AddAggKernel(std::move(sig), init, func, SimdLevel::NONE); + } +} + +void AddArrayScalarAggKernels(KernelInit init, + const std::vector>& types, + std::shared_ptr out_ty, + ScalarAggregateFunction* func, + SimdLevel::type simd_level = SimdLevel::NONE) { + AddBasicAggKernels(init, types, out_ty, func, simd_level); + AddScalarAggKernels(init, types, out_ty, func); +} + void AddMinMaxKernels(KernelInit init, const std::vector>& types, ScalarAggregateFunction* func, SimdLevel::type simd_level) { for (const auto& ty : types) { - // array[T] -> scalar[struct] + // any[T] -> scalar[struct] auto out_ty = struct_({field("min", ty), field("max", ty)}); - auto sig = KernelSignature::Make({InputType::Array(ty)}, ValueDescr::Scalar(out_ty)); + auto sig = KernelSignature::Make({InputType(ty)}, ValueDescr::Scalar(out_ty)); AddAggKernel(std::move(sig), init, func, simd_level); } } @@ -231,54 +442,85 @@ namespace internal { namespace { const FunctionDoc count_doc{"Count the number of null / non-null values", - ("By default, non-null values are counted.\n" - "This can be changed through CountOptions."), + ("By default, only non-null values are counted.\n" + "This can be changed through ScalarAggregateOptions."), {"array"}, - "CountOptions"}; + "ScalarAggregateOptions"}; const FunctionDoc sum_doc{ - "Sum values of a numeric array", ("Null values are ignored."), {"array"}}; - -const FunctionDoc mean_doc{"Compute the mean of a numeric array", - ("Null values are ignored. The result is always computed\n" - "as a double, regardless of the input types"), - {"array"}}; + "Compute the sum of a numeric array", + ("Null values are ignored by default. Minimum count of non-null\n" + "values can be set and null is returned if too few are present.\n" + "This can be changed through ScalarAggregateOptions."), + {"array"}, + "ScalarAggregateOptions"}; + +const FunctionDoc mean_doc{ + "Compute the mean of a numeric array", + ("Null values are ignored by default. Minimum count of non-null\n" + "values can be set and null is returned if too few are " + "present.\nThis can be changed through ScalarAggregateOptions.\n" + "The result is always computed as a double, regardless of the input types."), + {"array"}, + "ScalarAggregateOptions"}; const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numeric array", ("Null values are ignored by default.\n" - "This can be changed through MinMaxOptions."), + "This can be changed through ScalarAggregateOptions."), {"array"}, - "MinMaxOptions"}; + "ScalarAggregateOptions"}; const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true", - ("Null values are ignored."), - {"array"}}; + ("Null values are ignored by default.\n" + "If null values are taken into account by setting " + "ScalarAggregateOptions parameter skip_nulls = false then " + "Kleene logic is used.\n" + "See KleeneOr for more details on Kleene logic."), + {"array"}, + "ScalarAggregateOptions"}; const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true", - ("Null values are ignored."), - {"array"}}; + ("Null values are ignored by default.\n" + "If null values are taken into account by setting " + "ScalarAggregateOptions parameter skip_nulls = false then " + "Kleene logic is used.\n" + "See KleeneAnd for more details on Kleene logic."), + {"array"}, + "ScalarAggregateOptions"}; + +const FunctionDoc index_doc{"Find the index of the first occurrence of a given value", + ("The result is always computed as an int64_t, regardless\n" + "of the offset type of the input array."), + {"array"}, + "IndexOptions"}; } // namespace void RegisterScalarAggregateBasic(FunctionRegistry* registry) { - static auto default_count_options = CountOptions::Defaults(); + static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults(); + auto func = std::make_shared( - "count", Arity::Unary(), &count_doc, &default_count_options); + "count", Arity::Unary(), &count_doc, &default_scalar_aggregate_options); // Takes any array input, outputs int64 scalar InputType any_array(ValueDescr::ARRAY); AddAggKernel(KernelSignature::Make({any_array}, ValueDescr::Scalar(int64())), aggregate::CountInit, func.get()); + AddAggKernel( + KernelSignature::Make({InputType(ValueDescr::SCALAR)}, ValueDescr::Scalar(int64())), + aggregate::CountInit, func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); - func = std::make_shared("sum", Arity::Unary(), &sum_doc); - aggregate::AddBasicAggKernels(aggregate::SumInit, {boolean()}, int64(), func.get()); - aggregate::AddBasicAggKernels(aggregate::SumInit, SignedIntTypes(), int64(), - func.get()); - aggregate::AddBasicAggKernels(aggregate::SumInit, UnsignedIntTypes(), uint64(), - func.get()); - aggregate::AddBasicAggKernels(aggregate::SumInit, FloatingPointTypes(), float64(), - func.get()); + func = std::make_shared("sum", Arity::Unary(), &sum_doc, + &default_scalar_aggregate_options); + aggregate::AddArrayScalarAggKernels(aggregate::SumInit, {boolean()}, int64(), + func.get()); + aggregate::AddArrayScalarAggKernels(aggregate::SumInit, SignedIntTypes(), int64(), + func.get()); + aggregate::AddArrayScalarAggKernels(aggregate::SumInit, UnsignedIntTypes(), uint64(), + func.get()); + aggregate::AddArrayScalarAggKernels(aggregate::SumInit, FloatingPointTypes(), float64(), + func.get()); // Add the SIMD variants for sum #if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512) auto cpu_info = arrow::internal::CpuInfo::GetInstance(); @@ -295,10 +537,12 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { #endif DCHECK_OK(registry->AddFunction(std::move(func))); - func = std::make_shared("mean", Arity::Unary(), &mean_doc); - aggregate::AddBasicAggKernels(aggregate::MeanInit, {boolean()}, float64(), func.get()); - aggregate::AddBasicAggKernels(aggregate::MeanInit, NumericTypes(), float64(), - func.get()); + func = std::make_shared("mean", Arity::Unary(), &mean_doc, + &default_scalar_aggregate_options); + aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, {boolean()}, float64(), + func.get()); + aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, NumericTypes(), float64(), + func.get()); // Add the SIMD variants for mean #if defined(ARROW_HAVE_RUNTIME_AVX2) if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) { @@ -312,9 +556,8 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { #endif DCHECK_OK(registry->AddFunction(std::move(func))); - static auto default_minmax_options = MinMaxOptions::Defaults(); - func = std::make_shared("min_max", Arity::Unary(), - &min_max_doc, &default_minmax_options); + func = std::make_shared( + "min_max", Arity::Unary(), &min_max_doc, &default_scalar_aggregate_options); aggregate::AddMinMaxKernels(aggregate::MinMaxInit, {boolean()}, func.get()); aggregate::AddMinMaxKernels(aggregate::MinMaxInit, NumericTypes(), func.get()); // Add the SIMD variants for min max @@ -332,13 +575,27 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); // any - func = std::make_shared("any", Arity::Unary(), &any_doc); - aggregate::AddBasicAggKernels(aggregate::AnyInit, {boolean()}, boolean(), func.get()); + func = std::make_shared("any", Arity::Unary(), &any_doc, + &default_scalar_aggregate_options); + aggregate::AddArrayScalarAggKernels(aggregate::AnyInit, {boolean()}, boolean(), + func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); // all - func = std::make_shared("all", Arity::Unary(), &all_doc); - aggregate::AddBasicAggKernels(aggregate::AllInit, {boolean()}, boolean(), func.get()); + func = std::make_shared("all", Arity::Unary(), &all_doc, + &default_scalar_aggregate_options); + aggregate::AddArrayScalarAggKernels(aggregate::AllInit, {boolean()}, boolean(), + func.get()); + DCHECK_OK(registry->AddFunction(std::move(func))); + + // index + func = std::make_shared("index", Arity::Unary(), &index_doc); + aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, BaseBinaryTypes(), int64(), + func.get()); + aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, PrimitiveTypes(), int64(), + func.get()); + aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, TemporalTypes(), int64(), + func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc index feeb66a1489..8d3e5a0409d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc @@ -25,30 +25,43 @@ namespace aggregate { // Sum implementation template -struct SumImplAvx2 : public SumImpl {}; +struct SumImplAvx2 : public SumImpl { + explicit SumImplAvx2(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; template -struct MeanImplAvx2 : public MeanImpl {}; +struct MeanImplAvx2 : public MeanImpl { + explicit MeanImplAvx2(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; -std::unique_ptr SumInitAvx2(KernelContext* ctx, const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); +Result> SumInitAvx2(KernelContext* ctx, + const KernelInitArgs& args) { + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } -std::unique_ptr MeanInitAvx2(KernelContext* ctx, - const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); +Result> MeanInitAvx2(KernelContext* ctx, + const KernelInitArgs& args) { + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } // ---------------------------------------------------------------------- // MinMax implementation -std::unique_ptr MinMaxInitAvx2(KernelContext* ctx, - const KernelInitArgs& args) { +Result> MinMaxInitAvx2(KernelContext* ctx, + const KernelInitArgs& args) { MinMaxInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options)); + static_cast(*args.options)); return visitor.Create(); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc index 522564a8469..4f8ad74a086 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc @@ -25,31 +25,43 @@ namespace aggregate { // Sum implementation template -struct SumImplAvx512 : public SumImpl {}; +struct SumImplAvx512 : public SumImpl { + explicit SumImplAvx512(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; template -struct MeanImplAvx512 : public MeanImpl {}; +struct MeanImplAvx512 : public MeanImpl { + explicit MeanImplAvx512(const ScalarAggregateOptions& options_) { + this->options = options_; + } +}; -std::unique_ptr SumInitAvx512(KernelContext* ctx, - const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); +Result> SumInitAvx512(KernelContext* ctx, + const KernelInitArgs& args) { + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } -std::unique_ptr MeanInitAvx512(KernelContext* ctx, - const KernelInitArgs& args) { - SumLikeInit visitor(ctx, *args.inputs[0].type); +Result> MeanInitAvx512(KernelContext* ctx, + const KernelInitArgs& args) { + SumLikeInit visitor( + ctx, *args.inputs[0].type, + static_cast(*args.options)); return visitor.Create(); } // ---------------------------------------------------------------------- // MinMax implementation -std::unique_ptr MinMaxInitAvx512(KernelContext* ctx, - const KernelInitArgs& args) { +Result> MinMaxInitAvx512(KernelContext* ctx, + const KernelInitArgs& args) { MinMaxInitState visitor( ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), - static_cast(*args.options)); + static_cast(*args.options)); return visitor.Create(); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 5029c1855c0..3d02b273066 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_internal.h" @@ -58,45 +59,61 @@ struct SumImpl : public ScalarAggregator { using SumType = typename FindAccumulatorType::Type; using OutputType = typename TypeTraits::ScalarType; - void Consume(KernelContext*, const ExecBatch& batch) override { - const auto& data = batch[0].array(); - this->count = data->length - data->GetNullCount(); - if (is_boolean_type::value) { - this->sum = static_cast(BooleanArray(data).true_count()); + Status Consume(KernelContext*, const ExecBatch& batch) override { + if (batch[0].is_array()) { + const auto& data = batch[0].array(); + this->count += data->length - data->GetNullCount(); + if (is_boolean_type::value) { + this->sum += + static_cast(BooleanArray(data).true_count()); + } else { + this->sum += + arrow::compute::detail::SumArray( + *data); + } } else { - this->sum = - arrow::compute::detail::SumArray(*data); + const auto& data = *batch[0].scalar(); + this->count += data.is_valid * batch.length; + if (data.is_valid) { + this->sum += internal::UnboxScalar::Unbox(data) * batch.length; + } } + return Status::OK(); } - void MergeFrom(KernelContext*, KernelState&& src) override { + Status MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); this->count += other.count; this->sum += other.sum; + return Status::OK(); } - void Finalize(KernelContext*, Datum* out) override { - if (this->count == 0) { + Status Finalize(KernelContext*, Datum* out) override { + if (this->count < options.min_count) { out->value = std::make_shared(); } else { out->value = MakeScalar(this->sum); } + return Status::OK(); } size_t count = 0; typename SumType::c_type sum = 0; + ScalarAggregateOptions options; }; template struct MeanImpl : public SumImpl { - void Finalize(KernelContext*, Datum* out) override { - if (this->count == 0) { + Status Finalize(KernelContext*, Datum* out) override { + if (this->count < options.min_count) { out->value = std::make_shared(); } else { const double mean = static_cast(this->sum) / this->count; out->value = std::make_shared(mean); } + return Status::OK(); } + ScalarAggregateOptions options; }; template