diff --git a/.env b/.env index 587430579f9..510e11d9568 100644 --- a/.env +++ b/.env @@ -47,7 +47,6 @@ FEDORA=33 PYTHON=3.6 LLVM=12 CLANG_TOOLS=8 -RUST=nightly-2021-03-24 GO=1.15 NODE=14 MAVEN=3.5.4 diff --git a/.github/workflows/cancel.yml b/.github/workflows/cancel.yml index 3049ae706b0..da668bcdc3b 100644 --- a/.github/workflows/cancel.yml +++ b/.github/workflows/cancel.yml @@ -115,10 +115,3 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} workflowFileName: ruby.yml skipEventTypes: '["push", "schedule"]' - - uses: potiuk/cancel-workflow-runs@master - name: "Cancel stale Rust runs" - with: - cancelMode: allDuplicates - token: ${{ secrets.GITHUB_TOKEN }} - workflowFileName: rust.yml - skipEventTypes: '["push", "schedule"]' diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index d1b01848004..fdbb53c29da 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -29,7 +29,7 @@ env: jobs: lint: - name: Lint C++, Python, R, Rust, Docker, RAT + name: Lint C++, Python, R, Docker, RAT runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 098e1bad7f4..235b5918902 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -45,19 +45,9 @@ lang-R: lang-ruby: - ruby/**/* -lang-rust: - - rust/**/* - -datafusion: - - rust/datafusion/**/* - -ballista: - - rust/ballista/**/* - flight: - cpp/src/arrow/flight/**/* - r/R/flight.* - - rust/arrow-flight/**/* - python/pyarrow/*flight.* gandiva: @@ -71,4 +61,3 @@ parquet: - cpp/src/parquet/**/* - r/R/parquet.* - ruby/red-parquet/**/* - - rust/parquet*/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index a4f97be3b9c..fb41f36caf8 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -62,6 +62,11 @@ jobs: fetch-depth: 0 - name: Fetch Submodules and Tags run: ci/scripts/util_checkout.sh + - name: Checkout Arrow Rust + uses: actions/checkout@v2 + with: + repository: apache/arrow-rs + path: rust - name: Free Up Disk Space run: ci/scripts/util_cleanup.sh - name: Cache Docker Volumes diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml deleted file mode 100644 index 9c0a4ea72f1..00000000000 --- a/.github/workflows/rust.yml +++ /dev/null @@ -1,480 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Rust - -on: - push: - paths: - - '.github/workflows/rust.yml' - - 'rust/**' - - 'format/Flight.proto' - pull_request: - paths: - - '.github/workflows/rust.yml' - - 'rust/**' - - 'format/Flight.proto' - -jobs: - - # build the library, a compilation step used by multiple steps below - linux-build-lib: - name: Build Libraries on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - timeout-minutes: 40 - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - steps: - - uses: actions/checkout@v2 - - name: Cache Cargo - uses: actions/cache@v2 - with: - # these represent dependencies downloaded by cargo - # and thus do not depend on the OS, arch nor rust version. - path: /github/home/.cargo - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - # these represent compiled steps of both dependencies and arrow - # and thus are specific for a particular OS, arch and rust version. - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }}- - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Build Workspace - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust - cargo build - # Ballista is currently not part of the main workspace so requires a separate build step - - name: Build Ballista - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/ballista/rust - # snmalloc requires cmake so build without default features - cargo build --no-default-features - - # test the crate - linux-test: - name: Test Workspace on AMD64 Rust ${{ matrix.rust }} - needs: [linux-build-lib] - runs-on: ubuntu-latest - timeout-minutes: 40 - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - PARQUET_TEST_DATA: /__w/arrow/arrow/cpp/submodules/parquet-testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Run tests - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust - # run tests on all workspace members with default feature list - cargo test - # test datafusion examples - cd datafusion-examples - cargo test --no-default-features - cargo run --example csv_sql - cargo run --example parquet_sql - cd .. - cd arrow - # re-run tests on arrow workspace with additional features - cargo test --features=prettyprint - cargo run --example builders - cargo run --example dynamic_types - cargo run --example read_csv - cargo run --example read_csv_infer_schema - # Ballista is currently not part of the main workspace so requires a separate test step - - name: Run Ballista tests - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/ballista/rust - # snmalloc requires cmake so build without default features - cargo test --no-default-features - - # test the --features "simd" of the arrow crate. This requires nightly. - linux-test-simd: - name: Test SIMD on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - timeout-minutes: 40 - strategy: - matrix: - arch: [amd64] - rust: [nightly-2021-03-24] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Run tests - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/arrow - cargo test --features "simd" - - windows-and-macos: - name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }} - runs-on: ${{ matrix.os }} - timeout-minutes: 40 - strategy: - matrix: - os: [windows-latest, macos-latest] - rust: [stable] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - # TODO: this won't cache anything, which is expensive. Setup this action - # with a OS-dependent path. - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt - - name: Run tests - shell: bash - run: | - export ARROW_TEST_DATA=$(pwd)/testing/data - export PARQUET_TEST_DATA=$(pwd)/cpp/submodules/parquet-testing/data - # do not produce debug symbols to keep memory usage down - export RUSTFLAGS="-C debuginfo=0" - cd rust - cargo test - - clippy: - name: Clippy - needs: [linux-build-lib] - runs-on: ubuntu-latest - timeout-minutes: 40 - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - # this key equals the ones on `linux-build-lib` for re-use - key: ${{ runner.os }}-${{ matrix.arch }}-target-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt clippy - - name: Run clippy - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust - cargo clippy --all-targets --workspace -- -D warnings -A clippy::redundant_field_names - - miri-checks: - name: MIRI - runs-on: ubuntu-latest - timeout-minutes: 40 - strategy: - matrix: - arch: [amd64] - rust: [nightly-2021-03-24] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - uses: actions/cache@v2 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target - key: ${{ runner.os }}-cargo-miri-${{ hashFiles('**/Cargo.lock') }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt clippy miri - - name: Run Miri Checks - env: - RUST_BACKTRACE: full - RUST_LOG: 'trace' - run: | - export MIRIFLAGS="-Zmiri-disable-isolation" - cd rust - cargo miri setup - cargo clean - # Ignore MIRI errors until we can get a clean run - cargo miri test || true - - coverage: - name: Coverage - runs-on: ubuntu-latest - timeout-minutes: 40 - strategy: - matrix: - arch: [amd64] - rust: [stable] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /home/runner/.cargo - # this key is not equal because the user is different than on a container (runner vs github) - key: cargo-coverage-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /home/runner/target - # this key is not equal because coverage uses different compilation flags. - key: ${{ runner.os }}-${{ matrix.arch }}-target-coverage-cache-${{ matrix.rust }}- - - name: Run coverage - run: | - export CARGO_HOME="/home/runner/.cargo" - export CARGO_TARGET_DIR="/home/runner/target" - - export ARROW_TEST_DATA=$(pwd)/testing/data - export PARQUET_TEST_DATA=$(pwd)/cpp/submodules/parquet-testing/data - - # 2020-11-15: There is a cargo-tarpaulin regression in 0.17.0 - # see https://github.com/xd009642/tarpaulin/issues/618 - cargo install --version 0.16.0 cargo-tarpaulin - cd rust - cargo tarpaulin --out Xml - - name: Report coverage - continue-on-error: true - run: bash <(curl -s https://codecov.io/bash) - - # test FFI against the C-Data interface exposed by pyarrow - pyarrow-integration-test: - name: Test Pyarrow C Data Interface - runs-on: ubuntu-latest - timeout-minutes: 40 - strategy: - matrix: - rust: [stable] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - rustup component add rustfmt clippy - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /home/runner/.cargo - key: cargo-maturin-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /home/runner/target - # this key is not equal because maturin uses different compilation flags. - key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}- - - uses: actions/setup-python@v2 - with: - python-version: '3.7' - - name: Install Python dependencies - run: python -m pip install --upgrade pip setuptools wheel - - name: Run tests - run: | - export CARGO_HOME="/home/runner/.cargo" - export CARGO_TARGET_DIR="/home/runner/target" - - cd rust/arrow-pyarrow-integration-testing - - python -m venv venv - source venv/bin/activate - - pip install maturin==0.8.2 toml==0.10.1 pyarrow==1.0.0 - maturin develop - python -m unittest discover tests - - # test the arrow crate builds against wasm32 in stable rust - wasm32-build: - name: Build wasm32 on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - timeout-minutes: 40 - strategy: - matrix: - arch: [amd64] - rust: [nightly-2021-03-24] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - PARQUET_TEST_DATA: /__w/arrow/arrow/cpp/submodules/parquet-testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-wasm32-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup override set ${{ matrix.rust }} - rustup component add rustfmt - rustup target add wasm32-unknown-unknown - - name: Build arrow crate - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/arrow - cargo build --target wasm32-unknown-unknown - - # test the projects can build without default features - default-build: - name: Check No Defaults on AMD64 Rust ${{ matrix.rust }} - runs-on: ubuntu-latest - timeout-minutes: 40 - strategy: - matrix: - arch: [amd64] - rust: [stable] - container: - image: ${{ matrix.arch }}/rust - env: - # Disable full debug symbol generation to speed up CI build and keep memory down - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - ARROW_TEST_DATA: /__w/arrow/arrow/testing/data - PARQUET_TEST_DATA: /__w/arrow/arrow/cpp/submodules/parquet-testing/data - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Cache Cargo - uses: actions/cache@v2 - with: - path: /github/home/.cargo - # this key equals the ones on `linux-build-lib` for re-use - key: cargo-cache- - - name: Cache Rust dependencies - uses: actions/cache@v2 - with: - path: /github/home/target - key: ${{ runner.os }}-${{ matrix.arch }}-target-wasm32-cache-${{ matrix.rust }} - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup override set ${{ matrix.rust }} - rustup component add rustfmt - - name: Build arrow crate - run: | - export CARGO_HOME="/github/home/.cargo" - export CARGO_TARGET_DIR="/github/home/target" - cd rust/arrow - cargo check --all-targets --no-default-features diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d2d2d81d68..8b5a24476d8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,14 +29,6 @@ repos: entry: bash -c "git archive HEAD --prefix=apache-arrow/ --output=arrow-src.tar && ./dev/release/run-rat.sh arrow-src.tar" always_run: true pass_filenames: false - - id: rustfmt - name: Rust Format - language: system - entry: bash -c "cd rust && cargo +stable fmt --all -- --check" - files: ^rust/.*\.rs$ - types: - - file - - rust - id: cmake-format name: CMake Format language: python diff --git a/README.md b/README.md index e9e13537cc9..efe63e1b269 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Major components of the project include: - [Python libraries](https://github.com/apache/arrow/tree/master/python) - [R libraries](https://github.com/apache/arrow/tree/master/r) - [Ruby libraries](https://github.com/apache/arrow/tree/master/ruby) - - [Rust libraries](https://github.com/apache/arrow/tree/master/rust) + - [Rust libraries](https://github.com/apache/arrow-rs) Arrow is an [Apache Software Foundation](https://www.apache.org) project. Learn more at [arrow.apache.org](https://arrow.apache.org). diff --git a/ci/detect-changes.py b/ci/detect-changes.py index c32f6e040dd..14e71ed48ce 100644 --- a/ci/detect-changes.py +++ b/ci/detect-changes.py @@ -140,7 +140,7 @@ def list_github_actions_affected_files(): LANGUAGE_TOPICS = ['c_glib', 'cpp', 'docs', 'go', 'java', 'js', 'python', - 'r', 'ruby', 'rust', 'csharp'] + 'r', 'ruby', 'csharp'] ALL_TOPICS = LANGUAGE_TOPICS + ['integration', 'dev'] @@ -161,7 +161,7 @@ def list_github_actions_affected_files(): } COMPONENTS = {'cpp', 'java', 'c_glib', 'r', 'ruby', 'integration', 'js', - 'rust', 'csharp', 'go', 'docs', 'python', 'dev'} + 'csharp', 'go', 'docs', 'python', 'dev'} def get_affected_topics(affected_files): @@ -298,7 +298,6 @@ def test_get_affected_topics(): 'python': True, 'r': True, 'ruby': True, - 'rust': False, 'csharp': False, 'integration': True, 'dev': False @@ -315,7 +314,6 @@ def test_get_affected_topics(): 'python': True, 'r': True, 'ruby': True, - 'rust': True, 'csharp': True, 'integration': True, 'dev': False @@ -332,7 +330,6 @@ def test_get_affected_topics(): 'python': True, 'r': True, 'ruby': True, - 'rust': True, 'csharp': True, 'integration': True, 'dev': True, diff --git a/ci/docker/linux-apt-lint.dockerfile b/ci/docker/linux-apt-lint.dockerfile index 66538919c49..c711c4c883c 100644 --- a/ci/docker/linux-apt-lint.dockerfile +++ b/ci/docker/linux-apt-lint.dockerfile @@ -45,15 +45,6 @@ COPY --from=hadolint /bin/hadolint /usr/bin/hadolint COPY ci/scripts/install_iwyu.sh /arrow/ci/scripts/ RUN arrow/ci/scripts/install_iwyu.sh /tmp/iwyu /usr/local ${clang_tools} -# Rust linter -ARG rust=nightly-2021-03-24 -RUN curl https://sh.rustup.rs -sSf | \ - sh -s -- --default-toolchain stable -y -ENV PATH /root/.cargo/bin:$PATH -RUN rustup install ${rust} && \ - rustup default ${rust} && \ - rustup component add rustfmt - # Use python3 by default in scripts RUN ln -s /usr/bin/python3 /usr/local/bin/python && \ ln -s /usr/bin/pip3 /usr/local/bin/pip diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 06dd6b60370..5329e0abbe2 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -282,7 +282,6 @@ def build(ctx, src, build_dir, force, targets, **kwargs): LintCheck('rat', "Check all sources files for license texts via Apache RAT."), LintCheck('r', "Lint R files."), - LintCheck('rust', "Lint Rust files."), LintCheck('docker', "Lint Dockerfiles with hadolint."), ] diff --git a/dev/archery/archery/lang/rust.py b/dev/archery/archery/lang/rust.py deleted file mode 100644 index b1d765b7d52..00000000000 --- a/dev/archery/archery/lang/rust.py +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from ..utils.command import Command, default_bin - - -class Cargo(Command): - def __init__(self, cargo_bin=None): - self.bin = default_bin(cargo_bin, "cargo") diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index 3b94d0139c0..0b0e8b46948 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -26,7 +26,6 @@ from .git import git from .logger import logger from ..lang.cpp import CppCMakeDefinition, CppConfiguration -from ..lang.rust import Cargo from ..lang.python import Autopep8, Flake8, NumpyDoc from .rat import Rat, exclusion_from_globs from .tmpdir import tmpdir @@ -292,20 +291,6 @@ def r_linter(src): yield LintResult.from_cmd(Bash().run(r_lint_sh, check=False)) -def rust_linter(src): - """Run Rust linter.""" - logger.info("Running Rust linter") - cargo = Cargo() - - if not cargo.available: - logger.error("Rust linter requested but cargo executable not found.") - return - - yield LintResult.from_cmd(cargo.run("+stable", "fmt", "--all", "--", - "--check", cwd=src.rust, - check=False)) - - class Hadolint(Command): def __init__(self, hadolint_bin=None): self.bin = default_bin(hadolint_bin, "hadolint") @@ -341,7 +326,7 @@ def docker_linter(src): def linter(src, fix=False, *, clang_format=False, cpplint=False, clang_tidy=False, iwyu=False, iwyu_all=False, python=False, numpydoc=False, cmake_format=False, rat=False, - r=False, rust=False, docker=False): + r=False, docker=False): """Run all linters.""" with tmpdir(prefix="arrow-lint-") as root: build_dir = os.path.join(root, "cpp-build") @@ -375,9 +360,6 @@ def linter(src, fix=False, *, clang_format=False, cpplint=False, if r: results.extend(r_linter(src)) - if rust: - results.extend(rust_linter(src)) - if docker: results.extend(docker_linter(src)) diff --git a/dev/archery/archery/utils/source.py b/dev/archery/archery/utils/source.py index f7e47a5a1b6..1080cb75d67 100644 --- a/dev/archery/archery/utils/source.py +++ b/dev/archery/archery/utils/source.py @@ -88,11 +88,6 @@ def r(self): """ Returns the r directory of an Arrow sources. """ return self.path / "r" - @property - def rust(self): - """ Returns the rust directory of an Arrow sources. """ - return self.path / "rust" - @property def git_backed(self): """ Indicate if the sources are backed by git. """ diff --git a/dev/release/01-prepare-test.rb b/dev/release/01-prepare-test.rb index 007e4da040c..3cc5418df0f 100644 --- a/dev/release/01-prepare-test.rb +++ b/dev/release/01-prepare-test.rb @@ -324,119 +324,6 @@ def test_version_pre_tag "+ VERSION = \"#{@release_version}\""], ], }, - { - path: "rust/arrow-flight/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\" }"], - ], - }, - { - path: "rust/arrow-pyarrow-integration-testing/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\" }"], - ], - }, - { - path: "rust/arrow/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ], - }, - { - path: "rust/benchmarks/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ], - }, - { - path: "rust/datafusion-examples/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ], - }, - { - path: "rust/datafusion/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\", features = [\"prettyprint\"] }", - "-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\", features = [\"arrow\"] }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\", features = [\"prettyprint\"] }", - "+parquet = { path = \"../parquet\", version = \"#{@release_version}\", features = [\"arrow\"] }"], - ], - }, - { - path: "rust/datafusion/README.md", - hunks: [ - ["-datafusion = \"#{@snapshot_version}\"", - "+datafusion = \"#{@release_version}\""], - ], - }, - { - path: "rust/integration-testing/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ], - }, - { - path: "rust/parquet/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\", optional = true }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\", optional = true }"], - ["-arrow = { path = \"../arrow\", version = \"#{@snapshot_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@release_version}\" }"], - ], - }, - { - path: "rust/parquet/README.md", - hunks: [ - ["-parquet = \"#{@snapshot_version}\"", - "+parquet = \"#{@release_version}\""], - ["-See [crate documentation](https://docs.rs/crate/parquet/#{@snapshot_version}) on available API.", - "+See [crate documentation](https://docs.rs/crate/parquet/#{@release_version}) on available API."], - ], - }, - { - path: "rust/parquet_derive/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }"], - ], - }, - { - path: "rust/parquet_derive/README.md", - hunks: [ - ["-parquet = \"#{@snapshot_version}\"", - "-parquet_derive = \"#{@snapshot_version}\"", - "+parquet = \"#{@release_version}\"", - "+parquet_derive = \"#{@release_version}\""], - ], - }, - { - path: "rust/parquet_derive_test/Cargo.toml", - hunks: [ - ["-version = \"#{@snapshot_version}\"", - "+version = \"#{@release_version}\""], - ["-parquet = { path = \"../parquet\", version = \"#{@snapshot_version}\" }", - "-parquet_derive = { path = \"../parquet_derive\", version = \"#{@snapshot_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@release_version}\" }", - "+parquet_derive = { path = \"../parquet_derive\", version = \"#{@release_version}\" }"], - ], - }, ], parse_patch(git("log", "-n", "1", "-p"))) end @@ -633,119 +520,6 @@ def test_version_post_tag "+ VERSION = \"#{@next_snapshot_version}\""], ], }, - { - path: "rust/arrow-flight/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\" }"], - ], - }, - { - path: "rust/arrow-pyarrow-integration-testing/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\" }"], - ], - }, - { - path: "rust/arrow/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/benchmarks/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/datafusion-examples/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/datafusion/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\", features = [\"prettyprint\"] }", - "-parquet = { path = \"../parquet\", version = \"#{@release_version}\", features = [\"arrow\"] }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\", features = [\"prettyprint\"] }", - "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\", features = [\"arrow\"] }"], - ], - }, - { - path: "rust/datafusion/README.md", - hunks: [ - ["-datafusion = \"#{@release_version}\"", - "+datafusion = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/integration-testing/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/parquet/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\", optional = true }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\", optional = true }"], - ["-arrow = { path = \"../arrow\", version = \"#{@release_version}\" }", - "+arrow = { path = \"../arrow\", version = \"#{@next_snapshot_version}\" }"], - ], - }, - { - path: "rust/parquet/README.md", - hunks: [ - ["-parquet = \"#{@release_version}\"", - "+parquet = \"#{@next_snapshot_version}\""], - ["-See [crate documentation](https://docs.rs/crate/parquet/#{@release_version}) on available API.", - "+See [crate documentation](https://docs.rs/crate/parquet/#{@next_snapshot_version}) on available API."], - ], - }, - { - path: "rust/parquet_derive/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-parquet = { path = \"../parquet\", version = \"#{@release_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }"], - ], - }, - { - path: "rust/parquet_derive/README.md", - hunks: [ - ["-parquet = \"#{@release_version}\"", - "-parquet_derive = \"#{@release_version}\"", - "+parquet = \"#{@next_snapshot_version}\"", - "+parquet_derive = \"#{@next_snapshot_version}\""], - ], - }, - { - path: "rust/parquet_derive_test/Cargo.toml", - hunks: [ - ["-version = \"#{@release_version}\"", - "+version = \"#{@next_snapshot_version}\""], - ["-parquet = { path = \"../parquet\", version = \"#{@release_version}\" }", - "-parquet_derive = { path = \"../parquet_derive\", version = \"#{@release_version}\" }", - "+parquet = { path = \"../parquet\", version = \"#{@next_snapshot_version}\" }", - "+parquet_derive = { path = \"../parquet_derive\", version = \"#{@next_snapshot_version}\" }"], - ], - }, ], parse_patch(git("log", "-n", "1", "-p"))) end diff --git a/dev/release/post-07-rust.sh b/dev/release/post-07-rust.sh deleted file mode 100755 index 3c94607565f..00000000000 --- a/dev/release/post-07-rust.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -# -*- indent-tabs-mode: nil; sh-indentation: 2; sh-basic-offset: 2 -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -set -e -set -o pipefail - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit -fi - -version=$1 - -: ${INSTALL_RUST:=no} - -if [ "${INSTALL_RUST}" == "yes" ]; then - export RUSTUP_HOME="$(pwd)/release-rustup" - export CARGO_HOME="${RUSTUP_HOME}" - rm -rf "${RUSTUP_HOME}" - curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path - export PATH="${RUSTUP_HOME}/bin:$PATH" - source "${RUSTUP_HOME}/env" - rustup default stable - cargo login -fi - -archive_name=apache-arrow-${version} -tar_gz=${archive_name}.tar.gz -rm -f ${tar_gz} -curl \ - --remote-name \ - --fail \ - https://downloads.apache.org/arrow/arrow-${version}/${tar_gz} -rm -rf ${archive_name} -tar xf ${tar_gz} -modules=() -for cargo_toml in ${archive_name}/rust/*/Cargo.toml; do - module_dir=$(dirname ${cargo_toml}) - pushd ${module_dir} - cargo publish --allow-dirty - modules+=($(basename ${module_dir})) - popd -done -popd -rm -rf ${archive_name} -rm -f ${tar_gz} - -if [ "${INSTALL_RUST}" == "yes" ]; then - rm -rf "${RUSTUP_HOME}" -fi - -echo "Success! The released packages are available here:" -for module in ${modules[@]}; do - echo " https://crates.io/crates/${module}/${version}" -done diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index a50d729a7d5..13e431ceb8d 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -190,9 +190,6 @@ r/inst/include/cpp11.hpp r/inst/include/cpp11/*.hpp .gitattributes ruby/red-arrow/.yardopts -rust/arrow/test/data/*.csv -rust/rust-toolchain -rust/arrow-flight/src/arrow.flight.protocol.rs julia/Arrow/Project.toml julia/Arrow/README.md julia/Arrow/docs/Manifest.toml @@ -202,6 +199,3 @@ julia/Arrow/docs/mkdocs.yml julia/Arrow/docs/src/index.md julia/Arrow/docs/src/manual.md julia/Arrow/docs/src/reference.md -rust/ballista/rust/benchmarks/tpch/queries/q*.sql -rust/ballista/rust/scheduler/testdata/* -rust/ballista/ui/scheduler/yarn.lock diff --git a/dev/release/utils-prepare.sh b/dev/release/utils-prepare.sh index a1c884125a8..93ddb18b77c 100644 --- a/dev/release/utils-prepare.sh +++ b/dev/release/utils-prepare.sh @@ -145,23 +145,4 @@ update_versions() { rm -f */*/*/version.rb.bak git add */*/*/version.rb popd - - pushd "${ARROW_DIR}/rust" - sed -i.bak -E \ - -e "s/^version = \".+\"/version = \"${version}\"/g" \ - -e "s/^(arrow = .* version = )\".*\"(( .*)|(, features = .*)|(, optional = .*))$/\\1\"${version}\"\\2/g" \ - -e "s/^(arrow-flight = .* version = )\".+\"( .*)/\\1\"${version}\"\\2/g" \ - -e "s/^(parquet = .* version = )\".*\"(( .*)|(, features = .*))$/\\1\"${version}\"\\2/g" \ - -e "s/^(parquet_derive = .* version = )\".*\"(( .*)|(, features = .*))$/\\1\"${version}\"\\2/g" \ - */Cargo.toml - rm -f */Cargo.toml.bak - git add */Cargo.toml - - sed -i.bak -E \ - -e "s/^([^ ]+) = \".+\"/\\1 = \"${version}\"/g" \ - -e "s,docs\.rs/crate/([^/]+)/[^)]+,docs.rs/crate/\\1/${version},g" \ - */README.md - rm -f */README.md.bak - git add */README.md - popd } diff --git a/docs/source/developers/contributing.rst b/docs/source/developers/contributing.rst index 9aecf8a6915..e75d2c6336f 100644 --- a/docs/source/developers/contributing.rst +++ b/docs/source/developers/contributing.rst @@ -215,7 +215,7 @@ in the end. To make the review process smooth for everyone, try to for maintainers to accept. * Add new unit tests for your code. * Follow the style guides for the part(s) of the project you're modifying. - Some languages (C++, Python, and Rust, for example) run a lint check in + Some languages (C++ and Python, for example) run a lint check in continuous integration. For all languages, see their respective developer documentation and READMEs for style guidance. In general, try to make it look as if the codebase has a single author, and emulate any conventions you see, diff --git a/matlab/doc/matlab_interface_for_apache_arrow_design.md b/matlab/doc/matlab_interface_for_apache_arrow_design.md index de2bb13c39d..5d64c8e85bb 100644 --- a/matlab/doc/matlab_interface_for_apache_arrow_design.md +++ b/matlab/doc/matlab_interface_for_apache_arrow_design.md @@ -362,5 +362,5 @@ The table below provides a high-level roadmap for the development of specific ca [Add-On Explorer]: https://www.mathworks.com/help/matlab/matlab_env/get-add-ons.html [JavaScript user]: https://github.com/apache/arrow/tree/master/js [`apache-arrow` package via the `npm` package manager]: https://www.npmjs.com/package/apache-arrow -[Rust user]: https://github.com/apache/arrow/tree/master/rust +[Rust user]: https://github.com/apache/arrow-rs [`arrow` crate via the `cargo` package manager]: https://crates.io/crates/arrow diff --git a/rust/.gitignore b/rust/.gitignore deleted file mode 100644 index 389f4ab254b..00000000000 --- a/rust/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -Cargo.lock -target -rusty-tags.vi -.history -.flatbuffers/ diff --git a/rust/Cargo.toml b/rust/Cargo.toml deleted file mode 100644 index de26f87c778..00000000000 --- a/rust/Cargo.toml +++ /dev/null @@ -1,34 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[workspace] -members = [ - "arrow", - "parquet", - "parquet_derive", - "parquet_derive_test", - "datafusion", - "datafusion-examples", - "arrow-flight", - "integration-testing", - "benchmarks", -] - -# this package is excluded because it requires different compilation flags, thereby significantly changing -# how it is compiled within the workspace, causing the whole workspace to be compiled from scratch -# this way, this is a stand-alone package that compiles independently of the others. -exclude = ["arrow-pyarrow-integration-testing", "ballista"] diff --git a/rust/README.md b/rust/README.md deleted file mode 100644 index 7fdef29bcdb..00000000000 --- a/rust/README.md +++ /dev/null @@ -1,186 +0,0 @@ - - -# Native Rust implementation of Apache Arrow - -[![Coverage Status](https://codecov.io/gh/apache/arrow/rust/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/arrow?branch=master) - -Welcome to the implementation of Arrow, the popular in-memory columnar format, in [Rust](https://www.rust-lang.org/). - -This part of the Arrow project is divided in 4 main components: - -| Crate | Description | Documentation | -|-----------|-------------|---------------| -|Arrow | Core functionality (memory layout, arrays, low level computations) | [(README)](arrow/README.md) | -|Parquet | Parquet support | [(README)](parquet/README.md) | -|Arrow-flight | Arrow data between processes | [(README)](arrow-flight/README.md) | -|DataFusion | In-memory query engine with SQL support | [(README)](datafusion/README.md) | -|Ballista | Distributed query execution | [(README)](ballista/README.md) | - -Independently, they support a vast array of functionality for in-memory computations. - -Together, they allow users to write an SQL query or a `DataFrame` (using the `datafusion` crate), run it against a parquet file (using the `parquet` crate), evaluate it in-memory using Arrow's columnar format (using the `arrow` crate), and send to another process (using the `arrow-flight` crate). - -Generally speaking, the `arrow` crate offers functionality to develop code that uses Arrow arrays, and `datafusion` offers most operations typically found in SQL, with the notable exceptions of: - -* `join` -* `window` functions - -There are too many features to enumerate here, but some notable mentions: - -* `Arrow` implements all formats in the specification except certain dictionaries -* `Arrow` supports SIMD operations to some of its vertical operations -* `DataFusion` supports `async` execution -* `DataFusion` supports user-defined functions, aggregates, and whole execution nodes - -You can find more details about each crate in their respective READMEs. - -## Arrow Rust Community - -We use the official [ASF Slack](https://s.apache.org/slack-invite) for informal discussions and coordination. This is -a great place to meet other contributors and get guidance on where to contribute. Join us in the `arrow-rust` channel. - -We use [ASF JIRA](https://issues.apache.org/jira/secure/Dashboard.jspa) as the system of record for new features -and bug fixes and this plays a critical role in the release process. - -For design discussions we generally collaborate on Google documents and file a JIRA linking to the document. - -There is also a bi-weekly Rust-specific sync call for the Arrow Rust community. This is hosted on Google Meet -at https://meet.google.com/ctp-yujs-aee on alternate Wednesday's at 09:00 US/Pacific, 12:00 US/Eastern. During -US daylight savings time this corresponds to 16:00 UTC and at other times this is 17:00 UTC. - -## Developer's guide to Arrow Rust - -### How to compile - -This is a standard cargo project with workspaces. To build it, you need to have `rust` and `cargo`: - -```bash -cd /rust && cargo build -``` - -You can also use rust's official docker image: - -```bash -docker run --rm -v $(pwd)/rust:/rust -it rust /bin/bash -c "cd /rust && cargo build" -``` - -The command above assumes that are in the root directory of the project, not in the same -directory as this README.md. - -You can also compile specific workspaces: - -```bash -cd /rust/arrow && cargo build -``` - -### Git Submodules - -Before running tests and examples, it is necessary to set up the local development environment. - -The tests rely on test data that is contained in git submodules. - -To pull down this data run the following: - -```bash -git submodule update --init -``` - -This populates data in two git submodules: - -- `../cpp/submodules/parquet_testing/data` (sourced from https://github.com/apache/parquet-testing.git) -- `../testing` (sourced from https://github.com/apache/arrow-testing) - -By default, `cargo test` will look for these directories at their -standard location. The following environment variables can be used to override the location: - -```bash -# Optionaly specify a different location for test data -export PARQUET_TEST_DATA=$(cd ../cpp/submodules/parquet-testing/data; pwd) -export ARROW_TEST_DATA=$(cd ../testing/data; pwd) -``` - -From here on, this is a pure Rust project and `cargo` can be used to run tests, benchmarks, docs and examples as usual. - - -### Running the tests - -Run tests using the Rust standard `cargo test` command: - -```bash -# run all tests. -cargo test - - -# run only tests for the arrow crate -cargo test -p arrow -``` - -## Code Formatting - -Our CI uses `rustfmt` to check code formatting. Before submitting a -PR be sure to run the following and check for lint issues: - -```bash -cargo +stable fmt --all -- --check -``` - -## Clippy Lints - -We recommend using `clippy` for checking lints during development. While we do not yet enforce `clippy` checks, we recommend not introducing new `clippy` errors or warnings. - -Run the following to check for clippy lints. - -``` -cargo clippy -``` - -If you use Visual Studio Code with the `rust-analyzer` plugin, you can enable `clippy` to run each time you save a file. See https://users.rust-lang.org/t/how-to-use-clippy-in-vs-code-with-rust-analyzer/41881. - -One of the concerns with `clippy` is that it often produces a lot of false positives, or that some recommendations may hurt readability. We do not have a policy of which lints are ignored, but if you disagree with a `clippy` lint, you may disable the lint and briefly justify it. - -Search for `allow(clippy::` in the codebase to identify lints that are ignored/allowed. We currently prefer ignoring lints on the lowest unit possible. -* If you are introducing a line that returns a lint warning or error, you may disable the lint on that line. -* If you have several lints on a function or module, you may disable the lint on the function or module. -* If a lint is pervasive across multiple modules, you may disable it at the crate level. - -## Git Pre-Commit Hook - -We can use [git pre-commit hook](https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks) to automate various kinds of git pre-commit checking/formatting. - -Suppose you are in the root directory of the project. - -First check if the file already exists: - -```bash -ls -l .git/hooks/pre-commit -``` - -If the file already exists, to avoid mistakenly **overriding**, you MAY have to check -the link source or file content. Else if not exist, let's safely soft link [pre-commit.sh](pre-commit.sh) as file `.git/hooks/pre-commit`: - -``` -ln -s ../../rust/pre-commit.sh .git/hooks/pre-commit -``` - -If sometimes you want to commit without checking, just run `git commit` with `--no-verify`: - -```bash -git commit --no-verify -m "... commit message ..." -``` diff --git a/rust/arrow-flight/Cargo.toml b/rust/arrow-flight/Cargo.toml deleted file mode 100644 index de6aa832315..00000000000 --- a/rust/arrow-flight/Cargo.toml +++ /dev/null @@ -1,45 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "arrow-flight" -description = "Apache Arrow Flight" -version = "5.0.0-SNAPSHOT" -edition = "2018" -authors = ["Apache Arrow "] -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" -license = "Apache-2.0" - -[dependencies] -arrow = { path = "../arrow", version = "5.0.0-SNAPSHOT" } -tonic = "0.4" -bytes = "1" -prost = "0.7" -prost-derive = "0.7" -tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread"] } -futures = { version = "0.3", default-features = false, features = ["alloc"]} - -[build-dependencies] -tonic-build = "0.4" -# Pin specific version of the tonic-build dependencies to avoid auto-generated -# (and checked in) arrow.flight.protocol.rs from changing -proc-macro2 = "=1.0.24" - -#[lib] -#name = "flight" -#path = "src/lib.rs" diff --git a/rust/arrow-flight/README.md b/rust/arrow-flight/README.md deleted file mode 100644 index ba63f65bc48..00000000000 --- a/rust/arrow-flight/README.md +++ /dev/null @@ -1,29 +0,0 @@ - - -# Apache Arrow Flight - -Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. - -This crate simply provides the Rust implementation of the [Flight.proto](../../format/Flight.proto) gRPC protocol and provides an example that demonstrates how to build a Flight server implemented with Tonic. - -Note that building a Flight server also requires an implementation of Arrow IPC which is based on the Flatbuffers serialization framework. The Rust implementation of Arrow IPC is not yet complete although the generated Flatbuffers code is available as part of the core Arrow crate. - - - diff --git a/rust/arrow-flight/build.rs b/rust/arrow-flight/build.rs deleted file mode 100644 index ca232551455..00000000000 --- a/rust/arrow-flight/build.rs +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::{ - env, - fs::OpenOptions, - io::{Read, Write}, - path::Path, -}; - -fn main() -> Result<(), Box> { - // avoid rerunning build if the file has not changed - println!("cargo:rerun-if-changed=../../format/Flight.proto"); - - // override the build location, in order to check in the changes to proto files - env::set_var("OUT_DIR", "src"); - - // The current working directory can vary depending on how the project is being - // built or released so we build an absolute path to the proto file - let path = Path::new("../../format/Flight.proto"); - if path.exists() { - tonic_build::compile_protos("../../format/Flight.proto")?; - // read file contents to string - let mut file = OpenOptions::new() - .read(true) - .open("src/arrow.flight.protocol.rs")?; - let mut buffer = String::new(); - file.read_to_string(&mut buffer)?; - // append warning that file was auto-generate - let mut file = OpenOptions::new() - .write(true) - .truncate(true) - .open("src/arrow.flight.protocol.rs")?; - file.write_all("// This file was automatically generated through the build.rs script, and should not be edited.\n\n".as_bytes())?; - file.write_all(buffer.as_bytes())?; - } - - // As the proto file is checked in, the build should not fail if the file is not found - Ok(()) -} diff --git a/rust/arrow-flight/examples/server.rs b/rust/arrow-flight/examples/server.rs deleted file mode 100644 index 75d05378710..00000000000 --- a/rust/arrow-flight/examples/server.rs +++ /dev/null @@ -1,131 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::pin::Pin; - -use futures::Stream; -use tonic::transport::Server; -use tonic::{Request, Response, Status, Streaming}; - -use arrow_flight::{ - flight_service_server::FlightService, flight_service_server::FlightServiceServer, - Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, - HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, -}; - -#[derive(Clone)] -pub struct FlightServiceImpl {} - -#[tonic::async_trait] -impl FlightService for FlightServiceImpl { - type HandshakeStream = Pin< - Box> + Send + Sync + 'static>, - >; - type ListFlightsStream = - Pin> + Send + Sync + 'static>>; - type DoGetStream = - Pin> + Send + Sync + 'static>>; - type DoPutStream = - Pin> + Send + Sync + 'static>>; - type DoActionStream = Pin< - Box< - dyn Stream> - + Send - + Sync - + 'static, - >, - >; - type ListActionsStream = - Pin> + Send + Sync + 'static>>; - type DoExchangeStream = - Pin> + Send + Sync + 'static>>; - - async fn handshake( - &self, - _request: Request>, - ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) - } - - async fn list_flights( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) - } - - async fn get_flight_info( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) - } - - async fn get_schema( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) - } - - async fn do_get( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) - } - - async fn do_put( - &self, - _request: Request>, - ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) - } - - async fn do_action( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) - } - - async fn list_actions( - &self, - _request: Request, - ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) - } - - async fn do_exchange( - &self, - _request: Request>, - ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) - } -} - -#[tokio::main] -async fn main() -> Result<(), Box> { - let addr = "[::1]:50051".parse()?; - let service = FlightServiceImpl {}; - - let svc = FlightServiceServer::new(service); - - Server::builder().add_service(svc).serve(addr).await?; - - Ok(()) -} diff --git a/rust/arrow-flight/src/arrow.flight.protocol.rs b/rust/arrow-flight/src/arrow.flight.protocol.rs deleted file mode 100644 index 5fce526ff6e..00000000000 --- a/rust/arrow-flight/src/arrow.flight.protocol.rs +++ /dev/null @@ -1,1039 +0,0 @@ -// This file was automatically generated through the build.rs script, and should not be edited. - -/// -/// The request that a client provides to a server on handshake. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct HandshakeRequest { - /// - /// A defined protocol version - #[prost(uint64, tag = "1")] - pub protocol_version: u64, - /// - /// Arbitrary auth/handshake info. - #[prost(bytes = "vec", tag = "2")] - pub payload: ::prost::alloc::vec::Vec, -} -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct HandshakeResponse { - /// - /// A defined protocol version - #[prost(uint64, tag = "1")] - pub protocol_version: u64, - /// - /// Arbitrary auth/handshake info. - #[prost(bytes = "vec", tag = "2")] - pub payload: ::prost::alloc::vec::Vec, -} -/// -/// A message for doing simple auth. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct BasicAuth { - #[prost(string, tag = "2")] - pub username: ::prost::alloc::string::String, - #[prost(string, tag = "3")] - pub password: ::prost::alloc::string::String, -} -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Empty {} -/// -/// Describes an available action, including both the name used for execution -/// along with a short description of the purpose of the action. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ActionType { - #[prost(string, tag = "1")] - pub r#type: ::prost::alloc::string::String, - #[prost(string, tag = "2")] - pub description: ::prost::alloc::string::String, -} -/// -/// A service specific expression that can be used to return a limited set -/// of available Arrow Flight streams. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Criteria { - #[prost(bytes = "vec", tag = "1")] - pub expression: ::prost::alloc::vec::Vec, -} -/// -/// An opaque action specific for the service. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Action { - #[prost(string, tag = "1")] - pub r#type: ::prost::alloc::string::String, - #[prost(bytes = "vec", tag = "2")] - pub body: ::prost::alloc::vec::Vec, -} -/// -/// An opaque result returned after executing an action. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Result { - #[prost(bytes = "vec", tag = "1")] - pub body: ::prost::alloc::vec::Vec, -} -/// -/// Wrap the result of a getSchema call -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct SchemaResult { - /// schema of the dataset as described in Schema.fbs::Schema. - #[prost(bytes = "vec", tag = "1")] - pub schema: ::prost::alloc::vec::Vec, -} -/// -/// The name or tag for a Flight. May be used as a way to retrieve or generate -/// a flight or be used to expose a set of previously defined flights. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct FlightDescriptor { - #[prost(enumeration = "flight_descriptor::DescriptorType", tag = "1")] - pub r#type: i32, - /// - /// Opaque value used to express a command. Should only be defined when - /// type = CMD. - #[prost(bytes = "vec", tag = "2")] - pub cmd: ::prost::alloc::vec::Vec, - /// - /// List of strings identifying a particular dataset. Should only be defined - /// when type = PATH. - #[prost(string, repeated, tag = "3")] - pub path: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, -} -/// Nested message and enum types in `FlightDescriptor`. -pub mod flight_descriptor { - /// - /// Describes what type of descriptor is defined. - #[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, - )] - #[repr(i32)] - pub enum DescriptorType { - /// Protobuf pattern, not used. - Unknown = 0, - /// - /// A named path that identifies a dataset. A path is composed of a string - /// or list of strings describing a particular dataset. This is conceptually - /// similar to a path inside a filesystem. - Path = 1, - /// - /// An opaque command to generate a dataset. - Cmd = 2, - } -} -/// -/// The access coordinates for retrieval of a dataset. With a FlightInfo, a -/// consumer is able to determine how to retrieve a dataset. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct FlightInfo { - /// schema of the dataset as described in Schema.fbs::Schema. - #[prost(bytes = "vec", tag = "1")] - pub schema: ::prost::alloc::vec::Vec, - /// - /// The descriptor associated with this info. - #[prost(message, optional, tag = "2")] - pub flight_descriptor: ::core::option::Option, - /// - /// A list of endpoints associated with the flight. To consume the whole - /// flight, all endpoints must be consumed. - #[prost(message, repeated, tag = "3")] - pub endpoint: ::prost::alloc::vec::Vec, - /// Set these to -1 if unknown. - #[prost(int64, tag = "4")] - pub total_records: i64, - #[prost(int64, tag = "5")] - pub total_bytes: i64, -} -/// -/// A particular stream or split associated with a flight. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct FlightEndpoint { - /// - /// Token used to retrieve this stream. - #[prost(message, optional, tag = "1")] - pub ticket: ::core::option::Option, - /// - /// A list of URIs where this ticket can be redeemed. If the list is - /// empty, the expectation is that the ticket can only be redeemed on the - /// current service where the ticket was generated. - #[prost(message, repeated, tag = "2")] - pub location: ::prost::alloc::vec::Vec, -} -/// -/// A location where a Flight service will accept retrieval of a particular -/// stream given a ticket. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Location { - #[prost(string, tag = "1")] - pub uri: ::prost::alloc::string::String, -} -/// -/// An opaque identifier that the service can use to retrieve a particular -/// portion of a stream. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Ticket { - #[prost(bytes = "vec", tag = "1")] - pub ticket: ::prost::alloc::vec::Vec, -} -/// -/// A batch of Arrow data as part of a stream of batches. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct FlightData { - /// - /// The descriptor of the data. This is only relevant when a client is - /// starting a new DoPut stream. - #[prost(message, optional, tag = "1")] - pub flight_descriptor: ::core::option::Option, - /// - /// Header for message data as described in Message.fbs::Message. - #[prost(bytes = "vec", tag = "2")] - pub data_header: ::prost::alloc::vec::Vec, - /// - /// Application-defined metadata. - #[prost(bytes = "vec", tag = "3")] - pub app_metadata: ::prost::alloc::vec::Vec, - /// - /// The actual batch of Arrow data. Preferably handled with minimal-copies - /// coming last in the definition to help with sidecar patterns (it is - /// expected that some implementations will fetch this field off the wire - /// with specialized code to avoid extra memory copies). - #[prost(bytes = "vec", tag = "1000")] - pub data_body: ::prost::alloc::vec::Vec, -} -///* -/// The response message associated with the submission of a DoPut. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct PutResult { - #[prost(bytes = "vec", tag = "1")] - pub app_metadata: ::prost::alloc::vec::Vec, -} -#[doc = r" Generated client implementations."] -pub mod flight_service_client { - #![allow(unused_variables, dead_code, missing_docs)] - use tonic::codegen::*; - #[doc = ""] - #[doc = " A flight service is an endpoint for retrieving or storing Arrow data. A"] - #[doc = " flight service can expose one or more predefined endpoints that can be"] - #[doc = " accessed using the Arrow Flight Protocol. Additionally, a flight service"] - #[doc = " can expose a set of actions that are available."] - pub struct FlightServiceClient { - inner: tonic::client::Grpc, - } - impl FlightServiceClient { - #[doc = r" Attempt to create a new client by connecting to a given endpoint."] - pub async fn connect(dst: D) -> Result - where - D: std::convert::TryInto, - D::Error: Into, - { - let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; - Ok(Self::new(conn)) - } - } - impl FlightServiceClient - where - T: tonic::client::GrpcService, - T::ResponseBody: Body + HttpBody + Send + 'static, - T::Error: Into, - ::Error: Into + Send, - { - pub fn new(inner: T) -> Self { - let inner = tonic::client::Grpc::new(inner); - Self { inner } - } - pub fn with_interceptor( - inner: T, - interceptor: impl Into, - ) -> Self { - let inner = tonic::client::Grpc::with_interceptor(inner, interceptor); - Self { inner } - } - #[doc = ""] - #[doc = " Handshake between client and server. Depending on the server, the"] - #[doc = " handshake may be required to determine the token that should be used for"] - #[doc = " future operations. Both request and response are streams to allow multiple"] - #[doc = " round-trips depending on auth mechanism."] - pub async fn handshake( - &mut self, - request: impl tonic::IntoStreamingRequest, - ) -> Result< - tonic::Response>, - tonic::Status, - > { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/arrow.flight.protocol.FlightService/Handshake", - ); - self.inner - .streaming(request.into_streaming_request(), path, codec) - .await - } - #[doc = ""] - #[doc = " Get a list of available streams given a particular criteria. Most flight"] - #[doc = " services will expose one or more streams that are readily available for"] - #[doc = " retrieval. This api allows listing the streams available for"] - #[doc = " consumption. A user can also provide a criteria. The criteria can limit"] - #[doc = " the subset of streams that can be listed via this interface. Each flight"] - #[doc = " service allows its own definition of how to consume criteria."] - pub async fn list_flights( - &mut self, - request: impl tonic::IntoRequest, - ) -> Result< - tonic::Response>, - tonic::Status, - > { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/arrow.flight.protocol.FlightService/ListFlights", - ); - self.inner - .server_streaming(request.into_request(), path, codec) - .await - } - #[doc = ""] - #[doc = " For a given FlightDescriptor, get information about how the flight can be"] - #[doc = " consumed. This is a useful interface if the consumer of the interface"] - #[doc = " already can identify the specific flight to consume. This interface can"] - #[doc = " also allow a consumer to generate a flight stream through a specified"] - #[doc = " descriptor. For example, a flight descriptor might be something that"] - #[doc = " includes a SQL statement or a Pickled Python operation that will be"] - #[doc = " executed. In those cases, the descriptor will not be previously available"] - #[doc = " within the list of available streams provided by ListFlights but will be"] - #[doc = " available for consumption for the duration defined by the specific flight"] - #[doc = " service."] - pub async fn get_flight_info( - &mut self, - request: impl tonic::IntoRequest, - ) -> Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/arrow.flight.protocol.FlightService/GetFlightInfo", - ); - self.inner.unary(request.into_request(), path, codec).await - } - #[doc = ""] - #[doc = " For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema"] - #[doc = " This is used when a consumer needs the Schema of flight stream. Similar to"] - #[doc = " GetFlightInfo this interface may generate a new flight that was not previously"] - #[doc = " available in ListFlights."] - pub async fn get_schema( - &mut self, - request: impl tonic::IntoRequest, - ) -> Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/arrow.flight.protocol.FlightService/GetSchema", - ); - self.inner.unary(request.into_request(), path, codec).await - } - #[doc = ""] - #[doc = " Retrieve a single stream associated with a particular descriptor"] - #[doc = " associated with the referenced ticket. A Flight can be composed of one or"] - #[doc = " more streams where each stream can be retrieved using a separate opaque"] - #[doc = " ticket that the flight service uses for managing a collection of streams."] - pub async fn do_get( - &mut self, - request: impl tonic::IntoRequest, - ) -> Result< - tonic::Response>, - tonic::Status, - > { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/arrow.flight.protocol.FlightService/DoGet", - ); - self.inner - .server_streaming(request.into_request(), path, codec) - .await - } - #[doc = ""] - #[doc = " Push a stream to the flight service associated with a particular"] - #[doc = " flight stream. This allows a client of a flight service to upload a stream"] - #[doc = " of data. Depending on the particular flight service, a client consumer"] - #[doc = " could be allowed to upload a single stream per descriptor or an unlimited"] - #[doc = " number. In the latter, the service might implement a 'seal' action that"] - #[doc = " can be applied to a descriptor once all streams are uploaded."] - pub async fn do_put( - &mut self, - request: impl tonic::IntoStreamingRequest, - ) -> Result< - tonic::Response>, - tonic::Status, - > { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/arrow.flight.protocol.FlightService/DoPut", - ); - self.inner - .streaming(request.into_streaming_request(), path, codec) - .await - } - #[doc = ""] - #[doc = " Open a bidirectional data channel for a given descriptor. This"] - #[doc = " allows clients to send and receive arbitrary Arrow data and"] - #[doc = " application-specific metadata in a single logical stream. In"] - #[doc = " contrast to DoGet/DoPut, this is more suited for clients"] - #[doc = " offloading computation (rather than storage) to a Flight service."] - pub async fn do_exchange( - &mut self, - request: impl tonic::IntoStreamingRequest, - ) -> Result< - tonic::Response>, - tonic::Status, - > { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/arrow.flight.protocol.FlightService/DoExchange", - ); - self.inner - .streaming(request.into_streaming_request(), path, codec) - .await - } - #[doc = ""] - #[doc = " Flight services can support an arbitrary number of simple actions in"] - #[doc = " addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut"] - #[doc = " operations that are potentially available. DoAction allows a flight client"] - #[doc = " to do a specific action against a flight service. An action includes"] - #[doc = " opaque request and response objects that are specific to the type action"] - #[doc = " being undertaken."] - pub async fn do_action( - &mut self, - request: impl tonic::IntoRequest, - ) -> Result>, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/arrow.flight.protocol.FlightService/DoAction", - ); - self.inner - .server_streaming(request.into_request(), path, codec) - .await - } - #[doc = ""] - #[doc = " A flight service exposes all of the available action types that it has"] - #[doc = " along with descriptions. This allows different flight consumers to"] - #[doc = " understand the capabilities of the flight service."] - pub async fn list_actions( - &mut self, - request: impl tonic::IntoRequest, - ) -> Result< - tonic::Response>, - tonic::Status, - > { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/arrow.flight.protocol.FlightService/ListActions", - ); - self.inner - .server_streaming(request.into_request(), path, codec) - .await - } - } - impl Clone for FlightServiceClient { - fn clone(&self) -> Self { - Self { - inner: self.inner.clone(), - } - } - } - impl std::fmt::Debug for FlightServiceClient { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "FlightServiceClient {{ ... }}") - } - } -} -#[doc = r" Generated server implementations."] -pub mod flight_service_server { - #![allow(unused_variables, dead_code, missing_docs)] - use tonic::codegen::*; - #[doc = "Generated trait containing gRPC methods that should be implemented for use with FlightServiceServer."] - #[async_trait] - pub trait FlightService: Send + Sync + 'static { - #[doc = "Server streaming response type for the Handshake method."] - type HandshakeStream: futures_core::Stream> - + Send - + Sync - + 'static; - #[doc = ""] - #[doc = " Handshake between client and server. Depending on the server, the"] - #[doc = " handshake may be required to determine the token that should be used for"] - #[doc = " future operations. Both request and response are streams to allow multiple"] - #[doc = " round-trips depending on auth mechanism."] - async fn handshake( - &self, - request: tonic::Request>, - ) -> Result, tonic::Status>; - #[doc = "Server streaming response type for the ListFlights method."] - type ListFlightsStream: futures_core::Stream> - + Send - + Sync - + 'static; - #[doc = ""] - #[doc = " Get a list of available streams given a particular criteria. Most flight"] - #[doc = " services will expose one or more streams that are readily available for"] - #[doc = " retrieval. This api allows listing the streams available for"] - #[doc = " consumption. A user can also provide a criteria. The criteria can limit"] - #[doc = " the subset of streams that can be listed via this interface. Each flight"] - #[doc = " service allows its own definition of how to consume criteria."] - async fn list_flights( - &self, - request: tonic::Request, - ) -> Result, tonic::Status>; - #[doc = ""] - #[doc = " For a given FlightDescriptor, get information about how the flight can be"] - #[doc = " consumed. This is a useful interface if the consumer of the interface"] - #[doc = " already can identify the specific flight to consume. This interface can"] - #[doc = " also allow a consumer to generate a flight stream through a specified"] - #[doc = " descriptor. For example, a flight descriptor might be something that"] - #[doc = " includes a SQL statement or a Pickled Python operation that will be"] - #[doc = " executed. In those cases, the descriptor will not be previously available"] - #[doc = " within the list of available streams provided by ListFlights but will be"] - #[doc = " available for consumption for the duration defined by the specific flight"] - #[doc = " service."] - async fn get_flight_info( - &self, - request: tonic::Request, - ) -> Result, tonic::Status>; - #[doc = ""] - #[doc = " For a given FlightDescriptor, get the Schema as described in Schema.fbs::Schema"] - #[doc = " This is used when a consumer needs the Schema of flight stream. Similar to"] - #[doc = " GetFlightInfo this interface may generate a new flight that was not previously"] - #[doc = " available in ListFlights."] - async fn get_schema( - &self, - request: tonic::Request, - ) -> Result, tonic::Status>; - #[doc = "Server streaming response type for the DoGet method."] - type DoGetStream: futures_core::Stream> - + Send - + Sync - + 'static; - #[doc = ""] - #[doc = " Retrieve a single stream associated with a particular descriptor"] - #[doc = " associated with the referenced ticket. A Flight can be composed of one or"] - #[doc = " more streams where each stream can be retrieved using a separate opaque"] - #[doc = " ticket that the flight service uses for managing a collection of streams."] - async fn do_get( - &self, - request: tonic::Request, - ) -> Result, tonic::Status>; - #[doc = "Server streaming response type for the DoPut method."] - type DoPutStream: futures_core::Stream> - + Send - + Sync - + 'static; - #[doc = ""] - #[doc = " Push a stream to the flight service associated with a particular"] - #[doc = " flight stream. This allows a client of a flight service to upload a stream"] - #[doc = " of data. Depending on the particular flight service, a client consumer"] - #[doc = " could be allowed to upload a single stream per descriptor or an unlimited"] - #[doc = " number. In the latter, the service might implement a 'seal' action that"] - #[doc = " can be applied to a descriptor once all streams are uploaded."] - async fn do_put( - &self, - request: tonic::Request>, - ) -> Result, tonic::Status>; - #[doc = "Server streaming response type for the DoExchange method."] - type DoExchangeStream: futures_core::Stream> - + Send - + Sync - + 'static; - #[doc = ""] - #[doc = " Open a bidirectional data channel for a given descriptor. This"] - #[doc = " allows clients to send and receive arbitrary Arrow data and"] - #[doc = " application-specific metadata in a single logical stream. In"] - #[doc = " contrast to DoGet/DoPut, this is more suited for clients"] - #[doc = " offloading computation (rather than storage) to a Flight service."] - async fn do_exchange( - &self, - request: tonic::Request>, - ) -> Result, tonic::Status>; - #[doc = "Server streaming response type for the DoAction method."] - type DoActionStream: futures_core::Stream> - + Send - + Sync - + 'static; - #[doc = ""] - #[doc = " Flight services can support an arbitrary number of simple actions in"] - #[doc = " addition to the possible ListFlights, GetFlightInfo, DoGet, DoPut"] - #[doc = " operations that are potentially available. DoAction allows a flight client"] - #[doc = " to do a specific action against a flight service. An action includes"] - #[doc = " opaque request and response objects that are specific to the type action"] - #[doc = " being undertaken."] - async fn do_action( - &self, - request: tonic::Request, - ) -> Result, tonic::Status>; - #[doc = "Server streaming response type for the ListActions method."] - type ListActionsStream: futures_core::Stream> - + Send - + Sync - + 'static; - #[doc = ""] - #[doc = " A flight service exposes all of the available action types that it has"] - #[doc = " along with descriptions. This allows different flight consumers to"] - #[doc = " understand the capabilities of the flight service."] - async fn list_actions( - &self, - request: tonic::Request, - ) -> Result, tonic::Status>; - } - #[doc = ""] - #[doc = " A flight service is an endpoint for retrieving or storing Arrow data. A"] - #[doc = " flight service can expose one or more predefined endpoints that can be"] - #[doc = " accessed using the Arrow Flight Protocol. Additionally, a flight service"] - #[doc = " can expose a set of actions that are available."] - #[derive(Debug)] - pub struct FlightServiceServer { - inner: _Inner, - } - struct _Inner(Arc, Option); - impl FlightServiceServer { - pub fn new(inner: T) -> Self { - let inner = Arc::new(inner); - let inner = _Inner(inner, None); - Self { inner } - } - pub fn with_interceptor( - inner: T, - interceptor: impl Into, - ) -> Self { - let inner = Arc::new(inner); - let inner = _Inner(inner, Some(interceptor.into())); - Self { inner } - } - } - impl Service> for FlightServiceServer - where - T: FlightService, - B: HttpBody + Send + Sync + 'static, - B::Error: Into + Send + 'static, - { - type Response = http::Response; - type Error = Never; - type Future = BoxFuture; - fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll> { - Poll::Ready(Ok(())) - } - fn call(&mut self, req: http::Request) -> Self::Future { - let inner = self.inner.clone(); - match req.uri().path() { - "/arrow.flight.protocol.FlightService/Handshake" => { - #[allow(non_camel_case_types)] - struct HandshakeSvc(pub Arc); - impl - tonic::server::StreamingService - for HandshakeSvc - { - type Response = super::HandshakeResponse; - type ResponseStream = T::HandshakeStream; - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request< - tonic::Streaming, - >, - ) -> Self::Future { - let inner = self.0.clone(); - let fut = async move { (*inner).handshake(request).await }; - Box::pin(fut) - } - } - let inner = self.inner.clone(); - let fut = async move { - let interceptor = inner.1; - let inner = inner.0; - let method = HandshakeSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = if let Some(interceptor) = interceptor { - tonic::server::Grpc::with_interceptor(codec, interceptor) - } else { - tonic::server::Grpc::new(codec) - }; - let res = grpc.streaming(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/arrow.flight.protocol.FlightService/ListFlights" => { - #[allow(non_camel_case_types)] - struct ListFlightsSvc(pub Arc); - impl - tonic::server::ServerStreamingService - for ListFlightsSvc - { - type Response = super::FlightInfo; - type ResponseStream = T::ListFlightsStream; - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = self.0.clone(); - let fut = async move { (*inner).list_flights(request).await }; - Box::pin(fut) - } - } - let inner = self.inner.clone(); - let fut = async move { - let interceptor = inner.1; - let inner = inner.0; - let method = ListFlightsSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = if let Some(interceptor) = interceptor { - tonic::server::Grpc::with_interceptor(codec, interceptor) - } else { - tonic::server::Grpc::new(codec) - }; - let res = grpc.server_streaming(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/arrow.flight.protocol.FlightService/GetFlightInfo" => { - #[allow(non_camel_case_types)] - struct GetFlightInfoSvc(pub Arc); - impl - tonic::server::UnaryService - for GetFlightInfoSvc - { - type Response = super::FlightInfo; - type Future = - BoxFuture, tonic::Status>; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = self.0.clone(); - let fut = - async move { (*inner).get_flight_info(request).await }; - Box::pin(fut) - } - } - let inner = self.inner.clone(); - let fut = async move { - let interceptor = inner.1.clone(); - let inner = inner.0; - let method = GetFlightInfoSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = if let Some(interceptor) = interceptor { - tonic::server::Grpc::with_interceptor(codec, interceptor) - } else { - tonic::server::Grpc::new(codec) - }; - let res = grpc.unary(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/arrow.flight.protocol.FlightService/GetSchema" => { - #[allow(non_camel_case_types)] - struct GetSchemaSvc(pub Arc); - impl - tonic::server::UnaryService - for GetSchemaSvc - { - type Response = super::SchemaResult; - type Future = - BoxFuture, tonic::Status>; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = self.0.clone(); - let fut = async move { (*inner).get_schema(request).await }; - Box::pin(fut) - } - } - let inner = self.inner.clone(); - let fut = async move { - let interceptor = inner.1.clone(); - let inner = inner.0; - let method = GetSchemaSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = if let Some(interceptor) = interceptor { - tonic::server::Grpc::with_interceptor(codec, interceptor) - } else { - tonic::server::Grpc::new(codec) - }; - let res = grpc.unary(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/arrow.flight.protocol.FlightService/DoGet" => { - #[allow(non_camel_case_types)] - struct DoGetSvc(pub Arc); - impl - tonic::server::ServerStreamingService - for DoGetSvc - { - type Response = super::FlightData; - type ResponseStream = T::DoGetStream; - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = self.0.clone(); - let fut = async move { (*inner).do_get(request).await }; - Box::pin(fut) - } - } - let inner = self.inner.clone(); - let fut = async move { - let interceptor = inner.1; - let inner = inner.0; - let method = DoGetSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = if let Some(interceptor) = interceptor { - tonic::server::Grpc::with_interceptor(codec, interceptor) - } else { - tonic::server::Grpc::new(codec) - }; - let res = grpc.server_streaming(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/arrow.flight.protocol.FlightService/DoPut" => { - #[allow(non_camel_case_types)] - struct DoPutSvc(pub Arc); - impl - tonic::server::StreamingService - for DoPutSvc - { - type Response = super::PutResult; - type ResponseStream = T::DoPutStream; - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request>, - ) -> Self::Future { - let inner = self.0.clone(); - let fut = async move { (*inner).do_put(request).await }; - Box::pin(fut) - } - } - let inner = self.inner.clone(); - let fut = async move { - let interceptor = inner.1; - let inner = inner.0; - let method = DoPutSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = if let Some(interceptor) = interceptor { - tonic::server::Grpc::with_interceptor(codec, interceptor) - } else { - tonic::server::Grpc::new(codec) - }; - let res = grpc.streaming(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/arrow.flight.protocol.FlightService/DoExchange" => { - #[allow(non_camel_case_types)] - struct DoExchangeSvc(pub Arc); - impl - tonic::server::StreamingService - for DoExchangeSvc - { - type Response = super::FlightData; - type ResponseStream = T::DoExchangeStream; - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request>, - ) -> Self::Future { - let inner = self.0.clone(); - let fut = async move { (*inner).do_exchange(request).await }; - Box::pin(fut) - } - } - let inner = self.inner.clone(); - let fut = async move { - let interceptor = inner.1; - let inner = inner.0; - let method = DoExchangeSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = if let Some(interceptor) = interceptor { - tonic::server::Grpc::with_interceptor(codec, interceptor) - } else { - tonic::server::Grpc::new(codec) - }; - let res = grpc.streaming(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/arrow.flight.protocol.FlightService/DoAction" => { - #[allow(non_camel_case_types)] - struct DoActionSvc(pub Arc); - impl - tonic::server::ServerStreamingService - for DoActionSvc - { - type Response = super::Result; - type ResponseStream = T::DoActionStream; - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = self.0.clone(); - let fut = async move { (*inner).do_action(request).await }; - Box::pin(fut) - } - } - let inner = self.inner.clone(); - let fut = async move { - let interceptor = inner.1; - let inner = inner.0; - let method = DoActionSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = if let Some(interceptor) = interceptor { - tonic::server::Grpc::with_interceptor(codec, interceptor) - } else { - tonic::server::Grpc::new(codec) - }; - let res = grpc.server_streaming(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/arrow.flight.protocol.FlightService/ListActions" => { - #[allow(non_camel_case_types)] - struct ListActionsSvc(pub Arc); - impl - tonic::server::ServerStreamingService - for ListActionsSvc - { - type Response = super::ActionType; - type ResponseStream = T::ListActionsStream; - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = self.0.clone(); - let fut = async move { (*inner).list_actions(request).await }; - Box::pin(fut) - } - } - let inner = self.inner.clone(); - let fut = async move { - let interceptor = inner.1; - let inner = inner.0; - let method = ListActionsSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = if let Some(interceptor) = interceptor { - tonic::server::Grpc::with_interceptor(codec, interceptor) - } else { - tonic::server::Grpc::new(codec) - }; - let res = grpc.server_streaming(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - _ => Box::pin(async move { - Ok(http::Response::builder() - .status(200) - .header("grpc-status", "12") - .header("content-type", "application/grpc") - .body(tonic::body::BoxBody::empty()) - .unwrap()) - }), - } - } - } - impl Clone for FlightServiceServer { - fn clone(&self) -> Self { - let inner = self.inner.clone(); - Self { inner } - } - } - impl Clone for _Inner { - fn clone(&self) -> Self { - Self(self.0.clone(), self.1.clone()) - } - } - impl std::fmt::Debug for _Inner { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0) - } - } - impl tonic::transport::NamedService for FlightServiceServer { - const NAME: &'static str = "arrow.flight.protocol.FlightService"; - } -} diff --git a/rust/arrow-flight/src/lib.rs b/rust/arrow-flight/src/lib.rs deleted file mode 100644 index 6af2e748678..00000000000 --- a/rust/arrow-flight/src/lib.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -include!("arrow.flight.protocol.rs"); - -pub mod utils; diff --git a/rust/arrow-flight/src/utils.rs b/rust/arrow-flight/src/utils.rs deleted file mode 100644 index 659668c0baf..00000000000 --- a/rust/arrow-flight/src/utils.rs +++ /dev/null @@ -1,167 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Utilities to assist with reading and writing Arrow data as Flight messages - -use std::convert::TryFrom; - -use crate::{FlightData, SchemaResult}; - -use arrow::array::ArrayRef; -use arrow::datatypes::{Schema, SchemaRef}; -use arrow::error::{ArrowError, Result}; -use arrow::ipc::{convert, reader, writer, writer::EncodedData, writer::IpcWriteOptions}; -use arrow::record_batch::RecordBatch; - -/// Convert a `RecordBatch` to a vector of `FlightData` representing the bytes of the dictionaries -/// and a `FlightData` representing the bytes of the batch's values -pub fn flight_data_from_arrow_batch( - batch: &RecordBatch, - options: &IpcWriteOptions, -) -> (Vec, FlightData) { - let data_gen = writer::IpcDataGenerator::default(); - let mut dictionary_tracker = writer::DictionaryTracker::new(false); - - let (encoded_dictionaries, encoded_batch) = data_gen - .encoded_batch(batch, &mut dictionary_tracker, &options) - .expect("DictionaryTracker configured above to not error on replacement"); - - let flight_dictionaries = encoded_dictionaries.into_iter().map(Into::into).collect(); - let flight_batch = encoded_batch.into(); - - (flight_dictionaries, flight_batch) -} - -impl From for FlightData { - fn from(data: EncodedData) -> Self { - FlightData { - data_header: data.ipc_message, - data_body: data.arrow_data, - ..Default::default() - } - } -} - -/// Convert a `Schema` to `SchemaResult` by converting to an IPC message -pub fn flight_schema_from_arrow_schema( - schema: &Schema, - options: &IpcWriteOptions, -) -> SchemaResult { - SchemaResult { - schema: flight_schema_as_flatbuffer(schema, options), - } -} - -/// Convert a `Schema` to `FlightData` by converting to an IPC message -pub fn flight_data_from_arrow_schema( - schema: &Schema, - options: &IpcWriteOptions, -) -> FlightData { - let data_header = flight_schema_as_flatbuffer(schema, options); - FlightData { - data_header, - ..Default::default() - } -} - -/// Convert a `Schema` to bytes in the format expected in `FlightInfo.schema` -pub fn ipc_message_from_arrow_schema( - arrow_schema: &Schema, - options: &IpcWriteOptions, -) -> Result> { - let encoded_data = flight_schema_as_encoded_data(arrow_schema, options); - - let mut schema = vec![]; - arrow::ipc::writer::write_message(&mut schema, encoded_data, options)?; - Ok(schema) -} - -fn flight_schema_as_flatbuffer( - arrow_schema: &Schema, - options: &IpcWriteOptions, -) -> Vec { - let encoded_data = flight_schema_as_encoded_data(arrow_schema, options); - encoded_data.ipc_message -} - -fn flight_schema_as_encoded_data( - arrow_schema: &Schema, - options: &IpcWriteOptions, -) -> EncodedData { - let data_gen = writer::IpcDataGenerator::default(); - data_gen.schema_to_bytes(arrow_schema, options) -} - -/// Try convert `FlightData` into an Arrow Schema -/// -/// Returns an error if the `FlightData` header is not a valid IPC schema -impl TryFrom<&FlightData> for Schema { - type Error = ArrowError; - fn try_from(data: &FlightData) -> Result { - convert::schema_from_bytes(&data.data_header[..]).map_err(|err| { - ArrowError::ParseError(format!( - "Unable to convert flight data to Arrow schema: {}", - err - )) - }) - } -} - -/// Try convert `SchemaResult` into an Arrow Schema -/// -/// Returns an error if the `FlightData` header is not a valid IPC schema -impl TryFrom<&SchemaResult> for Schema { - type Error = ArrowError; - fn try_from(data: &SchemaResult) -> Result { - convert::schema_from_bytes(&data.schema[..]).map_err(|err| { - ArrowError::ParseError(format!( - "Unable to convert schema result to Arrow schema: {}", - err - )) - }) - } -} - -/// Convert a FlightData message to a RecordBatch -pub fn flight_data_to_arrow_batch( - data: &FlightData, - schema: SchemaRef, - dictionaries_by_field: &[Option], -) -> Result { - // check that the data_header is a record batch message - let message = arrow::ipc::root_as_message(&data.data_header[..]).map_err(|err| { - ArrowError::ParseError(format!("Unable to get root as message: {:?}", err)) - })?; - - message - .header_as_record_batch() - .ok_or_else(|| { - ArrowError::ParseError( - "Unable to convert flight data header to a record batch".to_string(), - ) - }) - .map(|batch| { - reader::read_record_batch( - &data.data_body, - batch, - schema, - &dictionaries_by_field, - ) - })? -} - -// TODO: add more explicit conversion that exposes flight descriptor and metadata options diff --git a/rust/arrow-pyarrow-integration-testing/.cargo/config b/rust/arrow-pyarrow-integration-testing/.cargo/config deleted file mode 100644 index a127967f66c..00000000000 --- a/rust/arrow-pyarrow-integration-testing/.cargo/config +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[target.x86_64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] \ No newline at end of file diff --git a/rust/arrow-pyarrow-integration-testing/.gitignore b/rust/arrow-pyarrow-integration-testing/.gitignore deleted file mode 100644 index 82adb58b4d6..00000000000 --- a/rust/arrow-pyarrow-integration-testing/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -__pycache__ -venv diff --git a/rust/arrow-pyarrow-integration-testing/Cargo.toml b/rust/arrow-pyarrow-integration-testing/Cargo.toml deleted file mode 100644 index f95458dbcb5..00000000000 --- a/rust/arrow-pyarrow-integration-testing/Cargo.toml +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "arrow-pyarrow-integration-testing" -description = "" -version = "5.0.0-SNAPSHOT" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = [ "arrow" ] -edition = "2018" - -[lib] -name = "arrow_pyarrow_integration_testing" -crate-type = ["cdylib"] - -[dependencies] -arrow = { path = "../arrow", version = "5.0.0-SNAPSHOT" } -pyo3 = { version = "0.12.1", features = ["extension-module"] } - -[package.metadata.maturin] -requires-dist = ["pyarrow>=1"] diff --git a/rust/arrow-pyarrow-integration-testing/README.md b/rust/arrow-pyarrow-integration-testing/README.md deleted file mode 100644 index 7e78aa9ec70..00000000000 --- a/rust/arrow-pyarrow-integration-testing/README.md +++ /dev/null @@ -1,57 +0,0 @@ - - -# Arrow c integration - -This is a Rust crate that tests compatibility between Rust's Arrow implementation and PyArrow. - -Note that this crate uses two languages and an external ABI: -* `Rust` -* `Python` -* C ABI privately exposed by `Pyarrow`. - -## Basic idea - -Pyarrow exposes a C ABI to convert arrow arrays from and to its C implementation, see [here](https://arrow.apache.org/docs/format/CDataInterface.html). - -This package uses the equivalent struct in Rust (`arrow::array::ArrowArray`), and verifies that -we can use pyarrow's interface to move pointers from and to Rust. - -## Relevant literature - -* [Arrow's CDataInterface](https://arrow.apache.org/docs/format/CDataInterface.html) -* [Rust's FFI](https://doc.rust-lang.org/nomicon/ffi.html) -* [Pyarrow private binds](https://github.com/apache/arrow/blob/ae1d24efcc3f1ac2a876d8d9f544a34eb04ae874/python/pyarrow/array.pxi#L1226) -* [PyO3](https://docs.rs/pyo3/0.12.1/pyo3/index.html) - -## How to develop - -```bash -# prepare development environment (used to build wheel / install in development) -python -m venv venv -venv/bin/pip install maturin==0.8.2 toml==0.10.1 pyarrow==1.0.0 -``` - -Whenever rust code changes (your changes or via git pull): - -```bash -source venv/bin/activate -maturin develop -python -m unittest discover tests -``` diff --git a/rust/arrow-pyarrow-integration-testing/pyproject.toml b/rust/arrow-pyarrow-integration-testing/pyproject.toml deleted file mode 100644 index 27480690e06..00000000000 --- a/rust/arrow-pyarrow-integration-testing/pyproject.toml +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[build-system] -requires = ["maturin"] -build-backend = "maturin" diff --git a/rust/arrow-pyarrow-integration-testing/src/lib.rs b/rust/arrow-pyarrow-integration-testing/src/lib.rs deleted file mode 100644 index 5b5462d9c15..00000000000 --- a/rust/arrow-pyarrow-integration-testing/src/lib.rs +++ /dev/null @@ -1,188 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! This library demonstrates a minimal usage of Rust's C data interface to pass -//! arrays from and to Python. - -use std::error; -use std::fmt; -use std::sync::Arc; - -use pyo3::exceptions::PyOSError; -use pyo3::wrap_pyfunction; -use pyo3::{libc::uintptr_t, prelude::*}; - -use arrow::array::{make_array_from_raw, ArrayRef, Int64Array}; -use arrow::compute::kernels; -use arrow::error::ArrowError; -use arrow::ffi; - -/// an error that bridges ArrowError with a Python error -#[derive(Debug)] -enum PyO3ArrowError { - ArrowError(ArrowError), -} - -impl fmt::Display for PyO3ArrowError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - PyO3ArrowError::ArrowError(ref e) => e.fmt(f), - } - } -} - -impl error::Error for PyO3ArrowError { - fn source(&self) -> Option<&(dyn error::Error + 'static)> { - match *self { - // The cause is the underlying implementation error type. Is implicitly - // cast to the trait object `&error::Error`. This works because the - // underlying type already implements the `Error` trait. - PyO3ArrowError::ArrowError(ref e) => Some(e), - } - } -} - -impl From for PyO3ArrowError { - fn from(err: ArrowError) -> PyO3ArrowError { - PyO3ArrowError::ArrowError(err) - } -} - -impl From for PyErr { - fn from(err: PyO3ArrowError) -> PyErr { - PyOSError::new_err(err.to_string()) - } -} - -fn to_rust(ob: PyObject, py: Python) -> PyResult { - // prepare a pointer to receive the Array struct - let (array_pointer, schema_pointer) = - ffi::ArrowArray::into_raw(unsafe { ffi::ArrowArray::empty() }); - - // make the conversion through PyArrow's private API - // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds - ob.call_method1( - py, - "_export_to_c", - (array_pointer as uintptr_t, schema_pointer as uintptr_t), - )?; - - let array = unsafe { make_array_from_raw(array_pointer, schema_pointer) } - .map_err(|e| PyO3ArrowError::from(e))?; - Ok(array) -} - -fn to_py(array: ArrayRef, py: Python) -> PyResult { - let (array_pointer, schema_pointer) = - array.to_raw().map_err(|e| PyO3ArrowError::from(e))?; - - let pa = py.import("pyarrow")?; - - let array = pa.getattr("Array")?.call_method1( - "_import_from_c", - (array_pointer as uintptr_t, schema_pointer as uintptr_t), - )?; - Ok(array.to_object(py)) -} - -/// Returns `array + array` of an int64 array. -#[pyfunction] -fn double(array: PyObject, py: Python) -> PyResult { - // import - let array = to_rust(array, py)?; - - // perform some operation - let array = - array - .as_any() - .downcast_ref::() - .ok_or(PyO3ArrowError::ArrowError(ArrowError::ParseError( - "Expects an int64".to_string(), - )))?; - let array = - kernels::arithmetic::add(&array, &array).map_err(|e| PyO3ArrowError::from(e))?; - let array = Arc::new(array); - - // export - to_py(array, py) -} - -/// calls a lambda function that receives and returns an array -/// whose result must be the array multiplied by two -#[pyfunction] -fn double_py(lambda: PyObject, py: Python) -> PyResult { - // create - let array = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)])); - let expected = Arc::new(Int64Array::from(vec![Some(2), None, Some(6)])) as ArrayRef; - - // to py - let array = to_py(array, py)?; - - let array = lambda.call1(py, (array,))?; - - let array = to_rust(array, py)?; - - Ok(array == expected) -} - -/// Returns the substring -#[pyfunction] -fn substring(array: PyObject, start: i64, py: Python) -> PyResult { - // import - let array = to_rust(array, py)?; - - // substring - let array = kernels::substring::substring(array.as_ref(), start, &None) - .map_err(|e| PyO3ArrowError::from(e))?; - - // export - to_py(array, py) -} - -/// Returns the concatenate -#[pyfunction] -fn concatenate(array: PyObject, py: Python) -> PyResult { - // import - let array = to_rust(array, py)?; - - // concat - let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]) - .map_err(|e| PyO3ArrowError::from(e))?; - - // export - to_py(array, py) -} - -/// Converts to rust and back to python -#[pyfunction] -fn round_trip(array: PyObject, py: Python) -> PyResult { - // import - let array = to_rust(array, py)?; - - // export - to_py(array, py) -} - -#[pymodule] -fn arrow_pyarrow_integration_testing(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_wrapped(wrap_pyfunction!(double))?; - m.add_wrapped(wrap_pyfunction!(double_py))?; - m.add_wrapped(wrap_pyfunction!(substring))?; - m.add_wrapped(wrap_pyfunction!(concatenate))?; - m.add_wrapped(wrap_pyfunction!(round_trip))?; - Ok(()) -} diff --git a/rust/arrow-pyarrow-integration-testing/tests/test_sql.py b/rust/arrow-pyarrow-integration-testing/tests/test_sql.py deleted file mode 100644 index c0de382057c..00000000000 --- a/rust/arrow-pyarrow-integration-testing/tests/test_sql.py +++ /dev/null @@ -1,99 +0,0 @@ -# -*- coding: utf-8 -*- -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import unittest - -import pyarrow -import arrow_pyarrow_integration_testing - - -class TestCase(unittest.TestCase): - def test_primitive_python(self): - """ - Python -> Rust -> Python - """ - old_allocated = pyarrow.total_allocated_bytes() - a = pyarrow.array([1, 2, 3]) - b = arrow_pyarrow_integration_testing.double(a) - self.assertEqual(b, pyarrow.array([2, 4, 6])) - del a - del b - # No leak of C++ memory - self.assertEqual(old_allocated, pyarrow.total_allocated_bytes()) - - def test_primitive_rust(self): - """ - Rust -> Python -> Rust - """ - old_allocated = pyarrow.total_allocated_bytes() - - def double(array): - array = array.to_pylist() - return pyarrow.array([x * 2 if x is not None else None for x in array]) - - is_correct = arrow_pyarrow_integration_testing.double_py(double) - self.assertTrue(is_correct) - # No leak of C++ memory - self.assertEqual(old_allocated, pyarrow.total_allocated_bytes()) - - def test_string_python(self): - """ - Python -> Rust -> Python - """ - old_allocated = pyarrow.total_allocated_bytes() - a = pyarrow.array(["a", None, "ccc"]) - b = arrow_pyarrow_integration_testing.substring(a, 1) - self.assertEqual(b, pyarrow.array(["", None, "cc"])) - del a - del b - # No leak of C++ memory - self.assertEqual(old_allocated, pyarrow.total_allocated_bytes()) - - def test_time32_python(self): - """ - Python -> Rust -> Python - """ - old_allocated = pyarrow.total_allocated_bytes() - a = pyarrow.array([None, 1, 2], pyarrow.time32('s')) - b = arrow_pyarrow_integration_testing.concatenate(a) - expected = pyarrow.array([None, 1, 2] + [None, 1, 2], pyarrow.time32('s')) - self.assertEqual(b, expected) - del a - del b - del expected - # No leak of C++ memory - self.assertEqual(old_allocated, pyarrow.total_allocated_bytes()) - - def test_list_array(self): - """ - Python -> Rust -> Python - """ - old_allocated = pyarrow.total_allocated_bytes() - a = pyarrow.array([[], None, [1, 2], [4, 5, 6]], pyarrow.list_(pyarrow.int64())) - b = arrow_pyarrow_integration_testing.round_trip(a) - - b.validate(full=True) - assert a.to_pylist() == b.to_pylist() - assert a.type == b.type - del a - del b - # No leak of C++ memory - self.assertEqual(old_allocated, pyarrow.total_allocated_bytes()) - - - diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml deleted file mode 100644 index ac3b72e57b0..00000000000 --- a/rust/arrow/Cargo.toml +++ /dev/null @@ -1,151 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "arrow" -version = "5.0.0-SNAPSHOT" -description = "Rust implementation of Apache Arrow" -homepage = "https://github.com/apache/arrow" -repository = "https://github.com/apache/arrow" -authors = ["Apache Arrow "] -license = "Apache-2.0" -keywords = [ "arrow" ] -include = [ - "benches/*.rs", - "src/**/*.rs", - "Cargo.toml", -] -edition = "2018" - -[lib] -name = "arrow" -path = "src/lib.rs" - -[dependencies] -serde = { version = "1.0", features = ["rc"] } -serde_derive = "1.0" -serde_json = { version = "1.0", features = ["preserve_order"] } -indexmap = "1.6" -rand = "0.7" -csv = "1.1" -num = "0.3" -regex = "1.3" -lazy_static = "1.4" -packed_simd = { version = "0.3.4", optional = true, package = "packed_simd_2" } -chrono = "0.4" -flatbuffers = "^0.8" -hex = "0.4" -prettytable-rs = { version = "0.8.0", optional = true } -lexical-core = "^0.7" - -[features] -default = [] -avx512 = [] -simd = ["packed_simd"] -prettyprint = ["prettytable-rs"] -# this is only intended to be used in single-threaded programs: it verifies that -# all allocated memory is being released (no memory leaks). -# See README for details -memory-check = [] - -[dev-dependencies] -criterion = "0.3" -flate2 = "1" -tempfile = "3" - -[build-dependencies] -cfg_aliases = "0.1" - -[[bench]] -name = "aggregate_kernels" -harness = false - -[[bench]] -name = "array_from_vec" -harness = false - -[[bench]] -name = "builder" -harness = false - -[[bench]] -name = "buffer_bit_ops" -harness = false - -[[bench]] -name = "boolean_kernels" -harness = false - -[[bench]] -name = "arithmetic_kernels" -harness = false - -[[bench]] -name = "cast_kernels" -harness = false - -[[bench]] -name = "comparison_kernels" -harness = false - -[[bench]] -name = "filter_kernels" -harness = false - -[[bench]] -name = "take_kernels" -harness = false - -[[bench]] -name = "length_kernel" -harness = false - -[[bench]] -name = "bit_length_kernel" -harness = false - -[[bench]] -name = "sort_kernel" -harness = false - -[[bench]] -name = "csv_writer" -harness = false - -[[bench]] -name = "json_reader" -harness = false - -[[bench]] -name = "equal" -harness = false - -[[bench]] -name = "array_slice" -harness = false - -[[bench]] -name = "concatenate_kernel" -harness = false - -[[bench]] -name = "mutable_array" -harness = false - -[[bench]] -name = "buffer_create" -harness = false diff --git a/rust/arrow/README.md b/rust/arrow/README.md deleted file mode 100644 index 674c3fc6c8b..00000000000 --- a/rust/arrow/README.md +++ /dev/null @@ -1,206 +0,0 @@ - - -# Native Rust implementation of Apache Arrow - -This crate contains a native Rust implementation of the [Arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html). - -## Developer's guide - -Common information for all Rust libraries in this project, including -testing, code formatting, and lints, can be found in the main Arrow -Rust [README.md](../README.md). - -Please refer to [lib.rs](src/lib.rs) for an introduction to this -specific crate and its current functionality. - -### How to check memory allocations - -This crate heavily uses `unsafe` due to how memory is allocated in cache lines. -We have a small tool to verify that this crate does not leak memory (beyond what the compiler already does) - -Run it with - -```bash -cargo test --features memory-check --lib -- --test-threads 1 -``` - -This runs all unit-tests on a single thread and counts all allocations and de-allocations. - -## Examples - -The examples folder shows how to construct some different types of Arrow -arrays, including dynamic arrays created at runtime. - -Examples can be run using the `cargo run --example` command. For example: - -```bash -cargo run --example builders -cargo run --example dynamic_types -cargo run --example read_csv -``` - -## IPC - -The expected flatc version is 1.12.0+, built from [flatbuffers](https://github.com/google/flatbuffers) -master at fixed commit ID, by regen.sh. - -The IPC flatbuffer code was generated by running this command from the root of the project: - -```bash -./regen.sh -``` - -The above script will run the `flatc` compiler and perform some adjustments to the source code: - -- Replace `type__` with `type_` -- Remove `org::apache::arrow::flatbuffers` namespace -- Add includes to each generated file - -## Features - -Arrow uses the following features: - -* `simd` - Arrow uses the [packed_simd](https://crates.io/crates/packed_simd) crate to optimize many of the - implementations in the [compute](https://github.com/apache/arrow/tree/master/rust/arrow/src/compute) - module using SIMD intrinsics. These optimizations are turned *off* by default. - If the `simd` feature is enabled, an unstable version of Rust is required (we test with `nightly-2021-03-24`) -* `flight` which contains useful functions to convert between the Flight wire format and Arrow data -* `prettyprint` which is a utility for printing record batches - -Other than `simd` all the other features are enabled by default. Disabling `prettyprint` might be necessary in order to -compile Arrow to the `wasm32-unknown-unknown` WASM target. - -## Guidelines in usage of `unsafe` - -[`unsafe`](https://doc.rust-lang.org/book/ch19-01-unsafe-rust.html) has a high maintenance cost because debugging and testing it is difficult, time consuming, often requires external tools (e.g. `valgrind`), and requires a higher-than-usual attention to details. Undefined behavior is particularly difficult to identify and test, and usage of `unsafe` is the [primary cause of undefined behavior](https://doc.rust-lang.org/reference/behavior-considered-undefined.html) in a program written in Rust. -For two real world examples of where `unsafe` has consumed time in the past in this project see [#8545](https://github.com/apache/arrow/pull/8645) and [8829](https://github.com/apache/arrow/pull/8829) -This crate only accepts the usage of `unsafe` code upon careful consideration, and strives to avoid it to the largest possible extent. - -### When can `unsafe` be used? - -Generally, `unsafe` should only be used when a `safe` counterpart is not available and there is no `safe` way to achieve additional performance in that area. The following is a summary of the current components of the crate that require `unsafe`: - -* alloc, dealloc and realloc of buffers along cache lines -* Interpreting bytes as certain rust types, for access, representation and compute -* Foreign interfaces (C data interface) -* Inter-process communication (IPC) -* SIMD -* Performance (e.g. omit bounds checks, use of pointers to avoid bound checks) - -#### cache-line aligned memory management - -The arrow format recommends storing buffers aligned with cache lines, and this crate adopts this behavior. -However, Rust's global allocator does not allocate memory aligned with cache-lines. As such, many of the low-level operations related to memory management require `unsafe`. - -#### Interpreting bytes - -The arrow format is specified in bytes (`u8`), which can be logically represented as certain types -depending on the `DataType`. -For many operations, such as access, representation, numerical computation and string manipulation, -it is often necessary to interpret bytes as other physical types (e.g. `i32`). - -Usage of `unsafe` for the purpose of interpreting bytes in their corresponding type (according to the arrow specification) is allowed. Specifically, the pointer to the byte slice must be aligned to the type that it intends to represent and the length of the slice is a multiple of the size of the target type of the transmutation. - -#### FFI - -The arrow format declares an ABI for zero-copy from and to libraries that implement the specification -(foreign interfaces). In Rust, receiving and sending pointers via FFI requires usage of `unsafe` due to -the impossibility of the compiler to derive the invariants (such as lifetime, null pointers, and pointer alignment) from the source code alone as they are part of the FFI contract. - -#### IPC - -The arrow format declares a IPC protocol, which this crate supports. IPC is equivalent to a FFI in that the rust compiler can't reason about the contract's invariants. - -#### SIMD - -The API provided by the `packed_simd` library is currently `unsafe`. However, SIMD offers a significant performance improvement over non-SIMD operations. - -#### Performance - -Some operations are significantly faster when `unsafe` is used. - -A common usage of `unsafe` is to offer an API to access the `i`th element of an array (e.g. `UInt32Array`). -This requires accessing the values buffer e.g. `array.buffers()[0]`, picking the slice -`[i * size_of(), (i + 1) * size_of()]`, and then transmuting it to `i32`. In safe Rust, -this operation requires boundary checks that are detrimental to performance. - -Usage of `unsafe` for performance reasons is justified only when all other alternatives have been exhausted and the performance benefits are sufficiently large (e.g. >~10%). - -### Considerations when introducing `unsafe` - -Usage of `unsafe` in this crate *must*: - -* not expose a public API as `safe` when there are necessary invariants for that API to be defined behavior. -* have code documentation for why `safe` is not used / possible -* have code documentation about which invariant the user needs to enforce to ensure [soundness](https://rust-lang.github.io/unsafe-code-guidelines/glossary.html#soundness-of-code--of-a-library), or which -* invariant is being preserved. -* if applicable, use `debug_assert`s to relevant invariants (e.g. bound checks) - -Example of code documentation: - -```rust -// JUSTIFICATION -// Benefit -// Describe the benefit of using unsafe. E.g. -// "30% performance degradation if the safe counterpart is used, see bench X." -// Soundness -// Describe why the code remains sound (according to the definition of rust's unsafe code guidelines). E.g. -// "We bounded check these values at initialization and the array is immutable." -let ... = unsafe { ... }; -``` - -When adding this documentation to existing code that is not sound and cannot trivially be fixed, we should file -specific JIRA issues and reference them in these code comments. For example: - -```rust -// Soundness -// This is not sound because .... see https://issues.apache.org/jira/browse/ARROW-nnnnn -``` - -# Publishing to crates.io - -An Arrow committer can publish this crate after an official project release has -been made to crates.io using the following instructions. - -Follow [these -instructions](https://doc.rust-lang.org/cargo/reference/publishing.html) to -create an account and login to crates.io before asking to be added as an owner -of the [arrow crate](https://crates.io/crates/arrow). - -Checkout the tag for the version to be released. For example: - -```bash -git checkout apache-arrow-0.11.0 -``` - -If the Cargo.toml in this tag already contains `version = "0.11.0"` (as it -should) then the crate can be published with the following command: - -```bash -cargo publish -``` - -If the Cargo.toml does not have the correct version then it will be necessary -to modify it manually. Since there is now a modified file locally that is not -committed to GitHub it will be necessary to use the following command. - -```bash -cargo publish --allow-dirty -``` diff --git a/rust/arrow/benches/aggregate_kernels.rs b/rust/arrow/benches/aggregate_kernels.rs deleted file mode 100644 index 1724b7349c5..00000000000 --- a/rust/arrow/benches/aggregate_kernels.rs +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -extern crate arrow; - -use arrow::compute::kernels::aggregate::*; -use arrow::util::bench_util::*; -use arrow::{array::*, datatypes::Float32Type}; - -fn bench_sum(arr_a: &Float32Array) { - criterion::black_box(sum(&arr_a).unwrap()); -} - -fn bench_min(arr_a: &Float32Array) { - criterion::black_box(min(&arr_a).unwrap()); -} - -fn bench_max(arr_a: &Float32Array) { - criterion::black_box(max(&arr_a).unwrap()); -} - -fn bench_min_string(arr_a: &StringArray) { - criterion::black_box(min_string(&arr_a).unwrap()); -} - -fn add_benchmark(c: &mut Criterion) { - let arr_a = create_primitive_array::(512, 0.0); - - c.bench_function("sum 512", |b| b.iter(|| bench_sum(&arr_a))); - c.bench_function("min 512", |b| b.iter(|| bench_min(&arr_a))); - c.bench_function("max 512", |b| b.iter(|| bench_max(&arr_a))); - - let arr_a = create_primitive_array::(512, 0.5); - - c.bench_function("sum nulls 512", |b| b.iter(|| bench_sum(&arr_a))); - c.bench_function("min nulls 512", |b| b.iter(|| bench_min(&arr_a))); - c.bench_function("max nulls 512", |b| b.iter(|| bench_max(&arr_a))); - - let arr_b = create_string_array::(512, 0.0); - c.bench_function("min string 512", |b| b.iter(|| bench_min_string(&arr_b))); - - let arr_b = create_string_array::(512, 0.5); - c.bench_function("min nulls string 512", |b| { - b.iter(|| bench_min_string(&arr_b)) - }); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/arithmetic_kernels.rs b/rust/arrow/benches/arithmetic_kernels.rs deleted file mode 100644 index 721157e2846..00000000000 --- a/rust/arrow/benches/arithmetic_kernels.rs +++ /dev/null @@ -1,103 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; -use rand::Rng; - -use std::sync::Arc; - -extern crate arrow; - -use arrow::compute::kernels::limit::*; -use arrow::util::bench_util::*; -use arrow::{array::*, datatypes::Float32Type}; -use arrow::{compute::kernels::arithmetic::*, util::test_util::seedable_rng}; - -fn create_array(size: usize, with_nulls: bool) -> ArrayRef { - let null_density = if with_nulls { 0.5 } else { 0.0 }; - let array = create_primitive_array::(size, null_density); - Arc::new(array) -} - -fn bench_add(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(add(arr_a, arr_b).unwrap()); -} - -fn bench_subtract(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(subtract(&arr_a, &arr_b).unwrap()); -} - -fn bench_multiply(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(multiply(&arr_a, &arr_b).unwrap()); -} - -fn bench_divide(arr_a: &ArrayRef, arr_b: &ArrayRef) { - let arr_a = arr_a.as_any().downcast_ref::().unwrap(); - let arr_b = arr_b.as_any().downcast_ref::().unwrap(); - criterion::black_box(divide(&arr_a, &arr_b).unwrap()); -} - -fn bench_divide_scalar(array: &ArrayRef, divisor: f32) { - let array = array.as_any().downcast_ref::().unwrap(); - criterion::black_box(divide_scalar(&array, divisor).unwrap()); -} - -fn bench_limit(arr_a: &ArrayRef, max: usize) { - criterion::black_box(limit(arr_a, max)); -} - -fn add_benchmark(c: &mut Criterion) { - let arr_a = create_array(512, false); - let arr_b = create_array(512, false); - let scalar = seedable_rng().gen(); - - c.bench_function("add 512", |b| b.iter(|| bench_add(&arr_a, &arr_b))); - c.bench_function("subtract 512", |b| { - b.iter(|| bench_subtract(&arr_a, &arr_b)) - }); - c.bench_function("multiply 512", |b| { - b.iter(|| bench_multiply(&arr_a, &arr_b)) - }); - c.bench_function("divide 512", |b| b.iter(|| bench_divide(&arr_a, &arr_b))); - c.bench_function("divide_scalar 512", |b| { - b.iter(|| bench_divide_scalar(&arr_a, scalar)) - }); - c.bench_function("limit 512, 512", |b| b.iter(|| bench_limit(&arr_a, 512))); - - let arr_a_nulls = create_array(512, false); - let arr_b_nulls = create_array(512, false); - c.bench_function("add_nulls_512", |b| { - b.iter(|| bench_add(&arr_a_nulls, &arr_b_nulls)) - }); - c.bench_function("divide_nulls_512", |b| { - b.iter(|| bench_divide(&arr_a_nulls, &arr_b_nulls)) - }); - c.bench_function("divide_scalar_nulls_512", |b| { - b.iter(|| bench_divide_scalar(&arr_a_nulls, scalar)) - }); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/array_from_vec.rs b/rust/arrow/benches/array_from_vec.rs deleted file mode 100644 index 7740c6bc34e..00000000000 --- a/rust/arrow/benches/array_from_vec.rs +++ /dev/null @@ -1,120 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -extern crate arrow; - -use arrow::array::*; -use arrow::buffer::Buffer; -use arrow::datatypes::*; -use std::{convert::TryFrom, sync::Arc}; - -fn array_from_vec(n: usize) { - let mut v: Vec = Vec::with_capacity(n); - for i in 0..n { - v.push((i & 0xffff) as u8); - } - let arr_data = ArrayDataBuilder::new(DataType::Int32) - .add_buffer(Buffer::from(v)) - .build(); - criterion::black_box(Int32Array::from(arr_data)); -} - -fn array_string_from_vec(n: usize) { - let mut v: Vec> = Vec::with_capacity(n); - for i in 0..n { - if i % 2 == 0 { - v.push(Some("hello world")); - } else { - v.push(None); - } - } - criterion::black_box(StringArray::from(v)); -} - -fn struct_array_values( - n: usize, -) -> ( - &'static str, - Vec>, - &'static str, - Vec>, -) { - let mut strings: Vec> = Vec::with_capacity(n); - let mut ints: Vec> = Vec::with_capacity(n); - for _ in 0..n / 4 { - strings.extend_from_slice(&[Some("joe"), None, None, Some("mark")]); - ints.extend_from_slice(&[Some(1), Some(2), None, Some(4)]); - } - ("f1", strings, "f2", ints) -} - -fn struct_array_from_vec( - field1: &str, - strings: &[Option<&str>], - field2: &str, - ints: &[Option], -) { - let strings: ArrayRef = Arc::new(StringArray::from(strings.to_owned())); - let ints: ArrayRef = Arc::new(Int32Array::from(ints.to_owned())); - - criterion::black_box( - StructArray::try_from(vec![(field1, strings), (field2, ints)]).unwrap(), - ); -} - -fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("array_from_vec 128", |b| b.iter(|| array_from_vec(128))); - c.bench_function("array_from_vec 256", |b| b.iter(|| array_from_vec(256))); - c.bench_function("array_from_vec 512", |b| b.iter(|| array_from_vec(512))); - - c.bench_function("array_string_from_vec 128", |b| { - b.iter(|| array_string_from_vec(128)) - }); - c.bench_function("array_string_from_vec 256", |b| { - b.iter(|| array_string_from_vec(256)) - }); - c.bench_function("array_string_from_vec 512", |b| { - b.iter(|| array_string_from_vec(512)) - }); - - let (field1, strings, field2, ints) = struct_array_values(128); - c.bench_function("struct_array_from_vec 128", |b| { - b.iter(|| struct_array_from_vec(&field1, &strings, &field2, &ints)) - }); - - let (field1, strings, field2, ints) = struct_array_values(256); - c.bench_function("struct_array_from_vec 256", |b| { - b.iter(|| struct_array_from_vec(&field1, &strings, &field2, &ints)) - }); - - let (field1, strings, field2, ints) = struct_array_values(512); - c.bench_function("struct_array_from_vec 512", |b| { - b.iter(|| struct_array_from_vec(&field1, &strings, &field2, &ints)) - }); - - let (field1, strings, field2, ints) = struct_array_values(1024); - c.bench_function("struct_array_from_vec 1024", |b| { - b.iter(|| struct_array_from_vec(&field1, &strings, &field2, &ints)) - }); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/array_slice.rs b/rust/arrow/benches/array_slice.rs deleted file mode 100644 index a535c80d217..00000000000 --- a/rust/arrow/benches/array_slice.rs +++ /dev/null @@ -1,52 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -extern crate arrow; - -use arrow::array::*; -use std::sync::Arc; - -fn create_array_slice(array: &ArrayRef, length: usize) -> ArrayRef { - array.slice(0, length) -} - -fn create_array_with_nulls(size: usize) -> ArrayRef { - let array: Float64Array = (0..size) - .map(|i| if i % 2 == 0 { Some(1.0) } else { None }) - .collect(); - Arc::new(array) -} - -fn array_slice_benchmark(c: &mut Criterion) { - let array = create_array_with_nulls(4096); - c.bench_function("array_slice 128", |b| { - b.iter(|| create_array_slice(&array, 128)) - }); - c.bench_function("array_slice 512", |b| { - b.iter(|| create_array_slice(&array, 512)) - }); - c.bench_function("array_slice 2048", |b| { - b.iter(|| create_array_slice(&array, 2048)) - }); -} - -criterion_group!(benches, array_slice_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/bit_length_kernel.rs b/rust/arrow/benches/bit_length_kernel.rs deleted file mode 100644 index 51d31345712..00000000000 --- a/rust/arrow/benches/bit_length_kernel.rs +++ /dev/null @@ -1,46 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -extern crate arrow; - -use arrow::{array::*, compute::kernels::length::bit_length}; - -fn bench_bit_length(array: &StringArray) { - criterion::black_box(bit_length(array).unwrap()); -} - -fn add_benchmark(c: &mut Criterion) { - fn double_vec(v: Vec) -> Vec { - [&v[..], &v[..]].concat() - } - - // double ["hello", " ", "world", "!"] 10 times - let mut values = vec!["one", "on", "o", ""]; - for _ in 0..10 { - values = double_vec(values); - } - let array = StringArray::from(values); - - c.bench_function("bit_length", |b| b.iter(|| bench_bit_length(&array))); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/boolean_kernels.rs b/rust/arrow/benches/boolean_kernels.rs deleted file mode 100644 index 6559c4e4caf..00000000000 --- a/rust/arrow/benches/boolean_kernels.rs +++ /dev/null @@ -1,51 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -use arrow::util::bench_util::create_boolean_array; - -extern crate arrow; - -use arrow::array::*; -use arrow::compute::kernels::boolean as boolean_kernels; - -fn bench_and(lhs: &BooleanArray, rhs: &BooleanArray) { - criterion::black_box(boolean_kernels::and(lhs, rhs).unwrap()); -} - -fn bench_or(lhs: &BooleanArray, rhs: &BooleanArray) { - criterion::black_box(boolean_kernels::or(lhs, rhs).unwrap()); -} - -fn bench_not(array: &BooleanArray) { - criterion::black_box(boolean_kernels::not(&array).unwrap()); -} - -fn add_benchmark(c: &mut Criterion) { - let size = 2usize.pow(15); - let array1 = create_boolean_array(size, 0.0, 0.5); - let array2 = create_boolean_array(size, 0.0, 0.5); - c.bench_function("and", |b| b.iter(|| bench_and(&array1, &array2))); - c.bench_function("or", |b| b.iter(|| bench_or(&array1, &array2))); - c.bench_function("not", |b| b.iter(|| bench_not(&array1))); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/buffer_bit_ops.rs b/rust/arrow/benches/buffer_bit_ops.rs deleted file mode 100644 index 063f39c9272..00000000000 --- a/rust/arrow/benches/buffer_bit_ops.rs +++ /dev/null @@ -1,59 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -extern crate arrow; - -use arrow::buffer::{Buffer, MutableBuffer}; - -/// Helper function to create arrays -fn create_buffer(size: usize) -> Buffer { - let mut result = MutableBuffer::new(size).with_bitset(size, false); - - for i in 0..size { - result.as_slice_mut()[i] = 0b01010101 << i << (i % 4); - } - - result.into() -} - -fn bench_buffer_and(left: &Buffer, right: &Buffer) { - criterion::black_box((left & right).unwrap()); -} - -fn bench_buffer_or(left: &Buffer, right: &Buffer) { - criterion::black_box((left | right).unwrap()); -} - -fn bit_ops_benchmark(c: &mut Criterion) { - let left = create_buffer(512 * 10); - let right = create_buffer(512 * 10); - - c.bench_function("buffer_bit_ops and", |b| { - b.iter(|| bench_buffer_and(&left, &right)) - }); - - c.bench_function("buffer_bit_ops or", |b| { - b.iter(|| bench_buffer_or(&left, &right)) - }); -} - -criterion_group!(benches, bit_ops_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/buffer_create.rs b/rust/arrow/benches/buffer_create.rs deleted file mode 100644 index d628e031ce6..00000000000 --- a/rust/arrow/benches/buffer_create.rs +++ /dev/null @@ -1,190 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use arrow::util::test_util::seedable_rng; -use criterion::Criterion; -use rand::distributions::Uniform; -use rand::Rng; - -extern crate arrow; - -use arrow::{ - buffer::{Buffer, MutableBuffer}, - datatypes::ToByteSlice, -}; - -fn mutable_buffer_from_iter(data: &[Vec]) -> Vec { - criterion::black_box( - data.iter() - .map(|vec| vec.iter().copied().collect::().into()) - .collect::>(), - ) -} - -fn buffer_from_iter(data: &[Vec]) -> Vec { - criterion::black_box( - data.iter() - .map(|vec| vec.iter().copied().collect::()) - .collect::>(), - ) -} - -fn mutable_buffer_iter_bitset(data: &[Vec]) -> Vec { - criterion::black_box({ - data.iter() - .map(|datum| { - let mut result = MutableBuffer::new((data.len() + 7) / 8) - .with_bitset(datum.len(), false); - for (i, value) in datum.iter().enumerate() { - if *value { - unsafe { - arrow::util::bit_util::set_bit_raw(result.as_mut_ptr(), i); - } - } - } - result.into() - }) - .collect::>() - }) -} - -fn mutable_iter_extend_from_slice(data: &[Vec], capacity: usize) -> Buffer { - criterion::black_box({ - let mut result = MutableBuffer::new(capacity); - - data.iter().for_each(|vec| { - vec.iter() - .for_each(|elem| result.extend_from_slice(elem.to_byte_slice())) - }); - - result.into() - }) -} - -fn mutable_buffer(data: &[Vec], capacity: usize) -> Buffer { - criterion::black_box({ - let mut result = MutableBuffer::new(capacity); - - data.iter().for_each(|vec| result.extend_from_slice(vec)); - - result.into() - }) -} - -fn mutable_buffer_extend(data: &[Vec], capacity: usize) -> Buffer { - criterion::black_box({ - let mut result = MutableBuffer::new(capacity); - - data.iter() - .for_each(|vec| result.extend(vec.iter().copied())); - - result.into() - }) -} - -fn from_slice(data: &[Vec], capacity: usize) -> Buffer { - criterion::black_box({ - let mut a = Vec::::with_capacity(capacity); - - data.iter().for_each(|vec| a.extend(vec)); - - Buffer::from(a.to_byte_slice()) - }) -} - -fn create_data(size: usize) -> Vec> { - let rng = &mut seedable_rng(); - let range = Uniform::new(0, 33); - - (0..size) - .map(|_| { - let size = rng.sample(range); - seedable_rng() - .sample_iter(&range) - .take(size as usize) - .collect() - }) - .collect() -} - -fn create_data_bool(size: usize) -> Vec> { - let rng = &mut seedable_rng(); - let range = Uniform::new(0, 33); - - (0..size) - .map(|_| { - let size = rng.sample(range); - seedable_rng() - .sample_iter(&range) - .take(size as usize) - .map(|x| x > 15) - .collect() - }) - .collect() -} -fn benchmark(c: &mut Criterion) { - let size = 2usize.pow(15); - let data = create_data(size); - - let bool_data = create_data_bool(size); - let cap = data.iter().map(|i| i.len()).sum(); - let byte_cap = cap * std::mem::size_of::(); - - c.bench_function("mutable iter extend_from_slice", |b| { - b.iter(|| { - mutable_iter_extend_from_slice( - criterion::black_box(&data), - criterion::black_box(0), - ) - }) - }); - c.bench_function("mutable", |b| { - b.iter(|| mutable_buffer(criterion::black_box(&data), criterion::black_box(0))) - }); - - c.bench_function("mutable extend", |b| { - b.iter(|| mutable_buffer_extend(&data, 0)) - }); - - c.bench_function("mutable prepared", |b| { - b.iter(|| { - mutable_buffer(criterion::black_box(&data), criterion::black_box(byte_cap)) - }) - }); - - c.bench_function("from_slice", |b| { - b.iter(|| from_slice(criterion::black_box(&data), criterion::black_box(0))) - }); - c.bench_function("from_slice prepared", |b| { - b.iter(|| from_slice(criterion::black_box(&data), criterion::black_box(cap))) - }); - - c.bench_function("MutableBuffer iter bitset", |b| { - b.iter(|| mutable_buffer_iter_bitset(criterion::black_box(&bool_data))) - }); - c.bench_function("MutableBuffer::from_iter bool", |b| { - b.iter(|| mutable_buffer_from_iter(criterion::black_box(&bool_data))) - }); - c.bench_function("Buffer::from_iter bool", |b| { - b.iter(|| buffer_from_iter(criterion::black_box(&bool_data))) - }); -} - -criterion_group!(benches, benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/builder.rs b/rust/arrow/benches/builder.rs deleted file mode 100644 index fd9f319e397..00000000000 --- a/rust/arrow/benches/builder.rs +++ /dev/null @@ -1,116 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate arrow; -extern crate criterion; -extern crate rand; - -use std::mem::size_of; - -use criterion::*; -use rand::distributions::Standard; - -use arrow::array::*; -use arrow::util::test_util::seedable_rng; -use rand::Rng; - -// Build arrays with 512k elements. -const BATCH_SIZE: usize = 8 << 10; -const NUM_BATCHES: usize = 64; - -fn bench_primitive(c: &mut Criterion) { - let data: [i64; BATCH_SIZE] = [100; BATCH_SIZE]; - - let mut group = c.benchmark_group("bench_primitive"); - group.throughput(Throughput::Bytes( - ((data.len() * NUM_BATCHES * size_of::()) as u32).into(), - )); - group.bench_function("bench_primitive", |b| { - b.iter(|| { - let mut builder = Int64Builder::new(64); - for _ in 0..NUM_BATCHES { - let _ = black_box(builder.append_slice(&data[..])); - } - black_box(builder.finish()); - }) - }); - group.finish(); -} - -fn bench_primitive_nulls(c: &mut Criterion) { - let mut group = c.benchmark_group("bench_primitive_nulls"); - group.bench_function("bench_primitive_nulls", |b| { - b.iter(|| { - let mut builder = UInt8Builder::new(64); - for _ in 0..NUM_BATCHES * BATCH_SIZE { - let _ = black_box(builder.append_null()); - } - black_box(builder.finish()); - }) - }); - group.finish(); -} - -fn bench_bool(c: &mut Criterion) { - let data: Vec = seedable_rng() - .sample_iter(&Standard) - .take(BATCH_SIZE) - .collect(); - let data_len = data.len(); - - let mut group = c.benchmark_group("bench_bool"); - group.throughput(Throughput::Bytes( - ((data_len * NUM_BATCHES * size_of::()) as u32).into(), - )); - group.bench_function("bench_bool", |b| { - b.iter(|| { - let mut builder = BooleanBuilder::new(64); - for _ in 0..NUM_BATCHES { - let _ = black_box(builder.append_slice(&data[..])); - } - black_box(builder.finish()); - }) - }); - group.finish(); -} - -fn bench_string(c: &mut Criterion) { - const SAMPLE_STRING: &str = "sample string"; - let mut group = c.benchmark_group("bench_primitive"); - group.throughput(Throughput::Bytes( - ((BATCH_SIZE * NUM_BATCHES * SAMPLE_STRING.len()) as u32).into(), - )); - group.bench_function("bench_string", |b| { - b.iter(|| { - let mut builder = StringBuilder::new(64); - for _ in 0..NUM_BATCHES * BATCH_SIZE { - let _ = black_box(builder.append_value(SAMPLE_STRING)); - } - black_box(builder.finish()); - }) - }); - group.finish(); -} - -criterion_group!( - benches, - bench_primitive, - bench_primitive_nulls, - bench_bool, - bench_string -); -criterion_main!(benches); diff --git a/rust/arrow/benches/cast_kernels.rs b/rust/arrow/benches/cast_kernels.rs deleted file mode 100644 index d164e1facfd..00000000000 --- a/rust/arrow/benches/cast_kernels.rs +++ /dev/null @@ -1,185 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; -use rand::distributions::{Distribution, Standard, Uniform}; -use rand::Rng; - -use std::sync::Arc; - -extern crate arrow; - -use arrow::array::*; -use arrow::compute::cast; -use arrow::datatypes::*; -use arrow::util::bench_util::*; -use arrow::util::test_util::seedable_rng; - -fn build_array(size: usize) -> ArrayRef -where - Standard: Distribution, -{ - let array = create_primitive_array::(size, 0.1); - Arc::new(array) -} - -fn build_utf8_date_array(size: usize, with_nulls: bool) -> ArrayRef { - use chrono::NaiveDate; - - // use random numbers to avoid spurious compiler optimizations wrt to branching - let mut rng = seedable_rng(); - let mut builder = StringBuilder::new(size); - let range = Uniform::new(0, 737776); - - for _ in 0..size { - if with_nulls && rng.gen::() > 0.8 { - builder.append_null().unwrap(); - } else { - let string = NaiveDate::from_num_days_from_ce(rng.sample(range)) - .format("%Y-%m-%d") - .to_string(); - builder.append_value(&string).unwrap(); - } - } - Arc::new(builder.finish()) -} - -fn build_utf8_date_time_array(size: usize, with_nulls: bool) -> ArrayRef { - use chrono::NaiveDateTime; - - // use random numbers to avoid spurious compiler optimizations wrt to branching - let mut rng = seedable_rng(); - let mut builder = StringBuilder::new(size); - let range = Uniform::new(0, 1608071414123); - - for _ in 0..size { - if with_nulls && rng.gen::() > 0.8 { - builder.append_null().unwrap(); - } else { - let string = NaiveDateTime::from_timestamp(rng.sample(range), 0) - .format("%Y-%m-%dT%H:%M:%S") - .to_string(); - builder.append_value(&string).unwrap(); - } - } - Arc::new(builder.finish()) -} - -// cast array from specified primitive array type to desired data type -fn cast_array(array: &ArrayRef, to_type: DataType) { - criterion::black_box(cast(array, &to_type).unwrap()); -} - -fn add_benchmark(c: &mut Criterion) { - let i32_array = build_array::(512); - let i64_array = build_array::(512); - let f32_array = build_array::(512); - let f32_utf8_array = cast(&build_array::(512), &DataType::Utf8).unwrap(); - - let f64_array = build_array::(512); - let date64_array = build_array::(512); - let date32_array = build_array::(512); - let time32s_array = build_array::(512); - let time64ns_array = build_array::(512); - let time_ns_array = build_array::(512); - let time_ms_array = build_array::(512); - let utf8_date_array = build_utf8_date_array(512, true); - let utf8_date_time_array = build_utf8_date_time_array(512, true); - - c.bench_function("cast int32 to int32 512", |b| { - b.iter(|| cast_array(&i32_array, DataType::Int32)) - }); - c.bench_function("cast int32 to uint32 512", |b| { - b.iter(|| cast_array(&i32_array, DataType::UInt32)) - }); - c.bench_function("cast int32 to float32 512", |b| { - b.iter(|| cast_array(&i32_array, DataType::Float32)) - }); - c.bench_function("cast int32 to float64 512", |b| { - b.iter(|| cast_array(&i32_array, DataType::Float64)) - }); - c.bench_function("cast int32 to int64 512", |b| { - b.iter(|| cast_array(&i32_array, DataType::Int64)) - }); - c.bench_function("cast float32 to int32 512", |b| { - b.iter(|| cast_array(&f32_array, DataType::Int32)) - }); - c.bench_function("cast float64 to float32 512", |b| { - b.iter(|| cast_array(&f64_array, DataType::Float32)) - }); - c.bench_function("cast float64 to uint64 512", |b| { - b.iter(|| cast_array(&f64_array, DataType::UInt64)) - }); - c.bench_function("cast int64 to int32 512", |b| { - b.iter(|| cast_array(&i64_array, DataType::Int32)) - }); - c.bench_function("cast date64 to date32 512", |b| { - b.iter(|| cast_array(&date64_array, DataType::Date32)) - }); - c.bench_function("cast date32 to date64 512", |b| { - b.iter(|| cast_array(&date32_array, DataType::Date64)) - }); - c.bench_function("cast time32s to time32ms 512", |b| { - b.iter(|| cast_array(&time32s_array, DataType::Time32(TimeUnit::Millisecond))) - }); - c.bench_function("cast time32s to time64us 512", |b| { - b.iter(|| cast_array(&time32s_array, DataType::Time64(TimeUnit::Microsecond))) - }); - c.bench_function("cast time64ns to time32s 512", |b| { - b.iter(|| cast_array(&time64ns_array, DataType::Time32(TimeUnit::Second))) - }); - c.bench_function("cast timestamp_ns to timestamp_s 512", |b| { - b.iter(|| { - cast_array( - &time_ns_array, - DataType::Timestamp(TimeUnit::Nanosecond, None), - ) - }) - }); - c.bench_function("cast timestamp_ms to timestamp_ns 512", |b| { - b.iter(|| { - cast_array( - &time_ms_array, - DataType::Timestamp(TimeUnit::Nanosecond, None), - ) - }) - }); - c.bench_function("cast utf8 to f32", |b| { - b.iter(|| cast_array(&f32_utf8_array, DataType::Float32)) - }); - c.bench_function("cast i64 to string 512", |b| { - b.iter(|| cast_array(&i64_array, DataType::Utf8)) - }); - c.bench_function("cast f32 to string 512", |b| { - b.iter(|| cast_array(&f32_array, DataType::Utf8)) - }); - - c.bench_function("cast timestamp_ms to i64 512", |b| { - b.iter(|| cast_array(&time_ms_array, DataType::Int64)) - }); - c.bench_function("cast utf8 to date32 512", |b| { - b.iter(|| cast_array(&utf8_date_array, DataType::Date32)) - }); - c.bench_function("cast utf8 to date64 512", |b| { - b.iter(|| cast_array(&utf8_date_time_array, DataType::Date64)) - }); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/comparison_kernels.rs b/rust/arrow/benches/comparison_kernels.rs deleted file mode 100644 index a3df556efcf..00000000000 --- a/rust/arrow/benches/comparison_kernels.rs +++ /dev/null @@ -1,201 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -extern crate arrow; - -use arrow::compute::*; -use arrow::datatypes::ArrowNumericType; -use arrow::util::bench_util::*; -use arrow::{array::*, datatypes::Float32Type}; - -fn bench_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, -{ - eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_eq_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - eq_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - -fn bench_neq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, -{ - neq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_neq_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - neq_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - -fn bench_lt(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, -{ - lt(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_lt_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - lt_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - -fn bench_lt_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, -{ - lt_eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_lt_eq_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - lt_eq_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - -fn bench_gt(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, -{ - gt(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_gt_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - gt_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - -fn bench_gt_eq(arr_a: &PrimitiveArray, arr_b: &PrimitiveArray) -where - T: ArrowNumericType, -{ - gt_eq(criterion::black_box(arr_a), criterion::black_box(arr_b)).unwrap(); -} - -fn bench_gt_eq_scalar(arr_a: &PrimitiveArray, value_b: T::Native) -where - T: ArrowNumericType, -{ - gt_eq_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - -fn bench_like_utf8_scalar(arr_a: &StringArray, value_b: &str) { - like_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)).unwrap(); -} - -fn bench_nlike_utf8_scalar(arr_a: &StringArray, value_b: &str) { - nlike_utf8_scalar(criterion::black_box(arr_a), criterion::black_box(value_b)) - .unwrap(); -} - -fn add_benchmark(c: &mut Criterion) { - let size = 65536; - let arr_a = create_primitive_array_with_seed::(size, 0.0, 42); - let arr_b = create_primitive_array_with_seed::(size, 0.0, 43); - - let arr_string = create_string_array::(size, 0.0); - - c.bench_function("eq Float32", |b| b.iter(|| bench_eq(&arr_a, &arr_b))); - c.bench_function("eq scalar Float32", |b| { - b.iter(|| bench_eq_scalar(&arr_a, 1.0)) - }); - - c.bench_function("neq Float32", |b| b.iter(|| bench_neq(&arr_a, &arr_b))); - c.bench_function("neq scalar Float32", |b| { - b.iter(|| bench_neq_scalar(&arr_a, 1.0)) - }); - - c.bench_function("lt Float32", |b| b.iter(|| bench_lt(&arr_a, &arr_b))); - c.bench_function("lt scalar Float32", |b| { - b.iter(|| bench_lt_scalar(&arr_a, 1.0)) - }); - - c.bench_function("lt_eq Float32", |b| b.iter(|| bench_lt_eq(&arr_a, &arr_b))); - c.bench_function("lt_eq scalar Float32", |b| { - b.iter(|| bench_lt_eq_scalar(&arr_a, 1.0)) - }); - - c.bench_function("gt Float32", |b| b.iter(|| bench_gt(&arr_a, &arr_b))); - c.bench_function("gt scalar Float32", |b| { - b.iter(|| bench_gt_scalar(&arr_a, 1.0)) - }); - - c.bench_function("gt_eq Float32", |b| b.iter(|| bench_gt_eq(&arr_a, &arr_b))); - c.bench_function("gt_eq scalar Float32", |b| { - b.iter(|| bench_gt_eq_scalar(&arr_a, 1.0)) - }); - - c.bench_function("like_utf8 scalar equals", |b| { - b.iter(|| bench_like_utf8_scalar(&arr_string, "xxxx")) - }); - - c.bench_function("like_utf8 scalar contains", |b| { - b.iter(|| bench_like_utf8_scalar(&arr_string, "%xxxx%")) - }); - - c.bench_function("like_utf8 scalar ends with", |b| { - b.iter(|| bench_like_utf8_scalar(&arr_string, "xxxx%")) - }); - - c.bench_function("like_utf8 scalar starts with", |b| { - b.iter(|| bench_like_utf8_scalar(&arr_string, "%xxxx")) - }); - - c.bench_function("like_utf8 scalar complex", |b| { - b.iter(|| bench_like_utf8_scalar(&arr_string, "%xx_xx%xxx")) - }); - - c.bench_function("nlike_utf8 scalar equals", |b| { - b.iter(|| bench_nlike_utf8_scalar(&arr_string, "xxxx")) - }); - - c.bench_function("nlike_utf8 scalar contains", |b| { - b.iter(|| bench_nlike_utf8_scalar(&arr_string, "%xxxx%")) - }); - - c.bench_function("nlike_utf8 scalar ends with", |b| { - b.iter(|| bench_nlike_utf8_scalar(&arr_string, "xxxx%")) - }); - - c.bench_function("nlike_utf8 scalar starts with", |b| { - b.iter(|| bench_nlike_utf8_scalar(&arr_string, "%xxxx")) - }); - - c.bench_function("nlike_utf8 scalar complex", |b| { - b.iter(|| bench_nlike_utf8_scalar(&arr_string, "%xx_xx%xxx")) - }); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/concatenate_kernel.rs b/rust/arrow/benches/concatenate_kernel.rs deleted file mode 100644 index 3fff2abd179..00000000000 --- a/rust/arrow/benches/concatenate_kernel.rs +++ /dev/null @@ -1,66 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -extern crate arrow; - -use arrow::array::*; -use arrow::compute::concat; -use arrow::datatypes::*; -use arrow::util::bench_util::*; - -fn bench_concat(v1: &dyn Array, v2: &dyn Array) { - criterion::black_box(concat(&[v1, v2]).unwrap()); -} - -fn bench_concat_arrays(arrays: &[&dyn Array]) { - criterion::black_box(concat(arrays).unwrap()); -} - -fn add_benchmark(c: &mut Criterion) { - let v1 = create_primitive_array::(1024, 0.0); - let v2 = create_primitive_array::(1024, 0.0); - c.bench_function("concat i32 1024", |b| b.iter(|| bench_concat(&v1, &v2))); - - let v1 = create_primitive_array::(1024, 0.5); - let v2 = create_primitive_array::(1024, 0.5); - c.bench_function("concat i32 nulls 1024", |b| { - b.iter(|| bench_concat(&v1, &v2)) - }); - - let small_array = create_primitive_array::(4, 0.0); - let arrays: Vec<_> = (0..1024).map(|_| &small_array as &dyn Array).collect(); - c.bench_function("concat 1024 arrays i32 4", |b| { - b.iter(|| bench_concat_arrays(&arrays)) - }); - - let v1 = create_string_array::(1024, 0.0); - let v2 = create_string_array::(1024, 0.0); - c.bench_function("concat str 1024", |b| b.iter(|| bench_concat(&v1, &v2))); - - let v1 = create_string_array::(1024, 0.5); - let v2 = create_string_array::(1024, 0.5); - c.bench_function("concat str nulls 1024", |b| { - b.iter(|| bench_concat(&v1, &v2)) - }); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/csv_writer.rs b/rust/arrow/benches/csv_writer.rs deleted file mode 100644 index 9b018530938..00000000000 --- a/rust/arrow/benches/csv_writer.rs +++ /dev/null @@ -1,70 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate arrow; -extern crate criterion; - -use criterion::*; - -use arrow::array::*; -use arrow::csv; -use arrow::datatypes::*; -use arrow::record_batch::RecordBatch; -use std::fs::File; -use std::sync::Arc; - -fn record_batches_to_csv() { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::UInt32, false), - Field::new("c3", DataType::Boolean, true), - ]); - - let c1 = StringArray::from(vec![ - "Lorem ipsum dolor sit amet", - "consectetur adipiscing elit", - "sed do eiusmod tempor", - ]); - let c2 = PrimitiveArray::::from(vec![ - Some(123.564532), - None, - Some(-556132.25), - ]); - let c3 = PrimitiveArray::::from(vec![3, 2, 1]); - let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); - - let b = RecordBatch::try_new( - Arc::new(schema), - vec![Arc::new(c1), Arc::new(c2), Arc::new(c3), Arc::new(c4)], - ) - .unwrap(); - let file = File::create("target/bench_write_csv.csv").unwrap(); - let mut writer = csv::Writer::new(file); - let batches = vec![&b, &b, &b, &b, &b, &b, &b, &b, &b, &b, &b]; - #[allow(clippy::unit_arg)] - criterion::black_box(for batch in batches { - writer.write(batch).unwrap() - }); -} - -fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("record_batches_to_csv", |b| b.iter(record_batches_to_csv)); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/equal.rs b/rust/arrow/benches/equal.rs deleted file mode 100644 index af535506e86..00000000000 --- a/rust/arrow/benches/equal.rs +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Allowed because we use `arr == arr` in benchmarks -#![allow(clippy::eq_op)] - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -extern crate arrow; - -use arrow::util::bench_util::*; -use arrow::{array::*, datatypes::Float32Type}; - -fn bench_equal>(arr_a: &A) { - criterion::black_box(arr_a == arr_a); -} - -fn add_benchmark(c: &mut Criterion) { - let arr_a = create_primitive_array::(512, 0.0); - c.bench_function("equal_512", |b| b.iter(|| bench_equal(&arr_a))); - - let arr_a_nulls = create_primitive_array::(512, 0.5); - c.bench_function("equal_nulls_512", |b| b.iter(|| bench_equal(&arr_a_nulls))); - - let arr_a = create_string_array::(512, 0.0); - c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a))); - - let arr_a_nulls = create_string_array::(512, 0.5); - c.bench_function("equal_string_nulls_512", |b| { - b.iter(|| bench_equal(&arr_a_nulls)) - }); - - let arr_a = create_boolean_array(512, 0.0, 0.5); - c.bench_function("equal_bool_512", |b| b.iter(|| bench_equal(&arr_a))); - - let arr_a = create_boolean_array(513, 0.0, 0.5); - c.bench_function("equal_bool_513", |b| b.iter(|| bench_equal(&arr_a))); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/filter_kernels.rs b/rust/arrow/benches/filter_kernels.rs deleted file mode 100644 index ca317b4676c..00000000000 --- a/rust/arrow/benches/filter_kernels.rs +++ /dev/null @@ -1,106 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -extern crate arrow; - -use arrow::compute::Filter; -use arrow::util::bench_util::*; - -use arrow::array::*; -use arrow::compute::{build_filter, filter}; -use arrow::datatypes::{Float32Type, UInt8Type}; - -use criterion::{criterion_group, criterion_main, Criterion}; - -fn bench_filter(data_array: &dyn Array, filter_array: &BooleanArray) { - criterion::black_box(filter(data_array, filter_array).unwrap()); -} - -fn bench_built_filter<'a>(filter: &Filter<'a>, data: &impl Array) { - criterion::black_box(filter(&data.data())); -} - -fn add_benchmark(c: &mut Criterion) { - let size = 65536; - let filter_array = create_boolean_array(size, 0.0, 0.5); - let dense_filter_array = create_boolean_array(size, 0.0, 1.0 - 1.0 / 1024.0); - let sparse_filter_array = create_boolean_array(size, 0.0, 1.0 / 1024.0); - - let filter = build_filter(&filter_array).unwrap(); - let dense_filter = build_filter(&dense_filter_array).unwrap(); - let sparse_filter = build_filter(&sparse_filter_array).unwrap(); - - let data_array = create_primitive_array::(size, 0.0); - - c.bench_function("filter u8", |b| { - b.iter(|| bench_filter(&data_array, &filter_array)) - }); - c.bench_function("filter u8 high selectivity", |b| { - b.iter(|| bench_filter(&data_array, &dense_filter_array)) - }); - c.bench_function("filter u8 low selectivity", |b| { - b.iter(|| bench_filter(&data_array, &sparse_filter_array)) - }); - - c.bench_function("filter context u8", |b| { - b.iter(|| bench_built_filter(&filter, &data_array)) - }); - c.bench_function("filter context u8 high selectivity", |b| { - b.iter(|| bench_built_filter(&dense_filter, &data_array)) - }); - c.bench_function("filter context u8 low selectivity", |b| { - b.iter(|| bench_built_filter(&sparse_filter, &data_array)) - }); - - let data_array = create_primitive_array::(size, 0.5); - c.bench_function("filter context u8 w NULLs", |b| { - b.iter(|| bench_built_filter(&filter, &data_array)) - }); - c.bench_function("filter context u8 w NULLs high selectivity", |b| { - b.iter(|| bench_built_filter(&dense_filter, &data_array)) - }); - c.bench_function("filter context u8 w NULLs low selectivity", |b| { - b.iter(|| bench_built_filter(&sparse_filter, &data_array)) - }); - - let data_array = create_primitive_array::(size, 0.5); - c.bench_function("filter f32", |b| { - b.iter(|| bench_filter(&data_array, &filter_array)) - }); - c.bench_function("filter context f32", |b| { - b.iter(|| bench_built_filter(&filter, &data_array)) - }); - c.bench_function("filter context f32 high selectivity", |b| { - b.iter(|| bench_built_filter(&dense_filter, &data_array)) - }); - c.bench_function("filter context f32 low selectivity", |b| { - b.iter(|| bench_built_filter(&sparse_filter, &data_array)) - }); - - let data_array = create_string_array::(size, 0.5); - c.bench_function("filter context string", |b| { - b.iter(|| bench_built_filter(&filter, &data_array)) - }); - c.bench_function("filter context string high selectivity", |b| { - b.iter(|| bench_built_filter(&dense_filter, &data_array)) - }); - c.bench_function("filter context string low selectivity", |b| { - b.iter(|| bench_built_filter(&sparse_filter, &data_array)) - }); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/json_reader.rs b/rust/arrow/benches/json_reader.rs deleted file mode 100644 index ef3ddf0537b..00000000000 --- a/rust/arrow/benches/json_reader.rs +++ /dev/null @@ -1,112 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate arrow; -extern crate criterion; - -use criterion::*; - -use arrow::datatypes::*; -use arrow::json::ReaderBuilder; -use std::io::Cursor; -use std::sync::Arc; - -fn json_primitive_to_record_batch() { - let schema = Arc::new(Schema::new(vec![ - Field::new("c1", DataType::Utf8, true), - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::UInt32, true), - Field::new("c4", DataType::Boolean, true), - ])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let json_content = r#" - {"c1": "eleven", "c2": 6.2222222225, "c3": 5.0, "c4": false} - {"c1": "twelve", "c2": -55555555555555.2, "c3": 3} - {"c1": null, "c2": 3, "c3": 125, "c4": null} - {"c2": -35, "c3": 100.0, "c4": true} - {"c1": "fifteen", "c2": null, "c4": true} - {"c1": "eleven", "c2": 6.2222222225, "c3": 5.0, "c4": false} - {"c1": "twelve", "c2": -55555555555555.2, "c3": 3} - {"c1": null, "c2": 3, "c3": 125, "c4": null} - {"c2": -35, "c3": 100.0, "c4": true} - {"c1": "fifteen", "c2": null, "c4": true} - "#; - let cursor = Cursor::new(json_content); - let mut reader = builder.build(cursor).unwrap(); - #[allow(clippy::unit_arg)] - criterion::black_box({ - reader.next().unwrap(); - }); -} - -fn json_list_primitive_to_record_batch() { - let schema = Arc::new(Schema::new(vec![ - Field::new( - "c1", - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - true, - ), - Field::new( - "c2", - DataType::List(Box::new(Field::new("item", DataType::Float64, true))), - true, - ), - Field::new( - "c3", - DataType::List(Box::new(Field::new("item", DataType::UInt32, true))), - true, - ), - Field::new( - "c4", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - true, - ), - ])); - let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let json_content = r#" - {"c1": ["eleven"], "c2": [6.2222222225, -3.2, null], "c3": [5.0, 6], "c4": [false, true]} - {"c1": ["twelve"], "c2": [-55555555555555.2, 12500000.0], "c3": [3, 4, 5]} - {"c1": null, "c2": [3], "c3": [125, 127, 129], "c4": [null, false, true]} - {"c2": [-35], "c3": [100.0, 200.0], "c4": null} - {"c1": ["fifteen"], "c2": [null, 2.1, 1.5, -3], "c4": [true, false, null]} - {"c1": ["fifteen"], "c2": [], "c4": [true, false, null]} - {"c1": ["eleven"], "c2": [6.2222222225, -3.2, null], "c3": [5.0, 6], "c4": [false, true]} - {"c1": ["twelve"], "c2": [-55555555555555.2, 12500000.0], "c3": [3, 4, 5]} - {"c1": null, "c2": [3], "c3": [125, 127, 129], "c4": [null, false, true]} - {"c2": [-35], "c3": [100.0, 200.0], "c4": null} - {"c1": ["fifteen"], "c2": [null, 2.1, 1.5, -3], "c4": [true, false, null]} - {"c1": ["fifteen"], "c2": [], "c4": [true, false, null]} - "#; - let cursor = Cursor::new(json_content); - let mut reader = builder.build(cursor).unwrap(); - #[allow(clippy::unit_arg)] - criterion::black_box({ - reader.next().unwrap(); - }); -} - -fn criterion_benchmark(c: &mut Criterion) { - c.bench_function("json_primitive_to_record_batch", |b| { - b.iter(json_primitive_to_record_batch) - }); - c.bench_function("json_list_primitive_to_record_batch", |b| { - b.iter(json_list_primitive_to_record_batch) - }); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/length_kernel.rs b/rust/arrow/benches/length_kernel.rs deleted file mode 100644 index b70f6374f8f..00000000000 --- a/rust/arrow/benches/length_kernel.rs +++ /dev/null @@ -1,47 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -extern crate arrow; - -use arrow::array::*; -use arrow::compute::kernels::length::length; - -fn bench_length(array: &StringArray) { - criterion::black_box(length(array).unwrap()); -} - -fn add_benchmark(c: &mut Criterion) { - fn double_vec(v: Vec) -> Vec { - [&v[..], &v[..]].concat() - } - - // double ["hello", " ", "world", "!"] 10 times - let mut values = vec!["one", "on", "o", ""]; - for _ in 0..10 { - values = double_vec(values); - } - let array = StringArray::from(values); - - c.bench_function("length", |b| b.iter(|| bench_length(&array))); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/mutable_array.rs b/rust/arrow/benches/mutable_array.rs deleted file mode 100644 index 52da38a1d54..00000000000 --- a/rust/arrow/benches/mutable_array.rs +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -use rand::Rng; - -extern crate arrow; - -use arrow::util::test_util::seedable_rng; -use arrow::{array::*, util::bench_util::create_string_array}; - -fn create_slices(size: usize) -> Vec<(usize, usize)> { - let rng = &mut seedable_rng(); - - (0..size) - .map(|_| { - let start = rng.gen_range(0, size / 2); - let end = rng.gen_range(start + 1, size); - (start, end) - }) - .collect() -} - -fn bench(v1: &T, slices: &[(usize, usize)]) { - let mut mutable = MutableArrayData::new(vec![v1.data_ref()], false, 5); - for (start, end) in slices { - mutable.extend(0, *start, *end) - } - mutable.freeze(); -} - -fn add_benchmark(c: &mut Criterion) { - let v1 = create_string_array::(1024, 0.0); - let v2 = create_slices(1024); - c.bench_function("mutable str 1024", |b| b.iter(|| bench(&v1, &v2))); - - let v1 = create_string_array::(1024, 0.5); - let v2 = create_slices(1024); - c.bench_function("mutable str nulls 1024", |b| b.iter(|| bench(&v1, &v2))); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/sort_kernel.rs b/rust/arrow/benches/sort_kernel.rs deleted file mode 100644 index 74dc0ceae18..00000000000 --- a/rust/arrow/benches/sort_kernel.rs +++ /dev/null @@ -1,121 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -use std::sync::Arc; - -extern crate arrow; - -use arrow::compute::kernels::sort::{lexsort, SortColumn}; -use arrow::util::bench_util::*; -use arrow::{array::*, datatypes::Float32Type}; - -fn create_array(size: usize, with_nulls: bool) -> ArrayRef { - let null_density = if with_nulls { 0.5 } else { 0.0 }; - let array = create_primitive_array::(size, null_density); - Arc::new(array) -} - -fn bench_sort(arr_a: &ArrayRef, array_b: &ArrayRef, limit: Option) { - let columns = vec![ - SortColumn { - values: arr_a.clone(), - options: None, - }, - SortColumn { - values: array_b.clone(), - options: None, - }, - ]; - - criterion::black_box(lexsort(&columns, limit).unwrap()); -} - -fn add_benchmark(c: &mut Criterion) { - let arr_a = create_array(2u64.pow(10) as usize, false); - let arr_b = create_array(2u64.pow(10) as usize, false); - - c.bench_function("sort 2^10", |b| b.iter(|| bench_sort(&arr_a, &arr_b, None))); - - let arr_a = create_array(2u64.pow(12) as usize, false); - let arr_b = create_array(2u64.pow(12) as usize, false); - - c.bench_function("sort 2^12", |b| b.iter(|| bench_sort(&arr_a, &arr_b, None))); - - let arr_a = create_array(2u64.pow(10) as usize, true); - let arr_b = create_array(2u64.pow(10) as usize, true); - - c.bench_function("sort nulls 2^10", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, None)) - }); - - let arr_a = create_array(2u64.pow(12) as usize, true); - let arr_b = create_array(2u64.pow(12) as usize, true); - - c.bench_function("sort nulls 2^12", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, None)) - }); - - // with limit - { - let arr_a = create_array(2u64.pow(12) as usize, false); - let arr_b = create_array(2u64.pow(12) as usize, false); - c.bench_function("sort 2^12 limit 10", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(10))) - }); - - let arr_a = create_array(2u64.pow(12) as usize, false); - let arr_b = create_array(2u64.pow(12) as usize, false); - c.bench_function("sort 2^12 limit 100", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(100))) - }); - - let arr_a = create_array(2u64.pow(12) as usize, false); - let arr_b = create_array(2u64.pow(12) as usize, false); - c.bench_function("sort 2^12 limit 1000", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(1000))) - }); - - let arr_a = create_array(2u64.pow(12) as usize, false); - let arr_b = create_array(2u64.pow(12) as usize, false); - c.bench_function("sort 2^12 limit 2^12", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(2u64.pow(12) as usize))) - }); - - let arr_a = create_array(2u64.pow(12) as usize, true); - let arr_b = create_array(2u64.pow(12) as usize, true); - - c.bench_function("sort nulls 2^12 limit 10", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(10))) - }); - c.bench_function("sort nulls 2^12 limit 100", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(100))) - }); - c.bench_function("sort nulls 2^12 limit 1000", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(1000))) - }); - c.bench_function("sort nulls 2^12 limit 2^12", |b| { - b.iter(|| bench_sort(&arr_a, &arr_b, Some(2u64.pow(12) as usize))) - }); - } -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/benches/take_kernels.rs b/rust/arrow/benches/take_kernels.rs deleted file mode 100644 index 2853eb5d476..00000000000 --- a/rust/arrow/benches/take_kernels.rs +++ /dev/null @@ -1,128 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[macro_use] -extern crate criterion; -use criterion::Criterion; - -use rand::Rng; - -extern crate arrow; - -use arrow::compute::take; -use arrow::datatypes::*; -use arrow::util::test_util::seedable_rng; -use arrow::{array::*, util::bench_util::*}; - -fn create_random_index(size: usize, null_density: f32) -> UInt32Array { - let mut rng = seedable_rng(); - let mut builder = UInt32Builder::new(size); - for _ in 0..size { - if rng.gen::() < null_density { - builder.append_null().unwrap() - } else { - let value = rng.gen_range::(0u32, size as u32); - builder.append_value(value).unwrap(); - } - } - builder.finish() -} - -fn bench_take(values: &dyn Array, indices: &UInt32Array) { - criterion::black_box(take(values, &indices, None).unwrap()); -} - -fn add_benchmark(c: &mut Criterion) { - let values = create_primitive_array::(512, 0.0); - let indices = create_random_index(512, 0.0); - c.bench_function("take i32 512", |b| b.iter(|| bench_take(&values, &indices))); - let values = create_primitive_array::(1024, 0.0); - let indices = create_random_index(1024, 0.0); - c.bench_function("take i32 1024", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - - let indices = create_random_index(512, 0.5); - c.bench_function("take i32 nulls 512", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - let values = create_primitive_array::(1024, 0.0); - let indices = create_random_index(1024, 0.5); - c.bench_function("take i32 nulls 1024", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - - let values = create_boolean_array(512, 0.0, 0.5); - let indices = create_random_index(512, 0.0); - c.bench_function("take bool 512", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - let values = create_boolean_array(1024, 0.0, 0.5); - let indices = create_random_index(1024, 0.0); - c.bench_function("take bool 1024", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - - let values = create_boolean_array(512, 0.0, 0.5); - let indices = create_random_index(512, 0.5); - c.bench_function("take bool nulls 512", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - let values = create_boolean_array(1024, 0.0, 0.5); - let indices = create_random_index(1024, 0.5); - c.bench_function("take bool nulls 1024", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - - let values = create_string_array::(512, 0.0); - let indices = create_random_index(512, 0.0); - c.bench_function("take str 512", |b| b.iter(|| bench_take(&values, &indices))); - - let values = create_string_array::(1024, 0.0); - let indices = create_random_index(1024, 0.0); - c.bench_function("take str 1024", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - - let values = create_string_array::(512, 0.0); - let indices = create_random_index(512, 0.5); - c.bench_function("take str null indices 512", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - - let values = create_string_array::(1024, 0.0); - let indices = create_random_index(1024, 0.5); - c.bench_function("take str null indices 1024", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - - let values = create_string_array::(1024, 0.5); - - let indices = create_random_index(1024, 0.0); - c.bench_function("take str null values 1024", |b| { - b.iter(|| bench_take(&values, &indices)) - }); - - let values = create_string_array::(1024, 0.5); - let indices = create_random_index(1024, 0.5); - c.bench_function("take str null values null indices 1024", |b| { - b.iter(|| bench_take(&values, &indices)) - }); -} - -criterion_group!(benches, add_benchmark); -criterion_main!(benches); diff --git a/rust/arrow/build.rs b/rust/arrow/build.rs deleted file mode 100644 index 2e3a711533c..00000000000 --- a/rust/arrow/build.rs +++ /dev/null @@ -1,26 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use cfg_aliases::cfg_aliases; - -fn main() { - println!("cargo:rerun-if-changed=build.rs"); - // Setup cfg aliases - cfg_aliases! { - simd: { all(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"), feature = "simd") }, - } -} diff --git a/rust/arrow/examples/builders.rs b/rust/arrow/examples/builders.rs deleted file mode 100644 index 61cce0ed97a..00000000000 --- a/rust/arrow/examples/builders.rs +++ /dev/null @@ -1,131 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -///! Many builders are available to easily create different types of arrow arrays -extern crate arrow; - -use std::sync::Arc; - -use arrow::array::{ - Array, ArrayData, BooleanArray, Int32Array, Int32Builder, ListArray, PrimitiveArray, - StringArray, StructArray, -}; -use arrow::buffer::Buffer; -use arrow::datatypes::{DataType, Date64Type, Field, Time64NanosecondType, ToByteSlice}; - -fn main() { - // Primitive Arrays - // - // Primitive arrays are arrays of fixed-width primitive types (bool, u8, u16, u32, - // u64, i8, i16, i32, i64, f32, f64) - - // Create a new builder with a capacity of 100 - let mut primitive_array_builder = Int32Builder::new(100); - - // Append an individual primitive value - primitive_array_builder.append_value(55).unwrap(); - - // Append a null value - primitive_array_builder.append_null().unwrap(); - - // Append a slice of primitive values - primitive_array_builder.append_slice(&[39, 89, 12]).unwrap(); - - // Append lots of values - primitive_array_builder.append_null().unwrap(); - primitive_array_builder - .append_slice(&(25..50).collect::>()) - .unwrap(); - - // Build the `PrimitiveArray` - let primitive_array = primitive_array_builder.finish(); - // Long arrays will have an ellipsis printed in the middle - println!("{:?}", primitive_array); - - // Arrays can also be built from `Vec>`. `None` - // represents a null value in the array. - let date_array: PrimitiveArray = - vec![Some(1550902545147), None, Some(1550902545147)].into(); - println!("{:?}", date_array); - - let time_array: PrimitiveArray = - (0..100).collect::>().into(); - println!("{:?}", time_array); - - // We can build arrays directly from the underlying buffers. - - // BinaryArrays are arrays of byte arrays, where each byte array - // is a slice of an underlying buffer. - - // Array data: ["hello", null, "parquet"] - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let offsets: [i32; 4] = [0, 5, 5, 12]; - - let array_data = ArrayData::builder(DataType::Utf8) - .len(3) - .add_buffer(Buffer::from(offsets.to_byte_slice())) - .add_buffer(Buffer::from(&values[..])) - .null_bit_buffer(Buffer::from([0b00000101])) - .build(); - let binary_array = StringArray::from(array_data); - println!("{:?}", binary_array); - - // ListArrays are similar to ByteArrays: they are arrays of other - // arrays, where each child array is a slice of the underlying - // buffer. - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - let list_array = ListArray::from(list_data); - - println!("{:?}", list_array); - - // StructArrays are arrays of tuples, where each tuple element is - // from a child array. (In other words, they're like zipping - // multiple columns into one and giving each subcolumn a label.) - - // StructArrays can be constructed using the StructArray::from - // helper, which takes the underlying arrays and field types. - let struct_array = StructArray::from(vec![ - ( - Field::new("b", DataType::Boolean, false), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, - ), - ( - Field::new("c", DataType::Int32, false), - Arc::new(Int32Array::from(vec![42, 28, 19, 31])), - ), - ]); - println!("{:?}", struct_array); -} diff --git a/rust/arrow/examples/dynamic_types.rs b/rust/arrow/examples/dynamic_types.rs deleted file mode 100644 index 58e41560e23..00000000000 --- a/rust/arrow/examples/dynamic_types.rs +++ /dev/null @@ -1,101 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -///! This example demonstrates dealing with mixed types dynamically at runtime -use std::sync::Arc; - -extern crate arrow; - -use arrow::array::*; -use arrow::datatypes::*; -use arrow::error::Result; -use arrow::record_batch::*; - -fn main() -> Result<()> { - // define schema - let schema = Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new( - "nested", - DataType::Struct(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::Float64, false), - Field::new("c", DataType::Float64, false), - ]), - false, - ), - ]); - - // create some data - let id = Int32Array::from(vec![1, 2, 3, 4, 5]); - - let nested = StructArray::from(vec![ - ( - Field::new("a", DataType::Utf8, false), - Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])) as Arc, - ), - ( - Field::new("b", DataType::Float64, false), - Arc::new(Float64Array::from(vec![1.1, 2.2, 3.3, 4.4, 5.5])), - ), - ( - Field::new("c", DataType::Float64, false), - Arc::new(Float64Array::from(vec![2.2, 3.3, 4.4, 5.5, 6.6])), - ), - ]); - - // build a record batch - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id), Arc::new(nested)])?; - - process(&batch); - Ok(()) -} - -/// Create a new batch by performing a projection of id, nested.c -fn process(batch: &RecordBatch) { - let id = batch.column(0); - let nested = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - - let _nested_b = nested - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - let nested_c: &Float64Array = nested - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - - let projected_schema = Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("sum", DataType::Float64, false), - ]); - - let _ = RecordBatch::try_new( - Arc::new(projected_schema), - vec![ - id.clone(), // NOTE: this is cloning the Arc not the array data - Arc::new(Float64Array::from(nested_c.data().clone())), - ], - ); -} diff --git a/rust/arrow/examples/read_csv.rs b/rust/arrow/examples/read_csv.rs deleted file mode 100644 index 9e2b9c34c86..00000000000 --- a/rust/arrow/examples/read_csv.rs +++ /dev/null @@ -1,43 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate arrow; - -use std::fs::File; -use std::sync::Arc; - -use arrow::csv; -use arrow::datatypes::{DataType, Field, Schema}; -#[cfg(feature = "prettyprint")] -use arrow::util::pretty::print_batches; - -fn main() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = csv::Reader::new(file, Arc::new(schema), false, None, 1024, None, None); - let _batch = csv.next().unwrap().unwrap(); - #[cfg(feature = "prettyprint")] - { - print_batches(&[_batch]).unwrap(); - } -} diff --git a/rust/arrow/examples/read_csv_infer_schema.rs b/rust/arrow/examples/read_csv_infer_schema.rs deleted file mode 100644 index 93253e72cff..00000000000 --- a/rust/arrow/examples/read_csv_infer_schema.rs +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate arrow; - -use arrow::csv; -#[cfg(feature = "prettyprint")] -use arrow::util::pretty::print_batches; -use std::fs::File; - -fn main() { - let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - let builder = csv::ReaderBuilder::new() - .has_header(true) - .infer_schema(Some(100)); - let mut csv = builder.build(file).unwrap(); - let _batch = csv.next().unwrap().unwrap(); - #[cfg(feature = "prettyprint")] - { - print_batches(&[_batch]).unwrap(); - } -} diff --git a/rust/arrow/examples/tensor_builder.rs b/rust/arrow/examples/tensor_builder.rs deleted file mode 100644 index 1ef53920e04..00000000000 --- a/rust/arrow/examples/tensor_builder.rs +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -///! Tensor builder example -extern crate arrow; - -use arrow::array::*; //{Int32BufferBuilder, Float32BufferBuilder}; -use arrow::buffer::Buffer; -use arrow::datatypes::ToByteSlice; -use arrow::error::Result; -use arrow::tensor::{Float32Tensor, Int32Tensor}; - -fn main() -> Result<()> { - // Building a tensor using the buffer builder for Int32 - // The buffer builder will pad the appended numbers - // to match the required size for each buffer - let mut builder = Int32BufferBuilder::new(16); - for i in 0..16 { - builder.append(i); - } - let buf = builder.finish(); - - // When building a tensor the buffer and shape are required - // The new function will estimate the expected stride for the - // storage data - let tensor = Int32Tensor::try_new(buf, Some(vec![2, 8]), None, None)?; - println!("Int32 Tensor"); - println!("{:?}", tensor); - - // Creating a tensor using float type buffer builder - let mut builder = Float32BufferBuilder::new(4); - builder.append(1.0); - builder.append(2.0); - builder.append(3.0); - builder.append(4.0); - let buf = builder.finish(); - - // When building the tensor the buffer and shape are necessary - // The new function will estimate the expected stride for the - // storage data - let tensor = Float32Tensor::try_new(buf, Some(vec![2, 2]), None, None)?; - println!("\nFloat32 Tensor"); - println!("{:?}", tensor); - - // In order to build a tensor from an array the function to_byte_slice add the - // required padding to the elements in the array. - let buf = Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7, 9, 10].to_byte_slice()); - let tensor = Int32Tensor::try_new(buf, Some(vec![2, 5]), None, None)?; - println!("\nInt32 Tensor"); - println!("{:?}", tensor); - - Ok(()) -} diff --git a/rust/arrow/format-0ed34c83.patch b/rust/arrow/format-0ed34c83.patch deleted file mode 100644 index 5da0a0c51f0..00000000000 --- a/rust/arrow/format-0ed34c83.patch +++ /dev/null @@ -1,220 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -diff --git a/format/Message.fbs b/format/Message.fbs -index 1a7e0dfff..f1c18d765 100644 ---- a/format/Message.fbs -+++ b/format/Message.fbs -@@ -28,7 +28,7 @@ namespace org.apache.arrow.flatbuf; - /// Metadata about a field at some level of a nested type tree (but not - /// its children). - /// --/// For example, a List with values [[1, 2, 3], null, [4], [5, 6], null] -+/// For example, a List with values `[[1, 2, 3], null, [4], [5, 6], null]` - /// would have {length: 5, null_count: 2} for its List node, and {length: 6, - /// null_count: 0} for its Int16 node, as separate FieldNode structs - struct FieldNode { -diff --git a/format/Schema.fbs b/format/Schema.fbs -index 3b37e5d85..3b00dd478 100644 ---- a/format/Schema.fbs -+++ b/format/Schema.fbs -@@ -110,10 +110,11 @@ table FixedSizeList { - /// not enforced. - /// - /// Map -+/// ```text - /// - child[0] entries: Struct - /// - child[0] key: K - /// - child[1] value: V --/// -+/// ``` - /// Neither the "entries" field nor the "key" field may be nullable. - /// - /// The metadata is structured so that Arrow systems without special handling -@@ -129,7 +130,7 @@ enum UnionMode:short { Sparse, Dense } - /// A union is a complex type with children in Field - /// By default ids in the type vector refer to the offsets in the children - /// optionally typeIds provides an indirection between the child offset and the type id --/// for each child typeIds[offset] is the id used in the type vector -+/// for each child `typeIds[offset]` is the id used in the type vector - table Union { - mode: UnionMode; - typeIds: [ int ]; // optional, describes typeid of each child. -diff --git a/format/SparseTensor.fbs b/format/SparseTensor.fbs -index 3fe8a7582..a6fd2f9e7 100644 ---- a/format/SparseTensor.fbs -+++ b/format/SparseTensor.fbs -@@ -37,21 +37,21 @@ namespace org.apache.arrow.flatbuf; - /// - /// For example, let X be a 2x3x4x5 tensor, and it has the following - /// 6 non-zero values: --/// -+/// ```text - /// X[0, 1, 2, 0] := 1 - /// X[1, 1, 2, 3] := 2 - /// X[0, 2, 1, 0] := 3 - /// X[0, 1, 3, 0] := 4 - /// X[0, 1, 2, 1] := 5 - /// X[1, 2, 0, 4] := 6 --/// -+/// ``` - /// In COO format, the index matrix of X is the following 4x6 matrix: --/// -+/// ```text - /// [[0, 0, 0, 0, 1, 1], - /// [1, 1, 1, 2, 1, 2], - /// [2, 2, 3, 1, 2, 0], - /// [0, 1, 0, 0, 3, 4]] --/// -+/// ``` - /// When isCanonical is true, the indices is sorted in lexicographical order - /// (row-major order), and it does not have duplicated entries. Otherwise, - /// the indices may not be sorted, or may have duplicated entries. -@@ -86,26 +86,27 @@ table SparseMatrixIndexCSX { - - /// indptrBuffer stores the location and size of indptr array that - /// represents the range of the rows. -- /// The i-th row spans from indptr[i] to indptr[i+1] in the data. -+ /// The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. - /// The length of this array is 1 + (the number of rows), and the type - /// of index value is long. - /// - /// For example, let X be the following 6x4 matrix: -- /// -+ /// ```text - /// X := [[0, 1, 2, 0], - /// [0, 0, 3, 0], - /// [0, 4, 0, 5], - /// [0, 0, 0, 0], - /// [6, 0, 7, 8], - /// [0, 9, 0, 0]]. -- /// -+ /// ``` - /// The array of non-zero values in X is: -- /// -+ /// ```text - /// values(X) = [1, 2, 3, 4, 5, 6, 7, 8, 9]. -- /// -+ /// ``` - /// And the indptr of X is: -- /// -+ /// ```text - /// indptr(X) = [0, 2, 3, 5, 5, 8, 10]. -+ /// ``` - indptrBuffer: Buffer (required); - - /// The type of values in indicesBuffer -@@ -116,9 +117,9 @@ table SparseMatrixIndexCSX { - /// The type of index value is long. - /// - /// For example, the indices of the above X is: -- /// -+ /// ```text - /// indices(X) = [1, 2, 2, 1, 3, 0, 2, 3, 1]. -- /// -+ /// ``` - /// Note that the indices are sorted in lexicographical order for each row. - indicesBuffer: Buffer (required); - } -@@ -126,7 +127,7 @@ table SparseMatrixIndexCSX { - /// Compressed Sparse Fiber (CSF) sparse tensor index. - table SparseTensorIndexCSF { - /// CSF is a generalization of compressed sparse row (CSR) index. -- /// See [smith2017knl]: http://shaden.io/pub-files/smith2017knl.pdf -+ /// See [smith2017knl](http://shaden.io/pub-files/smith2017knl.pdf) - /// - /// CSF index recursively compresses each dimension of a tensor into a set - /// of prefix trees. Each path from a root to leaf forms one tensor -@@ -135,7 +136,7 @@ table SparseTensorIndexCSF { - /// - /// For example, let X be a 2x3x4x5 tensor and let it have the following - /// 8 non-zero values: -- /// -+ /// ```text - /// X[0, 0, 0, 1] := 1 - /// X[0, 0, 0, 2] := 2 - /// X[0, 1, 0, 0] := 3 -@@ -144,9 +145,9 @@ table SparseTensorIndexCSF { - /// X[1, 1, 1, 0] := 6 - /// X[1, 1, 1, 1] := 7 - /// X[1, 1, 1, 2] := 8 -- /// -+ /// ``` - /// As a prefix tree this would be represented as: -- /// -+ /// ```text - /// 0 1 - /// / \ | - /// 0 1 1 -@@ -154,24 +155,24 @@ table SparseTensorIndexCSF { - /// 0 0 1 1 - /// /| /| | /| | - /// 1 2 0 2 0 0 1 2 -- -+ /// ``` - /// The type of values in indptrBuffers - indptrType: Int (required); - - /// indptrBuffers stores the sparsity structure. - /// Each two consecutive dimensions in a tensor correspond to a buffer in -- /// indptrBuffers. A pair of consecutive values at indptrBuffers[dim][i] -- /// and indptrBuffers[dim][i + 1] signify a range of nodes in -- /// indicesBuffers[dim + 1] who are children of indicesBuffers[dim][i] node. -+ /// indptrBuffers. A pair of consecutive values at `indptrBuffers[dim][i]` -+ /// and `indptrBuffers[dim][i + 1]` signify a range of nodes in -+ /// `indicesBuffers[dim + 1]` who are children of `indicesBuffers[dim][i]` node. - /// - /// For example, the indptrBuffers for the above X is: -- /// -+ /// ```text - /// indptrBuffer(X) = [ - /// [0, 2, 3], - /// [0, 1, 3, 4], - /// [0, 2, 4, 5, 8] - /// ]. -- /// -+ /// ``` - indptrBuffers: [Buffer] (required); - - /// The type of values in indicesBuffers -@@ -180,22 +181,22 @@ table SparseTensorIndexCSF { - /// indicesBuffers stores values of nodes. - /// Each tensor dimension corresponds to a buffer in indicesBuffers. - /// For example, the indicesBuffers for the above X is: -- /// -+ /// ```text - /// indicesBuffer(X) = [ - /// [0, 1], - /// [0, 1, 1], - /// [0, 0, 1, 1], - /// [1, 2, 0, 2, 0, 0, 1, 2] - /// ]. -- /// -+ /// ``` - indicesBuffers: [Buffer] (required); - - /// axisOrder stores the sequence in which dimensions were traversed to - /// produce the prefix tree. - /// For example, the axisOrder for the above X is: -- /// -+ /// ```text - /// axisOrder(X) = [0, 1, 2, 3]. -- /// -+ /// ``` - axisOrder: [int] (required); - } - diff --git a/rust/arrow/regen.sh b/rust/arrow/regen.sh deleted file mode 100755 index 9d384b6b63b..00000000000 --- a/rust/arrow/regen.sh +++ /dev/null @@ -1,157 +0,0 @@ -#!/bin/bash -e -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -# Change to the toplevel Rust directory -pushd $DIR/../../ - -echo "Build flatc from source ..." - -FB_URL="https://github.com/google/flatbuffers" -# https://github.com/google/flatbuffers/pull/6393 -FB_COMMIT="408cf5802415e1dea65fef7489a6c2f3740fb381" -FB_DIR="rust/arrow/.flatbuffers" -FLATC="$FB_DIR/bazel-bin/flatc" - -if [ -z $(which bazel) ]; then - echo "bazel is required to build flatc" - exit 1 -fi - -echo "Bazel version: $(bazel version | head -1 | awk -F':' '{print $2}')" - -if [ ! -e $FB_DIR ]; then - echo "git clone $FB_URL ..." - git clone -b master --no-tag --depth 1 $FB_URL $FB_DIR -else - echo "git pull $FB_URL ..." - git -C $FB_DIR pull -fi - -echo "hard reset to $FB_COMMIT" -git -C $FB_DIR reset --hard $FB_COMMIT - -pushd $FB_DIR -echo "run: bazel build :flatc ..." -bazel build :flatc -popd - -FB_PATCH="rust/arrow/format-0ed34c83.patch" -echo "Patch flatbuffer files with ${FB_PATCH} for cargo doc" -echo "NOTE: the patch MAY need update in case of changes in format/*.fbs" -git apply --check ${FB_PATCH} && git apply ${FB_PATCH} - -# Execute the code generation: -$FLATC --filename-suffix "" --rust -o rust/arrow/src/ipc/gen/ format/*.fbs - -# Reset changes to format/ -git checkout -- format - -# Now the files are wrongly named so we have to change that. -popd -pushd $DIR/src/ipc/gen - -PREFIX=$(cat <<'HEREDOC' -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#![allow(dead_code)] -#![allow(unused_imports)] - -use std::{cmp::Ordering, mem}; -use flatbuffers::EndianScalar; - -HEREDOC -) - -SCHEMA_IMPORT="\nuse crate::ipc::gen::Schema::*;" -SPARSE_TENSOR_IMPORT="\nuse crate::ipc::gen::SparseTensor::*;" -TENSOR_IMPORT="\nuse crate::ipc::gen::Tensor::*;" - -# For flatbuffer(1.12.0+), remove: use crate::${name}::\*; -names=("File" "Message" "Schema" "SparseTensor" "Tensor") - -# Remove all generated lines we don't need -for f in `ls *.rs`; do - if [[ $f == "mod.rs" ]]; then - continue - fi - - echo "Modifying: $f" - sed -i '' '/extern crate flatbuffers;/d' $f - sed -i '' '/use self::flatbuffers::EndianScalar;/d' $f - sed -i '' '/\#\[allow(unused_imports, dead_code)\]/d' $f - sed -i '' '/pub mod org {/d' $f - sed -i '' '/pub mod apache {/d' $f - sed -i '' '/pub mod arrow {/d' $f - sed -i '' '/pub mod flatbuf {/d' $f - sed -i '' '/} \/\/ pub mod flatbuf/d' $f - sed -i '' '/} \/\/ pub mod arrow/d' $f - sed -i '' '/} \/\/ pub mod apache/d' $f - sed -i '' '/} \/\/ pub mod org/d' $f - sed -i '' '/use std::mem;/d' $f - sed -i '' '/use std::cmp::Ordering;/d' $f - - # required by flatc 1.12.0+ - sed -i '' "/\#\!\[allow(unused_imports, dead_code)\]/d" $f - for name in ${names[@]}; do - sed -i '' "/use crate::${name}::\*;/d" $f - sed -i '' "s/use self::flatbuffers::Verifiable;/use flatbuffers::Verifiable;/g" $f - done - - # Replace all occurrences of "type__" with "type_", "TYPE__" with "TYPE_". - sed -i '' 's/type__/type_/g' $f - sed -i '' 's/TYPE__/TYPE_/g' $f - - # Some files need prefixes - if [[ $f == "File.rs" ]]; then - # Now prefix the file with the static contents - echo -e "${PREFIX}" "${SCHEMA_IMPORT}" | cat - $f > temp && mv temp $f - elif [[ $f == "Message.rs" ]]; then - echo -e "${PREFIX}" "${SCHEMA_IMPORT}" "${SPARSE_TENSOR_IMPORT}" "${TENSOR_IMPORT}" | cat - $f > temp && mv temp $f - elif [[ $f == "SparseTensor.rs" ]]; then - echo -e "${PREFIX}" "${SCHEMA_IMPORT}" "${TENSOR_IMPORT}" | cat - $f > temp && mv temp $f - elif [[ $f == "Tensor.rs" ]]; then - echo -e "${PREFIX}" "${SCHEMA_IMPORT}" | cat - $f > temp && mv temp $f - else - echo "${PREFIX}" | cat - $f > temp && mv temp $f - fi -done - -# Return back to base directory -popd -cargo +stable fmt -- src/ipc/gen/* - -echo "DONE!" -echo "Please run 'cargo doc' and 'cargo test' with nightly and stable, " -echo "and fix possible errors or warnings!" diff --git a/rust/arrow/src/alloc/alignment.rs b/rust/arrow/src/alloc/alignment.rs deleted file mode 100644 index dbf4602f83a..00000000000 --- a/rust/arrow/src/alloc/alignment.rs +++ /dev/null @@ -1,119 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// NOTE: Below code is written for spatial/temporal prefetcher optimizations. Memory allocation -// should align well with usage pattern of cache access and block sizes on layers of storage levels from -// registers to non-volatile memory. These alignments are all cache aware alignments incorporated -// from [cuneiform](https://crates.io/crates/cuneiform) crate. This approach mimicks Intel TBB's -// cache_aligned_allocator which exploits cache locality and minimizes prefetch signals -// resulting in less round trip time between the layers of storage. -// For further info: https://software.intel.com/en-us/node/506094 - -// 32-bit architecture and things other than netburst microarchitecture are using 64 bytes. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "x86")] -pub const ALIGNMENT: usize = 1 << 6; - -// Intel x86_64: -// L2D streamer from L1: -// Loads data or instructions from memory to the second-level cache. To use the streamer, -// organize the data or instructions in blocks of 128 bytes, aligned on 128 bytes. -// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "x86_64")] -pub const ALIGNMENT: usize = 1 << 7; - -// 24Kc: -// Data Line Size -// - https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00346-2B-24K-DTS-04.00.pdf -// - https://gitlab.e.foundation/e/devices/samsung/n7100/stable_android_kernel_samsung_smdk4412/commit/2dbac10263b2f3c561de68b4c369bc679352ccee -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "mips")] -pub const ALIGNMENT: usize = 1 << 5; -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "mips64")] -pub const ALIGNMENT: usize = 1 << 5; - -// Defaults for powerpc -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "powerpc")] -pub const ALIGNMENT: usize = 1 << 5; - -// Defaults for the ppc 64 -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "powerpc64")] -pub const ALIGNMENT: usize = 1 << 6; - -// e.g.: sifive -// - https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/riscv/sifive-l2-cache.txt#L41 -// in general all of them are the same. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "riscv")] -pub const ALIGNMENT: usize = 1 << 6; - -// This size is same across all hardware for this architecture. -// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2s390_2include_2asm_2cache_8h.html -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "s390x")] -pub const ALIGNMENT: usize = 1 << 8; - -// This size is same across all hardware for this architecture. -// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2sparc_2include_2asm_2cache_8h.html#a9400cc2ba37e33279bdbc510a6311fb4 -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "sparc")] -pub const ALIGNMENT: usize = 1 << 5; -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "sparc64")] -pub const ALIGNMENT: usize = 1 << 6; - -// On ARM cache line sizes are fixed. both v6 and v7. -// Need to add board specific or platform specific things later. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "thumbv6")] -pub const ALIGNMENT: usize = 1 << 5; -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "thumbv7")] -pub const ALIGNMENT: usize = 1 << 5; - -// Operating Systems cache size determines this. -// Currently no way to determine this without runtime inference. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "wasm32")] -pub const ALIGNMENT: usize = 1 << 6; - -// Same as v6 and v7. -// List goes like that: -// Cortex A, M, R, ARM v7, v7-M, Krait and NeoverseN uses this size. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "arm")] -pub const ALIGNMENT: usize = 1 << 5; - -// Combined from 4 sectors. Volta says 128. -// Prevent chunk optimizations better to go to the default size. -// If you have smaller data with less padded functionality then use 32 with force option. -// - https://devtalk.nvidia.com/default/topic/803600/variable-cache-line-width-/ -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "nvptx")] -pub const ALIGNMENT: usize = 1 << 7; -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "nvptx64")] -pub const ALIGNMENT: usize = 1 << 7; - -// This size is same across all hardware for this architecture. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "aarch64")] -pub const ALIGNMENT: usize = 1 << 6; diff --git a/rust/arrow/src/alloc/mod.rs b/rust/arrow/src/alloc/mod.rs deleted file mode 100644 index a225d32dd82..00000000000 --- a/rust/arrow/src/alloc/mod.rs +++ /dev/null @@ -1,136 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines memory-related functions, such as allocate/deallocate/reallocate memory -//! regions, cache and allocation alignments. - -use std::mem::size_of; -use std::ptr::NonNull; -use std::{ - alloc::{handle_alloc_error, Layout}, - sync::atomic::AtomicIsize, -}; - -mod alignment; -mod types; - -pub use alignment::ALIGNMENT; -pub use types::NativeType; - -// If this number is not zero after all objects have been `drop`, there is a memory leak -pub static mut ALLOCATIONS: AtomicIsize = AtomicIsize::new(0); - -#[inline] -unsafe fn null_pointer() -> NonNull { - NonNull::new_unchecked(ALIGNMENT as *mut T) -} - -/// Allocates a cache-aligned memory region of `size` bytes with uninitialized values. -/// This is more performant than using [allocate_aligned_zeroed] when all bytes will have -/// an unknown or non-zero value and is semantically similar to `malloc`. -pub fn allocate_aligned(size: usize) -> NonNull { - unsafe { - if size == 0 { - null_pointer() - } else { - let size = size * size_of::(); - ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst); - - let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); - let raw_ptr = std::alloc::alloc(layout) as *mut T; - NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) - } - } -} - -/// Allocates a cache-aligned memory region of `size` bytes with `0` on all of them. -/// This is more performant than using [allocate_aligned] and setting all bytes to zero -/// and is semantically similar to `calloc`. -pub fn allocate_aligned_zeroed(size: usize) -> NonNull { - unsafe { - if size == 0 { - null_pointer() - } else { - let size = size * size_of::(); - ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst); - - let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); - let raw_ptr = std::alloc::alloc_zeroed(layout) as *mut T; - NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) - } - } -} - -/// # Safety -/// -/// This function is unsafe because undefined behavior can result if the caller does not ensure all -/// of the following: -/// -/// * ptr must denote a block of memory currently allocated via this allocator, -/// -/// * size must be the same size that was used to allocate that block of memory, -pub unsafe fn free_aligned(ptr: NonNull, size: usize) { - if ptr != null_pointer() { - let size = size * size_of::(); - ALLOCATIONS.fetch_sub(size as isize, std::sync::atomic::Ordering::SeqCst); - std::alloc::dealloc( - ptr.as_ptr() as *mut u8, - Layout::from_size_align_unchecked(size, ALIGNMENT), - ); - } -} - -/// # Safety -/// -/// This function is unsafe because undefined behavior can result if the caller does not ensure all -/// of the following: -/// -/// * ptr must be currently allocated via this allocator, -/// -/// * new_size must be greater than zero. -/// -/// * new_size, when rounded up to the nearest multiple of [ALIGNMENT], must not overflow (i.e., -/// the rounded value must be less than usize::MAX). -pub unsafe fn reallocate( - ptr: NonNull, - old_size: usize, - new_size: usize, -) -> NonNull { - let old_size = old_size * size_of::(); - let new_size = new_size * size_of::(); - if ptr == null_pointer() { - return allocate_aligned(new_size); - } - - if new_size == 0 { - free_aligned(ptr, old_size); - return null_pointer(); - } - - ALLOCATIONS.fetch_add( - new_size as isize - old_size as isize, - std::sync::atomic::Ordering::SeqCst, - ); - let raw_ptr = std::alloc::realloc( - ptr.as_ptr() as *mut u8, - Layout::from_size_align_unchecked(old_size, ALIGNMENT), - new_size, - ) as *mut T; - NonNull::new(raw_ptr).unwrap_or_else(|| { - handle_alloc_error(Layout::from_size_align_unchecked(new_size, ALIGNMENT)) - }) -} diff --git a/rust/arrow/src/alloc/types.rs b/rust/arrow/src/alloc/types.rs deleted file mode 100644 index c1f0ef99580..00000000000 --- a/rust/arrow/src/alloc/types.rs +++ /dev/null @@ -1,71 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::datatypes::DataType; - -/// A type that Rust's custom allocator knows how to allocate and deallocate. -/// This is implemented for all Arrow's physical types whose in-memory representation -/// matches Rust's physical types. Consider this trait sealed. -/// # Safety -/// Do not implement this trait. -pub unsafe trait NativeType: - Sized + Copy + std::fmt::Debug + std::fmt::Display + PartialEq + Default + Sized + 'static -{ - type Bytes: AsRef<[u8]>; - - /// Whether a DataType is a valid type for this physical representation. - fn is_valid(data_type: &DataType) -> bool; - - /// How this type represents itself as bytes in little endianess. - /// This is used for IPC, where data is communicated with a specific endianess. - fn to_le_bytes(&self) -> Self::Bytes; -} - -macro_rules! create_native { - ($native_ty:ty,$($impl_pattern:pat)|+) => { - unsafe impl NativeType for $native_ty { - type Bytes = [u8; std::mem::size_of::()]; - - #[inline] - fn to_le_bytes(&self) -> Self::Bytes { - Self::to_le_bytes(*self) - } - - #[inline] - fn is_valid(data_type: &DataType) -> bool { - matches!(data_type, $($impl_pattern)|+) - } - } - }; -} - -create_native!(u8, DataType::UInt8); -create_native!(u16, DataType::UInt16); -create_native!(u32, DataType::UInt32); -create_native!(u64, DataType::UInt64); -create_native!(i8, DataType::Int8); -create_native!(i16, DataType::Int16); -create_native!( - i32, - DataType::Int32 | DataType::Date32 | DataType::Time32(_) -); -create_native!( - i64, - DataType::Int64 | DataType::Date64 | DataType::Time64(_) | DataType::Timestamp(_, _) -); -create_native!(f32, DataType::Float32); -create_native!(f64, DataType::Float64); diff --git a/rust/arrow/src/arch/avx512.rs b/rust/arrow/src/arch/avx512.rs deleted file mode 100644 index 264532f3594..00000000000 --- a/rust/arrow/src/arch/avx512.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -pub(crate) const AVX512_U8X64_LANES: usize = 64; - -#[target_feature(enable = "avx512f")] -pub(crate) unsafe fn avx512_bin_and(left: &[u8], right: &[u8], res: &mut [u8]) { - use core::arch::x86_64::{__m512i, _mm512_and_si512, _mm512_loadu_epi64}; - - let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _); - let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _); - let f = _mm512_and_si512(l, r); - let s = &f as *const __m512i as *const u8; - let d = res.get_unchecked_mut(0) as *mut _ as *mut u8; - std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>()); -} - -#[target_feature(enable = "avx512f")] -pub(crate) unsafe fn avx512_bin_or(left: &[u8], right: &[u8], res: &mut [u8]) { - use core::arch::x86_64::{__m512i, _mm512_loadu_epi64, _mm512_or_si512}; - - let l: __m512i = _mm512_loadu_epi64(left.as_ptr() as *const _); - let r: __m512i = _mm512_loadu_epi64(right.as_ptr() as *const _); - let f = _mm512_or_si512(l, r); - let s = &f as *const __m512i as *const u8; - let d = res.get_unchecked_mut(0) as *mut _ as *mut u8; - std::ptr::copy_nonoverlapping(s, d, std::mem::size_of::<__m512i>()); -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_bitwise_and_avx512() { - let buf1 = [0b00110011u8; 64]; - let buf2 = [0b11110000u8; 64]; - let mut buf3 = [0b00000000; 64]; - unsafe { - avx512_bin_and(&buf1, &buf2, &mut buf3); - }; - for i in buf3.iter() { - assert_eq!(&0b00110000u8, i); - } - } - - #[test] - fn test_bitwise_or_avx512() { - let buf1 = [0b00010011u8; 64]; - let buf2 = [0b11100000u8; 64]; - let mut buf3 = [0b00000000; 64]; - unsafe { - avx512_bin_or(&buf1, &buf2, &mut buf3); - }; - for i in buf3.iter() { - assert_eq!(&0b11110011u8, i); - } - } -} diff --git a/rust/arrow/src/arch/mod.rs b/rust/arrow/src/arch/mod.rs deleted file mode 100644 index 56d8f4c0e2c..00000000000 --- a/rust/arrow/src/arch/mod.rs +++ /dev/null @@ -1,22 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// -/// Arch module contains architecture specific code. -/// Be aware that not all machines have these specific operations available. -#[cfg(all(target_arch = "x86_64", feature = "avx512"))] -pub(crate) mod avx512; diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs deleted file mode 100644 index 95a3117417e..00000000000 --- a/rust/arrow/src/array/array.rs +++ /dev/null @@ -1,640 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::fmt; -use std::sync::Arc; -use std::{any::Any, convert::TryFrom}; - -use super::*; -use crate::array::equal_json::JsonEqual; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::error::Result; -use crate::ffi; - -/// Trait for dealing with different types of array at runtime when the type of the -/// array is not known in advance. -pub trait Array: fmt::Debug + Send + Sync + JsonEqual { - /// Returns the array as [`Any`](std::any::Any) so that it can be - /// downcasted to a specific implementation. - /// - /// # Example: - /// - /// ``` - /// use std::sync::Arc; - /// use arrow::array::Int32Array; - /// use arrow::datatypes::{Schema, Field, DataType}; - /// use arrow::record_batch::RecordBatch; - /// - /// # fn main() -> arrow::error::Result<()> { - /// let id = Int32Array::from(vec![1, 2, 3, 4, 5]); - /// let batch = RecordBatch::try_new( - /// Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])), - /// vec![Arc::new(id)] - /// )?; - /// - /// let int32array = batch - /// .column(0) - /// .as_any() - /// .downcast_ref::() - /// .expect("Failed to downcast"); - /// # Ok(()) - /// # } - /// ``` - fn as_any(&self) -> &Any; - - /// Returns a reference to the underlying data of this array. - fn data(&self) -> &ArrayData; - - /// Returns a reference-counted pointer to the underlying data of this array. - fn data_ref(&self) -> &ArrayData { - self.data() - } - - /// Returns a reference to the [`DataType`](crate::datatypes::DataType) of this array. - /// - /// # Example: - /// - /// ``` - /// use arrow::datatypes::DataType; - /// use arrow::array::{Array, Int32Array}; - /// - /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); - /// - /// assert_eq!(*array.data_type(), DataType::Int32); - /// ``` - fn data_type(&self) -> &DataType { - self.data_ref().data_type() - } - - /// Returns a zero-copy slice of this array with the indicated offset and length. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::{Array, Int32Array}; - /// - /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); - /// // Make slice over the values [2, 3, 4] - /// let array_slice = array.slice(1, 3); - /// - /// assert_eq!(array_slice.as_ref(), &Int32Array::from(vec![2, 3, 4])); - /// ``` - fn slice(&self, offset: usize, length: usize) -> ArrayRef { - make_array(self.data_ref().slice(offset, length)) - } - - /// Returns the length (i.e., number of elements) of this array. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::{Array, Int32Array}; - /// - /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); - /// - /// assert_eq!(array.len(), 5); - /// ``` - fn len(&self) -> usize { - self.data_ref().len() - } - - /// Returns whether this array is empty. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::{Array, Int32Array}; - /// - /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); - /// - /// assert_eq!(array.is_empty(), false); - /// ``` - fn is_empty(&self) -> bool { - self.data_ref().is_empty() - } - - /// Returns the offset into the underlying data used by this array(-slice). - /// Note that the underlying data can be shared by many arrays. - /// This defaults to `0`. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::{Array, Int32Array}; - /// - /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); - /// // Make slice over the values [2, 3, 4] - /// let array_slice = array.slice(1, 3); - /// - /// assert_eq!(array.offset(), 0); - /// assert_eq!(array_slice.offset(), 1); - /// ``` - fn offset(&self) -> usize { - self.data_ref().offset() - } - - /// Returns whether the element at `index` is null. - /// When using this function on a slice, the index is relative to the slice. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::{Array, Int32Array}; - /// - /// let array = Int32Array::from(vec![Some(1), None]); - /// - /// assert_eq!(array.is_null(0), false); - /// assert_eq!(array.is_null(1), true); - /// ``` - fn is_null(&self, index: usize) -> bool { - self.data_ref().is_null(index) - } - - /// Returns whether the element at `index` is not null. - /// When using this function on a slice, the index is relative to the slice. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::{Array, Int32Array}; - /// - /// let array = Int32Array::from(vec![Some(1), None]); - /// - /// assert_eq!(array.is_valid(0), true); - /// assert_eq!(array.is_valid(1), false); - /// ``` - fn is_valid(&self, index: usize) -> bool { - self.data_ref().is_valid(index) - } - - /// Returns the total number of null values in this array. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::{Array, Int32Array}; - /// - /// // Construct an array with values [1, NULL, NULL] - /// let array = Int32Array::from(vec![Some(1), None, None]); - /// - /// assert_eq!(array.null_count(), 2); - /// ``` - fn null_count(&self) -> usize { - self.data_ref().null_count() - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this array. - fn get_buffer_memory_size(&self) -> usize; - - /// Returns the total number of bytes of memory occupied physically by this array. - fn get_array_memory_size(&self) -> usize; - - /// returns two pointers that represent this array in the C Data Interface (FFI) - fn to_raw( - &self, - ) -> Result<(*const ffi::FFI_ArrowArray, *const ffi::FFI_ArrowSchema)> { - let data = self.data().clone(); - let array = ffi::ArrowArray::try_from(data)?; - Ok(ffi::ArrowArray::into_raw(array)) - } -} - -/// A reference-counted reference to a generic `Array`. -pub type ArrayRef = Arc; - -/// Constructs an array using the input `data`. -/// Returns a reference-counted `Array` instance. -pub fn make_array(data: ArrayData) -> ArrayRef { - match data.data_type() { - DataType::Boolean => Arc::new(BooleanArray::from(data)) as ArrayRef, - DataType::Int8 => Arc::new(Int8Array::from(data)) as ArrayRef, - DataType::Int16 => Arc::new(Int16Array::from(data)) as ArrayRef, - DataType::Int32 => Arc::new(Int32Array::from(data)) as ArrayRef, - DataType::Int64 => Arc::new(Int64Array::from(data)) as ArrayRef, - DataType::UInt8 => Arc::new(UInt8Array::from(data)) as ArrayRef, - DataType::UInt16 => Arc::new(UInt16Array::from(data)) as ArrayRef, - DataType::UInt32 => Arc::new(UInt32Array::from(data)) as ArrayRef, - DataType::UInt64 => Arc::new(UInt64Array::from(data)) as ArrayRef, - DataType::Float16 => panic!("Float16 datatype not supported"), - DataType::Float32 => Arc::new(Float32Array::from(data)) as ArrayRef, - DataType::Float64 => Arc::new(Float64Array::from(data)) as ArrayRef, - DataType::Date32 => Arc::new(Date32Array::from(data)) as ArrayRef, - DataType::Date64 => Arc::new(Date64Array::from(data)) as ArrayRef, - DataType::Time32(TimeUnit::Second) => { - Arc::new(Time32SecondArray::from(data)) as ArrayRef - } - DataType::Time32(TimeUnit::Millisecond) => { - Arc::new(Time32MillisecondArray::from(data)) as ArrayRef - } - DataType::Time64(TimeUnit::Microsecond) => { - Arc::new(Time64MicrosecondArray::from(data)) as ArrayRef - } - DataType::Time64(TimeUnit::Nanosecond) => { - Arc::new(Time64NanosecondArray::from(data)) as ArrayRef - } - DataType::Timestamp(TimeUnit::Second, _) => { - Arc::new(TimestampSecondArray::from(data)) as ArrayRef - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - Arc::new(TimestampMillisecondArray::from(data)) as ArrayRef - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - Arc::new(TimestampMicrosecondArray::from(data)) as ArrayRef - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Arc::new(TimestampNanosecondArray::from(data)) as ArrayRef - } - DataType::Interval(IntervalUnit::YearMonth) => { - Arc::new(IntervalYearMonthArray::from(data)) as ArrayRef - } - DataType::Interval(IntervalUnit::DayTime) => { - Arc::new(IntervalDayTimeArray::from(data)) as ArrayRef - } - DataType::Duration(TimeUnit::Second) => { - Arc::new(DurationSecondArray::from(data)) as ArrayRef - } - DataType::Duration(TimeUnit::Millisecond) => { - Arc::new(DurationMillisecondArray::from(data)) as ArrayRef - } - DataType::Duration(TimeUnit::Microsecond) => { - Arc::new(DurationMicrosecondArray::from(data)) as ArrayRef - } - DataType::Duration(TimeUnit::Nanosecond) => { - Arc::new(DurationNanosecondArray::from(data)) as ArrayRef - } - DataType::Binary => Arc::new(BinaryArray::from(data)) as ArrayRef, - DataType::LargeBinary => Arc::new(LargeBinaryArray::from(data)) as ArrayRef, - DataType::FixedSizeBinary(_) => { - Arc::new(FixedSizeBinaryArray::from(data)) as ArrayRef - } - DataType::Utf8 => Arc::new(StringArray::from(data)) as ArrayRef, - DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as ArrayRef, - DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef, - DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef, - DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, - DataType::Union(_) => Arc::new(UnionArray::from(data)) as ArrayRef, - DataType::FixedSizeList(_, _) => { - Arc::new(FixedSizeListArray::from(data)) as ArrayRef - } - DataType::Dictionary(ref key_type, _) => match key_type.as_ref() { - DataType::Int8 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::Int16 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::Int32 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::Int64 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt8 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt16 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt32 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - DataType::UInt64 => { - Arc::new(DictionaryArray::::from(data)) as ArrayRef - } - dt => panic!("Unexpected dictionary key type {:?}", dt), - }, - DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, - DataType::Decimal(_, _) => Arc::new(DecimalArray::from(data)) as ArrayRef, - dt => panic!("Unexpected data type {:?}", dt), - } -} - -/// Creates a new empty array -pub fn new_empty_array(data_type: &DataType) -> ArrayRef { - let data = ArrayData::new_empty(data_type); - make_array(data) -} -/// Creates a new array of `data_type` of length `length` filled entirely of `NULL` values -pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { - // context: https://github.com/apache/arrow/pull/9469#discussion_r574761687 - match data_type { - DataType::Null => Arc::new(NullArray::new(length)), - DataType::Boolean => { - let null_buf: Buffer = MutableBuffer::new_null(length).into(); - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(null_buf.clone()), - 0, - vec![null_buf], - vec![], - )) - } - DataType::Int8 => new_null_sized_array::(data_type, length), - DataType::UInt8 => new_null_sized_array::(data_type, length), - DataType::Int16 => new_null_sized_array::(data_type, length), - DataType::UInt16 => new_null_sized_array::(data_type, length), - DataType::Float16 => unreachable!(), - DataType::Int32 => new_null_sized_array::(data_type, length), - DataType::UInt32 => new_null_sized_array::(data_type, length), - DataType::Float32 => new_null_sized_array::(data_type, length), - DataType::Date32 => new_null_sized_array::(data_type, length), - // expanding this into Date23{unit}Type results in needless branching - DataType::Time32(_) => new_null_sized_array::(data_type, length), - DataType::Int64 => new_null_sized_array::(data_type, length), - DataType::UInt64 => new_null_sized_array::(data_type, length), - DataType::Float64 => new_null_sized_array::(data_type, length), - DataType::Date64 => new_null_sized_array::(data_type, length), - // expanding this into Timestamp{unit}Type results in needless branching - DataType::Timestamp(_, _) => new_null_sized_array::(data_type, length), - DataType::Time64(_) => new_null_sized_array::(data_type, length), - DataType::Duration(_) => new_null_sized_array::(data_type, length), - DataType::Interval(unit) => match unit { - IntervalUnit::YearMonth => { - new_null_sized_array::(data_type, length) - } - IntervalUnit::DayTime => { - new_null_sized_array::(data_type, length) - } - }, - DataType::FixedSizeBinary(value_len) => make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from(vec![0u8; *value_len as usize * length])], - vec![], - )), - DataType::Binary | DataType::Utf8 => { - new_null_binary_array::(data_type, length) - } - DataType::LargeBinary | DataType::LargeUtf8 => { - new_null_binary_array::(data_type, length) - } - DataType::List(field) => { - new_null_list_array::(data_type, field.data_type(), length) - } - DataType::LargeList(field) => { - new_null_list_array::(data_type, field.data_type(), length) - } - DataType::FixedSizeList(field, value_len) => make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![], - vec![ - new_null_array(field.data_type(), *value_len as usize * length) - .data() - .clone(), - ], - )), - DataType::Struct(fields) => make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![], - fields - .iter() - .map(|field| ArrayData::new_empty(field.data_type())) - .collect(), - )), - DataType::Union(_) => { - unimplemented!("Creating null Union array not yet supported") - } - DataType::Dictionary(key, value) => { - let keys = new_null_array(key, length); - let keys = keys.data(); - - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - keys.null_buffer().cloned(), - 0, - keys.buffers().into(), - vec![new_empty_array(value.as_ref()).data().clone()], - )) - } - DataType::Decimal(_, _) => { - unimplemented!("Creating null Decimal array not yet supported") - } - } -} - -#[inline] -fn new_null_list_array( - data_type: &DataType, - child_data_type: &DataType, - length: usize, -) -> ArrayRef { - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from( - vec![OffsetSize::zero(); length + 1].to_byte_slice(), - )], - vec![ArrayData::new_empty(child_data_type)], - )) -} - -#[inline] -fn new_null_binary_array( - data_type: &DataType, - length: usize, -) -> ArrayRef { - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![ - Buffer::from(vec![OffsetSize::zero(); length + 1].to_byte_slice()), - MutableBuffer::new(0).into(), - ], - vec![], - )) -} - -#[inline] -fn new_null_sized_array( - data_type: &DataType, - length: usize, -) -> ArrayRef { - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from(vec![0u8; length * T::get_byte_width()])], - vec![], - )) -} - -/// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface -/// # Safety -/// Assumes that these pointers represent valid C Data Interfaces, both in memory -/// representation and lifetime via the `release` mechanism. -pub unsafe fn make_array_from_raw( - array: *const ffi::FFI_ArrowArray, - schema: *const ffi::FFI_ArrowSchema, -) -> Result { - let array = ffi::ArrowArray::try_from_raw(array, schema)?; - let data = ArrayData::try_from(array)?; - Ok(make_array(data)) -} -// Helper function for printing potentially long arrays. -pub(super) fn print_long_array( - array: &A, - f: &mut fmt::Formatter, - print_item: F, -) -> fmt::Result -where - A: Array, - F: Fn(&A, usize, &mut fmt::Formatter) -> fmt::Result, -{ - let head = std::cmp::min(10, array.len()); - - for i in 0..head { - if array.is_null(i) { - writeln!(f, " null,")?; - } else { - write!(f, " ")?; - print_item(&array, i, f)?; - writeln!(f, ",")?; - } - } - if array.len() > 10 { - if array.len() > 20 { - writeln!(f, " ...{} elements...,", array.len() - 20)?; - } - - let tail = std::cmp::max(head, array.len() - 10); - - for i in tail..array.len() { - if array.is_null(i) { - writeln!(f, " null,")?; - } else { - write!(f, " ")?; - print_item(&array, i, f)?; - writeln!(f, ",")?; - } - } - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - #[test] - fn test_empty_primitive() { - let array = new_empty_array(&DataType::Int32); - let a = array.as_any().downcast_ref::().unwrap(); - assert_eq!(a.len(), 0); - let expected: &[i32] = &[]; - assert_eq!(a.values(), expected); - } - - #[test] - fn test_empty_variable_sized() { - let array = new_empty_array(&DataType::Utf8); - let a = array.as_any().downcast_ref::().unwrap(); - assert_eq!(a.len(), 0); - assert_eq!(a.value_offsets()[0], 0i32); - } - - #[test] - fn test_empty_list_primitive() { - let data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let array = new_empty_array(&data_type); - let a = array.as_any().downcast_ref::().unwrap(); - assert_eq!(a.len(), 0); - assert_eq!(a.value_offsets()[0], 0i32); - } - - #[test] - fn test_null_boolean() { - let array = new_null_array(&DataType::Boolean, 9); - let a = array.as_any().downcast_ref::().unwrap(); - assert_eq!(a.len(), 9); - for i in 0..9 { - assert!(a.is_null(i)); - } - } - - #[test] - fn test_null_primitive() { - let array = new_null_array(&DataType::Int32, 9); - let a = array.as_any().downcast_ref::().unwrap(); - assert_eq!(a.len(), 9); - for i in 0..9 { - assert!(a.is_null(i)); - } - } - - #[test] - fn test_null_variable_sized() { - let array = new_null_array(&DataType::Utf8, 9); - let a = array.as_any().downcast_ref::().unwrap(); - assert_eq!(a.len(), 9); - assert_eq!(a.value_offsets()[9], 0i32); - for i in 0..9 { - assert!(a.is_null(i)); - } - } - - #[test] - fn test_null_list_primitive() { - let data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); - let array = new_null_array(&data_type, 9); - let a = array.as_any().downcast_ref::().unwrap(); - assert_eq!(a.len(), 9); - assert_eq!(a.value_offsets()[9], 0i32); - for i in 0..9 { - assert!(a.is_null(i)); - } - } - - #[test] - fn test_null_dictionary() { - let values = vec![None, None, None, None, None, None, None, None, None] - as Vec>; - - let array: DictionaryArray = values.into_iter().collect(); - let array = Arc::new(array) as ArrayRef; - - let null_array = new_null_array(array.data_type(), 9); - assert_eq!(&array, &null_array); - assert_eq!( - array.data().buffers()[0].len(), - null_array.data().buffers()[0].len() - ); - } -} diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs deleted file mode 100644 index bd04afa4c1f..00000000000 --- a/rust/arrow/src/array/array_binary.rs +++ /dev/null @@ -1,1157 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::convert::{From, TryInto}; -use std::fmt; -use std::mem; -use std::{any::Any, iter::FromIterator}; - -use super::{ - array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, - FixedSizeListArray, GenericBinaryIter, GenericListArray, OffsetSizeTrait, -}; -use crate::buffer::Buffer; -use crate::error::ArrowError; -use crate::util::bit_util; -use crate::{buffer::MutableBuffer, datatypes::DataType}; - -/// Like OffsetSizeTrait, but specialized for Binary -// This allow us to expose a constant datatype for the GenericBinaryArray -pub trait BinaryOffsetSizeTrait: OffsetSizeTrait { - const DATA_TYPE: DataType; -} - -impl BinaryOffsetSizeTrait for i32 { - const DATA_TYPE: DataType = DataType::Binary; -} - -impl BinaryOffsetSizeTrait for i64 { - const DATA_TYPE: DataType = DataType::LargeBinary; -} - -pub struct GenericBinaryArray { - data: ArrayData, - value_offsets: RawPtrBox, - value_data: RawPtrBox, -} - -impl GenericBinaryArray { - /// Returns the length for value at index `i`. - #[inline] - pub fn value_length(&self, i: usize) -> OffsetSize { - let offsets = self.value_offsets(); - offsets[i + 1] - offsets[i] - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[1].clone() - } - - /// Returns the offset values in the offsets buffer - #[inline] - pub fn value_offsets(&self) -> &[OffsetSize] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } - } - - /// Returns the element at index `i` as bytes slice - /// # Safety - /// Caller is responsible for ensuring that the index is within the bounds of the array - pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - let end = *self.value_offsets().get_unchecked(i + 1); - let start = *self.value_offsets().get_unchecked(i); - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(start.to_isize().unwrap()), - (end - start).to_usize().unwrap(), - ) - } - - /// Returns the element at index `i` as bytes slice - pub fn value(&self, i: usize) -> &[u8] { - assert!(i < self.data.len(), "BinaryArray out of bounds access"); - //Soundness: length checked above, offset buffer length is 1 larger than logical array length - let end = unsafe { self.value_offsets().get_unchecked(i + 1) }; - let start = unsafe { self.value_offsets().get_unchecked(i) }; - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - unsafe { - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(start.to_isize().unwrap()), - (*end - *start).to_usize().unwrap(), - ) - } - } - - /// Creates a [GenericBinaryArray] from a vector of byte slices - pub fn from_vec(v: Vec<&[u8]>) -> Self { - let mut offsets = Vec::with_capacity(v.len() + 1); - let mut values = Vec::new(); - let mut length_so_far: OffsetSize = OffsetSize::zero(); - offsets.push(length_so_far); - for s in &v { - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); - values.extend_from_slice(s); - } - let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(v.len()) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - GenericBinaryArray::::from(array_data) - } - - /// Creates a [GenericBinaryArray] from a vector of Optional (null) byte slices - pub fn from_opt_vec(v: Vec>) -> Self { - v.into_iter().collect() - } - - fn from_list(v: GenericListArray) -> Self { - assert_eq!( - v.data_ref().child_data()[0].child_data().len(), - 0, - "BinaryArray can only be created from list array of u8 values \ - (i.e. List>)." - ); - assert_eq!( - v.data_ref().child_data()[0].data_type(), - &DataType::UInt8, - "BinaryArray can only be created from List arrays, mismatched data types." - ); - - let mut builder = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(v.len()) - .add_buffer(v.data_ref().buffers()[0].clone()) - .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()); - if let Some(bitmap) = v.data_ref().null_bitmap() { - builder = builder.null_bit_buffer(bitmap.bits.clone()) - } - - let data = builder.build(); - Self::from(data) - } -} - -impl<'a, T: BinaryOffsetSizeTrait> GenericBinaryArray { - /// constructs a new iterator - pub fn iter(&'a self) -> GenericBinaryIter<'a, T> { - GenericBinaryIter::<'a, T>::new(&self) - } -} - -impl fmt::Debug for GenericBinaryArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let prefix = if OffsetSize::is_large() { "Large" } else { "" }; - - write!(f, "{}BinaryArray\n[\n", prefix)?; - print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl Array for GenericBinaryArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [$name]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } -} - -impl From - for GenericBinaryArray -{ - fn from(data: ArrayData) -> Self { - assert_eq!( - data.data_type(), - &::DATA_TYPE, - "[Large]BinaryArray expects Datatype::[Large]Binary" - ); - assert_eq!( - data.buffers().len(), - 2, - "BinaryArray data should contain 2 buffers only (offsets and values)" - ); - let offsets = data.buffers()[0].as_ptr(); - let values = data.buffers()[1].as_ptr(); - Self { - data, - value_offsets: unsafe { RawPtrBox::new(offsets) }, - value_data: unsafe { RawPtrBox::new(values) }, - } - } -} - -impl FromIterator> - for GenericBinaryArray -where - Ptr: AsRef<[u8]>, -{ - fn from_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let mut offsets = Vec::with_capacity(data_len + 1); - let mut values = Vec::new(); - let mut null_buf = MutableBuffer::new_null(data_len); - let mut length_so_far: OffsetSize = OffsetSize::zero(); - offsets.push(length_so_far); - - { - let null_slice = null_buf.as_slice_mut(); - - for (i, s) in iter.enumerate() { - if let Some(s) = s { - let s = s.as_ref(); - bit_util::set_bit(null_slice, i); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s); - } - // always add an element in offsets - offsets.push(length_so_far); - } - } - - // calculate actual data_len, which may be different from the iterator's upper bound - let data_len = offsets.len() - 1; - let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(data_len) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .null_bit_buffer(null_buf.into()) - .build(); - Self::from(array_data) - } -} - -/// An array where each element is a byte whose maximum length is represented by a i32. -pub type BinaryArray = GenericBinaryArray; - -/// An array where each element is a byte whose maximum length is represented by a i64. -pub type LargeBinaryArray = GenericBinaryArray; - -impl<'a, T: BinaryOffsetSizeTrait> IntoIterator for &'a GenericBinaryArray { - type Item = Option<&'a [u8]>; - type IntoIter = GenericBinaryIter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - GenericBinaryIter::<'a, T>::new(self) - } -} - -impl From>> - for GenericBinaryArray -{ - fn from(v: Vec>) -> Self { - GenericBinaryArray::::from_opt_vec(v) - } -} - -impl From> - for GenericBinaryArray -{ - fn from(v: Vec<&[u8]>) -> Self { - GenericBinaryArray::::from_vec(v) - } -} - -impl From> for GenericBinaryArray { - fn from(v: GenericListArray) -> Self { - GenericBinaryArray::::from_list(v) - } -} - -/// A type of `FixedSizeListArray` whose elements are binaries. -pub struct FixedSizeBinaryArray { - data: ArrayData, - value_data: RawPtrBox, - length: i32, -} - -impl FixedSizeBinaryArray { - /// Returns the element at index `i` as a byte slice. - pub fn value(&self, i: usize) -> &[u8] { - assert!( - i < self.data.len(), - "FixedSizeBinaryArray out of bounds access" - ); - let offset = i.checked_add(self.data.offset()).unwrap(); - unsafe { - let pos = self.value_offset_at(offset); - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(pos as isize), - (self.value_offset_at(offset + 1) - pos) as usize, - ) - } - } - - /// Returns the offset for the element at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) - } - - /// Returns the length for an element. - /// - /// All elements have the same length as the array is a fixed size. - #[inline] - pub fn value_length(&self) -> i32 { - self.length - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[0].clone() - } - - /// Create an array from an iterable argument of sparse byte slices. - /// Sparsity means that items returned by the iterator are optional, i.e input argument can - /// contain `None` items. - /// - /// # Examles - /// - /// ``` - /// use arrow::array::FixedSizeBinaryArray; - /// let input_arg = vec![ - /// None, - /// Some(vec![7, 8]), - /// Some(vec![9, 10]), - /// None, - /// Some(vec![13, 14]), - /// None, - /// ]; - /// let array = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); - /// ``` - /// - /// # Errors - /// - /// Returns error if argument has length zero, or sizes of nested slices don't match. - pub fn try_from_sparse_iter(mut iter: T) -> Result - where - T: Iterator>, - U: AsRef<[u8]>, - { - let mut len = 0; - let mut size = None; - let mut byte = 0; - let mut null_buf = MutableBuffer::from_len_zeroed(0); - let mut buffer = MutableBuffer::from_len_zeroed(0); - let mut prepend = 0; - iter.try_for_each(|item| -> Result<(), ArrowError> { - // extend null bitmask by one byte per each 8 items - if byte == 0 { - null_buf.push(0u8); - byte = 8; - } - byte -= 1; - - if let Some(slice) = item { - let slice = slice.as_ref(); - if let Some(size) = size { - if size != slice.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Nested array size mismatch: one is {}, and the other is {}", - size, - slice.len() - ))); - } - } else { - size = Some(slice.len()); - buffer.extend_zeros(slice.len() * prepend); - } - bit_util::set_bit(null_buf.as_slice_mut(), len); - buffer.extend_from_slice(slice); - } else if let Some(size) = size { - buffer.extend_zeros(size); - } else { - prepend += 1; - } - - len += 1; - - Ok(()) - })?; - - if len == 0 { - return Err(ArrowError::InvalidArgumentError( - "Input iterable argument has no data".to_owned(), - )); - } - - let size = size.unwrap_or(0); - let array_data = ArrayData::new( - DataType::FixedSizeBinary(size as i32), - len, - None, - Some(null_buf.into()), - 0, - vec![buffer.into()], - vec![], - ); - Ok(FixedSizeBinaryArray::from(array_data)) - } - - /// Create an array from an iterable argument of byte slices. - /// - /// # Examles - /// - /// ``` - /// use arrow::array::FixedSizeBinaryArray; - /// let input_arg = vec![ - /// vec![1, 2], - /// vec![3, 4], - /// vec![5, 6], - /// ]; - /// let array = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap(); - /// ``` - /// - /// # Errors - /// - /// Returns error if argument has length zero, or sizes of nested slices don't match. - pub fn try_from_iter(mut iter: T) -> Result - where - T: Iterator, - U: AsRef<[u8]>, - { - let mut len = 0; - let mut size = None; - let mut buffer = MutableBuffer::from_len_zeroed(0); - iter.try_for_each(|item| -> Result<(), ArrowError> { - let slice = item.as_ref(); - if let Some(size) = size { - if size != slice.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Nested array size mismatch: one is {}, and the other is {}", - size, - slice.len() - ))); - } - } else { - size = Some(slice.len()); - } - buffer.extend_from_slice(slice); - - len += 1; - - Ok(()) - })?; - - if len == 0 { - return Err(ArrowError::InvalidArgumentError( - "Input iterable argument has no data".to_owned(), - )); - } - - let size = size.unwrap_or(0); - let array_data = ArrayData::builder(DataType::FixedSizeBinary(size as i32)) - .len(len) - .add_buffer(buffer.into()) - .build(); - Ok(FixedSizeBinaryArray::from(array_data)) - } - - #[inline] - fn value_offset_at(&self, i: usize) -> i32 { - self.length * i as i32 - } -} - -impl From for FixedSizeBinaryArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "FixedSizeBinaryArray data should contain 1 buffer only (values)" - ); - let value_data = data.buffers()[0].as_ptr(); - let length = match data.data_type() { - DataType::FixedSizeBinary(len) => *len, - _ => panic!("Expected data type to be FixedSizeBinary"), - }; - Self { - data, - value_data: unsafe { RawPtrBox::new(value_data) }, - length, - } - } -} - -/// Creates a `FixedSizeBinaryArray` from `FixedSizeList` array -impl From for FixedSizeBinaryArray { - fn from(v: FixedSizeListArray) -> Self { - assert_eq!( - v.data_ref().child_data()[0].child_data().len(), - 0, - "FixedSizeBinaryArray can only be created from list array of u8 values \ - (i.e. FixedSizeList>)." - ); - assert_eq!( - v.data_ref().child_data()[0].data_type(), - &DataType::UInt8, - "FixedSizeBinaryArray can only be created from FixedSizeList arrays, mismatched data types." - ); - - let mut builder = ArrayData::builder(DataType::FixedSizeBinary(v.value_length())) - .len(v.len()) - .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()); - if let Some(bitmap) = v.data_ref().null_bitmap() { - builder = builder.null_bit_buffer(bitmap.bits.clone()) - } - - let data = builder.build(); - Self::from(data) - } -} - -impl fmt::Debug for FixedSizeBinaryArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?; - print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl Array for FixedSizeBinaryArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeBinaryArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [FixedSizeBinaryArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } -} - -/// A type of `DecimalArray` whose elements are binaries. -pub struct DecimalArray { - data: ArrayData, - value_data: RawPtrBox, - precision: usize, - scale: usize, - length: i32, -} - -impl DecimalArray { - /// Returns the element at index `i` as i128. - pub fn value(&self, i: usize) -> i128 { - assert!(i < self.data.len(), "DecimalArray out of bounds access"); - let offset = i.checked_add(self.data.offset()).unwrap(); - let raw_val = unsafe { - let pos = self.value_offset_at(offset); - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(pos as isize), - (self.value_offset_at(offset + 1) - pos) as usize, - ) - }; - let as_array = raw_val.try_into(); - match as_array { - Ok(v) if raw_val.len() == 16 => i128::from_le_bytes(v), - _ => panic!("DecimalArray elements are not 128bit integers."), - } - } - - /// Returns the offset for the element at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) - } - - /// Returns the length for an element. - /// - /// All elements have the same length as the array is a fixed size. - #[inline] - pub fn value_length(&self) -> i32 { - self.length - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[0].clone() - } - - #[inline] - fn value_offset_at(&self, i: usize) -> i32 { - self.length * i as i32 - } - - pub fn from_fixed_size_list_array( - v: FixedSizeListArray, - precision: usize, - scale: usize, - ) -> Self { - assert_eq!( - v.data_ref().child_data()[0].child_data().len(), - 0, - "DecimalArray can only be created from list array of u8 values \ - (i.e. FixedSizeList>)." - ); - assert_eq!( - v.data_ref().child_data()[0].data_type(), - &DataType::UInt8, - "DecimalArray can only be created from FixedSizeList arrays, mismatched data types." - ); - - let mut builder = ArrayData::builder(DataType::Decimal(precision, scale)) - .len(v.len()) - .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()); - if let Some(bitmap) = v.data_ref().null_bitmap() { - builder = builder.null_bit_buffer(bitmap.bits.clone()) - } - - let data = builder.build(); - Self::from(data) - } - pub fn precision(&self) -> usize { - self.precision - } - - pub fn scale(&self) -> usize { - self.scale - } -} - -impl From for DecimalArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "DecimalArray data should contain 1 buffer only (values)" - ); - let values = data.buffers()[0].as_ptr(); - let (precision, scale) = match data.data_type() { - DataType::Decimal(precision, scale) => (*precision, *scale), - _ => panic!("Expected data type to be Decimal"), - }; - let length = 16; - Self { - data, - value_data: unsafe { RawPtrBox::new(values) }, - precision, - scale, - length, - } - } -} - -impl fmt::Debug for DecimalArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "DecimalArray<{}, {}>\n[\n", self.precision, self.scale)?; - print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl Array for DecimalArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [DecimalArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [DecimalArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } -} - -#[cfg(test)] -mod tests { - use crate::{ - array::{LargeListArray, ListArray}, - datatypes::Field, - }; - - use super::*; - - #[test] - fn test_binary_array() { - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let offsets: [i32; 4] = [0, 5, 5, 12]; - - // Array data: ["hello", "", "parquet"] - let array_data = ArrayData::builder(DataType::Binary) - .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - let binary_array = BinaryArray::from(array_data); - assert_eq!(3, binary_array.len()); - assert_eq!(0, binary_array.null_count()); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], unsafe { - binary_array.value_unchecked(0) - }); - assert_eq!([] as [u8; 0], binary_array.value(1)); - assert_eq!([] as [u8; 0], unsafe { binary_array.value_unchecked(1) }); - assert_eq!( - [b'p', b'a', b'r', b'q', b'u', b'e', b't'], - binary_array.value(2) - ); - assert_eq!([b'p', b'a', b'r', b'q', b'u', b'e', b't'], unsafe { - binary_array.value_unchecked(2) - }); - assert_eq!(5, binary_array.value_offsets()[2]); - assert_eq!(7, binary_array.value_length(2)); - for i in 0..3 { - assert!(binary_array.is_valid(i)); - assert!(!binary_array.is_null(i)); - } - - // Test binary array with offset - let array_data = ArrayData::builder(DataType::Binary) - .len(4) - .offset(1) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - let binary_array = BinaryArray::from(array_data); - assert_eq!( - [b'p', b'a', b'r', b'q', b'u', b'e', b't'], - binary_array.value(1) - ); - assert_eq!(5, binary_array.value_offsets()[0]); - assert_eq!(0, binary_array.value_length(0)); - assert_eq!(5, binary_array.value_offsets()[1]); - assert_eq!(7, binary_array.value_length(1)); - } - - #[test] - fn test_large_binary_array() { - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let offsets: [i64; 4] = [0, 5, 5, 12]; - - // Array data: ["hello", "", "parquet"] - let array_data = ArrayData::builder(DataType::LargeBinary) - .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - let binary_array = LargeBinaryArray::from(array_data); - assert_eq!(3, binary_array.len()); - assert_eq!(0, binary_array.null_count()); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], unsafe { - binary_array.value_unchecked(0) - }); - assert_eq!([] as [u8; 0], binary_array.value(1)); - assert_eq!([] as [u8; 0], unsafe { binary_array.value_unchecked(1) }); - assert_eq!( - [b'p', b'a', b'r', b'q', b'u', b'e', b't'], - binary_array.value(2) - ); - assert_eq!([b'p', b'a', b'r', b'q', b'u', b'e', b't'], unsafe { - binary_array.value_unchecked(2) - }); - assert_eq!(5, binary_array.value_offsets()[2]); - assert_eq!(7, binary_array.value_length(2)); - for i in 0..3 { - assert!(binary_array.is_valid(i)); - assert!(!binary_array.is_null(i)); - } - - // Test binary array with offset - let array_data = ArrayData::builder(DataType::LargeBinary) - .len(4) - .offset(1) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - let binary_array = LargeBinaryArray::from(array_data); - assert_eq!( - [b'p', b'a', b'r', b'q', b'u', b'e', b't'], - binary_array.value(1) - ); - assert_eq!([b'p', b'a', b'r', b'q', b'u', b'e', b't'], unsafe { - binary_array.value_unchecked(1) - }); - assert_eq!(5, binary_array.value_offsets()[0]); - assert_eq!(0, binary_array.value_length(0)); - assert_eq!(5, binary_array.value_offsets()[1]); - assert_eq!(7, binary_array.value_length(1)); - } - - #[test] - fn test_binary_array_from_list_array() { - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let values_data = ArrayData::builder(DataType::UInt8) - .len(12) - .add_buffer(Buffer::from(&values[..])) - .build(); - let offsets: [i32; 4] = [0, 5, 5, 12]; - - // Array data: ["hello", "", "parquet"] - let array_data1 = ArrayData::builder(DataType::Binary) - .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - let binary_array1 = BinaryArray::from(array_data1); - - let data_type = - DataType::List(Box::new(Field::new("item", DataType::UInt8, false))); - let array_data2 = ArrayData::builder(data_type) - .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_child_data(values_data) - .build(); - let list_array = ListArray::from(array_data2); - let binary_array2 = BinaryArray::from(list_array); - - assert_eq!(2, binary_array2.data().buffers().len()); - assert_eq!(0, binary_array2.data().child_data().len()); - - assert_eq!(binary_array1.len(), binary_array2.len()); - assert_eq!(binary_array1.null_count(), binary_array2.null_count()); - assert_eq!(binary_array1.value_offsets(), binary_array2.value_offsets()); - for i in 0..binary_array1.len() { - assert_eq!(binary_array1.value(i), binary_array2.value(i)); - assert_eq!(binary_array1.value(i), unsafe { - binary_array2.value_unchecked(i) - }); - assert_eq!(binary_array1.value_length(i), binary_array2.value_length(i)); - } - } - - #[test] - fn test_large_binary_array_from_list_array() { - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let values_data = ArrayData::builder(DataType::UInt8) - .len(12) - .add_buffer(Buffer::from(&values[..])) - .build(); - let offsets: [i64; 4] = [0, 5, 5, 12]; - - // Array data: ["hello", "", "parquet"] - let array_data1 = ArrayData::builder(DataType::LargeBinary) - .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - let binary_array1 = LargeBinaryArray::from(array_data1); - - let data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::UInt8, false))); - let array_data2 = ArrayData::builder(data_type) - .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_child_data(values_data) - .build(); - let list_array = LargeListArray::from(array_data2); - let binary_array2 = LargeBinaryArray::from(list_array); - - assert_eq!(2, binary_array2.data().buffers().len()); - assert_eq!(0, binary_array2.data().child_data().len()); - - assert_eq!(binary_array1.len(), binary_array2.len()); - assert_eq!(binary_array1.null_count(), binary_array2.null_count()); - assert_eq!(binary_array1.value_offsets(), binary_array2.value_offsets()); - for i in 0..binary_array1.len() { - assert_eq!(binary_array1.value(i), binary_array2.value(i)); - assert_eq!(binary_array1.value(i), unsafe { - binary_array2.value_unchecked(i) - }); - assert_eq!(binary_array1.value_length(i), binary_array2.value_length(i)); - } - } - - fn test_generic_binary_array_from_opt_vec() { - let values: Vec> = - vec![Some(b"one"), Some(b"two"), None, Some(b""), Some(b"three")]; - let array = GenericBinaryArray::::from_opt_vec(values); - assert_eq!(array.len(), 5); - assert_eq!(array.value(0), b"one"); - assert_eq!(array.value(1), b"two"); - assert_eq!(array.value(3), b""); - assert_eq!(array.value(4), b"three"); - assert_eq!(array.is_null(0), false); - assert_eq!(array.is_null(1), false); - assert_eq!(array.is_null(2), true); - assert_eq!(array.is_null(3), false); - assert_eq!(array.is_null(4), false); - } - - #[test] - fn test_large_binary_array_from_opt_vec() { - test_generic_binary_array_from_opt_vec::() - } - - #[test] - fn test_binary_array_from_opt_vec() { - test_generic_binary_array_from_opt_vec::() - } - - #[test] - fn test_binary_array_from_unbound_iter() { - // iterator that doesn't declare (upper) size bound - let value_iter = (0..) - .scan(0usize, |pos, i| { - if *pos < 10 { - *pos += 1; - Some(Some(format!("value {}", i))) - } else { - // actually returns up to 10 values - None - } - }) - // limited using take() - .take(100); - - let (_, upper_size_bound) = value_iter.size_hint(); - // the upper bound, defined by take above, is 100 - assert_eq!(upper_size_bound, Some(100)); - let binary_array: BinaryArray = value_iter.collect(); - // but the actual number of items in the array should be 10 - assert_eq!(binary_array.len(), 10); - } - - #[test] - #[should_panic( - expected = "assertion failed: `(left == right)`\n left: `UInt32`,\n \ - right: `UInt8`: BinaryArray can only be created from List arrays, \ - mismatched data types." - )] - fn test_binary_array_from_incorrect_list_array() { - let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; - let values_data = ArrayData::builder(DataType::UInt32) - .len(12) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - let offsets: [i32; 4] = [0, 5, 5, 12]; - - let data_type = - DataType::List(Box::new(Field::new("item", DataType::UInt32, false))); - let array_data = ArrayData::builder(data_type) - .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_child_data(values_data) - .build(); - let list_array = ListArray::from(array_data); - BinaryArray::from(list_array); - } - - #[test] - fn test_fixed_size_binary_array() { - let values: [u8; 15] = *b"hellotherearrow"; - - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(3) - .add_buffer(Buffer::from(&values[..])) - .build(); - let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); - assert_eq!(3, fixed_size_binary_array.len()); - assert_eq!(0, fixed_size_binary_array.null_count()); - assert_eq!( - [b'h', b'e', b'l', b'l', b'o'], - fixed_size_binary_array.value(0) - ); - assert_eq!( - [b't', b'h', b'e', b'r', b'e'], - fixed_size_binary_array.value(1) - ); - assert_eq!( - [b'a', b'r', b'r', b'o', b'w'], - fixed_size_binary_array.value(2) - ); - assert_eq!(5, fixed_size_binary_array.value_length()); - assert_eq!(10, fixed_size_binary_array.value_offset(2)); - for i in 0..3 { - assert!(fixed_size_binary_array.is_valid(i)); - assert!(!fixed_size_binary_array.is_null(i)); - } - - // Test binary array with offset - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(2) - .offset(1) - .add_buffer(Buffer::from(&values[..])) - .build(); - let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); - assert_eq!( - [b't', b'h', b'e', b'r', b'e'], - fixed_size_binary_array.value(0) - ); - assert_eq!( - [b'a', b'r', b'r', b'o', b'w'], - fixed_size_binary_array.value(1) - ); - assert_eq!(2, fixed_size_binary_array.len()); - assert_eq!(5, fixed_size_binary_array.value_offset(0)); - assert_eq!(5, fixed_size_binary_array.value_length()); - assert_eq!(10, fixed_size_binary_array.value_offset(1)); - } - - #[test] - #[should_panic( - expected = "FixedSizeBinaryArray can only be created from list array of u8 values \ - (i.e. FixedSizeList>)." - )] - fn test_fixed_size_binary_array_from_incorrect_list_array() { - let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; - let values_data = ArrayData::builder(DataType::UInt32) - .len(12) - .add_buffer(Buffer::from_slice_ref(&values)) - .add_child_data(ArrayData::builder(DataType::Boolean).build()) - .build(); - - let array_data = ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Binary, false)), - 4, - )) - .len(3) - .add_child_data(values_data) - .build(); - let list_array = FixedSizeListArray::from(array_data); - FixedSizeBinaryArray::from(list_array); - } - - #[test] - #[should_panic(expected = "BinaryArray out of bounds access")] - fn test_binary_array_get_value_index_out_of_bound() { - let values: [u8; 12] = - [104, 101, 108, 108, 111, 112, 97, 114, 113, 117, 101, 116]; - let offsets: [i32; 4] = [0, 5, 5, 12]; - let array_data = ArrayData::builder(DataType::Binary) - .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - let binary_array = BinaryArray::from(array_data); - binary_array.value(4); - } - - #[test] - fn test_binary_array_fmt_debug() { - let values: [u8; 15] = *b"hellotherearrow"; - - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(3) - .add_buffer(Buffer::from(&values[..])) - .build(); - let arr = FixedSizeBinaryArray::from(array_data); - assert_eq!( - "FixedSizeBinaryArray<5>\n[\n [104, 101, 108, 108, 111],\n [116, 104, 101, 114, 101],\n [97, 114, 114, 111, 119],\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_decimal_array() { - // let val_8887: [u8; 16] = [192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; - // let val_neg_8887: [u8; 16] = [64, 36, 75, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]; - let values: [u8; 32] = [ - 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]; - let array_data = ArrayData::builder(DataType::Decimal(23, 6)) - .len(2) - .add_buffer(Buffer::from(&values[..])) - .build(); - let decimal_array = DecimalArray::from(array_data); - assert_eq!(8_887_000_000, decimal_array.value(0)); - assert_eq!(-8_887_000_000, decimal_array.value(1)); - assert_eq!(16, decimal_array.value_length()); - } - - #[test] - fn test_decimal_array_fmt_debug() { - let values: [u8; 32] = [ - 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - ]; - let array_data = ArrayData::builder(DataType::Decimal(23, 6)) - .len(2) - .add_buffer(Buffer::from(&values[..])) - .build(); - let arr = DecimalArray::from(array_data); - assert_eq!( - "DecimalArray<23, 6>\n[\n 8887000000,\n -8887000000,\n]", - format!("{:?}", arr) - ); - } -} diff --git a/rust/arrow/src/array/array_boolean.rs b/rust/arrow/src/array/array_boolean.rs deleted file mode 100644 index 67af85d167f..00000000000 --- a/rust/arrow/src/array/array_boolean.rs +++ /dev/null @@ -1,291 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::borrow::Borrow; -use std::convert::From; -use std::iter::{FromIterator, IntoIterator}; -use std::mem; -use std::{any::Any, fmt}; - -use super::*; -use super::{array::print_long_array, raw_pointer::RawPtrBox}; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::util::bit_util; - -/// Array of bools -pub struct BooleanArray { - data: ArrayData, - /// Pointer to the value array. The lifetime of this must be <= to the value buffer - /// stored in `data`, so it's safe to store. - raw_values: RawPtrBox, -} - -impl fmt::Debug for BooleanArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "BooleanArray\n[\n")?; - print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl BooleanArray { - /// Returns the length of this array. - pub fn len(&self) -> usize { - self.data.len() - } - - /// Returns whether this array is empty. - pub fn is_empty(&self) -> bool { - self.data.is_empty() - } - - // Returns a new boolean array builder - pub fn builder(capacity: usize) -> BooleanBuilder { - BooleanBuilder::new(capacity) - } - - /// Returns a `Buffer` holding all the values of this array. - /// - /// Note this doesn't take the offset of this array into account. - pub fn values(&self) -> &Buffer { - &self.data.buffers()[0] - } - - /// Returns the boolean value at index `i`. - /// - /// # Safety - /// This doesn't check bounds, the caller must ensure that index < self.len() - pub unsafe fn value_unchecked(&self, i: usize) -> bool { - let offset = i + self.offset(); - bit_util::get_bit_raw(self.raw_values.as_ptr(), offset) - } - - /// Returns the boolean value at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - pub fn value(&self, i: usize) -> bool { - debug_assert!(i < self.len()); - unsafe { self.value_unchecked(i) } - } -} - -impl Array for BooleanArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [BooleanArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [BooleanArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } -} - -impl From> for BooleanArray { - fn from(data: Vec) -> Self { - let mut mut_buf = MutableBuffer::new_null(data.len()); - { - let mut_slice = mut_buf.as_slice_mut(); - for (i, b) in data.iter().enumerate() { - if *b { - bit_util::set_bit(mut_slice, i); - } - } - } - let array_data = ArrayData::builder(DataType::Boolean) - .len(data.len()) - .add_buffer(mut_buf.into()) - .build(); - BooleanArray::from(array_data) - } -} - -impl From>> for BooleanArray { - fn from(data: Vec>) -> Self { - BooleanArray::from_iter(data.iter()) - } -} - -impl From for BooleanArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "BooleanArray data should contain a single buffer only (values buffer)" - ); - let ptr = data.buffers()[0].as_ptr(); - Self { - data, - raw_values: unsafe { RawPtrBox::new(ptr) }, - } - } -} - -impl<'a> IntoIterator for &'a BooleanArray { - type Item = Option; - type IntoIter = BooleanIter<'a>; - - fn into_iter(self) -> Self::IntoIter { - BooleanIter::<'a>::new(self) - } -} - -impl<'a> BooleanArray { - /// constructs a new iterator - pub fn iter(&'a self) -> BooleanIter<'a> { - BooleanIter::<'a>::new(&self) - } -} - -impl>> FromIterator for BooleanArray { - fn from_iter>(iter: I) -> Self { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let num_bytes = bit_util::ceil(data_len, 8); - let mut null_buf = MutableBuffer::from_len_zeroed(num_bytes); - let mut val_buf = MutableBuffer::from_len_zeroed(num_bytes); - - let data = val_buf.as_slice_mut(); - - let null_slice = null_buf.as_slice_mut(); - iter.enumerate().for_each(|(i, item)| { - if let Some(a) = item.borrow() { - bit_util::set_bit(null_slice, i); - if *a { - bit_util::set_bit(data, i); - } - } - }); - - let data = ArrayData::new( - DataType::Boolean, - data_len, - None, - Some(null_buf.into()), - 0, - vec![val_buf.into()], - vec![], - ); - BooleanArray::from(data) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::buffer::Buffer; - use crate::datatypes::DataType; - - #[test] - fn test_boolean_fmt_debug() { - let arr = BooleanArray::from(vec![true, false, false]); - assert_eq!( - "BooleanArray\n[\n true,\n false,\n false,\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_boolean_with_null_fmt_debug() { - let mut builder = BooleanArray::builder(3); - builder.append_value(true).unwrap(); - builder.append_null().unwrap(); - builder.append_value(false).unwrap(); - let arr = builder.finish(); - assert_eq!( - "BooleanArray\n[\n true,\n null,\n false,\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_boolean_array_from_vec() { - let buf = Buffer::from([10_u8]); - let arr = BooleanArray::from(vec![false, true, false, true]); - assert_eq!(&buf, arr.values()); - assert_eq!(4, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..4 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i) - } - } - - #[test] - fn test_boolean_array_from_vec_option() { - let buf = Buffer::from([10_u8]); - let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]); - assert_eq!(&buf, arr.values()); - assert_eq!(4, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(1, arr.null_count()); - for i in 0..4 { - if i == 2 { - assert!(arr.is_null(i)); - assert!(!arr.is_valid(i)); - } else { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i) - } - } - } - - #[test] - fn test_boolean_array_builder() { - // Test building a boolean array with ArrayData builder and offset - // 000011011 - let buf = Buffer::from([27_u8]); - let buf2 = buf.clone(); - let data = ArrayData::builder(DataType::Boolean) - .len(5) - .offset(2) - .add_buffer(buf) - .build(); - let arr = BooleanArray::from(data); - assert_eq!(&buf2, arr.values()); - assert_eq!(5, arr.len()); - assert_eq!(2, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..3 { - assert_eq!(i != 0, arr.value(i), "failed at {}", i); - } - } - - #[test] - #[should_panic(expected = "BooleanArray data should contain a single buffer only \ - (values buffer)")] - fn test_boolean_array_invalid_buffer_len() { - let data = ArrayData::builder(DataType::Boolean).len(5).build(); - BooleanArray::from(data); - } -} diff --git a/rust/arrow/src/array/array_dictionary.rs b/rust/arrow/src/array/array_dictionary.rs deleted file mode 100644 index 5948658157e..00000000000 --- a/rust/arrow/src/array/array_dictionary.rs +++ /dev/null @@ -1,408 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::fmt; -use std::iter::IntoIterator; -use std::mem; -use std::{convert::From, iter::FromIterator}; - -use super::{ - make_array, Array, ArrayData, ArrayRef, PrimitiveArray, PrimitiveBuilder, - StringArray, StringBuilder, StringDictionaryBuilder, -}; -use crate::datatypes::ArrowNativeType; -use crate::datatypes::{ArrowDictionaryKeyType, ArrowPrimitiveType, DataType}; - -/// A dictionary array where each element is a single value indexed by an integer key. -/// This is mostly used to represent strings or a limited set of primitive types as integers, -/// for example when doing NLP analysis or representing chromosomes by name. -/// -/// Example **with nullable** data: -/// -/// ``` -/// use arrow::array::{DictionaryArray, Int8Array}; -/// use arrow::datatypes::Int8Type; -/// let test = vec!["a", "a", "b", "c"]; -/// let array : DictionaryArray = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect(); -/// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)])); -/// ``` -/// -/// Example **without nullable** data: -/// -/// ``` -/// use arrow::array::{DictionaryArray, Int8Array}; -/// use arrow::datatypes::Int8Type; -/// let test = vec!["a", "a", "b", "c"]; -/// let array : DictionaryArray = test.into_iter().collect(); -/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); -/// ``` -pub struct DictionaryArray { - /// Data of this dictionary. Note that this is _not_ compatible with the C Data interface, - /// as, in the current implementation, `values` below are the first child of this struct. - data: ArrayData, - - /// The keys of this dictionary. These are constructed from the buffer and null bitmap - /// of `data`. - /// Also, note that these do not correspond to the true values of this array. Rather, they map - /// to the real values. - keys: PrimitiveArray, - - /// Array of dictionary values (can by any DataType). - values: ArrayRef, - - /// Values are ordered. - is_ordered: bool, -} - -impl<'a, K: ArrowPrimitiveType> DictionaryArray { - /// Return an iterator to the keys of this dictionary. - pub fn keys(&self) -> &PrimitiveArray { - &self.keys - } - - /// Returns an array view of the keys of this dictionary - pub fn keys_array(&self) -> PrimitiveArray { - let data = self.data_ref(); - let keys_data = ArrayData::new( - K::DATA_TYPE, - data.len(), - Some(data.null_count()), - data.null_buffer().cloned(), - data.offset(), - data.buffers().to_vec(), - vec![], - ); - PrimitiveArray::::from(keys_data) - } - - /// Returns the lookup key by doing reverse dictionary lookup - pub fn lookup_key(&self, value: &str) -> Option { - let rd_buf: &StringArray = - self.values.as_any().downcast_ref::().unwrap(); - - (0..rd_buf.len()) - .position(|i| rd_buf.value(i) == value) - .map(K::Native::from_usize) - .flatten() - } - - /// Returns an `ArrayRef` to the dictionary values. - pub fn values(&self) -> ArrayRef { - self.values.clone() - } - - /// Returns a clone of the value type of this list. - pub fn value_type(&self) -> DataType { - self.values.data_ref().data_type().clone() - } - - /// The length of the dictionary is the length of the keys array. - pub fn len(&self) -> usize { - self.keys.len() - } - - /// Whether this dictionary is empty - pub fn is_empty(&self) -> bool { - self.keys.is_empty() - } - - // Currently exists for compatibility purposes with Arrow IPC. - pub fn is_ordered(&self) -> bool { - self.is_ordered - } -} - -/// Constructs a `DictionaryArray` from an array data reference. -impl From for DictionaryArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "DictionaryArray data should contain a single buffer only (keys)." - ); - assert_eq!( - data.child_data().len(), - 1, - "DictionaryArray should contain a single child array (values)." - ); - - if let DataType::Dictionary(key_data_type, _) = data.data_type() { - if key_data_type.as_ref() != &T::DATA_TYPE { - panic!("DictionaryArray's data type must match.") - }; - // create a zero-copy of the keys' data - let keys = PrimitiveArray::::from(ArrayData::new( - T::DATA_TYPE, - data.len(), - Some(data.null_count()), - data.null_buffer().cloned(), - data.offset(), - data.buffers().to_vec(), - vec![], - )); - let values = make_array(data.child_data()[0].clone()); - Self { - data, - keys, - values, - is_ordered: false, - } - } else { - panic!("DictionaryArray must have Dictionary data type.") - } - } -} - -/// Constructs a `DictionaryArray` from an iterator of optional strings. -impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator> - for DictionaryArray -{ - fn from_iter>>(iter: I) -> Self { - let it = iter.into_iter(); - let (lower, _) = it.size_hint(); - let key_builder = PrimitiveBuilder::::new(lower); - let value_builder = StringBuilder::new(256); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); - it.for_each(|i| { - if let Some(i) = i { - // Note: impl ... for Result> fails with - // error[E0117]: only traits defined in the current crate can be implemented for arbitrary types - builder - .append(i) - .expect("Unable to append a value to a dictionary array."); - } else { - builder - .append_null() - .expect("Unable to append a null value to a dictionary array."); - } - }); - - builder.finish() - } -} - -/// Constructs a `DictionaryArray` from an iterator of strings. -impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator<&'a str> - for DictionaryArray -{ - fn from_iter>(iter: I) -> Self { - let it = iter.into_iter(); - let (lower, _) = it.size_hint(); - let key_builder = PrimitiveBuilder::::new(lower); - let value_builder = StringBuilder::new(256); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); - it.for_each(|i| { - builder - .append(i) - .expect("Unable to append a value to a dictionary array."); - }); - - builder.finish() - } -} - -impl Array for DictionaryArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - fn get_buffer_memory_size(&self) -> usize { - // Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`. - self.data.get_buffer_memory_size() - } - - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() - + self.keys.get_array_memory_size() - + self.values.get_array_memory_size() - + mem::size_of_val(self) - } -} - -impl fmt::Debug for DictionaryArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - writeln!( - f, - "DictionaryArray {{keys: {:?} values: {:?}}}", - self.keys, self.values - ) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::{ - array::Int16Array, - datatypes::{Int32Type, Int8Type, UInt32Type, UInt8Type}, - }; - use crate::{ - array::Int16DictionaryArray, array::PrimitiveDictionaryBuilder, - datatypes::DataType, - }; - use crate::{buffer::Buffer, datatypes::ToByteSlice}; - - #[test] - fn test_dictionary_array() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int8) - .len(8) - .add_buffer(Buffer::from( - &[10_i8, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(), - )) - .build(); - - // Construct a buffer for value offsets, for the nested array: - let keys = Buffer::from(&[2_i16, 3, 4].to_byte_slice()); - - // Construct a dictionary array from the above two - let key_type = DataType::Int16; - let value_type = DataType::Int8; - let dict_data_type = - DataType::Dictionary(Box::new(key_type), Box::new(value_type)); - let dict_data = ArrayData::builder(dict_data_type.clone()) - .len(3) - .add_buffer(keys.clone()) - .add_child_data(value_data.clone()) - .build(); - let dict_array = Int16DictionaryArray::from(dict_data); - - let values = dict_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int8, dict_array.value_type()); - assert_eq!(3, dict_array.len()); - - // Null count only makes sense in terms of the component arrays. - assert_eq!(0, dict_array.null_count()); - assert_eq!(0, dict_array.values().null_count()); - assert_eq!(dict_array.keys(), &Int16Array::from(vec![2_i16, 3, 4])); - - // Now test with a non-zero offset - let dict_data = ArrayData::builder(dict_data_type) - .len(2) - .offset(1) - .add_buffer(keys) - .add_child_data(value_data.clone()) - .build(); - let dict_array = Int16DictionaryArray::from(dict_data); - - let values = dict_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int8, dict_array.value_type()); - assert_eq!(2, dict_array.len()); - assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4])); - } - - #[test] - fn test_dictionary_array_fmt_debug() { - let key_builder = PrimitiveBuilder::::new(3); - let value_builder = PrimitiveBuilder::::new(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); - builder.append(12345678).unwrap(); - builder.append_null().unwrap(); - builder.append(22345678).unwrap(); - let array = builder.finish(); - assert_eq!( - "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n null,\n 1,\n] values: PrimitiveArray\n[\n 12345678,\n 22345678,\n]}\n", - format!("{:?}", array) - ); - - let key_builder = PrimitiveBuilder::::new(20); - let value_builder = PrimitiveBuilder::::new(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); - for _ in 0..20 { - builder.append(1).unwrap(); - } - let array = builder.finish(); - assert_eq!( - "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n] values: PrimitiveArray\n[\n 1,\n]}\n", - format!("{:?}", array) - ); - } - - #[test] - fn test_dictionary_array_from_iter() { - let test = vec!["a", "a", "b", "c"]; - let array: DictionaryArray = test - .iter() - .map(|&x| if x == "b" { None } else { Some(x) }) - .collect(); - assert_eq!( - "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n null,\n 1,\n] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n", - format!("{:?}", array) - ); - - let array: DictionaryArray = test.into_iter().collect(); - assert_eq!( - "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n 1,\n 2,\n] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", - format!("{:?}", array) - ); - } - - #[test] - fn test_dictionary_array_reverse_lookup_key() { - let test = vec!["a", "a", "b", "c"]; - let array: DictionaryArray = test.into_iter().collect(); - - assert_eq!(array.lookup_key("c"), Some(2)); - - // Direction of building a dictionary is the iterator direction - let test = vec!["t3", "t3", "t2", "t2", "t1", "t3", "t4", "t1", "t0"]; - let array: DictionaryArray = test.into_iter().collect(); - - assert_eq!(array.lookup_key("t1"), Some(2)); - assert_eq!(array.lookup_key("non-existent"), None); - } - - #[test] - fn test_dictionary_keys_as_primitive_array() { - let test = vec!["a", "b", "c", "a"]; - let array: DictionaryArray = test.into_iter().collect(); - - let keys = array.keys_array(); - assert_eq!(&DataType::Int8, keys.data_type()); - assert_eq!(0, keys.null_count()); - assert_eq!(&[0, 1, 2, 0], keys.values()); - } - - #[test] - fn test_dictionary_keys_as_primitive_array_with_null() { - let test = vec![Some("a"), None, Some("b"), None, None, Some("a")]; - let array: DictionaryArray = test.into_iter().collect(); - - let keys = array.keys_array(); - assert_eq!(&DataType::Int32, keys.data_type()); - assert_eq!(3, keys.null_count()); - - assert_eq!(true, keys.is_valid(0)); - assert_eq!(false, keys.is_valid(1)); - assert_eq!(true, keys.is_valid(2)); - assert_eq!(false, keys.is_valid(3)); - assert_eq!(false, keys.is_valid(4)); - assert_eq!(true, keys.is_valid(5)); - - assert_eq!(0, keys.value(0)); - assert_eq!(1, keys.value(2)); - assert_eq!(0, keys.value(5)); - } -} diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs deleted file mode 100644 index 0e334631adf..00000000000 --- a/rust/arrow/src/array/array_list.rs +++ /dev/null @@ -1,1056 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::fmt; -use std::mem; - -use num::Num; - -use super::{ - array::print_long_array, make_array, raw_pointer::RawPtrBox, Array, ArrayData, - ArrayRef, BooleanBufferBuilder, GenericListArrayIter, PrimitiveArray, -}; -use crate::{ - buffer::MutableBuffer, - datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType, Field}, - error::ArrowError, -}; - -/// trait declaring an offset size, relevant for i32 vs i64 array types. -pub trait OffsetSizeTrait: ArrowNativeType + Num + Ord + std::ops::AddAssign { - fn is_large() -> bool; -} - -impl OffsetSizeTrait for i32 { - #[inline] - fn is_large() -> bool { - false - } -} - -impl OffsetSizeTrait for i64 { - #[inline] - fn is_large() -> bool { - true - } -} - -pub struct GenericListArray { - data: ArrayData, - values: ArrayRef, - value_offsets: RawPtrBox, -} - -impl GenericListArray { - /// Returns a reference to the values of this list. - pub fn values(&self) -> ArrayRef { - self.values.clone() - } - - /// Returns a clone of the value type of this list. - pub fn value_type(&self) -> DataType { - self.values.data_ref().data_type().clone() - } - - /// Returns ith value of this list array. - /// # Safety - /// Caller must ensure that the index is within the array bounds - pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { - let end = *self.value_offsets().get_unchecked(i + 1); - let start = *self.value_offsets().get_unchecked(i); - self.values - .slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap()) - } - - /// Returns ith value of this list array. - pub fn value(&self, i: usize) -> ArrayRef { - let end = self.value_offsets()[i + 1]; - let start = self.value_offsets()[i]; - self.values - .slice(start.to_usize().unwrap(), (end - start).to_usize().unwrap()) - } - - /// Returns the offset values in the offsets buffer - #[inline] - pub fn value_offsets(&self) -> &[OffsetSize] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } - } - - /// Returns the length for value at index `i`. - #[inline] - pub fn value_length(&self, i: usize) -> OffsetSize { - let offsets = self.value_offsets(); - offsets[i + 1] - offsets[i] - } - - /// constructs a new iterator - pub fn iter<'a>(&'a self) -> GenericListArrayIter<'a, OffsetSize> { - GenericListArrayIter::<'a, OffsetSize>::new(&self) - } - - #[inline] - fn get_type(data_type: &DataType) -> Option<&DataType> { - if OffsetSize::is_large() { - if let DataType::LargeList(child) = data_type { - Some(child.data_type()) - } else { - None - } - } else if let DataType::List(child) = data_type { - Some(child.data_type()) - } else { - None - } - } - - /// Creates a [`GenericListArray`] from an iterator of primitive values - /// # Example - /// ``` - /// # use arrow::array::ListArray; - /// # use arrow::datatypes::Int32Type; - /// let data = vec![ - /// Some(vec![Some(0), Some(1), Some(2)]), - /// None, - /// Some(vec![Some(3), None, Some(5)]), - /// Some(vec![Some(6), Some(7)]), - /// ]; - /// let list_array = ListArray::from_iter_primitive::(data); - /// println!("{:?}", list_array); - /// ``` - pub fn from_iter_primitive(iter: I) -> Self - where - T: ArrowPrimitiveType, - P: AsRef<[Option<::Native>]> - + IntoIterator::Native>>, - I: IntoIterator>, - { - let iterator = iter.into_iter(); - let (lower, _) = iterator.size_hint(); - - let mut offsets = - MutableBuffer::new((lower + 1) * std::mem::size_of::()); - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - let mut null_buf = BooleanBufferBuilder::new(lower); - - let values: PrimitiveArray = iterator - .filter_map(|maybe_slice| { - // regardless of whether the item is Some, the offsets and null buffers must be updated. - match &maybe_slice { - Some(x) => { - length_so_far += - OffsetSize::from_usize(x.as_ref().len()).unwrap(); - null_buf.append(true); - } - None => null_buf.append(false), - }; - offsets.push(length_so_far); - maybe_slice - }) - .flatten() - .collect(); - - let field = Box::new(Field::new("item", T::DATA_TYPE, true)); - let data_type = if OffsetSize::is_large() { - DataType::LargeList(field) - } else { - DataType::List(field) - }; - let data = ArrayData::builder(data_type) - .len(null_buf.len()) - .add_buffer(offsets.into()) - .add_child_data(values.data().clone()) - .null_bit_buffer(null_buf.into()) - .build(); - Self::from(data) - } -} - -impl From for GenericListArray { - fn from(data: ArrayData) -> Self { - Self::try_new_from_array_data(data).expect( - "Expected infallable creation of GenericListArray from ArrayDataRef failed", - ) - } -} - -impl GenericListArray { - fn try_new_from_array_data(data: ArrayData) -> Result { - if data.buffers().len() != 1 { - return Err(ArrowError::InvalidArgumentError( - format!("ListArray data should contain a single buffer only (value offsets), had {}", - data.len()))); - } - - if data.child_data().len() != 1 { - return Err(ArrowError::InvalidArgumentError(format!( - "ListArray should contain a single child array (values array), had {}", - data.child_data().len() - ))); - } - - let values = data.child_data()[0].clone(); - - if let Some(child_data_type) = Self::get_type(data.data_type()) { - if values.data_type() != child_data_type { - return Err(ArrowError::InvalidArgumentError(format!( - "[Large]ListArray's child datatype {:?} does not \ - correspond to the List's datatype {:?}", - values.data_type(), - child_data_type - ))); - } - } else { - return Err(ArrowError::InvalidArgumentError(format!( - "[Large]ListArray's datatype must be [Large]ListArray(). It is {:?}", - data.data_type() - ))); - } - - let values = make_array(values); - let value_offsets = data.buffers()[0].as_ptr(); - - let value_offsets = unsafe { RawPtrBox::::new(value_offsets) }; - unsafe { - if !(*value_offsets.as_ptr().offset(0)).is_zero() { - return Err(ArrowError::InvalidArgumentError(String::from( - "offsets do not start at zero", - ))); - } - } - Ok(Self { - data, - values, - value_offsets, - }) - } -} - -impl Array for GenericListArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [ListArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [ListArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } -} - -impl fmt::Debug for GenericListArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let prefix = if OffsetSize::is_large() { "Large" } else { "" }; - - write!(f, "{}ListArray\n[\n", prefix)?; - print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -/// A list array where each element is a variable-sized sequence of values with the same -/// type whose memory offsets between elements are represented by a i32. -pub type ListArray = GenericListArray; - -/// A list array where each element is a variable-sized sequence of values with the same -/// type whose memory offsets between elements are represented by a i64. -pub type LargeListArray = GenericListArray; - -/// A list array where each element is a fixed-size sequence of values with the same -/// type whose maximum length is represented by a i32. -pub struct FixedSizeListArray { - data: ArrayData, - values: ArrayRef, - length: i32, -} - -impl FixedSizeListArray { - /// Returns a reference to the values of this list. - pub fn values(&self) -> ArrayRef { - self.values.clone() - } - - /// Returns a clone of the value type of this list. - pub fn value_type(&self) -> DataType { - self.values.data_ref().data_type().clone() - } - - /// Returns ith value of this list array. - pub fn value(&self, i: usize) -> ArrayRef { - self.values - .slice(self.value_offset(i) as usize, self.value_length() as usize) - } - - /// Returns the offset for value at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) - } - - /// Returns the length for value at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub const fn value_length(&self) -> i32 { - self.length - } - - #[inline] - const fn value_offset_at(&self, i: usize) -> i32 { - i as i32 * self.length - } -} - -impl From for FixedSizeListArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 0, - "FixedSizeListArray data should not contain a buffer for value offsets" - ); - assert_eq!( - data.child_data().len(), - 1, - "FixedSizeListArray should contain a single child array (values array)" - ); - let values = make_array(data.child_data()[0].clone()); - let length = match data.data_type() { - DataType::FixedSizeList(_, len) => { - if *len > 0 { - // check that child data is multiple of length - assert_eq!( - values.len() % *len as usize, - 0, - "FixedSizeListArray child array length should be a multiple of {}", - len - ); - } - - *len - } - _ => { - panic!("FixedSizeListArray data should contain a FixedSizeList data type") - } - }; - Self { - data, - values, - length, - } - } -} - -impl Array for FixedSizeListArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeListArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [FixedSizeListArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() - + self.values().get_array_memory_size() - + mem::size_of_val(self) - } -} - -impl fmt::Debug for FixedSizeListArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "FixedSizeListArray<{}>\n[\n", self.value_length())?; - print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -#[cfg(test)] -mod tests { - use crate::{ - alloc, - array::ArrayData, - array::Int32Array, - buffer::Buffer, - datatypes::Field, - datatypes::{Int32Type, ToByteSlice}, - util::bit_util, - }; - - use super::*; - - fn create_from_buffers() -> ListArray { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice())) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice()); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - ListArray::from(list_data) - } - - #[test] - fn test_from_iter_primitive() { - let data = vec![ - Some(vec![Some(0), Some(1), Some(2)]), - Some(vec![Some(3), Some(4), Some(5)]), - Some(vec![Some(6), Some(7)]), - ]; - let list_array = ListArray::from_iter_primitive::(data); - - let another = create_from_buffers(); - assert_eq!(list_array, another) - } - - #[test] - fn test_list_array() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type.clone()) - .len(3) - .add_buffer(value_offsets.clone()) - .add_child_data(value_data.clone()) - .build(); - let list_array = ListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offsets()[2]); - assert_eq!(2, list_array.value_length(2)); - assert_eq!( - 0, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - assert_eq!( - 0, - unsafe { list_array.value_unchecked(0) } - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - for i in 0..3 { - assert!(list_array.is_valid(i)); - assert!(!list_array.is_null(i)); - } - - // Now test with a non-zero offset - let list_data = ArrayData::builder(list_data_type) - .len(3) - .offset(1) - .add_buffer(value_offsets) - .add_child_data(value_data.clone()) - .build(); - let list_array = ListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offsets()[1]); - assert_eq!(2, list_array.value_length(1)); - assert_eq!( - 3, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - assert_eq!( - 3, - unsafe { list_array.value_unchecked(0) } - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - } - - #[test] - fn test_large_list_array() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 8]); - - // Construct a list array from the above two - let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type.clone()) - .len(3) - .add_buffer(value_offsets.clone()) - .add_child_data(value_data.clone()) - .build(); - let list_array = LargeListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offsets()[2]); - assert_eq!(2, list_array.value_length(2)); - assert_eq!( - 0, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - assert_eq!( - 0, - unsafe { list_array.value_unchecked(0) } - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - for i in 0..3 { - assert!(list_array.is_valid(i)); - assert!(!list_array.is_null(i)); - } - - // Now test with a non-zero offset - let list_data = ArrayData::builder(list_data_type) - .len(3) - .offset(1) - .add_buffer(value_offsets) - .add_child_data(value_data.clone()) - .build(); - let list_array = LargeListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offsets()[1]); - assert_eq!(2, list_array.value_length(1)); - assert_eq!( - 3, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - assert_eq!( - 3, - unsafe { list_array.value_unchecked(0) } - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - } - - #[test] - fn test_fixed_size_list_array() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(9) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8])) - .build(); - - // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), - 3, - ); - let list_data = ArrayData::builder(list_data_type.clone()) - .len(3) - .add_child_data(value_data.clone()) - .build(); - let list_array = FixedSizeListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - assert_eq!( - 0, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - for i in 0..3 { - assert!(list_array.is_valid(i)); - assert!(!list_array.is_null(i)); - } - - // Now test with a non-zero offset - let list_data = ArrayData::builder(list_data_type) - .len(3) - .offset(1) - .add_child_data(value_data.clone()) - .build(); - let list_array = FixedSizeListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!( - 3, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - assert_eq!(6, list_array.value_offset(1)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - #[should_panic( - expected = "FixedSizeListArray child array length should be a multiple of 3" - )] - fn test_fixed_size_list_array_unequal_children() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); - - // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), - 3, - ); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_child_data(value_data) - .build(); - FixedSizeListArray::from(list_data); - } - - #[test] - fn test_list_array_slice() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] - let value_offsets = Buffer::from_slice_ref(&[0, 2, 2, 2, 4, 6, 6, 9, 9, 10]); - // 01011001 00000001 - let mut null_bits: [u8; 2] = [0; 2]; - bit_util::set_bit(&mut null_bits, 0); - bit_util::set_bit(&mut null_bits, 3); - bit_util::set_bit(&mut null_bits, 4); - bit_util::set_bit(&mut null_bits, 6); - bit_util::set_bit(&mut null_bits, 8); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(9) - .add_buffer(value_offsets) - .add_child_data(value_data.clone()) - .null_bit_buffer(Buffer::from(null_bits)) - .build(); - let list_array = ListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(9, list_array.len()); - assert_eq!(4, list_array.null_count()); - assert_eq!(2, list_array.value_offsets()[3]); - assert_eq!(2, list_array.value_length(3)); - - let sliced_array = list_array.slice(1, 6); - assert_eq!(6, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); - assert_eq!(3, sliced_array.null_count()); - - for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { - assert!(sliced_array.is_valid(i)); - } else { - assert!(sliced_array.is_null(i)); - } - } - - // Check offset and length for each non-null value. - let sliced_list_array = - sliced_array.as_any().downcast_ref::().unwrap(); - assert_eq!(2, sliced_list_array.value_offsets()[2]); - assert_eq!(2, sliced_list_array.value_length(2)); - assert_eq!(4, sliced_list_array.value_offsets()[3]); - assert_eq!(2, sliced_list_array.value_length(3)); - assert_eq!(6, sliced_list_array.value_offsets()[5]); - assert_eq!(3, sliced_list_array.value_length(5)); - } - - #[test] - fn test_large_list_array_slice() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] - let value_offsets = Buffer::from_slice_ref(&[0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); - // 01011001 00000001 - let mut null_bits: [u8; 2] = [0; 2]; - bit_util::set_bit(&mut null_bits, 0); - bit_util::set_bit(&mut null_bits, 3); - bit_util::set_bit(&mut null_bits, 4); - bit_util::set_bit(&mut null_bits, 6); - bit_util::set_bit(&mut null_bits, 8); - - // Construct a list array from the above two - let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(9) - .add_buffer(value_offsets) - .add_child_data(value_data.clone()) - .null_bit_buffer(Buffer::from(null_bits)) - .build(); - let list_array = LargeListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(9, list_array.len()); - assert_eq!(4, list_array.null_count()); - assert_eq!(2, list_array.value_offsets()[3]); - assert_eq!(2, list_array.value_length(3)); - - let sliced_array = list_array.slice(1, 6); - assert_eq!(6, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); - assert_eq!(3, sliced_array.null_count()); - - for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { - assert!(sliced_array.is_valid(i)); - } else { - assert!(sliced_array.is_null(i)); - } - } - - // Check offset and length for each non-null value. - let sliced_list_array = sliced_array - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(2, sliced_list_array.value_offsets()[2]); - assert_eq!(2, sliced_list_array.value_length(2)); - assert_eq!(4, sliced_list_array.value_offsets()[3]); - assert_eq!(2, sliced_list_array.value_length(3)); - assert_eq!(6, sliced_list_array.value_offsets()[5]); - assert_eq!(3, sliced_list_array.value_length(5)); - } - - #[test] - #[should_panic(expected = "index out of bounds: the len is 10 but the index is 11")] - fn test_list_array_index_out_of_bound() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] - let value_offsets = Buffer::from_slice_ref(&[0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); - // 01011001 00000001 - let mut null_bits: [u8; 2] = [0; 2]; - bit_util::set_bit(&mut null_bits, 0); - bit_util::set_bit(&mut null_bits, 3); - bit_util::set_bit(&mut null_bits, 4); - bit_util::set_bit(&mut null_bits, 6); - bit_util::set_bit(&mut null_bits, 8); - - // Construct a list array from the above two - let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(9) - .add_buffer(value_offsets) - .add_child_data(value_data) - .null_bit_buffer(Buffer::from(null_bits)) - .build(); - let list_array = LargeListArray::from(list_data); - assert_eq!(9, list_array.len()); - - list_array.value(10); - } - - #[test] - fn test_fixed_size_list_array_slice() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); - - // Set null buts for the nested array: - // [[0, 1], null, null, [6, 7], [8, 9]] - // 01011001 00000001 - let mut null_bits: [u8; 1] = [0; 1]; - bit_util::set_bit(&mut null_bits, 0); - bit_util::set_bit(&mut null_bits, 3); - bit_util::set_bit(&mut null_bits, 4); - - // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), - 2, - ); - let list_data = ArrayData::builder(list_data_type) - .len(5) - .add_child_data(value_data.clone()) - .null_bit_buffer(Buffer::from(null_bits)) - .build(); - let list_array = FixedSizeListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(5, list_array.len()); - assert_eq!(2, list_array.null_count()); - assert_eq!(6, list_array.value_offset(3)); - assert_eq!(2, list_array.value_length()); - - let sliced_array = list_array.slice(1, 4); - assert_eq!(4, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); - assert_eq!(2, sliced_array.null_count()); - - for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { - assert!(sliced_array.is_valid(i)); - } else { - assert!(sliced_array.is_null(i)); - } - } - - // Check offset and length for each non-null value. - let sliced_list_array = sliced_array - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(2, sliced_list_array.value_length()); - assert_eq!(6, sliced_list_array.value_offset(2)); - assert_eq!(8, sliced_list_array.value_offset(3)); - } - - #[test] - #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] - fn test_fixed_size_list_array_index_out_of_bound() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); - - // Set null buts for the nested array: - // [[0, 1], null, null, [6, 7], [8, 9]] - // 01011001 00000001 - let mut null_bits: [u8; 1] = [0; 1]; - bit_util::set_bit(&mut null_bits, 0); - bit_util::set_bit(&mut null_bits, 3); - bit_util::set_bit(&mut null_bits, 4); - - // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), - 2, - ); - let list_data = ArrayData::builder(list_data_type) - .len(5) - .add_child_data(value_data) - .null_bit_buffer(Buffer::from(null_bits)) - .build(); - let list_array = FixedSizeListArray::from(list_data); - - list_array.value(10); - } - - #[test] - #[should_panic( - expected = "ListArray data should contain a single buffer only (value offsets)" - )] - fn test_list_array_invalid_buffer_len() { - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_child_data(value_data) - .build(); - ListArray::from(list_data); - } - - #[test] - #[should_panic( - expected = "ListArray should contain a single child array (values array)" - )] - fn test_list_array_invalid_child_array_len() { - let value_offsets = Buffer::from_slice_ref(&[0, 2, 5, 7]); - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .build(); - ListArray::from(list_data); - } - - #[test] - #[should_panic(expected = "offsets do not start at zero")] - fn test_list_array_invalid_value_offset_start() { - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); - - let value_offsets = Buffer::from_slice_ref(&[2, 2, 5, 7]); - - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - ListArray::from(list_data); - } - - #[test] - #[should_panic(expected = "memory is not aligned")] - fn test_primitive_array_alignment() { - let ptr = alloc::allocate_aligned::(8); - let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; - let buf2 = buf.slice(1); - let array_data = ArrayData::builder(DataType::Int32).add_buffer(buf2).build(); - Int32Array::from(array_data); - } - - #[test] - #[should_panic(expected = "memory is not aligned")] - fn test_list_array_alignment() { - let ptr = alloc::allocate_aligned::(8); - let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; - let buf2 = buf.slice(1); - - let values: [i32; 8] = [0; 8]; - let value_data = ArrayData::builder(DataType::Int32) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .add_buffer(buf2) - .add_child_data(value_data) - .build(); - ListArray::from(list_data); - } -} diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs deleted file mode 100644 index d2b3b6686d9..00000000000 --- a/rust/arrow/src/array/array_primitive.rs +++ /dev/null @@ -1,942 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::borrow::Borrow; -use std::convert::From; -use std::fmt; -use std::iter::{FromIterator, IntoIterator}; -use std::mem; - -use chrono::{prelude::*, Duration}; - -use super::array::print_long_array; -use super::raw_pointer::RawPtrBox; -use super::*; -use crate::temporal_conversions; -use crate::util::bit_util; -use crate::{ - buffer::{Buffer, MutableBuffer}, - util::trusted_len_unzip, -}; - -/// Number of seconds in a day -const SECONDS_IN_DAY: i64 = 86_400; -/// Number of milliseconds in a second -const MILLISECONDS: i64 = 1_000; -/// Number of microseconds in a second -const MICROSECONDS: i64 = 1_000_000; -/// Number of nanoseconds in a second -const NANOSECONDS: i64 = 1_000_000_000; - -/// Array whose elements are of primitive types. -pub struct PrimitiveArray { - /// Underlying ArrayData - /// # Safety - /// must have exactly one buffer, aligned to type T - data: ArrayData, - /// Pointer to the value array. The lifetime of this must be <= to the value buffer - /// stored in `data`, so it's safe to store. - /// # Safety - /// raw_values must have a value equivalent to `data.buffers()[0].raw_data()` - /// raw_values must have alignment for type T::NativeType - raw_values: RawPtrBox, -} - -impl PrimitiveArray { - /// Returns the length of this array. - #[inline] - pub fn len(&self) -> usize { - self.data.len() - } - - /// Returns whether this array is empty. - pub fn is_empty(&self) -> bool { - self.data.is_empty() - } - - /// Returns a slice of the values of this array - #[inline] - pub fn values(&self) -> &[T::Native] { - // Soundness - // raw_values alignment & location is ensured by fn from(ArrayDataRef) - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.raw_values.as_ptr().add(self.data.offset()), - self.len(), - ) - } - } - - // Returns a new primitive array builder - pub fn builder(capacity: usize) -> PrimitiveBuilder { - PrimitiveBuilder::::new(capacity) - } - - /// Returns the primitive value at index `i`. - /// - /// # Safety - /// - /// caller must ensure that the passed in offset is less than the array len() - pub unsafe fn value_unchecked(&self, i: usize) -> T::Native { - let offset = i + self.offset(); - *self.raw_values.as_ptr().add(offset) - } - - /// Returns the primitive value at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - /// # Safety - /// caller must ensure that the passed in offset is less than the array len() - pub fn value(&self, i: usize) -> T::Native { - debug_assert!(i < self.len()); - unsafe { self.value_unchecked(i) } - } - - /// Creates a PrimitiveArray based on an iterator of values without nulls - pub fn from_iter_values>(iter: I) -> Self { - let val_buf: Buffer = iter.into_iter().collect(); - let data = ArrayData::new( - T::DATA_TYPE, - val_buf.len() / mem::size_of::<::Native>(), - None, - None, - 0, - vec![val_buf], - vec![], - ); - PrimitiveArray::from(data) - } - - /// Creates a PrimitiveArray based on a constant value with `count` elements - pub fn from_value(value: T::Native, count: usize) -> Self { - // # Safety: length is known - let val_buf = unsafe { Buffer::from_trusted_len_iter((0..count).map(|_| value)) }; - let data = ArrayData::new( - T::DATA_TYPE, - val_buf.len() / mem::size_of::<::Native>(), - None, - None, - 0, - vec![val_buf], - vec![], - ); - PrimitiveArray::from(data) - } -} - -impl Array for PrimitiveArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [PrimitiveArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [PrimitiveArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of::>() - } -} - -fn as_datetime(v: i64) -> Option { - match T::DATA_TYPE { - DataType::Date32 => Some(temporal_conversions::date32_to_datetime(v as i32)), - DataType::Date64 => Some(temporal_conversions::date64_to_datetime(v)), - DataType::Time32(_) | DataType::Time64(_) => None, - DataType::Timestamp(unit, _) => match unit { - TimeUnit::Second => Some(temporal_conversions::timestamp_s_to_datetime(v)), - TimeUnit::Millisecond => { - Some(temporal_conversions::timestamp_ms_to_datetime(v)) - } - TimeUnit::Microsecond => { - Some(temporal_conversions::timestamp_us_to_datetime(v)) - } - TimeUnit::Nanosecond => { - Some(temporal_conversions::timestamp_ns_to_datetime(v)) - } - }, - // interval is not yet fully documented [ARROW-3097] - DataType::Interval(_) => None, - _ => None, - } -} - -fn as_date(v: i64) -> Option { - as_datetime::(v).map(|datetime| datetime.date()) -} - -fn as_time(v: i64) -> Option { - match T::DATA_TYPE { - DataType::Time32(unit) => { - // safe to immediately cast to u32 as `self.value(i)` is positive i32 - let v = v as u32; - match unit { - TimeUnit::Second => Some(temporal_conversions::time32s_to_time(v as i32)), - TimeUnit::Millisecond => { - Some(temporal_conversions::time32ms_to_time(v as i32)) - } - _ => None, - } - } - DataType::Time64(unit) => match unit { - TimeUnit::Microsecond => Some(temporal_conversions::time64us_to_time(v)), - TimeUnit::Nanosecond => Some(temporal_conversions::time64ns_to_time(v)), - _ => None, - }, - DataType::Timestamp(_, _) => as_datetime::(v).map(|datetime| datetime.time()), - DataType::Date32 | DataType::Date64 => Some(NaiveTime::from_hms(0, 0, 0)), - DataType::Interval(_) => None, - _ => None, - } -} - -fn as_duration(v: i64) -> Option { - match T::DATA_TYPE { - DataType::Duration(unit) => match unit { - TimeUnit::Second => Some(temporal_conversions::duration_s_to_duration(v)), - TimeUnit::Millisecond => { - Some(temporal_conversions::duration_ms_to_duration(v)) - } - TimeUnit::Microsecond => { - Some(temporal_conversions::duration_us_to_duration(v)) - } - TimeUnit::Nanosecond => { - Some(temporal_conversions::duration_ns_to_duration(v)) - } - }, - _ => None, - } -} - -impl PrimitiveArray -where - i64: std::convert::From, -{ - /// Returns value as a chrono `NaiveDateTime`, handling time resolution - /// - /// If a data type cannot be converted to `NaiveDateTime`, a `None` is returned. - /// A valid value is expected, thus the user should first check for validity. - pub fn value_as_datetime(&self, i: usize) -> Option { - as_datetime::(i64::from(self.value(i))) - } - - /// Returns value as a chrono `NaiveDate` by using `Self::datetime()` - /// - /// If a data type cannot be converted to `NaiveDate`, a `None` is returned - pub fn value_as_date(&self, i: usize) -> Option { - self.value_as_datetime(i).map(|datetime| datetime.date()) - } - - /// Returns a value as a chrono `NaiveTime` - /// - /// `Date32` and `Date64` return UTC midnight as they do not have time resolution - pub fn value_as_time(&self, i: usize) -> Option { - as_time::(i64::from(self.value(i))) - } - - /// Returns a value as a chrono `Duration` - /// - /// If a data type cannot be converted to `Duration`, a `None` is returned - pub fn value_as_duration(&self, i: usize) -> Option { - as_duration::(i64::from(self.value(i))) - } -} - -impl fmt::Debug for PrimitiveArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "PrimitiveArray<{:?}>\n[\n", T::DATA_TYPE)?; - print_long_array(self, f, |array, index, f| match T::DATA_TYPE { - DataType::Date32 | DataType::Date64 => { - let v = self.value(index).to_isize().unwrap() as i64; - match as_date::(v) { - Some(date) => write!(f, "{:?}", date), - None => write!(f, "null"), - } - } - DataType::Time32(_) | DataType::Time64(_) => { - let v = self.value(index).to_isize().unwrap() as i64; - match as_time::(v) { - Some(time) => write!(f, "{:?}", time), - None => write!(f, "null"), - } - } - DataType::Timestamp(_, _) => { - let v = self.value(index).to_isize().unwrap() as i64; - match as_datetime::(v) { - Some(datetime) => write!(f, "{:?}", datetime), - None => write!(f, "null"), - } - } - _ => fmt::Debug::fmt(&array.value(index), f), - })?; - write!(f, "]") - } -} - -impl<'a, T: ArrowPrimitiveType> IntoIterator for &'a PrimitiveArray { - type Item = Option<::Native>; - type IntoIter = PrimitiveIter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - PrimitiveIter::<'a, T>::new(self) - } -} - -impl<'a, T: ArrowPrimitiveType> PrimitiveArray { - /// constructs a new iterator - pub fn iter(&'a self) -> PrimitiveIter<'a, T> { - PrimitiveIter::<'a, T>::new(&self) - } -} - -impl::Native>>> - FromIterator for PrimitiveArray -{ - fn from_iter>(iter: I) -> Self { - let iter = iter.into_iter(); - let (lower, _) = iter.size_hint(); - - let mut null_buf = BooleanBufferBuilder::new(lower); - - let buffer: Buffer = iter - .map(|item| { - if let Some(a) = item.borrow() { - null_buf.append(true); - *a - } else { - null_buf.append(false); - // this ensures that null items on the buffer are not arbitrary. - // This is important because falible operations can use null values (e.g. a vectorized "add") - // which may panic (e.g. overflow if the number on the slots happen to be very large). - T::Native::default() - } - }) - .collect(); - - let data = ArrayData::new( - T::DATA_TYPE, - null_buf.len(), - None, - Some(null_buf.into()), - 0, - vec![buffer], - vec![], - ); - PrimitiveArray::from(data) - } -} - -impl PrimitiveArray { - /// Creates a [`PrimitiveArray`] from an iterator of trusted length. - /// # Safety - /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. - #[inline] - pub unsafe fn from_trusted_len_iter(iter: I) -> Self - where - P: std::borrow::Borrow::Native>>, - I: IntoIterator, - { - let iterator = iter.into_iter(); - let (_, upper) = iterator.size_hint(); - let len = upper.expect("trusted_len_unzip requires an upper limit"); - - let (null, buffer) = trusted_len_unzip(iterator); - - let data = - ArrayData::new(T::DATA_TYPE, len, None, Some(null), 0, vec![buffer], vec![]); - PrimitiveArray::from(data) - } -} - -// TODO: the macro is needed here because we'd get "conflicting implementations" error -// otherwise with both `From>` and `From>>`. -// We should revisit this in future. -macro_rules! def_numeric_from_vec { - ( $ty:ident ) => { - impl From::Native>> for PrimitiveArray<$ty> { - fn from(data: Vec<<$ty as ArrowPrimitiveType>::Native>) -> Self { - let array_data = ArrayData::builder($ty::DATA_TYPE) - .len(data.len()) - .add_buffer(Buffer::from_slice_ref(&data)) - .build(); - PrimitiveArray::from(array_data) - } - } - - // Constructs a primitive array from a vector. Should only be used for testing. - impl From::Native>>> - for PrimitiveArray<$ty> - { - fn from(data: Vec::Native>>) -> Self { - PrimitiveArray::from_iter(data.iter()) - } - } - }; -} - -def_numeric_from_vec!(Int8Type); -def_numeric_from_vec!(Int16Type); -def_numeric_from_vec!(Int32Type); -def_numeric_from_vec!(Int64Type); -def_numeric_from_vec!(UInt8Type); -def_numeric_from_vec!(UInt16Type); -def_numeric_from_vec!(UInt32Type); -def_numeric_from_vec!(UInt64Type); -def_numeric_from_vec!(Float32Type); -def_numeric_from_vec!(Float64Type); - -def_numeric_from_vec!(Date32Type); -def_numeric_from_vec!(Date64Type); -def_numeric_from_vec!(Time32SecondType); -def_numeric_from_vec!(Time32MillisecondType); -def_numeric_from_vec!(Time64MicrosecondType); -def_numeric_from_vec!(Time64NanosecondType); -def_numeric_from_vec!(IntervalYearMonthType); -def_numeric_from_vec!(IntervalDayTimeType); -def_numeric_from_vec!(DurationSecondType); -def_numeric_from_vec!(DurationMillisecondType); -def_numeric_from_vec!(DurationMicrosecondType); -def_numeric_from_vec!(DurationNanosecondType); -def_numeric_from_vec!(TimestampSecondType); -def_numeric_from_vec!(TimestampMillisecondType); -def_numeric_from_vec!(TimestampMicrosecondType); -def_numeric_from_vec!(TimestampNanosecondType); - -impl PrimitiveArray { - /// Construct a timestamp array from a vec of i64 values and an optional timezone - pub fn from_vec(data: Vec, timezone: Option) -> Self { - let array_data = - ArrayData::builder(DataType::Timestamp(T::get_time_unit(), timezone)) - .len(data.len()) - .add_buffer(Buffer::from_slice_ref(&data)) - .build(); - PrimitiveArray::from(array_data) - } -} - -impl PrimitiveArray { - /// Construct a timestamp array from a vec of Option values and an optional timezone - pub fn from_opt_vec(data: Vec>, timezone: Option) -> Self { - // TODO: duplicated from def_numeric_from_vec! macro, it looks possible to convert to generic - let data_len = data.len(); - let mut null_buf = MutableBuffer::new_null(data_len); - let mut val_buf = MutableBuffer::new(data_len * mem::size_of::()); - - { - let null_slice = null_buf.as_slice_mut(); - for (i, v) in data.iter().enumerate() { - if let Some(n) = v { - bit_util::set_bit(null_slice, i); - val_buf.push(*n); - } else { - val_buf.push(0i64); - } - } - } - - let array_data = - ArrayData::builder(DataType::Timestamp(T::get_time_unit(), timezone)) - .len(data_len) - .add_buffer(val_buf.into()) - .null_bit_buffer(null_buf.into()) - .build(); - PrimitiveArray::from(array_data) - } -} - -/// Constructs a `PrimitiveArray` from an array data reference. -impl From for PrimitiveArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "PrimitiveArray data should contain a single buffer only (values buffer)" - ); - - let ptr = data.buffers()[0].as_ptr(); - Self { - data, - raw_values: unsafe { RawPtrBox::new(ptr) }, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use std::thread; - - use crate::buffer::Buffer; - use crate::datatypes::DataType; - - #[test] - fn test_primitive_array_from_vec() { - let buf = Buffer::from_slice_ref(&[0, 1, 2, 3, 4]); - let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); - assert_eq!(buf, arr.data.buffers()[0]); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } - - assert_eq!(64, arr.get_buffer_memory_size()); - assert_eq!(136, arr.get_array_memory_size()); - } - - #[test] - fn test_primitive_array_from_vec_option() { - // Test building a primitive array with null values - let arr = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(2, arr.null_count()); - for i in 0..5 { - if i % 2 == 0 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } else { - assert!(arr.is_null(i)); - assert!(!arr.is_valid(i)); - } - } - - assert_eq!(128, arr.get_buffer_memory_size()); - assert_eq!(216, arr.get_array_memory_size()); - } - - #[test] - fn test_date64_array_from_vec_option() { - // Test building a primitive array with null values - // we use Int32 and Int64 as a backing array, so all Int32 and Int64 conventions - // work - let arr: PrimitiveArray = - vec![Some(1550902545147), None, Some(1550902545147)].into(); - assert_eq!(3, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(1, arr.null_count()); - for i in 0..3 { - if i % 2 == 0 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(1550902545147, arr.value(i)); - // roundtrip to and from datetime - assert_eq!( - 1550902545147, - arr.value_as_datetime(i).unwrap().timestamp_millis() - ); - } else { - assert!(arr.is_null(i)); - assert!(!arr.is_valid(i)); - } - } - } - - #[test] - fn test_time32_millisecond_array_from_vec() { - // 1: 00:00:00.001 - // 37800005: 10:30:00.005 - // 86399210: 23:59:59.210 - let arr: PrimitiveArray = - vec![1, 37_800_005, 86_399_210].into(); - assert_eq!(3, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - let formatted = vec!["00:00:00.001", "10:30:00.005", "23:59:59.210"]; - for (i, formatted) in formatted.iter().enumerate().take(3) { - // check that we can't create dates or datetimes from time instances - assert_eq!(None, arr.value_as_datetime(i)); - assert_eq!(None, arr.value_as_date(i)); - let time = arr.value_as_time(i).unwrap(); - assert_eq!(*formatted, time.format("%H:%M:%S%.3f").to_string()); - } - } - - #[test] - fn test_time64_nanosecond_array_from_vec() { - // Test building a primitive array with null values - // we use Int32 and Int64 as a backing array, so all Int32 and Int64 conventions - // work - - // 1e6: 00:00:00.001 - // 37800005e6: 10:30:00.005 - // 86399210e6: 23:59:59.210 - let arr: PrimitiveArray = - vec![1_000_000, 37_800_005_000_000, 86_399_210_000_000].into(); - assert_eq!(3, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - let formatted = vec!["00:00:00.001", "10:30:00.005", "23:59:59.210"]; - for (i, item) in formatted.iter().enumerate().take(3) { - // check that we can't create dates or datetimes from time instances - assert_eq!(None, arr.value_as_datetime(i)); - assert_eq!(None, arr.value_as_date(i)); - let time = arr.value_as_time(i).unwrap(); - assert_eq!(*item, time.format("%H:%M:%S%.3f").to_string()); - } - } - - #[test] - fn test_interval_array_from_vec() { - // intervals are currently not treated specially, but are Int32 and Int64 arrays - let arr = IntervalYearMonthArray::from(vec![Some(1), None, Some(-5)]); - assert_eq!(3, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(1, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(1, arr.values()[0]); - assert!(arr.is_null(1)); - assert_eq!(-5, arr.value(2)); - assert_eq!(-5, arr.values()[2]); - - // a day_time interval contains days and milliseconds, but we do not yet have accessors for the values - let arr = IntervalDayTimeArray::from(vec![Some(1), None, Some(-5)]); - assert_eq!(3, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(1, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(1, arr.values()[0]); - assert!(arr.is_null(1)); - assert_eq!(-5, arr.value(2)); - assert_eq!(-5, arr.values()[2]); - } - - #[test] - fn test_duration_array_from_vec() { - let arr = DurationSecondArray::from(vec![Some(1), None, Some(-5)]); - assert_eq!(3, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(1, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(1, arr.values()[0]); - assert!(arr.is_null(1)); - assert_eq!(-5, arr.value(2)); - assert_eq!(-5, arr.values()[2]); - - let arr = DurationMillisecondArray::from(vec![Some(1), None, Some(-5)]); - assert_eq!(3, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(1, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(1, arr.values()[0]); - assert!(arr.is_null(1)); - assert_eq!(-5, arr.value(2)); - assert_eq!(-5, arr.values()[2]); - - let arr = DurationMicrosecondArray::from(vec![Some(1), None, Some(-5)]); - assert_eq!(3, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(1, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(1, arr.values()[0]); - assert!(arr.is_null(1)); - assert_eq!(-5, arr.value(2)); - assert_eq!(-5, arr.values()[2]); - - let arr = DurationNanosecondArray::from(vec![Some(1), None, Some(-5)]); - assert_eq!(3, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(1, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(1, arr.values()[0]); - assert!(arr.is_null(1)); - assert_eq!(-5, arr.value(2)); - assert_eq!(-5, arr.values()[2]); - } - - #[test] - fn test_timestamp_array_from_vec() { - let arr = TimestampSecondArray::from_vec(vec![1, -5], None); - assert_eq!(2, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(-5, arr.value(1)); - assert_eq!(&[1, -5], arr.values()); - - let arr = TimestampMillisecondArray::from_vec(vec![1, -5], None); - assert_eq!(2, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(-5, arr.value(1)); - assert_eq!(&[1, -5], arr.values()); - - let arr = TimestampMicrosecondArray::from_vec(vec![1, -5], None); - assert_eq!(2, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(-5, arr.value(1)); - assert_eq!(&[1, -5], arr.values()); - - let arr = TimestampNanosecondArray::from_vec(vec![1, -5], None); - assert_eq!(2, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - assert_eq!(1, arr.value(0)); - assert_eq!(-5, arr.value(1)); - assert_eq!(&[1, -5], arr.values()); - } - - #[test] - fn test_primitive_array_slice() { - let arr = Int32Array::from(vec![ - Some(0), - None, - Some(2), - None, - Some(4), - Some(5), - Some(6), - None, - None, - ]); - assert_eq!(9, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(4, arr.null_count()); - - let arr2 = arr.slice(2, 5); - assert_eq!(5, arr2.len()); - assert_eq!(2, arr2.offset()); - assert_eq!(1, arr2.null_count()); - - for i in 0..arr2.len() { - assert_eq!(i == 1, arr2.is_null(i)); - assert_eq!(i != 1, arr2.is_valid(i)); - } - let int_arr2 = arr2.as_any().downcast_ref::().unwrap(); - assert_eq!(2, int_arr2.values()[0]); - assert_eq!(&[4, 5, 6], &int_arr2.values()[2..5]); - - let arr3 = arr2.slice(2, 3); - assert_eq!(3, arr3.len()); - assert_eq!(4, arr3.offset()); - assert_eq!(0, arr3.null_count()); - - let int_arr3 = arr3.as_any().downcast_ref::().unwrap(); - assert_eq!(&[4, 5, 6], int_arr3.values()); - assert_eq!(4, int_arr3.value(0)); - assert_eq!(5, int_arr3.value(1)); - assert_eq!(6, int_arr3.value(2)); - } - - #[test] - fn test_boolean_array_slice() { - let arr = BooleanArray::from(vec![ - Some(true), - None, - Some(false), - None, - Some(true), - Some(false), - Some(true), - Some(false), - None, - Some(true), - ]); - - assert_eq!(10, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(3, arr.null_count()); - - let arr2 = arr.slice(3, 5); - assert_eq!(5, arr2.len()); - assert_eq!(3, arr2.offset()); - assert_eq!(1, arr2.null_count()); - - let bool_arr = arr2.as_any().downcast_ref::().unwrap(); - - assert_eq!(false, bool_arr.is_valid(0)); - - assert_eq!(true, bool_arr.is_valid(1)); - assert_eq!(true, bool_arr.value(1)); - - assert_eq!(true, bool_arr.is_valid(2)); - assert_eq!(false, bool_arr.value(2)); - - assert_eq!(true, bool_arr.is_valid(3)); - assert_eq!(true, bool_arr.value(3)); - - assert_eq!(true, bool_arr.is_valid(4)); - assert_eq!(false, bool_arr.value(4)); - } - - #[test] - fn test_int32_fmt_debug() { - let arr = Int32Array::from(vec![0, 1, 2, 3, 4]); - assert_eq!( - "PrimitiveArray\n[\n 0,\n 1,\n 2,\n 3,\n 4,\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_fmt_debug_up_to_20_elements() { - (1..=20).for_each(|i| { - let values = (0..i).collect::>(); - let array_expected = format!( - "PrimitiveArray\n[\n{}\n]", - values - .iter() - .map(|v| { format!(" {},", v) }) - .collect::>() - .join("\n") - ); - let array = Int16Array::from(values); - - assert_eq!(array_expected, format!("{:?}", array)); - }) - } - - #[test] - fn test_int32_with_null_fmt_debug() { - let mut builder = Int32Array::builder(3); - builder.append_slice(&[0, 1]).unwrap(); - builder.append_null().unwrap(); - builder.append_slice(&[3, 4]).unwrap(); - let arr = builder.finish(); - assert_eq!( - "PrimitiveArray\n[\n 0,\n 1,\n null,\n 3,\n 4,\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_timestamp_fmt_debug() { - let arr: PrimitiveArray = - TimestampMillisecondArray::from_vec( - vec![1546214400000, 1546214400000, -1546214400000], - None, - ); - assert_eq!( - "PrimitiveArray\n[\n 2018-12-31T00:00:00,\n 2018-12-31T00:00:00,\n 1921-01-02T00:00:00,\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_date32_fmt_debug() { - let arr: PrimitiveArray = vec![12356, 13548, -365].into(); - assert_eq!( - "PrimitiveArray\n[\n 2003-10-31,\n 2007-02-04,\n 1969-01-01,\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_time32second_fmt_debug() { - let arr: PrimitiveArray = vec![7201, 60054].into(); - assert_eq!( - "PrimitiveArray\n[\n 02:00:01,\n 16:40:54,\n]", - format!("{:?}", arr) - ); - } - - #[test] - #[should_panic(expected = "invalid time")] - fn test_time32second_invalid_neg() { - // The panic should come from chrono, not from arrow - let arr: PrimitiveArray = vec![-7201, -60054].into(); - println!("{:?}", arr); - } - - #[test] - fn test_primitive_array_builder() { - // Test building a primitive array with ArrayData builder and offset - let buf = Buffer::from_slice_ref(&[0, 1, 2, 3, 4]); - let buf2 = buf.clone(); - let data = ArrayData::builder(DataType::Int32) - .len(5) - .offset(2) - .add_buffer(buf) - .build(); - let arr = Int32Array::from(data); - assert_eq!(buf2, arr.data.buffers()[0]); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.null_count()); - for i in 0..3 { - assert_eq!((i + 2) as i32, arr.value(i)); - } - } - - #[test] - fn test_primitive_from_iter_values() { - // Test building a primitive array with from_iter_values - let arr: PrimitiveArray = PrimitiveArray::from_iter_values(0..10); - assert_eq!(10, arr.len()); - assert_eq!(0, arr.null_count()); - for i in 0..10i32 { - assert_eq!(i, arr.value(i as usize)); - } - } - - #[test] - fn test_primitive_array_from_unbound_iter() { - // iterator that doesn't declare (upper) size bound - let value_iter = (0..) - .scan(0usize, |pos, i| { - if *pos < 10 { - *pos += 1; - Some(Some(i)) - } else { - // actually returns up to 10 values - None - } - }) - // limited using take() - .take(100); - - let (_, upper_size_bound) = value_iter.size_hint(); - // the upper bound, defined by take above, is 100 - assert_eq!(upper_size_bound, Some(100)); - let primitive_array: PrimitiveArray = value_iter.collect(); - // but the actual number of items in the array should be 10 - assert_eq!(primitive_array.len(), 10); - } - - #[test] - #[should_panic(expected = "PrimitiveArray data should contain a single buffer only \ - (values buffer)")] - fn test_primitive_array_invalid_buffer_len() { - let data = ArrayData::builder(DataType::Int32).len(5).build(); - Int32Array::from(data); - } - - #[test] - fn test_access_array_concurrently() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let ret = thread::spawn(move || a.value(3)).join(); - - assert!(ret.is_ok()); - assert_eq!(8, ret.ok().unwrap()); - } -} diff --git a/rust/arrow/src/array/array_string.rs b/rust/arrow/src/array/array_string.rs deleted file mode 100644 index 0519148e6f4..00000000000 --- a/rust/arrow/src/array/array_string.rs +++ /dev/null @@ -1,528 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::convert::From; -use std::fmt; -use std::mem; -use std::{any::Any, iter::FromIterator}; - -use super::{ - array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericListArray, - GenericStringIter, OffsetSizeTrait, -}; -use crate::buffer::Buffer; -use crate::util::bit_util; -use crate::{buffer::MutableBuffer, datatypes::DataType}; - -/// Like OffsetSizeTrait, but specialized for Strings -// This allow us to expose a constant datatype for the GenericStringArray -pub trait StringOffsetSizeTrait: OffsetSizeTrait { - const DATA_TYPE: DataType; -} - -impl StringOffsetSizeTrait for i32 { - const DATA_TYPE: DataType = DataType::Utf8; -} - -impl StringOffsetSizeTrait for i64 { - const DATA_TYPE: DataType = DataType::LargeUtf8; -} - -/// Generic struct for \[Large\]StringArray -pub struct GenericStringArray { - data: ArrayData, - value_offsets: RawPtrBox, - value_data: RawPtrBox, -} - -impl GenericStringArray { - /// Returns the length for the element at index `i`. - #[inline] - pub fn value_length(&self, i: usize) -> OffsetSize { - let offsets = self.value_offsets(); - offsets[i + 1] - offsets[i] - } - - /// Returns the offset values in the offsets buffer - #[inline] - pub fn value_offsets(&self) -> &[OffsetSize] { - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the ArrayData instance. - unsafe { - std::slice::from_raw_parts( - self.value_offsets.as_ptr().add(self.data.offset()), - self.len() + 1, - ) - } - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[1].clone() - } - - /// Returns the element at index - /// # Safety - /// caller is responsible for ensuring that index is within the array bounds - pub unsafe fn value_unchecked(&self, i: usize) -> &str { - let end = self.value_offsets().get_unchecked(i + 1); - let start = self.value_offsets().get_unchecked(i); - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - // ISSUE: utf-8 well formedness is not checked - - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - let slice = std::slice::from_raw_parts( - self.value_data.as_ptr().offset(start.to_isize().unwrap()), - (*end - *start).to_usize().unwrap(), - ); - std::str::from_utf8_unchecked(slice) - } - - /// Returns the element at index `i` as &str - pub fn value(&self, i: usize) -> &str { - assert!(i < self.data.len(), "StringArray out of bounds access"); - //Soundness: length checked above, offset buffer length is 1 larger than logical array length - let end = unsafe { self.value_offsets().get_unchecked(i + 1) }; - let start = unsafe { self.value_offsets().get_unchecked(i) }; - - // Soundness - // pointer alignment & location is ensured by RawPtrBox - // buffer bounds/offset is ensured by the value_offset invariants - // ISSUE: utf-8 well formedness is not checked - unsafe { - // Safety of `to_isize().unwrap()` - // `start` and `end` are &OffsetSize, which is a generic type that implements the - // OffsetSizeTrait. Currently, only i32 and i64 implement OffsetSizeTrait, - // both of which should cleanly cast to isize on an architecture that supports - // 32/64-bit offsets - let slice = std::slice::from_raw_parts( - self.value_data.as_ptr().offset(start.to_isize().unwrap()), - (*end - *start).to_usize().unwrap(), - ); - std::str::from_utf8_unchecked(slice) - } - } - - fn from_list(v: GenericListArray) -> Self { - assert_eq!( - v.data().child_data()[0].child_data().len(), - 0, - "StringArray can only be created from list array of u8 values \ - (i.e. List>)." - ); - assert_eq!( - v.data().child_data()[0].data_type(), - &DataType::UInt8, - "StringArray can only be created from List arrays, mismatched data types." - ); - - let mut builder = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(v.len()) - .add_buffer(v.data().buffers()[0].clone()) - .add_buffer(v.data().child_data()[0].buffers()[0].clone()); - if let Some(bitmap) = v.data().null_bitmap() { - builder = builder.null_bit_buffer(bitmap.bits.clone()) - } - - let data = builder.build(); - Self::from(data) - } - - pub(crate) fn from_vec(v: Vec<&str>) -> Self { - let mut offsets = - MutableBuffer::new((v.len() + 1) * std::mem::size_of::()); - let mut values = MutableBuffer::new(0); - - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - for s in &v { - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); - values.extend_from_slice(s.as_bytes()); - } - let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(v.len()) - .add_buffer(offsets.into()) - .add_buffer(values.into()) - .build(); - Self::from(array_data) - } - - pub(crate) fn from_opt_vec(v: Vec>) -> Self { - v.into_iter().collect() - } - - /// Creates a `GenericStringArray` based on an iterator of values without nulls - pub fn from_iter_values>(iter: I) -> Self - where - Ptr: AsRef, - { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let mut offsets = - MutableBuffer::new((data_len + 1) * std::mem::size_of::()); - let mut values = MutableBuffer::new(0); - - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - for i in iter { - let s = i.as_ref(); - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); - values.extend_from_slice(s.as_bytes()); - } - let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(data_len) - .add_buffer(offsets.into()) - .add_buffer(values.into()) - .build(); - Self::from(array_data) - } -} - -impl<'a, Ptr, OffsetSize: StringOffsetSizeTrait> FromIterator> - for GenericStringArray -where - Ptr: AsRef, -{ - fn from_iter>>(iter: I) -> Self { - let iter = iter.into_iter(); - let (_, data_len) = iter.size_hint(); - let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - - let offset_size = std::mem::size_of::(); - let mut offsets = MutableBuffer::new((data_len + 1) * offset_size); - let mut values = MutableBuffer::new(0); - let mut null_buf = MutableBuffer::new_null(data_len); - let null_slice = null_buf.as_slice_mut(); - let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); - - for (i, s) in iter.enumerate() { - let value_bytes = if let Some(ref s) = s { - // set null bit - bit_util::set_bit(null_slice, i); - let s_bytes = s.as_ref().as_bytes(); - length_so_far += OffsetSize::from_usize(s_bytes.len()).unwrap(); - s_bytes - } else { - b"" - }; - values.extend_from_slice(value_bytes); - offsets.push(length_so_far); - } - - // calculate actual data_len, which may be different from the iterator's upper bound - let data_len = (offsets.len() / offset_size) - 1; - let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) - .len(data_len) - .add_buffer(offsets.into()) - .add_buffer(values.into()) - .null_bit_buffer(null_buf.into()) - .build(); - Self::from(array_data) - } -} - -impl<'a, T: StringOffsetSizeTrait> IntoIterator for &'a GenericStringArray { - type Item = Option<&'a str>; - type IntoIter = GenericStringIter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - GenericStringIter::<'a, T>::new(self) - } -} - -impl<'a, T: StringOffsetSizeTrait> GenericStringArray { - /// constructs a new iterator - pub fn iter(&'a self) -> GenericStringIter<'a, T> { - GenericStringIter::<'a, T>::new(&self) - } -} - -impl fmt::Debug for GenericStringArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let prefix = if OffsetSize::is_large() { "Large" } else { "" }; - - write!(f, "{}StringArray\n[\n", prefix)?; - print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl Array for GenericStringArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [$name]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [$name]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } -} - -impl From - for GenericStringArray -{ - fn from(data: ArrayData) -> Self { - assert_eq!( - data.data_type(), - &::DATA_TYPE, - "[Large]StringArray expects Datatype::[Large]Utf8" - ); - assert_eq!( - data.buffers().len(), - 2, - "StringArray data should contain 2 buffers only (offsets and values)" - ); - let offsets = data.buffers()[0].as_ptr(); - let values = data.buffers()[1].as_ptr(); - Self { - data, - value_offsets: unsafe { RawPtrBox::new(offsets) }, - value_data: unsafe { RawPtrBox::new(values) }, - } - } -} - -impl From>> - for GenericStringArray -{ - fn from(v: Vec>) -> Self { - GenericStringArray::::from_opt_vec(v) - } -} - -impl From> - for GenericStringArray -{ - fn from(v: Vec<&str>) -> Self { - GenericStringArray::::from_vec(v) - } -} - -/// An array where each element is a variable-sized sequence of bytes representing a string -/// whose maximum length (in bytes) is represented by a i32. -pub type StringArray = GenericStringArray; - -/// An array where each element is a variable-sized sequence of bytes representing a string -/// whose maximum length (in bytes) is represented by a i64. -pub type LargeStringArray = GenericStringArray; - -impl From> for GenericStringArray { - fn from(v: GenericListArray) -> Self { - GenericStringArray::::from_list(v) - } -} - -#[cfg(test)] -mod tests { - use crate::array::{ListBuilder, StringBuilder}; - - use super::*; - - #[test] - fn test_string_array_from_u8_slice() { - let values: Vec<&str> = vec!["hello", "", "parquet"]; - - // Array data: ["hello", "", "parquet"] - let string_array = StringArray::from(values); - - assert_eq!(3, string_array.len()); - assert_eq!(0, string_array.null_count()); - assert_eq!("hello", string_array.value(0)); - assert_eq!("hello", unsafe { string_array.value_unchecked(0) }); - assert_eq!("", string_array.value(1)); - assert_eq!("", unsafe { string_array.value_unchecked(1) }); - assert_eq!("parquet", string_array.value(2)); - assert_eq!("parquet", unsafe { string_array.value_unchecked(2) }); - assert_eq!(5, string_array.value_offsets()[2]); - assert_eq!(7, string_array.value_length(2)); - for i in 0..3 { - assert!(string_array.is_valid(i)); - assert!(!string_array.is_null(i)); - } - } - - #[test] - #[should_panic(expected = "[Large]StringArray expects Datatype::[Large]Utf8")] - fn test_string_array_from_int() { - let array = LargeStringArray::from(vec!["a", "b"]); - StringArray::from(array.data().clone()); - } - - #[test] - fn test_large_string_array_from_u8_slice() { - let values: Vec<&str> = vec!["hello", "", "parquet"]; - - // Array data: ["hello", "", "parquet"] - let string_array = LargeStringArray::from(values); - - assert_eq!(3, string_array.len()); - assert_eq!(0, string_array.null_count()); - assert_eq!("hello", string_array.value(0)); - assert_eq!("hello", unsafe { string_array.value_unchecked(0) }); - assert_eq!("", string_array.value(1)); - assert_eq!("", unsafe { string_array.value_unchecked(1) }); - assert_eq!("parquet", string_array.value(2)); - assert_eq!("parquet", unsafe { string_array.value_unchecked(2) }); - assert_eq!(5, string_array.value_offsets()[2]); - assert_eq!(7, string_array.value_length(2)); - for i in 0..3 { - assert!(string_array.is_valid(i)); - assert!(!string_array.is_null(i)); - } - } - - #[test] - fn test_nested_string_array() { - let string_builder = StringBuilder::new(3); - let mut list_of_string_builder = ListBuilder::new(string_builder); - - list_of_string_builder.values().append_value("foo").unwrap(); - list_of_string_builder.values().append_value("bar").unwrap(); - list_of_string_builder.append(true).unwrap(); - - list_of_string_builder - .values() - .append_value("foobar") - .unwrap(); - list_of_string_builder.append(true).unwrap(); - let list_of_strings = list_of_string_builder.finish(); - - assert_eq!(list_of_strings.len(), 2); - - let first_slot = list_of_strings.value(0); - let first_list = first_slot.as_any().downcast_ref::().unwrap(); - assert_eq!(first_list.len(), 2); - assert_eq!(first_list.value(0), "foo"); - assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo"); - assert_eq!(first_list.value(1), "bar"); - assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar"); - - let second_slot = list_of_strings.value(1); - let second_list = second_slot.as_any().downcast_ref::().unwrap(); - assert_eq!(second_list.len(), 1); - assert_eq!(second_list.value(0), "foobar"); - assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar"); - } - - #[test] - #[should_panic(expected = "StringArray out of bounds access")] - fn test_string_array_get_value_index_out_of_bound() { - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let offsets: [i32; 4] = [0, 5, 5, 12]; - let array_data = ArrayData::builder(DataType::Utf8) - .len(3) - .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) - .build(); - let string_array = StringArray::from(array_data); - string_array.value(4); - } - - #[test] - fn test_string_array_fmt_debug() { - let arr: StringArray = vec!["hello", "arrow"].into(); - assert_eq!( - "StringArray\n[\n \"hello\",\n \"arrow\",\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_large_string_array_fmt_debug() { - let arr: LargeStringArray = vec!["hello", "arrow"].into(); - assert_eq!( - "LargeStringArray\n[\n \"hello\",\n \"arrow\",\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_string_array_from_iter() { - let data = vec![Some("hello"), None, Some("arrow")]; - // from Vec> - let array1 = StringArray::from(data.clone()); - // from Iterator> - let array2: StringArray = data.clone().into_iter().collect(); - // from Iterator> - let array3: StringArray = - data.into_iter().map(|x| x.map(|s| s.to_string())).collect(); - - assert_eq!(array1, array2); - assert_eq!(array2, array3); - } - - #[test] - fn test_string_array_from_iter_values() { - let data = vec!["hello", "hello2"]; - let array1 = StringArray::from_iter_values(data.iter()); - - assert_eq!(array1.value(0), "hello"); - assert_eq!(array1.value(1), "hello2"); - } - - #[test] - fn test_string_array_from_unbound_iter() { - // iterator that doesn't declare (upper) size bound - let string_iter = (0..) - .scan(0usize, |pos, i| { - if *pos < 10 { - *pos += 1; - Some(Some(format!("value {}", i))) - } else { - // actually returns up to 10 values - None - } - }) - // limited using take() - .take(100); - - let (_, upper_size_bound) = string_iter.size_hint(); - // the upper bound, defined by take above, is 100 - assert_eq!(upper_size_bound, Some(100)); - let string_array: StringArray = string_iter.collect(); - // but the actual number of items in the array should be 10 - assert_eq!(string_array.len(), 10); - } -} diff --git a/rust/arrow/src/array/array_struct.rs b/rust/arrow/src/array/array_struct.rs deleted file mode 100644 index 59ee527e5f8..00000000000 --- a/rust/arrow/src/array/array_struct.rs +++ /dev/null @@ -1,531 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::convert::{From, TryFrom}; -use std::fmt; -use std::iter::IntoIterator; -use std::mem; - -use super::{make_array, Array, ArrayData, ArrayRef}; -use crate::datatypes::DataType; -use crate::error::{ArrowError, Result}; -use crate::{ - buffer::{buffer_bin_or, Buffer}, - datatypes::Field, -}; - -/// A nested array type where each child (called *field*) is represented by a separate -/// array. -pub struct StructArray { - data: ArrayData, - pub(crate) boxed_fields: Vec, -} - -impl StructArray { - /// Returns the field at `pos`. - pub fn column(&self, pos: usize) -> &ArrayRef { - &self.boxed_fields[pos] - } - - /// Return the number of fields in this struct array - pub fn num_columns(&self) -> usize { - self.boxed_fields.len() - } - - /// Returns the fields of the struct array - pub fn columns(&self) -> Vec<&ArrayRef> { - self.boxed_fields.iter().collect() - } - - /// Returns child array refs of the struct array - pub fn columns_ref(&self) -> Vec { - self.boxed_fields.clone() - } - - /// Return field names in this struct array - pub fn column_names(&self) -> Vec<&str> { - match self.data.data_type() { - DataType::Struct(fields) => fields - .iter() - .map(|f| f.name().as_str()) - .collect::>(), - _ => unreachable!("Struct array's data type is not struct!"), - } - } - - /// Return child array whose field name equals to column_name - /// - /// Note: A schema can currently have duplicate field names, in which case - /// the first field will always be selected. - /// This issue will be addressed in [ARROW-11178](https://issues.apache.org/jira/browse/ARROW-11178) - pub fn column_by_name(&self, column_name: &str) -> Option<&ArrayRef> { - self.column_names() - .iter() - .position(|c| c == &column_name) - .map(|pos| self.column(pos)) - } -} - -impl From for StructArray { - fn from(data: ArrayData) -> Self { - let mut boxed_fields = vec![]; - for cd in data.child_data() { - let child_data = if data.offset() != 0 || data.len() != cd.len() { - cd.slice(data.offset(), data.len()) - } else { - cd.clone() - }; - boxed_fields.push(make_array(child_data)); - } - Self { data, boxed_fields } - } -} - -impl TryFrom> for StructArray { - type Error = ArrowError; - - /// builds a StructArray from a vector of names and arrays. - /// This errors if the values have a different length. - /// An entry is set to Null when all values are null. - fn try_from(values: Vec<(&str, ArrayRef)>) -> Result { - let values_len = values.len(); - - // these will be populated - let mut fields = Vec::with_capacity(values_len); - let mut child_data = Vec::with_capacity(values_len); - - // len: the size of the arrays. - let mut len: Option = None; - // null: the null mask of the arrays. - let mut null: Option = None; - for (field_name, array) in values { - let child_datum = array.data(); - let child_datum_len = child_datum.len(); - if let Some(len) = len { - if len != child_datum_len { - return Err(ArrowError::InvalidArgumentError( - format!("Array of field \"{}\" has length {}, but previous elements have length {}. - All arrays in every entry in a struct array must have the same length.", field_name, child_datum_len, len) - )); - } - } else { - len = Some(child_datum_len) - } - child_data.push(child_datum.clone()); - fields.push(Field::new( - field_name, - array.data_type().clone(), - child_datum.null_buffer().is_some(), - )); - - if let Some(child_null_buffer) = child_datum.null_buffer() { - let child_datum_offset = child_datum.offset(); - - null = Some(if let Some(null_buffer) = &null { - buffer_bin_or( - null_buffer, - 0, - child_null_buffer, - child_datum_offset, - child_datum_len, - ) - } else { - child_null_buffer.bit_slice(child_datum_offset, child_datum_len) - }); - } else if null.is_some() { - // when one of the fields has no nulls, them there is no null in the array - null = None; - } - } - let len = len.unwrap(); - - let mut builder = ArrayData::builder(DataType::Struct(fields)) - .len(len) - .child_data(child_data); - if let Some(null_buffer) = null { - builder = builder.null_bit_buffer(null_buffer); - } - - Ok(StructArray::from(builder.build())) - } -} - -impl Array for StructArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the length (i.e., number of elements) of this array - fn len(&self) -> usize { - self.data_ref().len() - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [StructArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [StructArray]. - fn get_array_memory_size(&self) -> usize { - self.data.get_array_memory_size() + mem::size_of_val(self) - } -} - -impl From> for StructArray { - fn from(v: Vec<(Field, ArrayRef)>) -> Self { - let (field_types, field_values): (Vec<_>, Vec<_>) = v.into_iter().unzip(); - - // Check the length of the child arrays - let length = field_values[0].len(); - for i in 1..field_values.len() { - assert_eq!( - length, - field_values[i].len(), - "all child arrays of a StructArray must have the same length" - ); - assert_eq!( - field_types[i].data_type(), - field_values[i].data().data_type(), - "the field data types must match the array data in a StructArray" - ) - } - - let data = ArrayData::builder(DataType::Struct(field_types)) - .child_data(field_values.into_iter().map(|a| a.data().clone()).collect()) - .len(length) - .build(); - Self::from(data) - } -} - -impl fmt::Debug for StructArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "StructArray\n[\n")?; - for (child_index, name) in self.column_names().iter().enumerate() { - let column = self.column(child_index); - writeln!( - f, - "-- child {}: \"{}\" ({:?})", - child_index, - name, - column.data_type() - )?; - fmt::Debug::fmt(column, f)?; - writeln!(f)?; - } - write!(f, "]") - } -} - -impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray { - fn from(pair: (Vec<(Field, ArrayRef)>, Buffer)) -> Self { - let (field_types, field_values): (Vec<_>, Vec<_>) = pair.0.into_iter().unzip(); - - // Check the length of the child arrays - let length = field_values[0].len(); - for i in 1..field_values.len() { - assert_eq!( - length, - field_values[i].len(), - "all child arrays of a StructArray must have the same length" - ); - assert_eq!( - field_types[i].data_type(), - field_values[i].data().data_type(), - "the field data types must match the array data in a StructArray" - ) - } - - let data = ArrayData::builder(DataType::Struct(field_types)) - .null_bit_buffer(pair.1) - .child_data(field_values.into_iter().map(|a| a.data().clone()).collect()) - .len(length) - .build(); - Self::from(data) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use std::sync::Arc; - - use crate::{ - array::BooleanArray, array::Float32Array, array::Float64Array, array::Int32Array, - array::StringArray, bitmap::Bitmap, - }; - use crate::{ - array::Int64Array, - datatypes::{DataType, Field}, - }; - use crate::{buffer::Buffer, datatypes::ToByteSlice}; - - #[test] - fn test_struct_array_builder() { - let array = BooleanArray::from(vec![false, false, true, true]); - let boolean_data = array.data(); - let array = Int64Array::from(vec![42, 28, 19, 31]); - let int_data = array.data(); - - let fields = vec![ - Field::new("a", DataType::Boolean, false), - Field::new("b", DataType::Int64, false), - ]; - let struct_array_data = ArrayData::builder(DataType::Struct(fields)) - .len(4) - .add_child_data(boolean_data.clone()) - .add_child_data(int_data.clone()) - .build(); - let struct_array = StructArray::from(struct_array_data); - - assert_eq!(boolean_data, struct_array.column(0).data()); - assert_eq!(int_data, struct_array.column(1).data()); - } - - #[test] - fn test_struct_array_from() { - let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true])); - let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31])); - - let struct_array = StructArray::from(vec![ - ( - Field::new("b", DataType::Boolean, false), - boolean.clone() as ArrayRef, - ), - ( - Field::new("c", DataType::Int32, false), - int.clone() as ArrayRef, - ), - ]); - assert_eq!(struct_array.column(0).as_ref(), boolean.as_ref()); - assert_eq!(struct_array.column(1).as_ref(), int.as_ref()); - assert_eq!(4, struct_array.len()); - assert_eq!(0, struct_array.null_count()); - assert_eq!(0, struct_array.offset()); - } - - /// validates that the in-memory representation follows [the spec](https://arrow.apache.org/docs/format/Columnar.html#struct-layout) - #[test] - fn test_struct_array_from_vec() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - ])); - let ints: ArrayRef = - Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); - - let arr = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - - let struct_data = arr.data(); - assert_eq!(4, struct_data.len()); - assert_eq!(1, struct_data.null_count()); - assert_eq!( - // 00001011 - &Some(Bitmap::from(Buffer::from(&[11_u8]))), - struct_data.null_bitmap() - ); - - let expected_string_data = ArrayData::builder(DataType::Utf8) - .len(4) - .null_bit_buffer(Buffer::from(&[9_u8])) - .add_buffer(Buffer::from(&[0, 3, 3, 3, 7].to_byte_slice())) - .add_buffer(Buffer::from(b"joemark")) - .build(); - - let expected_int_data = ArrayData::builder(DataType::Int32) - .len(4) - .null_bit_buffer(Buffer::from(&[11_u8])) - .add_buffer(Buffer::from(&[1, 2, 0, 4].to_byte_slice())) - .build(); - - assert_eq!(&expected_string_data, arr.column(0).data()); - - // TODO: implement equality for ArrayData - assert_eq!(expected_int_data.len(), arr.column(1).data().len()); - assert_eq!( - expected_int_data.null_count(), - arr.column(1).data().null_count() - ); - assert_eq!( - expected_int_data.null_bitmap(), - arr.column(1).data().null_bitmap() - ); - let expected_value_buf = expected_int_data.buffers()[0].clone(); - let actual_value_buf = arr.column(1).data().buffers()[0].clone(); - for i in 0..expected_int_data.len() { - if !expected_int_data.is_null(i) { - assert_eq!( - expected_value_buf.as_slice()[i * 4..(i + 1) * 4], - actual_value_buf.as_slice()[i * 4..(i + 1) * 4] - ); - } - } - } - - #[test] - fn test_struct_array_from_vec_error() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - // 3 elements, not 4 - ])); - let ints: ArrayRef = - Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)])); - - let arr = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]); - - match arr { - Err(ArrowError::InvalidArgumentError(e)) => { - assert!(e.starts_with("Array of field \"f2\" has length 4, but previous elements have length 3.")); - } - _ => panic!("This test got an unexpected error type"), - }; - } - - #[test] - #[should_panic( - expected = "the field data types must match the array data in a StructArray" - )] - fn test_struct_array_from_mismatched_types() { - StructArray::from(vec![ - ( - Field::new("b", DataType::Int16, false), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, - ), - ( - Field::new("c", DataType::Utf8, false), - Arc::new(Int32Array::from(vec![42, 28, 19, 31])), - ), - ]); - } - - #[test] - fn test_struct_array_slice() { - let boolean_data = ArrayData::builder(DataType::Boolean) - .len(5) - .add_buffer(Buffer::from([0b00010000])) - .null_bit_buffer(Buffer::from([0b00010001])) - .build(); - let int_data = ArrayData::builder(DataType::Int32) - .len(5) - .add_buffer(Buffer::from([0, 28, 42, 0, 0].to_byte_slice())) - .null_bit_buffer(Buffer::from([0b00000110])) - .build(); - - let mut field_types = vec![]; - field_types.push(Field::new("a", DataType::Boolean, false)); - field_types.push(Field::new("b", DataType::Int32, false)); - let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) - .len(5) - .add_child_data(boolean_data.clone()) - .add_child_data(int_data.clone()) - .null_bit_buffer(Buffer::from([0b00010111])) - .build(); - let struct_array = StructArray::from(struct_array_data); - - assert_eq!(5, struct_array.len()); - assert_eq!(1, struct_array.null_count()); - assert!(struct_array.is_valid(0)); - assert!(struct_array.is_valid(1)); - assert!(struct_array.is_valid(2)); - assert!(struct_array.is_null(3)); - assert!(struct_array.is_valid(4)); - assert_eq!(&boolean_data, struct_array.column(0).data()); - assert_eq!(&int_data, struct_array.column(1).data()); - - let c0 = struct_array.column(0); - let c0 = c0.as_any().downcast_ref::().unwrap(); - assert_eq!(5, c0.len()); - assert_eq!(3, c0.null_count()); - assert!(c0.is_valid(0)); - assert_eq!(false, c0.value(0)); - assert!(c0.is_null(1)); - assert!(c0.is_null(2)); - assert!(c0.is_null(3)); - assert!(c0.is_valid(4)); - assert_eq!(true, c0.value(4)); - - let c1 = struct_array.column(1); - let c1 = c1.as_any().downcast_ref::().unwrap(); - assert_eq!(5, c1.len()); - assert_eq!(3, c1.null_count()); - assert!(c1.is_null(0)); - assert!(c1.is_valid(1)); - assert_eq!(28, c1.value(1)); - assert!(c1.is_valid(2)); - assert_eq!(42, c1.value(2)); - assert!(c1.is_null(3)); - assert!(c1.is_null(4)); - - let sliced_array = struct_array.slice(2, 3); - let sliced_array = sliced_array.as_any().downcast_ref::().unwrap(); - assert_eq!(3, sliced_array.len()); - assert_eq!(2, sliced_array.offset()); - assert_eq!(1, sliced_array.null_count()); - assert!(sliced_array.is_valid(0)); - assert!(sliced_array.is_null(1)); - assert!(sliced_array.is_valid(2)); - - let sliced_c0 = sliced_array.column(0); - let sliced_c0 = sliced_c0.as_any().downcast_ref::().unwrap(); - assert_eq!(3, sliced_c0.len()); - assert_eq!(2, sliced_c0.offset()); - assert!(sliced_c0.is_null(0)); - assert!(sliced_c0.is_null(1)); - assert!(sliced_c0.is_valid(2)); - assert_eq!(true, sliced_c0.value(2)); - - let sliced_c1 = sliced_array.column(1); - let sliced_c1 = sliced_c1.as_any().downcast_ref::().unwrap(); - assert_eq!(3, sliced_c1.len()); - assert_eq!(2, sliced_c1.offset()); - assert!(sliced_c1.is_valid(0)); - assert_eq!(42, sliced_c1.value(0)); - assert!(sliced_c1.is_null(1)); - assert!(sliced_c1.is_null(2)); - } - - #[test] - #[should_panic( - expected = "all child arrays of a StructArray must have the same length" - )] - fn test_invalid_struct_child_array_lengths() { - StructArray::from(vec![ - ( - Field::new("b", DataType::Float32, false), - Arc::new(Float32Array::from(vec![1.1])) as Arc, - ), - ( - Field::new("c", DataType::Float64, false), - Arc::new(Float64Array::from(vec![2.2, 3.3])), - ), - ]); - } -} diff --git a/rust/arrow/src/array/array_union.rs b/rust/arrow/src/array/array_union.rs deleted file mode 100644 index 083d5bba15b..00000000000 --- a/rust/arrow/src/array/array_union.rs +++ /dev/null @@ -1,831 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Contains the `UnionArray` type. -//! -//! Each slot in a `UnionArray` can have a value chosen from a number of types. Each of the -//! possible types are named like the fields of a [`StructArray`](crate::array::StructArray). -//! A `UnionArray` can have two possible memory layouts, "dense" or "sparse". For more information -//! on please see the [specification](https://arrow.apache.org/docs/format/Columnar.html#union-layout). -//! -//! Builders are provided for `UnionArray`'s involving primitive types. `UnionArray`'s of nested -//! types are also supported but not via `UnionBuilder`, see the tests for examples. -//! -//! # Example: Dense Memory Layout -//! -//! ``` -//! use arrow::array::UnionBuilder; -//! use arrow::datatypes::{Float64Type, Int32Type}; -//! -//! # fn main() -> arrow::error::Result<()> { -//! let mut builder = UnionBuilder::new_dense(3); -//! builder.append::("a", 1).unwrap(); -//! builder.append::("b", 3.0).unwrap(); -//! builder.append::("a", 4).unwrap(); -//! let union = builder.build().unwrap(); -//! -//! assert_eq!(union.type_id(0), 0_i8); -//! assert_eq!(union.type_id(1), 1_i8); -//! assert_eq!(union.type_id(2), 0_i8); -//! -//! assert_eq!(union.value_offset(0), 0_i32); -//! assert_eq!(union.value_offset(1), 0_i32); -//! assert_eq!(union.value_offset(2), 1_i32); -//! -//! # Ok(()) -//! # } -//! ``` -//! -//! # Example: Sparse Memory Layout -//! ``` -//! use arrow::array::UnionBuilder; -//! use arrow::datatypes::{Float64Type, Int32Type}; -//! -//! # fn main() -> arrow::error::Result<()> { -//! let mut builder = UnionBuilder::new_sparse(3); -//! builder.append::("a", 1).unwrap(); -//! builder.append::("b", 3.0).unwrap(); -//! builder.append::("a", 4).unwrap(); -//! let union = builder.build().unwrap(); -//! -//! assert_eq!(union.type_id(0), 0_i8); -//! assert_eq!(union.type_id(1), 1_i8); -//! assert_eq!(union.type_id(2), 0_i8); -//! -//! assert_eq!(union.value_offset(0), 0_i32); -//! assert_eq!(union.value_offset(1), 1_i32); -//! assert_eq!(union.value_offset(2), 2_i32); -//! -//! # Ok(()) -//! # } -//! ``` -use crate::array::{data::count_nulls, make_array, Array, ArrayData, ArrayRef}; -use crate::buffer::Buffer; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; - -use core::fmt; -use std::any::Any; -use std::mem; -use std::mem::size_of; - -/// An Array that can represent slots of varying types. -pub struct UnionArray { - data: ArrayData, - boxed_fields: Vec, -} - -impl UnionArray { - /// Creates a new `UnionArray`. - /// - /// Accepts type ids, child arrays and optionally offsets (for dense unions) to create - /// a new `UnionArray`. This method makes no attempt to validate the data provided by the - /// caller and assumes that each of the components are correct and consistent with each other. - /// See `try_new` for an alternative that validates the data provided. - /// - /// # Data Consistency - /// - /// The `type_ids` `Buffer` should contain `i8` values. These values should be greater than - /// zero and must be less than the number of children provided in `child_arrays`. These values - /// are used to index into the `child_arrays`. - /// - /// The `value_offsets` `Buffer` is only provided in the case of a dense union, sparse unions - /// should use `None`. If provided the `value_offsets` `Buffer` should contain `i32` values. - /// These values should be greater than zero and must be less than the length of the overall - /// array. - /// - /// In both cases above we use signed integer types to maintain compatibility with other - /// Arrow implementations. - /// - /// In both of the cases above we are accepting `Buffer`'s which are assumed to be representing - /// `i8` and `i32` values respectively. `Buffer` objects are untyped and no attempt is made - /// to ensure that the data provided is valid. - pub fn new( - type_ids: Buffer, - value_offsets: Option, - child_arrays: Vec<(Field, ArrayRef)>, - bitmap_data: Option, - ) -> Self { - let (field_types, field_values): (Vec<_>, Vec<_>) = - child_arrays.into_iter().unzip(); - let len = type_ids.len(); - let mut builder = ArrayData::builder(DataType::Union(field_types)) - .add_buffer(type_ids) - .child_data(field_values.into_iter().map(|a| a.data().clone()).collect()) - .len(len); - if let Some(bitmap) = bitmap_data { - builder = builder.null_bit_buffer(bitmap) - } - let data = match value_offsets { - Some(b) => builder.add_buffer(b).build(), - None => builder.build(), - }; - Self::from(data) - } - /// Attempts to create a new `UnionArray` and validates the inputs provided. - pub fn try_new( - type_ids: Buffer, - value_offsets: Option, - child_arrays: Vec<(Field, ArrayRef)>, - bitmap: Option, - ) -> Result { - if let Some(b) = &value_offsets { - let nulls = count_nulls(bitmap.as_ref(), 0, type_ids.len()); - if ((type_ids.len() - nulls) * 4) != b.len() { - return Err(ArrowError::InvalidArgumentError( - "Type Ids and Offsets represent a different number of array slots." - .to_string(), - )); - } - } - - // Check the type_ids - let type_id_slice: &[i8] = unsafe { type_ids.typed_data() }; - let invalid_type_ids = type_id_slice - .iter() - .filter(|i| *i < &0) - .collect::>(); - if !invalid_type_ids.is_empty() { - return Err(ArrowError::InvalidArgumentError(format!( - "Type Ids must be positive and cannot be greater than the number of \ - child arrays, found:\n{:?}", - invalid_type_ids - ))); - } - - // Check the value offsets if provided - if let Some(offset_buffer) = &value_offsets { - let max_len = type_ids.len() as i32; - let offsets_slice: &[i32] = unsafe { offset_buffer.typed_data() }; - let invalid_offsets = offsets_slice - .iter() - .filter(|i| *i < &0 || *i > &max_len) - .collect::>(); - if !invalid_offsets.is_empty() { - return Err(ArrowError::InvalidArgumentError(format!( - "Offsets must be positive and within the length of the Array, \ - found:\n{:?}", - invalid_offsets - ))); - } - } - - Ok(Self::new(type_ids, value_offsets, child_arrays, bitmap)) - } - - /// Accesses the child array for `type_id`. - /// - /// # Panics - /// - /// Panics if the `type_id` provided is less than zero or greater than the number of types - /// in the `Union`. - pub fn child(&self, type_id: i8) -> ArrayRef { - assert!(0 <= type_id); - assert!((type_id as usize) < self.boxed_fields.len()); - self.boxed_fields[type_id as usize].clone() - } - - /// Returns the `type_id` for the array slot at `index`. - /// - /// # Panics - /// - /// Panics if `index` is greater than the length of the array. - pub fn type_id(&self, index: usize) -> i8 { - assert!(index - self.offset() < self.len()); - self.data().buffers()[0].as_slice()[index] as i8 - } - - /// Returns the offset into the underlying values array for the array slot at `index`. - /// - /// # Panics - /// - /// Panics if `index` is greater than the length of the array. - pub fn value_offset(&self, index: usize) -> i32 { - assert!(index - self.offset() < self.len()); - if self.is_dense() { - // In format v4 unions had their own validity bitmap and offsets are compressed by omitting null values - // Starting with v5 unions don't have a validity bitmap and it's possible to directly index into the offsets buffer - let valid_slots = match self.data.null_buffer() { - Some(b) => b.count_set_bits_offset(0, index), - None => index, - }; - self.data().buffers()[1].as_slice()[valid_slots * size_of::()] as i32 - } else { - index as i32 - } - } - - /// Returns the array's value at `index`. - /// - /// # Panics - /// - /// Panics if `index` is greater than the length of the array. - pub fn value(&self, index: usize) -> ArrayRef { - let type_id = self.type_id(self.offset() + index); - let value_offset = self.value_offset(self.offset() + index) as usize; - let child_data = self.boxed_fields[type_id as usize].clone(); - child_data.slice(value_offset, 1) - } - - /// Returns the names of the types in the union. - pub fn type_names(&self) -> Vec<&str> { - match self.data.data_type() { - DataType::Union(fields) => fields - .iter() - .map(|f| f.name().as_str()) - .collect::>(), - _ => unreachable!("Union array's data type is not a union!"), - } - } - - /// Returns whether the `UnionArray` is dense (or sparse if `false`). - fn is_dense(&self) -> bool { - self.data().buffers().len() == 2 - } -} - -impl From for UnionArray { - fn from(data: ArrayData) -> Self { - let mut boxed_fields = vec![]; - for cd in data.child_data() { - boxed_fields.push(make_array(cd.clone())); - } - Self { data, boxed_fields } - } -} - -impl Array for UnionArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [UnionArray]. - fn get_buffer_memory_size(&self) -> usize { - let mut size = self.data.get_buffer_memory_size(); - for field in &self.boxed_fields { - size += field.get_buffer_memory_size(); - } - size - } - - /// Returns the total number of bytes of memory occupied physically by this [UnionArray]. - fn get_array_memory_size(&self) -> usize { - let mut size = self.data.get_array_memory_size(); - size += mem::size_of_val(self) - mem::size_of_val(&self.boxed_fields); - for field in &self.boxed_fields { - size += field.get_array_memory_size(); - } - size - } -} - -impl fmt::Debug for UnionArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let header = if self.is_dense() { - "UnionArray(Dense)\n[" - } else { - "UnionArray(Sparse)\n[" - }; - writeln!(f, "{}", header)?; - - writeln!(f, "-- type id buffer:")?; - writeln!(f, "{:?}", self.data().buffers()[0])?; - - if self.is_dense() { - writeln!(f, "-- offsets buffer:")?; - writeln!(f, "{:?}", self.data().buffers()[1])?; - } - - for (child_index, name) in self.type_names().iter().enumerate() { - let column = &self.boxed_fields[child_index]; - writeln!( - f, - "-- child {}: \"{}\" ({:?})", - child_index, - *name, - column.data_type() - )?; - fmt::Debug::fmt(column, f)?; - writeln!(f)?; - } - writeln!(f, "]") - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use std::sync::Arc; - - use crate::array::*; - use crate::buffer::Buffer; - use crate::datatypes::{DataType, Field}; - - #[test] - fn test_dense_i32() { - let mut builder = UnionBuilder::new_dense(7); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append::("c", 5).unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union = builder.build().unwrap(); - - let expected_type_ids = vec![0_i8, 1, 2, 0, 2, 0, 1]; - let expected_value_offsets = vec![0_i32, 0, 0, 1, 1, 2, 1]; - let expected_array_values = [1_i32, 2, 3, 4, 5, 6, 7]; - - // Check type ids - assert_eq!( - union.data().buffers()[0], - Buffer::from_slice_ref(&expected_type_ids) - ); - for (i, id) in expected_type_ids.iter().enumerate() { - assert_eq!(id, &union.type_id(i)); - } - - // Check offsets - assert_eq!( - union.data().buffers()[1], - Buffer::from_slice_ref(&expected_value_offsets) - ); - for (i, id) in expected_value_offsets.iter().enumerate() { - assert_eq!(&union.value_offset(i), id); - } - - // Check data - assert_eq!( - union.data().child_data()[0].buffers()[0], - Buffer::from_slice_ref(&[1_i32, 4, 6]) - ); - assert_eq!( - union.data().child_data()[1].buffers()[0], - Buffer::from_slice_ref(&[2_i32, 7]) - ); - assert_eq!( - union.data().child_data()[2].buffers()[0], - Buffer::from_slice_ref(&[3_i32, 5]), - ); - - assert_eq!(expected_array_values.len(), union.len()); - for (i, expected_value) in expected_array_values.iter().enumerate() { - assert_eq!(false, union.is_null(i)); - let slot = union.value(i); - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(expected_value, &value); - } - } - - #[test] - fn test_dense_mixed() { - let mut builder = UnionBuilder::new_dense(7); - builder.append::("a", 1).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append::("c", 5).unwrap(); - builder.append::("a", 6).unwrap(); - let union = builder.build().unwrap(); - - assert_eq!(5, union.len()); - for i in 0..union.len() { - let slot = union.value(i); - assert_eq!(false, union.is_null(i)); - match i { - 0 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(1_i32, value); - } - 1 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(3_i64, value); - } - 2 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(4_i32, value); - } - 3 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(5_i64, value); - } - 4 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(6_i32, value); - } - _ => unreachable!(), - } - } - } - - #[test] - fn test_dense_mixed_with_nulls() { - let mut builder = UnionBuilder::new_dense(7); - builder.append::("a", 1).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 10).unwrap(); - builder.append_null().unwrap(); - builder.append::("a", 6).unwrap(); - let union = builder.build().unwrap(); - - assert_eq!(5, union.len()); - for i in 0..union.len() { - let slot = union.value(i); - match i { - 0 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(1_i32, value); - } - 1 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(3_i64, value); - } - 2 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(10_i32, value); - } - 3 => assert!(union.is_null(i)), - 4 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(6_i32, value); - } - _ => unreachable!(), - } - } - } - - #[test] - fn test_dense_mixed_with_nulls_and_offset() { - let mut builder = UnionBuilder::new_dense(7); - builder.append::("a", 1).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 10).unwrap(); - builder.append_null().unwrap(); - builder.append::("a", 6).unwrap(); - let union = builder.build().unwrap(); - - let slice = union.slice(2, 3); - let new_union = slice.as_any().downcast_ref::().unwrap(); - - assert_eq!(3, new_union.len()); - for i in 0..new_union.len() { - let slot = new_union.value(i); - match i { - 0 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(10_i32, value); - } - 1 => assert!(new_union.is_null(i)), - 2 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(6_i32, value); - } - _ => unreachable!(), - } - } - } - - #[test] - fn test_dense_mixed_with_str() { - let string_array = StringArray::from(vec!["foo", "bar", "baz"]); - let int_array = Int32Array::from(vec![5, 6]); - let float_array = Float64Array::from(vec![10.0]); - - let type_ids = [1_i8, 0, 0, 2, 0, 1]; - let value_offsets = [0_i32, 0, 1, 0, 2, 1]; - - let type_id_buffer = Buffer::from_slice_ref(&type_ids); - let value_offsets_buffer = Buffer::from_slice_ref(&value_offsets); - - let mut children: Vec<(Field, Arc)> = Vec::new(); - children.push(( - Field::new("A", DataType::Utf8, false), - Arc::new(string_array), - )); - children.push((Field::new("B", DataType::Int32, false), Arc::new(int_array))); - children.push(( - Field::new("C", DataType::Float64, false), - Arc::new(float_array), - )); - let array = UnionArray::try_new( - type_id_buffer, - Some(value_offsets_buffer), - children, - None, - ) - .unwrap(); - - // Check type ids - assert_eq!(Buffer::from_slice_ref(&type_ids), array.data().buffers()[0]); - for (i, id) in type_ids.iter().enumerate() { - assert_eq!(id, &array.type_id(i)); - } - - // Check offsets - assert_eq!( - Buffer::from_slice_ref(&value_offsets), - array.data().buffers()[1] - ); - for (i, id) in value_offsets.iter().enumerate() { - assert_eq!(id, &array.value_offset(i)); - } - - // Check values - assert_eq!(6, array.len()); - - let slot = array.value(0); - let value = slot.as_any().downcast_ref::().unwrap().value(0); - assert_eq!(5, value); - - let slot = array.value(1); - let value = slot - .as_any() - .downcast_ref::() - .unwrap() - .value(0); - assert_eq!("foo", value); - - let slot = array.value(2); - let value = slot - .as_any() - .downcast_ref::() - .unwrap() - .value(0); - assert_eq!("bar", value); - - let slot = array.value(3); - let value = slot - .as_any() - .downcast_ref::() - .unwrap() - .value(0); - assert!(10.0 - value < f64::EPSILON); - - let slot = array.value(4); - let value = slot - .as_any() - .downcast_ref::() - .unwrap() - .value(0); - assert_eq!("baz", value); - - let slot = array.value(5); - let value = slot.as_any().downcast_ref::().unwrap().value(0); - assert_eq!(6, value); - } - - #[test] - fn test_sparse_i32() { - let mut builder = UnionBuilder::new_sparse(7); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.append::("c", 3).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append::("c", 5).unwrap(); - builder.append::("a", 6).unwrap(); - builder.append::("b", 7).unwrap(); - let union = builder.build().unwrap(); - - let expected_type_ids = vec![0_i8, 1, 2, 0, 2, 0, 1]; - let expected_array_values = [1_i32, 2, 3, 4, 5, 6, 7]; - - // Check type ids - assert_eq!( - Buffer::from_slice_ref(&expected_type_ids), - union.data().buffers()[0] - ); - for (i, id) in expected_type_ids.iter().enumerate() { - assert_eq!(id, &union.type_id(i)); - } - - // Check offsets, sparse union should only have a single buffer - assert_eq!(union.data().buffers().len(), 1); - - // Check data - assert_eq!( - union.data().child_data()[0].buffers()[0], - Buffer::from_slice_ref(&[1_i32, 0, 0, 4, 0, 6, 0]), - ); - assert_eq!( - Buffer::from_slice_ref(&[0_i32, 2_i32, 0, 0, 0, 0, 7]), - union.data().child_data()[1].buffers()[0] - ); - assert_eq!( - Buffer::from_slice_ref(&[0_i32, 0, 3_i32, 0, 5, 0, 0]), - union.data().child_data()[2].buffers()[0] - ); - - assert_eq!(expected_array_values.len(), union.len()); - for (i, expected_value) in expected_array_values.iter().enumerate() { - assert_eq!(false, union.is_null(i)); - let slot = union.value(i); - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(expected_value, &value); - } - } - - #[test] - fn test_sparse_mixed() { - let mut builder = UnionBuilder::new_sparse(5); - builder.append::("a", 1).unwrap(); - builder.append::("c", 3.0).unwrap(); - builder.append::("a", 4).unwrap(); - builder.append::("c", 5.0).unwrap(); - builder.append::("a", 6).unwrap(); - let union = builder.build().unwrap(); - - let expected_type_ids = vec![0_i8, 1, 0, 1, 0]; - - // Check type ids - assert_eq!( - Buffer::from_slice_ref(&expected_type_ids), - union.data().buffers()[0] - ); - for (i, id) in expected_type_ids.iter().enumerate() { - assert_eq!(id, &union.type_id(i)); - } - - // Check offsets, sparse union should only have a single buffer, i.e. no offsets - assert_eq!(union.data().buffers().len(), 1); - - for i in 0..union.len() { - let slot = union.value(i); - assert_eq!(false, union.is_null(i)); - match i { - 0 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(1_i32, value); - } - 1 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert!(value - 3_f64 < f64::EPSILON); - } - 2 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(4_i32, value); - } - 3 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert!(5_f64 - value < f64::EPSILON); - } - 4 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(6_i32, value); - } - _ => unreachable!(), - } - } - } - - #[test] - fn test_sparse_mixed_with_nulls() { - let mut builder = UnionBuilder::new_sparse(5); - builder.append::("a", 1).unwrap(); - builder.append_null().unwrap(); - builder.append::("c", 3.0).unwrap(); - builder.append::("a", 4).unwrap(); - let union = builder.build().unwrap(); - - let expected_type_ids = vec![0_i8, 0, 1, 0]; - - // Check type ids - assert_eq!( - Buffer::from_slice_ref(&expected_type_ids), - union.data().buffers()[0] - ); - for (i, id) in expected_type_ids.iter().enumerate() { - assert_eq!(id, &union.type_id(i)); - } - - // Check offsets, sparse union should only have a single buffer, i.e. no offsets - assert_eq!(union.data().buffers().len(), 1); - - for i in 0..union.len() { - let slot = union.value(i); - match i { - 0 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(1_i32, value); - } - 1 => assert!(union.is_null(i)), - 2 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert!(value - 3_f64 < f64::EPSILON); - } - 3 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(4_i32, value); - } - _ => unreachable!(), - } - } - } - - #[test] - fn test_sparse_mixed_with_nulls_and_offset() { - let mut builder = UnionBuilder::new_sparse(5); - builder.append::("a", 1).unwrap(); - builder.append_null().unwrap(); - builder.append::("c", 3.0).unwrap(); - builder.append_null().unwrap(); - builder.append::("a", 4).unwrap(); - let union = builder.build().unwrap(); - - let slice = union.slice(1, 4); - let new_union = slice.as_any().downcast_ref::().unwrap(); - - assert_eq!(4, new_union.len()); - for i in 0..new_union.len() { - let slot = new_union.value(i); - match i { - 0 => assert!(new_union.is_null(i)), - 1 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, new_union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert!(value - 3_f64 < f64::EPSILON); - } - 2 => assert!(new_union.is_null(i)), - 3 => { - let slot = slot.as_any().downcast_ref::().unwrap(); - assert_eq!(false, new_union.is_null(i)); - assert_eq!(slot.len(), 1); - let value = slot.value(0); - assert_eq!(4_i32, value); - } - _ => unreachable!(), - } - } - } -} diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs deleted file mode 100644 index 38df92ebb46..00000000000 --- a/rust/arrow/src/array/builder.rs +++ /dev/null @@ -1,3171 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines a [`BufferBuilder`](crate::array::BufferBuilder) capable -//! of creating a [`Buffer`](crate::buffer::Buffer) which can be used -//! as an internal buffer in an [`ArrayData`](crate::array::ArrayData) -//! object. - -use std::any::Any; -use std::collections::HashMap; -use std::fmt; -use std::marker::PhantomData; -use std::mem; -use std::sync::Arc; - -use crate::array::*; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; - -/// Converts a `MutableBuffer` to a `BufferBuilder`. -/// -/// `slots` is the number of array slots currently represented in the `MutableBuffer`. -pub(crate) fn mutable_buffer_to_builder( - mutable_buffer: MutableBuffer, - slots: usize, -) -> BufferBuilder { - BufferBuilder:: { - buffer: mutable_buffer, - len: slots, - _marker: PhantomData, - } -} - -/// Converts a `BufferBuilder` into its underlying `MutableBuffer`. -/// -/// `From` is not implemented because associated type bounds are unstable. -pub(crate) fn builder_to_mutable_buffer( - builder: BufferBuilder, -) -> MutableBuffer { - builder.buffer -} - -/// Builder for creating a [`Buffer`](crate::buffer::Buffer) object. -/// -/// A [`Buffer`](crate::buffer::Buffer) is the underlying data -/// structure of Arrow's [`Arrays`](crate::array::Array). -/// -/// For all supported types, there are type definitions for the -/// generic version of `BufferBuilder`, e.g. `UInt8BufferBuilder`. -/// -/// # Example: -/// -/// ``` -/// use arrow::array::UInt8BufferBuilder; -/// -/// # fn main() -> arrow::error::Result<()> { -/// let mut builder = UInt8BufferBuilder::new(100); -/// builder.append_slice(&[42, 43, 44]); -/// builder.append(45); -/// let buffer = builder.finish(); -/// -/// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 43, 44, 45]); -/// # Ok(()) -/// # } -/// ``` -#[derive(Debug)] -pub struct BufferBuilder { - buffer: MutableBuffer, - len: usize, - _marker: PhantomData, -} - -impl BufferBuilder { - /// Creates a new builder with initial capacity for _at least_ `capacity` - /// elements of type `T`. - /// - /// The capacity can later be manually adjusted with the - /// [`reserve()`](BufferBuilder::reserve) method. - /// Also the - /// [`append()`](BufferBuilder::append), - /// [`append_slice()`](BufferBuilder::append_slice) and - /// [`advance()`](BufferBuilder::advance) - /// methods automatically increase the capacity if needed. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// - /// assert!(builder.capacity() >= 10); - /// ``` - #[inline] - pub fn new(capacity: usize) -> Self { - let buffer = MutableBuffer::new(capacity * mem::size_of::()); - - Self { - buffer, - len: 0, - _marker: PhantomData, - } - } - - /// Returns the current number of array elements in the internal buffer. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append(42); - /// - /// assert_eq!(builder.len(), 1); - /// ``` - pub fn len(&self) -> usize { - self.len - } - - /// Returns whether the internal buffer is empty. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append(42); - /// - /// assert_eq!(builder.is_empty(), false); - /// ``` - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the actual capacity (number of elements) of the internal buffer. - /// - /// Note: the internal capacity returned by this method might be larger than - /// what you'd expect after setting the capacity in the `new()` or `reserve()` - /// functions. - pub fn capacity(&self) -> usize { - let byte_capacity = self.buffer.capacity(); - byte_capacity / std::mem::size_of::() - } - - /// Increases the number of elements in the internal buffer by `n` - /// and resizes the buffer as needed. - /// - /// The values of the newly added elements are 0. - /// This method is usually used when appending `NULL` values to the buffer - /// as they still require physical memory space. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.advance(2); - /// - /// assert_eq!(builder.len(), 2); - /// ``` - #[inline] - pub fn advance(&mut self, i: usize) { - let new_buffer_len = (self.len + i) * mem::size_of::(); - self.buffer.resize(new_buffer_len, 0); - self.len += i; - } - - /// Reserves memory for _at least_ `n` more elements of type `T`. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.reserve(10); - /// - /// assert!(builder.capacity() >= 20); - /// ``` - #[inline] - pub fn reserve(&mut self, n: usize) { - self.buffer.reserve(n * mem::size_of::()); - } - - /// Appends a value of type `T` into the builder, - /// growing the internal buffer as needed. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append(42); - /// - /// assert_eq!(builder.len(), 1); - /// ``` - #[inline] - pub fn append(&mut self, v: T) { - self.reserve(1); - self.buffer.push(v); - self.len += 1; - } - - /// Appends a value of type `T` into the builder N times, - /// growing the internal buffer as needed. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append_n(10, 42); - /// - /// assert_eq!(builder.len(), 10); - /// ``` - #[inline] - pub fn append_n(&mut self, n: usize, v: T) { - self.reserve(n); - for _ in 0..n { - self.buffer.push(v); - } - self.len += n; - } - - /// Appends a slice of type `T`, growing the internal buffer as needed. - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append_slice(&[42, 44, 46]); - /// - /// assert_eq!(builder.len(), 3); - /// ``` - #[inline] - pub fn append_slice(&mut self, slice: &[T]) { - self.buffer.extend_from_slice(slice); - self.len += slice.len(); - } - - /// Resets this builder and returns an immutable [`Buffer`](crate::buffer::Buffer). - /// - /// # Example: - /// - /// ``` - /// use arrow::array::UInt8BufferBuilder; - /// - /// let mut builder = UInt8BufferBuilder::new(10); - /// builder.append_slice(&[42, 44, 46]); - /// - /// let buffer = builder.finish(); - /// - /// assert_eq!(unsafe { buffer.typed_data::() }, &[42, 44, 46]); - /// ``` - #[inline] - pub fn finish(&mut self) -> Buffer { - let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); - self.len = 0; - buf.into() - } -} - -#[derive(Debug)] -pub struct BooleanBufferBuilder { - buffer: MutableBuffer, - len: usize, -} - -impl BooleanBufferBuilder { - #[inline] - pub fn new(capacity: usize) -> Self { - let byte_capacity = bit_util::ceil(capacity, 8); - let buffer = MutableBuffer::from_len_zeroed(byte_capacity); - Self { buffer, len: 0 } - } - - #[inline] - pub fn len(&self) -> usize { - self.len - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - #[inline] - pub fn capacity(&self) -> usize { - self.buffer.capacity() * 8 - } - - #[inline] - pub fn advance(&mut self, additional: usize) { - let new_len = self.len + additional; - let new_len_bytes = bit_util::ceil(new_len, 8); - if new_len_bytes > self.buffer.len() { - self.buffer.resize(new_len_bytes, 0); - } - self.len = new_len; - } - - /// Reserve space to at least `additional` new bits. - /// Capacity will be `>= self.len() + additional`. - /// New bytes are uninitialized and reading them is undefined behavior. - #[inline] - pub fn reserve(&mut self, additional: usize) { - let capacity = self.len + additional; - if capacity > self.capacity() { - // convert differential to bytes - let additional = bit_util::ceil(capacity, 8) - self.buffer.len(); - self.buffer.reserve(additional); - } - } - - #[inline] - pub fn append(&mut self, v: bool) { - self.advance(1); - if v { - unsafe { bit_util::set_bit_raw(self.buffer.as_mut_ptr(), self.len - 1) }; - } - } - - #[inline] - pub fn append_n(&mut self, additional: usize, v: bool) { - self.advance(additional); - if additional > 0 && v { - let offset = self.len() - additional; - (0..additional).for_each(|i| unsafe { - bit_util::set_bit_raw(self.buffer.as_mut_ptr(), offset + i) - }) - } - } - - #[inline] - pub fn append_slice(&mut self, slice: &[bool]) { - let additional = slice.len(); - self.advance(additional); - - let offset = self.len() - additional; - for (i, v) in slice.iter().enumerate() { - if *v { - unsafe { bit_util::set_bit_raw(self.buffer.as_mut_ptr(), offset + i) } - } - } - } - - #[inline] - pub fn finish(&mut self) -> Buffer { - let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0)); - self.len = 0; - buf.into() - } -} - -impl From for Buffer { - #[inline] - fn from(builder: BooleanBufferBuilder) -> Self { - builder.buffer.into() - } -} - -/// Trait for dealing with different array builders at runtime -pub trait ArrayBuilder: Any { - /// Returns the number of array slots in the builder - fn len(&self) -> usize; - - /// Returns whether number of array slots is zero - fn is_empty(&self) -> bool; - - /// Builds the array - fn finish(&mut self) -> ArrayRef; - - /// Returns the builder as a non-mutable `Any` reference. - /// - /// This is most useful when one wants to call non-mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_ref` to get a reference on the specific builder. - fn as_any(&self) -> &Any; - - /// Returns the builder as a mutable `Any` reference. - /// - /// This is most useful when one wants to call mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_mut` to get a reference on the specific builder. - fn as_any_mut(&mut self) -> &mut Any; - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box; -} - -/// Array builder for fixed-width primitive types -#[derive(Debug)] -pub struct BooleanBuilder { - values_builder: BooleanBufferBuilder, - bitmap_builder: BooleanBufferBuilder, -} - -impl BooleanBuilder { - /// Creates a new primitive array builder - pub fn new(capacity: usize) -> Self { - Self { - values_builder: BooleanBufferBuilder::new(capacity), - bitmap_builder: BooleanBufferBuilder::new(capacity), - } - } - - /// Returns the capacity of this builder measured in slots of type `T` - pub fn capacity(&self) -> usize { - self.values_builder.capacity() - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value(&mut self, v: bool) -> Result<()> { - self.bitmap_builder.append(true); - self.values_builder.append(v); - Ok(()) - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.bitmap_builder.append(false); - self.values_builder.advance(1); - Ok(()) - } - - /// Appends an `Option` into the builder - #[inline] - pub fn append_option(&mut self, v: Option) -> Result<()> { - match v { - None => self.append_null()?, - Some(v) => self.append_value(v)?, - }; - Ok(()) - } - - /// Appends a slice of type `T` into the builder - #[inline] - pub fn append_slice(&mut self, v: &[bool]) -> Result<()> { - self.bitmap_builder.append_n(v.len(), true); - self.values_builder.append_slice(v); - Ok(()) - } - - /// Appends values from a slice of type `T` and a validity boolean slice - #[inline] - pub fn append_values(&mut self, values: &[bool], is_valid: &[bool]) -> Result<()> { - if values.len() != is_valid.len() { - return Err(ArrowError::InvalidArgumentError( - "Value and validity lengths must be equal".to_string(), - )); - } - self.bitmap_builder.append_slice(is_valid); - self.values_builder.append_slice(values); - Ok(()) - } - - /// Builds the [BooleanArray] and reset this builder. - pub fn finish(&mut self) -> BooleanArray { - let len = self.len(); - let null_bit_buffer = self.bitmap_builder.finish(); - let null_count = len - null_bit_buffer.count_set_bits(); - let mut builder = ArrayData::builder(DataType::Boolean) - .len(len) - .add_buffer(self.values_builder.finish()); - if null_count > 0 { - builder = builder.null_bit_buffer(null_bit_buffer); - } - let data = builder.build(); - BooleanArray::from(data) - } -} - -impl ArrayBuilder for BooleanBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.values_builder.len - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.values_builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -/// Array builder for fixed-width primitive types -#[derive(Debug)] -pub struct PrimitiveBuilder { - values_builder: BufferBuilder, - /// We only materialize the builder when we add `false`. - /// This optimization is **very** important for performance of `StringBuilder`. - bitmap_builder: Option, -} - -impl ArrayBuilder for PrimitiveBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.values_builder.len - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.values_builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl PrimitiveBuilder { - /// Creates a new primitive array builder - pub fn new(capacity: usize) -> Self { - Self { - values_builder: BufferBuilder::::new(capacity), - bitmap_builder: None, - } - } - - /// Returns the capacity of this builder measured in slots of type `T` - pub fn capacity(&self) -> usize { - self.values_builder.capacity() - } - - /// Appends a value of type `T` into the builder - #[inline] - pub fn append_value(&mut self, v: T::Native) -> Result<()> { - if let Some(b) = self.bitmap_builder.as_mut() { - b.append(true); - } - self.values_builder.append(v); - Ok(()) - } - - /// Appends a null slot into the builder - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.materialize_bitmap_builder(); - self.bitmap_builder.as_mut().unwrap().append(false); - self.values_builder.advance(1); - Ok(()) - } - - /// Appends an `Option` into the builder - #[inline] - pub fn append_option(&mut self, v: Option) -> Result<()> { - match v { - None => self.append_null()?, - Some(v) => self.append_value(v)?, - }; - Ok(()) - } - - /// Appends a slice of type `T` into the builder - #[inline] - pub fn append_slice(&mut self, v: &[T::Native]) -> Result<()> { - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_n(v.len(), true); - } - self.values_builder.append_slice(v); - Ok(()) - } - - /// Appends values from a slice of type `T` and a validity boolean slice - #[inline] - pub fn append_values( - &mut self, - values: &[T::Native], - is_valid: &[bool], - ) -> Result<()> { - if values.len() != is_valid.len() { - return Err(ArrowError::InvalidArgumentError( - "Value and validity lengths must be equal".to_string(), - )); - } - if is_valid.iter().any(|v| !*v) { - self.materialize_bitmap_builder(); - } - if let Some(b) = self.bitmap_builder.as_mut() { - b.append_slice(is_valid); - } - self.values_builder.append_slice(values); - Ok(()) - } - - /// Builds the `PrimitiveArray` and reset this builder. - pub fn finish(&mut self) -> PrimitiveArray { - let len = self.len(); - let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish()); - let null_count = len - - null_bit_buffer - .as_ref() - .map(|b| b.count_set_bits()) - .unwrap_or(len); - let mut builder = ArrayData::builder(T::DATA_TYPE) - .len(len) - .add_buffer(self.values_builder.finish()); - if null_count > 0 { - builder = builder.null_bit_buffer(null_bit_buffer.unwrap()); - } - let data = builder.build(); - PrimitiveArray::::from(data) - } - - /// Builds the `DictionaryArray` and reset this builder. - pub fn finish_dict(&mut self, values: ArrayRef) -> DictionaryArray { - let len = self.len(); - let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish()); - let null_count = len - - null_bit_buffer - .as_ref() - .map(|b| b.count_set_bits()) - .unwrap_or(len); - let data_type = DataType::Dictionary( - Box::new(T::DATA_TYPE), - Box::new(values.data_type().clone()), - ); - let mut builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(self.values_builder.finish()); - if null_count > 0 { - builder = builder.null_bit_buffer(null_bit_buffer.unwrap()); - } - builder = builder.add_child_data(values.data().clone()); - DictionaryArray::::from(builder.build()) - } - - fn materialize_bitmap_builder(&mut self) { - if self.bitmap_builder.is_some() { - return; - } - let mut b = BooleanBufferBuilder::new(0); - b.reserve(self.values_builder.capacity()); - b.append_n(self.values_builder.len, true); - self.bitmap_builder = Some(b); - } -} - -/// Array builder for `ListArray` -#[derive(Debug)] -pub struct GenericListBuilder { - offsets_builder: BufferBuilder, - bitmap_builder: BooleanBufferBuilder, - values_builder: T, - len: OffsetSize, -} - -impl GenericListBuilder { - /// Creates a new `ListArrayBuilder` from a given values array builder - pub fn new(values_builder: T) -> Self { - let capacity = values_builder.len(); - Self::with_capacity(values_builder, capacity) - } - - /// Creates a new `ListArrayBuilder` from a given values array builder - /// `capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(values_builder: T, capacity: usize) -> Self { - let mut offsets_builder = BufferBuilder::::new(capacity + 1); - let len = OffsetSize::zero(); - offsets_builder.append(len); - Self { - offsets_builder, - bitmap_builder: BooleanBufferBuilder::new(capacity), - values_builder, - len, - } - } -} - -impl ArrayBuilder - for GenericListBuilder -where - T: 'static, -{ - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.len.to_usize().unwrap() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.len == OffsetSize::zero() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl GenericListBuilder -where - T: 'static, -{ - /// Returns the child array builder as a mutable reference. - /// - /// This mutable reference can be used to append values into the child array builder, - /// but you must call `append` to delimit each distinct list value. - pub fn values(&mut self) -> &mut T { - &mut self.values_builder - } - - /// Finish the current variable-length list array slot - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.offsets_builder - .append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); - self.bitmap_builder.append(is_valid); - self.len += OffsetSize::one(); - Ok(()) - } - - /// Builds the `ListArray` and reset this builder. - pub fn finish(&mut self) -> GenericListArray { - let len = self.len(); - self.len = OffsetSize::zero(); - let values_arr = self - .values_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); - let values_data = values_arr.data(); - - let offset_buffer = self.offsets_builder.finish(); - let null_bit_buffer = self.bitmap_builder.finish(); - self.offsets_builder.append(self.len); - let field = Box::new(Field::new( - "item", - values_data.data_type().clone(), - true, // TODO: find a consistent way of getting this - )); - let data_type = if OffsetSize::is_large() { - DataType::LargeList(field) - } else { - DataType::List(field) - }; - let data = ArrayData::builder(data_type) - .len(len) - .add_buffer(offset_buffer) - .add_child_data(values_data.clone()) - .null_bit_buffer(null_bit_buffer) - .build(); - - GenericListArray::::from(data) - } -} - -pub type ListBuilder = GenericListBuilder; -pub type LargeListBuilder = GenericListBuilder; - -/// Array builder for `ListArray` -#[derive(Debug)] -pub struct FixedSizeListBuilder { - bitmap_builder: BooleanBufferBuilder, - values_builder: T, - len: usize, - list_len: i32, -} - -impl FixedSizeListBuilder { - /// Creates a new `FixedSizeListBuilder` from a given values array builder - /// `length` is the number of values within each array - pub fn new(values_builder: T, length: i32) -> Self { - let capacity = values_builder.len(); - Self::with_capacity(values_builder, length, capacity) - } - - /// Creates a new `FixedSizeListBuilder` from a given values array builder - /// `length` is the number of values within each array - /// `capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(values_builder: T, length: i32, capacity: usize) -> Self { - let mut offsets_builder = Int32BufferBuilder::new(capacity + 1); - offsets_builder.append(0); - Self { - bitmap_builder: BooleanBufferBuilder::new(capacity), - values_builder, - len: 0, - list_len: length, - } - } -} - -impl ArrayBuilder for FixedSizeListBuilder -where - T: 'static, -{ - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.len - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl FixedSizeListBuilder -where - T: 'static, -{ - /// Returns the child array builder as a mutable reference. - /// - /// This mutable reference can be used to append values into the child array builder, - /// but you must call `append` to delimit each distinct list value. - pub fn values(&mut self) -> &mut T { - &mut self.values_builder - } - - pub fn value_length(&self) -> i32 { - self.list_len - } - - /// Finish the current variable-length list array slot - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.bitmap_builder.append(is_valid); - self.len += 1; - Ok(()) - } - - /// Builds the `FixedSizeListBuilder` and reset this builder. - pub fn finish(&mut self) -> FixedSizeListArray { - let len = self.len(); - self.len = 0; - let values_arr = self - .values_builder - .as_any_mut() - .downcast_mut::() - .unwrap() - .finish(); - let values_data = values_arr.data(); - - // check that values_data length is multiple of len if we have data - if len != 0 { - assert!( - values_data.len() / len == self.list_len as usize, - "Values of FixedSizeList must have equal lengths, values have length {} and list has {}", - values_data.len() / len, - self.list_len - ); - } - - let null_bit_buffer = self.bitmap_builder.finish(); - let data = ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", values_data.data_type().clone(), true)), - self.list_len, - )) - .len(len) - .add_child_data(values_data.clone()) - .null_bit_buffer(null_bit_buffer) - .build(); - - FixedSizeListArray::from(data) - } -} - -/// Array builder for `BinaryArray` -#[derive(Debug)] -pub struct GenericBinaryBuilder { - builder: GenericListBuilder, -} - -pub type BinaryBuilder = GenericBinaryBuilder; -pub type LargeBinaryBuilder = GenericBinaryBuilder; - -#[derive(Debug)] -pub struct GenericStringBuilder { - builder: GenericListBuilder, -} - -pub type StringBuilder = GenericStringBuilder; -pub type LargeStringBuilder = GenericStringBuilder; - -#[derive(Debug)] -pub struct FixedSizeBinaryBuilder { - builder: FixedSizeListBuilder, -} - -#[derive(Debug)] -pub struct DecimalBuilder { - builder: FixedSizeListBuilder, - precision: usize, - scale: usize, -} - -impl ArrayBuilder - for GenericBinaryBuilder -{ - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl ArrayBuilder - for GenericStringBuilder -{ - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - let a = GenericStringBuilder::::finish(self); - Arc::new(a) - } -} - -impl ArrayBuilder for FixedSizeBinaryBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl ArrayBuilder for DecimalBuilder { - /// Returns the builder as a non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl GenericBinaryBuilder { - /// Creates a new `GenericBinaryBuilder`, `capacity` is the number of bytes in the values - /// array - pub fn new(capacity: usize) -> Self { - let values_builder = UInt8Builder::new(capacity); - Self { - builder: GenericListBuilder::new(values_builder), - } - } - - /// Appends a single byte value into the builder's values array. - /// - /// Note, when appending individual byte values you must call `append` to delimit each - /// distinct list value. - #[inline] - pub fn append_byte(&mut self, value: u8) -> Result<()> { - self.builder.values().append_value(value)?; - Ok(()) - } - - /// Appends a byte slice into the builder. - /// - /// Automatically calls the `append` method to delimit the slice appended in as a - /// distinct array element. - #[inline] - pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> { - self.builder.values().append_slice(value.as_ref())?; - self.builder.append(true)?; - Ok(()) - } - - /// Finish the current variable-length list array slot. - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.builder.append(is_valid) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.append(false) - } - - /// Builds the `BinaryArray` and reset this builder. - pub fn finish(&mut self) -> GenericBinaryArray { - GenericBinaryArray::::from(self.builder.finish()) - } -} - -impl GenericStringBuilder { - /// Creates a new `StringBuilder`, - /// `capacity` is the number of bytes of string data to pre-allocate space for in this builder - pub fn new(capacity: usize) -> Self { - let values_builder = UInt8Builder::new(capacity); - Self { - builder: GenericListBuilder::new(values_builder), - } - } - - /// Creates a new `StringBuilder`, - /// `data_capacity` is the number of bytes of string data to pre-allocate space for in this builder - /// `item_capacity` is the number of items to pre-allocate space for in this builder - pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { - let values_builder = UInt8Builder::new(data_capacity); - Self { - builder: GenericListBuilder::with_capacity(values_builder, item_capacity), - } - } - - /// Appends a string into the builder. - /// - /// Automatically calls the `append` method to delimit the string appended in as a - /// distinct array element. - #[inline] - pub fn append_value(&mut self, value: impl AsRef) -> Result<()> { - self.builder - .values() - .append_slice(value.as_ref().as_bytes())?; - self.builder.append(true)?; - Ok(()) - } - - /// Finish the current variable-length list array slot. - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.builder.append(is_valid) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.append(false) - } - - /// Builds the `StringArray` and reset this builder. - pub fn finish(&mut self) -> GenericStringArray { - GenericStringArray::::from(self.builder.finish()) - } -} - -impl FixedSizeBinaryBuilder { - /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values - /// array - pub fn new(capacity: usize, byte_width: i32) -> Self { - let values_builder = UInt8Builder::new(capacity); - Self { - builder: FixedSizeListBuilder::new(values_builder, byte_width), - } - } - - /// Appends a byte slice into the builder. - /// - /// Automatically calls the `append` method to delimit the slice appended in as a - /// distinct array element. - #[inline] - pub fn append_value(&mut self, value: impl AsRef<[u8]>) -> Result<()> { - if self.builder.value_length() != value.as_ref().len() as i32 { - return Err(ArrowError::InvalidArgumentError( - "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths".to_string() - )); - } - self.builder.values().append_slice(value.as_ref())?; - self.builder.append(true) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - let length: usize = self.builder.value_length() as usize; - self.builder.values().append_slice(&vec![0u8; length][..])?; - self.builder.append(false) - } - - /// Builds the `FixedSizeBinaryArray` and reset this builder. - pub fn finish(&mut self) -> FixedSizeBinaryArray { - FixedSizeBinaryArray::from(self.builder.finish()) - } -} - -impl DecimalBuilder { - /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values - /// array - pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { - let values_builder = UInt8Builder::new(capacity); - let byte_width = 16; - Self { - builder: FixedSizeListBuilder::new(values_builder, byte_width), - precision, - scale, - } - } - - /// Appends a byte slice into the builder. - /// - /// Automatically calls the `append` method to delimit the slice appended in as a - /// distinct array element. - #[inline] - pub fn append_value(&mut self, value: i128) -> Result<()> { - let value_as_bytes = Self::from_i128_to_fixed_size_bytes( - value, - self.builder.value_length() as usize, - )?; - if self.builder.value_length() != value_as_bytes.len() as i32 { - return Err(ArrowError::InvalidArgumentError( - "Byte slice does not have the same length as DecimalBuilder value lengths".to_string() - )); - } - self.builder - .values() - .append_slice(value_as_bytes.as_slice())?; - self.builder.append(true) - } - - fn from_i128_to_fixed_size_bytes(v: i128, size: usize) -> Result> { - if size > 16 { - return Err(ArrowError::InvalidArgumentError( - "DecimalBuilder only supports values up to 16 bytes.".to_string(), - )); - } - let res = v.to_le_bytes(); - let start_byte = 16 - size; - Ok(res[start_byte..16].to_vec()) - } - - /// Append a null value to the array. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - let length: usize = self.builder.value_length() as usize; - self.builder.values().append_slice(&vec![0u8; length][..])?; - self.builder.append(false) - } - - /// Builds the `DecimalArray` and reset this builder. - pub fn finish(&mut self) -> DecimalArray { - DecimalArray::from_fixed_size_list_array( - self.builder.finish(), - self.precision, - self.scale, - ) - } -} - -/// Array builder for Struct types. -/// -/// Note that callers should make sure that methods of all the child field builders are -/// properly called to maintain the consistency of the data structure. -pub struct StructBuilder { - fields: Vec, - field_builders: Vec>, - bitmap_builder: BooleanBufferBuilder, - len: usize, -} - -impl fmt::Debug for StructBuilder { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("StructBuilder") - .field("fields", &self.fields) - .field("bitmap_builder", &self.bitmap_builder) - .field("len", &self.len) - .finish() - } -} - -impl ArrayBuilder for StructBuilder { - /// Returns the number of array slots in the builder. - /// - /// Note that this always return the first child field builder's length, and it is - /// the caller's responsibility to maintain the consistency that all the child field - /// builder should have the equal number of elements. - fn len(&self) -> usize { - self.len - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Builds the array. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } - - /// Returns the builder as a non-mutable `Any` reference. - /// - /// This is most useful when one wants to call non-mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_ref` to get a reference on the specific builder. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as a mutable `Any` reference. - /// - /// This is most useful when one wants to call mutable APIs on a specific builder - /// type. In this case, one can first cast this into a `Any`, and then use - /// `downcast_mut` to get a reference on the specific builder. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } -} - -/// Returns a builder with capacity `capacity` that corresponds to the datatype `DataType` -/// This function is useful to construct arrays from an arbitrary vectors with known/expected -/// schema. -pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { - match datatype { - DataType::Null => unimplemented!(), - DataType::Boolean => Box::new(BooleanBuilder::new(capacity)), - DataType::Int8 => Box::new(Int8Builder::new(capacity)), - DataType::Int16 => Box::new(Int16Builder::new(capacity)), - DataType::Int32 => Box::new(Int32Builder::new(capacity)), - DataType::Int64 => Box::new(Int64Builder::new(capacity)), - DataType::UInt8 => Box::new(UInt8Builder::new(capacity)), - DataType::UInt16 => Box::new(UInt16Builder::new(capacity)), - DataType::UInt32 => Box::new(UInt32Builder::new(capacity)), - DataType::UInt64 => Box::new(UInt64Builder::new(capacity)), - DataType::Float32 => Box::new(Float32Builder::new(capacity)), - DataType::Float64 => Box::new(Float64Builder::new(capacity)), - DataType::Binary => Box::new(BinaryBuilder::new(capacity)), - DataType::FixedSizeBinary(len) => { - Box::new(FixedSizeBinaryBuilder::new(capacity, *len)) - } - DataType::Decimal(precision, scale) => { - Box::new(DecimalBuilder::new(capacity, *precision, *scale)) - } - DataType::Utf8 => Box::new(StringBuilder::new(capacity)), - DataType::Date32 => Box::new(Date32Builder::new(capacity)), - DataType::Date64 => Box::new(Date64Builder::new(capacity)), - DataType::Time32(TimeUnit::Second) => { - Box::new(Time32SecondBuilder::new(capacity)) - } - DataType::Time32(TimeUnit::Millisecond) => { - Box::new(Time32MillisecondBuilder::new(capacity)) - } - DataType::Time64(TimeUnit::Microsecond) => { - Box::new(Time64MicrosecondBuilder::new(capacity)) - } - DataType::Time64(TimeUnit::Nanosecond) => { - Box::new(Time64NanosecondBuilder::new(capacity)) - } - DataType::Timestamp(TimeUnit::Second, _) => { - Box::new(TimestampSecondBuilder::new(capacity)) - } - DataType::Timestamp(TimeUnit::Millisecond, _) => { - Box::new(TimestampMillisecondBuilder::new(capacity)) - } - DataType::Timestamp(TimeUnit::Microsecond, _) => { - Box::new(TimestampMicrosecondBuilder::new(capacity)) - } - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Box::new(TimestampNanosecondBuilder::new(capacity)) - } - DataType::Interval(IntervalUnit::YearMonth) => { - Box::new(IntervalYearMonthBuilder::new(capacity)) - } - DataType::Interval(IntervalUnit::DayTime) => { - Box::new(IntervalDayTimeBuilder::new(capacity)) - } - DataType::Duration(TimeUnit::Second) => { - Box::new(DurationSecondBuilder::new(capacity)) - } - DataType::Duration(TimeUnit::Millisecond) => { - Box::new(DurationMillisecondBuilder::new(capacity)) - } - DataType::Duration(TimeUnit::Microsecond) => { - Box::new(DurationMicrosecondBuilder::new(capacity)) - } - DataType::Duration(TimeUnit::Nanosecond) => { - Box::new(DurationNanosecondBuilder::new(capacity)) - } - DataType::Struct(fields) => { - Box::new(StructBuilder::from_fields(fields.clone(), capacity)) - } - t => panic!("Data type {:?} is not currently supported", t), - } -} - -impl StructBuilder { - pub fn new(fields: Vec, field_builders: Vec>) -> Self { - Self { - fields, - field_builders, - bitmap_builder: BooleanBufferBuilder::new(0), - len: 0, - } - } - - pub fn from_fields(fields: Vec, capacity: usize) -> Self { - let mut builders = Vec::with_capacity(fields.len()); - for field in &fields { - builders.push(make_builder(field.data_type(), capacity)); - } - Self::new(fields, builders) - } - - /// Returns a mutable reference to the child field builder at index `i`. - /// Result will be `None` if the input type `T` provided doesn't match the actual - /// field builder's type. - pub fn field_builder(&mut self, i: usize) -> Option<&mut T> { - self.field_builders[i].as_any_mut().downcast_mut::() - } - - /// Returns the number of fields for the struct this builder is building. - pub fn num_fields(&self) -> usize { - self.field_builders.len() - } - - /// Appends an element (either null or non-null) to the struct. The actual elements - /// should be appended for each child sub-array in a consistent way. - #[inline] - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.bitmap_builder.append(is_valid); - self.len += 1; - Ok(()) - } - - /// Appends a null element to the struct. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.append(false) - } - - /// Builds the `StructArray` and reset this builder. - pub fn finish(&mut self) -> StructArray { - let mut child_data = Vec::with_capacity(self.field_builders.len()); - for f in &mut self.field_builders { - let arr = f.finish(); - child_data.push(arr.data().clone()); - } - - let null_bit_buffer = self.bitmap_builder.finish(); - let null_count = self.len - null_bit_buffer.count_set_bits(); - let mut builder = ArrayData::builder(DataType::Struct(self.fields.clone())) - .len(self.len) - .child_data(child_data); - if null_count > 0 { - builder = builder.null_bit_buffer(null_bit_buffer); - } - - self.len = 0; - - StructArray::from(builder.build()) - } -} - -/// `FieldData` is a helper struct to track the state of the fields in the `UnionBuilder`. -#[derive(Debug)] -struct FieldData { - /// The type id for this field - type_id: i8, - /// The Arrow data type represented in the `values_buffer`, which is untyped - data_type: DataType, - /// A buffer containing the values for this field in raw bytes - values_buffer: Option, - /// The number of array slots represented by the buffer - slots: usize, - /// A builder for the bitmap if required (for Sparse Unions) - bitmap_builder: Option, -} - -impl FieldData { - /// Creates a new `FieldData`. - fn new( - type_id: i8, - data_type: DataType, - bitmap_builder: Option, - ) -> Self { - Self { - type_id, - data_type, - values_buffer: Some(MutableBuffer::new(1)), - slots: 0, - bitmap_builder, - } - } - - /// Appends a single value to this `FieldData`'s `values_buffer`. - #[allow(clippy::unnecessary_wraps)] - fn append_to_values_buffer( - &mut self, - v: T::Native, - ) -> Result<()> { - let values_buffer = self - .values_buffer - .take() - .expect("Values buffer was never created"); - let mut builder: BufferBuilder = - mutable_buffer_to_builder(values_buffer, self.slots); - builder.append(v); - let mutable_buffer = builder_to_mutable_buffer(builder); - self.values_buffer = Some(mutable_buffer); - - self.slots += 1; - if let Some(b) = &mut self.bitmap_builder { - b.append(true) - }; - Ok(()) - } - - /// Appends a null to this `FieldData`. - #[allow(clippy::unnecessary_wraps)] - fn append_null(&mut self) -> Result<()> { - if let Some(b) = &mut self.bitmap_builder { - let values_buffer = self - .values_buffer - .take() - .expect("Values buffer was never created"); - let mut builder: BufferBuilder = - mutable_buffer_to_builder(values_buffer, self.slots); - builder.advance(1); - let mutable_buffer = builder_to_mutable_buffer(builder); - self.values_buffer = Some(mutable_buffer); - self.slots += 1; - b.append(false); - }; - Ok(()) - } - - /// Appends a null to this `FieldData` when the type is not known at compile time. - /// - /// As the main `append` method of `UnionBuilder` is generic, we need a way to append null - /// slots to the fields that are not being appended to in the case of sparse unions. This - /// method solves this problem by appending dynamically based on `DataType`. - /// - /// Note, this method does **not** update the length of the `UnionArray` (this is done by the - /// main append operation) and assumes that it is called from a method that is generic over `T` - /// where `T` satisfies the bound `ArrowPrimitiveType`. - fn append_null_dynamic(&mut self) -> Result<()> { - match self.data_type { - DataType::Null => unimplemented!(), - DataType::Int8 => self.append_null::()?, - DataType::Int16 => self.append_null::()?, - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - self.append_null::()? - } - DataType::Int64 - | DataType::Timestamp(_, _) - | DataType::Date64 - | DataType::Time64(_) - | DataType::Interval(IntervalUnit::DayTime) - | DataType::Duration(_) => self.append_null::()?, - DataType::UInt8 => self.append_null::()?, - DataType::UInt16 => self.append_null::()?, - DataType::UInt32 => self.append_null::()?, - DataType::UInt64 => self.append_null::()?, - DataType::Float32 => self.append_null::()?, - DataType::Float64 => self.append_null::()?, - _ => unreachable!("All cases of types that satisfy the trait bounds over T are covered above."), - }; - Ok(()) - } -} - -/// Builder type for creating a new `UnionArray`. -#[derive(Debug)] -pub struct UnionBuilder { - /// The current number of slots in the array - len: usize, - /// Maps field names to `FieldData` instances which track the builders for that field - fields: HashMap, - /// Builder to keep track of type ids - type_id_builder: Int8BufferBuilder, - /// Builder to keep track of offsets (`None` for sparse unions) - value_offset_builder: Option, - /// Optional builder for null slots - bitmap_builder: Option, -} - -impl UnionBuilder { - /// Creates a new dense array builder. - pub fn new_dense(capacity: usize) -> Self { - Self { - len: 0, - fields: HashMap::default(), - type_id_builder: Int8BufferBuilder::new(capacity), - value_offset_builder: Some(Int32BufferBuilder::new(capacity)), - bitmap_builder: None, - } - } - - /// Creates a new sparse array builder. - pub fn new_sparse(capacity: usize) -> Self { - Self { - len: 0, - fields: HashMap::default(), - type_id_builder: Int8BufferBuilder::new(capacity), - value_offset_builder: None, - bitmap_builder: None, - } - } - - /// Appends a null to this builder. - #[inline] - pub fn append_null(&mut self) -> Result<()> { - if self.bitmap_builder.is_none() { - let mut builder = BooleanBufferBuilder::new(self.len + 1); - for _ in 0..self.len { - builder.append(true); - } - self.bitmap_builder = Some(builder) - } - self.bitmap_builder - .as_mut() - .expect("Cannot be None") - .append(false); - - self.type_id_builder.append(i8::default()); - - // Handle sparse union - if self.value_offset_builder.is_none() { - for (_, fd) in self.fields.iter_mut() { - fd.append_null_dynamic()?; - } - } - self.len += 1; - Ok(()) - } - - /// Appends a value to this builder. - #[inline] - pub fn append( - &mut self, - type_name: &str, - v: T::Native, - ) -> Result<()> { - let type_name = type_name.to_string(); - - let mut field_data = match self.fields.remove(&type_name) { - Some(data) => data, - None => match self.value_offset_builder { - Some(_) => FieldData::new(self.fields.len() as i8, T::DATA_TYPE, None), - None => { - let mut fd = FieldData::new( - self.fields.len() as i8, - T::DATA_TYPE, - Some(BooleanBufferBuilder::new(1)), - ); - for _ in 0..self.len { - fd.append_null::()?; - } - fd - } - }, - }; - self.type_id_builder.append(field_data.type_id); - - match &mut self.value_offset_builder { - // Dense Union - Some(offset_builder) => { - offset_builder.append(field_data.slots as i32); - } - // Sparse Union - None => { - for (name, fd) in self.fields.iter_mut() { - if name != &type_name { - fd.append_null_dynamic()?; - } - } - } - } - field_data.append_to_values_buffer::(v)?; - self.fields.insert(type_name, field_data); - - // Update the bitmap builder if it exists - if let Some(b) = &mut self.bitmap_builder { - b.append(true); - } - self.len += 1; - Ok(()) - } - - /// Builds this builder creating a new `UnionArray`. - pub fn build(mut self) -> Result { - let type_id_buffer = self.type_id_builder.finish(); - let value_offsets_buffer = self.value_offset_builder.map(|mut b| b.finish()); - let mut children = Vec::new(); - for ( - name, - FieldData { - type_id, - data_type, - values_buffer, - slots, - bitmap_builder, - }, - ) in self.fields.into_iter() - { - let buffer = values_buffer - .expect("The `values_buffer` should only ever be None inside the `append` method.") - .into(); - let arr_data_builder = ArrayDataBuilder::new(data_type.clone()) - .add_buffer(buffer) - .len(slots); - // .build(); - let arr_data_ref = match bitmap_builder { - Some(mut bb) => arr_data_builder.null_bit_buffer(bb.finish()).build(), - None => arr_data_builder.build(), - }; - let array_ref = make_array(arr_data_ref); - children.push((type_id, (Field::new(&name, data_type, false), array_ref))) - } - - children.sort_by(|a, b| { - a.0.partial_cmp(&b.0) - .expect("This will never be None as type ids are always i8 values.") - }); - let children: Vec<_> = children.into_iter().map(|(_, b)| b).collect(); - let bitmap = self.bitmap_builder.map(|mut b| b.finish()); - - UnionArray::try_new(type_id_buffer, value_offsets_buffer, children, bitmap) - } -} - -/// Array builder for `DictionaryArray`. For example to map a set of byte indices -/// to f32 values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. -#[derive(Debug)] -pub struct PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - keys_builder: PrimitiveBuilder, - values_builder: PrimitiveBuilder, - map: HashMap, K::Native>, -} - -impl PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - /// Creates a new `PrimitiveDictionaryBuilder` from a keys builder and a value builder. - pub fn new( - keys_builder: PrimitiveBuilder, - values_builder: PrimitiveBuilder, - ) -> Self { - Self { - keys_builder, - values_builder, - map: HashMap::new(), - } - } -} - -impl ArrayBuilder for PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - /// Returns the builder as an non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as an mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.keys_builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.keys_builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl PrimitiveDictionaryBuilder -where - K: ArrowPrimitiveType, - V: ArrowPrimitiveType, -{ - /// Append a primitive value to the array. Return an existing index - /// if already present in the values array or a new index if the - /// value is appended to the values array. - #[inline] - pub fn append(&mut self, value: V::Native) -> Result { - if let Some(&key) = self.map.get(value.to_byte_slice()) { - // Append existing value. - self.keys_builder.append_value(key)?; - Ok(key) - } else { - // Append new value. - let key = K::Native::from_usize(self.values_builder.len()) - .ok_or(ArrowError::DictionaryKeyOverflowError)?; - self.values_builder.append_value(value)?; - self.keys_builder.append_value(key as K::Native)?; - self.map.insert(value.to_byte_slice().into(), key); - Ok(key) - } - } - - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.keys_builder.append_null() - } - - /// Builds the `DictionaryArray` and reset this builder. - pub fn finish(&mut self) -> DictionaryArray { - self.map.clear(); - let value_ref: ArrayRef = Arc::new(self.values_builder.finish()); - self.keys_builder.finish_dict(value_ref) - } -} - -/// Array builder for `DictionaryArray` that stores Strings. For example to map a set of byte indices -/// to String values. Note that the use of a `HashMap` here will not scale to very large -/// arrays or result in an ordered dictionary. -/// -/// ``` -/// use arrow::{ -/// array::{ -/// Int8Array, StringArray, -/// PrimitiveBuilder, StringBuilder, StringDictionaryBuilder, -/// }, -/// datatypes::Int8Type, -/// }; -/// -/// // Create a dictionary array indexed by bytes whose values are Strings. -/// // It can thus hold up to 256 distinct string values. -/// -/// let key_builder = PrimitiveBuilder::::new(100); -/// let value_builder = StringBuilder::new(100); -/// let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); -/// -/// // The builder builds the dictionary value by value -/// builder.append("abc").unwrap(); -/// builder.append_null().unwrap(); -/// builder.append("def").unwrap(); -/// builder.append("def").unwrap(); -/// builder.append("abc").unwrap(); -/// let array = builder.finish(); -/// -/// assert_eq!( -/// array.keys(), -/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) -/// ); -/// -/// // Values are polymorphic and so require a downcast. -/// let av = array.values(); -/// let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); -/// -/// assert_eq!(ava.value(0), "abc"); -/// assert_eq!(ava.value(1), "def"); -/// -/// ``` -#[derive(Debug)] -pub struct StringDictionaryBuilder -where - K: ArrowDictionaryKeyType, -{ - keys_builder: PrimitiveBuilder, - values_builder: StringBuilder, - map: HashMap, K::Native>, -} - -impl StringDictionaryBuilder -where - K: ArrowDictionaryKeyType, -{ - /// Creates a new `StringDictionaryBuilder` from a keys builder and a value builder. - pub fn new(keys_builder: PrimitiveBuilder, values_builder: StringBuilder) -> Self { - Self { - keys_builder, - values_builder, - map: HashMap::new(), - } - } - - /// Creates a new `StringDictionaryBuilder` from a keys builder and a dictionary - /// which is initialized with the given values. - /// The indices of those dictionary values are used as keys. - /// - /// # Example - /// - /// ``` - /// use arrow::datatypes::Int16Type; - /// use arrow::array::{StringArray, StringDictionaryBuilder, PrimitiveBuilder, Int16Array}; - /// use std::convert::TryFrom; - /// - /// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]); - /// - /// let mut builder = StringDictionaryBuilder::new_with_dictionary(PrimitiveBuilder::::new(3), &dictionary_values).unwrap(); - /// builder.append("def").unwrap(); - /// builder.append_null().unwrap(); - /// builder.append("abc").unwrap(); - /// - /// let dictionary_array = builder.finish(); - /// - /// let keys = dictionary_array.keys(); - /// - /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)])); - /// ``` - pub fn new_with_dictionary( - keys_builder: PrimitiveBuilder, - dictionary_values: &StringArray, - ) -> Result { - let dict_len = dictionary_values.len(); - let mut values_builder = - StringBuilder::with_capacity(dict_len, dictionary_values.value_data().len()); - let mut map: HashMap, K::Native> = HashMap::with_capacity(dict_len); - for i in 0..dict_len { - if dictionary_values.is_valid(i) { - let value = dictionary_values.value(i); - map.insert( - value.as_bytes().into(), - K::Native::from_usize(i) - .ok_or(ArrowError::DictionaryKeyOverflowError)?, - ); - values_builder.append_value(value)?; - } else { - values_builder.append_null()?; - } - } - Ok(Self { - keys_builder, - values_builder, - map, - }) - } -} - -impl ArrayBuilder for StringDictionaryBuilder -where - K: ArrowDictionaryKeyType, -{ - /// Returns the builder as an non-mutable `Any` reference. - fn as_any(&self) -> &Any { - self - } - - /// Returns the builder as an mutable `Any` reference. - fn as_any_mut(&mut self) -> &mut Any { - self - } - - /// Returns the boxed builder as a box of `Any`. - fn into_box_any(self: Box) -> Box { - self - } - - /// Returns the number of array slots in the builder - fn len(&self) -> usize { - self.keys_builder.len() - } - - /// Returns whether the number of array slots is zero - fn is_empty(&self) -> bool { - self.keys_builder.is_empty() - } - - /// Builds the array and reset this builder. - fn finish(&mut self) -> ArrayRef { - Arc::new(self.finish()) - } -} - -impl StringDictionaryBuilder -where - K: ArrowDictionaryKeyType, -{ - /// Append a primitive value to the array. Return an existing index - /// if already present in the values array or a new index if the - /// value is appended to the values array. - pub fn append(&mut self, value: impl AsRef) -> Result { - if let Some(&key) = self.map.get(value.as_ref().as_bytes()) { - // Append existing value. - self.keys_builder.append_value(key)?; - Ok(key) - } else { - // Append new value. - let key = K::Native::from_usize(self.values_builder.len()) - .ok_or(ArrowError::DictionaryKeyOverflowError)?; - self.values_builder.append_value(value.as_ref())?; - self.keys_builder.append_value(key as K::Native)?; - self.map.insert(value.as_ref().as_bytes().into(), key); - Ok(key) - } - } - - #[inline] - pub fn append_null(&mut self) -> Result<()> { - self.keys_builder.append_null() - } - - /// Builds the `DictionaryArray` and reset this builder. - pub fn finish(&mut self) -> DictionaryArray { - self.map.clear(); - let value_ref: ArrayRef = Arc::new(self.values_builder.finish()); - self.keys_builder.finish_dict(value_ref) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::array::Array; - use crate::bitmap::Bitmap; - - #[test] - fn test_builder_i32_empty() { - let mut b = Int32BufferBuilder::new(5); - assert_eq!(0, b.len()); - assert_eq!(16, b.capacity()); - let a = b.finish(); - assert_eq!(0, a.len()); - } - - #[test] - fn test_builder_i32_alloc_zero_bytes() { - let mut b = Int32BufferBuilder::new(0); - b.append(123); - let a = b.finish(); - assert_eq!(4, a.len()); - } - - #[test] - fn test_builder_i32() { - let mut b = Int32BufferBuilder::new(5); - for i in 0..5 { - b.append(i); - } - assert_eq!(16, b.capacity()); - let a = b.finish(); - assert_eq!(20, a.len()); - } - - #[test] - fn test_builder_i32_grow_buffer() { - let mut b = Int32BufferBuilder::new(2); - assert_eq!(16, b.capacity()); - for i in 0..20 { - b.append(i); - } - assert_eq!(32, b.capacity()); - let a = b.finish(); - assert_eq!(80, a.len()); - } - - #[test] - fn test_builder_finish() { - let mut b = Int32BufferBuilder::new(5); - assert_eq!(16, b.capacity()); - for i in 0..10 { - b.append(i); - } - let mut a = b.finish(); - assert_eq!(40, a.len()); - assert_eq!(0, b.len()); - assert_eq!(0, b.capacity()); - - // Try build another buffer after cleaning up. - for i in 0..20 { - b.append(i) - } - assert_eq!(32, b.capacity()); - a = b.finish(); - assert_eq!(80, a.len()); - } - - #[test] - fn test_reserve() { - let mut b = UInt8BufferBuilder::new(2); - assert_eq!(64, b.capacity()); - b.reserve(64); - assert_eq!(64, b.capacity()); - b.reserve(65); - assert_eq!(128, b.capacity()); - - let mut b = Int32BufferBuilder::new(2); - assert_eq!(16, b.capacity()); - b.reserve(16); - assert_eq!(16, b.capacity()); - b.reserve(17); - assert_eq!(32, b.capacity()); - } - - #[test] - fn test_append_slice() { - let mut b = UInt8BufferBuilder::new(0); - b.append_slice(b"Hello, "); - b.append_slice(b"World!"); - let buffer = b.finish(); - assert_eq!(13, buffer.len()); - - let mut b = Int32BufferBuilder::new(0); - b.append_slice(&[32, 54]); - let buffer = b.finish(); - assert_eq!(8, buffer.len()); - } - - #[test] - fn test_append_values() -> Result<()> { - let mut a = Int8Builder::new(0); - a.append_value(1)?; - a.append_null()?; - a.append_value(-2)?; - assert_eq!(a.len(), 3); - - // append values - let values = &[1, 2, 3, 4]; - let is_valid = &[true, true, false, true]; - a.append_values(values, is_valid)?; - - assert_eq!(a.len(), 7); - let array = a.finish(); - assert_eq!(array.value(0), 1); - assert_eq!(array.is_null(1), true); - assert_eq!(array.value(2), -2); - assert_eq!(array.value(3), 1); - assert_eq!(array.value(4), 2); - assert_eq!(array.is_null(5), true); - assert_eq!(array.value(6), 4); - - Ok(()) - } - - #[test] - fn test_write_bytes() { - let mut b = BooleanBufferBuilder::new(4); - b.append(false); - b.append(true); - b.append(false); - b.append(true); - assert_eq!(4, b.len()); - assert_eq!(512, b.capacity()); - let buffer = b.finish(); - assert_eq!(1, buffer.len()); - - let mut b = BooleanBufferBuilder::new(4); - b.append_slice(&[false, true, false, true]); - assert_eq!(4, b.len()); - assert_eq!(512, b.capacity()); - let buffer = b.finish(); - assert_eq!(1, buffer.len()); - } - - #[test] - fn test_boolean_array_builder_append_slice() { - let arr1 = - BooleanArray::from(vec![Some(true), Some(false), None, None, Some(false)]); - - let mut builder = BooleanArray::builder(0); - builder.append_slice(&[true, false]).unwrap(); - builder.append_null().unwrap(); - builder.append_null().unwrap(); - builder.append_value(false).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1, arr2); - } - - #[test] - fn test_boolean_array_builder_append_slice_large() { - let arr1 = BooleanArray::from(vec![true; 513]); - - let mut builder = BooleanArray::builder(512); - builder.append_slice(&[true; 513]).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1, arr2); - } - - #[test] - fn test_boolean_builder_increases_buffer_len() { - // 00000010 01001000 - let buf = Buffer::from([72_u8, 2_u8]); - let mut builder = BooleanBufferBuilder::new(8); - - for i in 0..16 { - if i == 3 || i == 6 || i == 9 { - builder.append(true); - } else { - builder.append(false); - } - } - let buf2 = builder.finish(); - - assert_eq!(buf.len(), buf2.len()); - assert_eq!(buf.as_slice(), buf2.as_slice()); - } - - #[test] - fn test_primitive_array_builder_i32() { - let mut builder = Int32Array::builder(5); - for i in 0..5 { - builder.append_value(i).unwrap(); - } - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_date32() { - let mut builder = Date32Array::builder(5); - for i in 0..5 { - builder.append_value(i).unwrap(); - } - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i32, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_timestamp_second() { - let mut builder = TimestampSecondArray::builder(5); - for i in 0..5 { - builder.append_value(i).unwrap(); - } - let arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..5 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i as i64, arr.value(i)); - } - } - - #[test] - fn test_primitive_array_builder_bool() { - // 00000010 01001000 - let buf = Buffer::from([72_u8, 2_u8]); - let mut builder = BooleanArray::builder(10); - for i in 0..10 { - if i == 3 || i == 6 || i == 9 { - builder.append_value(true).unwrap(); - } else { - builder.append_value(false).unwrap(); - } - } - - let arr = builder.finish(); - assert_eq!(&buf, arr.values()); - assert_eq!(10, arr.len()); - assert_eq!(0, arr.offset()); - assert_eq!(0, arr.null_count()); - for i in 0..10 { - assert!(!arr.is_null(i)); - assert!(arr.is_valid(i)); - assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at {}", i) - } - } - - #[test] - fn test_primitive_array_builder_append_option() { - let arr1 = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); - - let mut builder = Int32Array::builder(5); - builder.append_option(Some(0)).unwrap(); - builder.append_option(None).unwrap(); - builder.append_option(Some(2)).unwrap(); - builder.append_option(None).unwrap(); - builder.append_option(Some(4)).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1.len(), arr2.len()); - assert_eq!(arr1.offset(), arr2.offset()); - assert_eq!(arr1.null_count(), arr2.null_count()); - for i in 0..5 { - assert_eq!(arr1.is_null(i), arr2.is_null(i)); - assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); - if arr1.is_valid(i) { - assert_eq!(arr1.value(i), arr2.value(i)); - } - } - } - - #[test] - fn test_primitive_array_builder_append_null() { - let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); - - let mut builder = Int32Array::builder(5); - builder.append_value(0).unwrap(); - builder.append_value(2).unwrap(); - builder.append_null().unwrap(); - builder.append_null().unwrap(); - builder.append_value(4).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1.len(), arr2.len()); - assert_eq!(arr1.offset(), arr2.offset()); - assert_eq!(arr1.null_count(), arr2.null_count()); - for i in 0..5 { - assert_eq!(arr1.is_null(i), arr2.is_null(i)); - assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); - if arr1.is_valid(i) { - assert_eq!(arr1.value(i), arr2.value(i)); - } - } - } - - #[test] - fn test_primitive_array_builder_append_slice() { - let arr1 = Int32Array::from(vec![Some(0), Some(2), None, None, Some(4)]); - - let mut builder = Int32Array::builder(5); - builder.append_slice(&[0, 2]).unwrap(); - builder.append_null().unwrap(); - builder.append_null().unwrap(); - builder.append_value(4).unwrap(); - let arr2 = builder.finish(); - - assert_eq!(arr1.len(), arr2.len()); - assert_eq!(arr1.offset(), arr2.offset()); - assert_eq!(arr1.null_count(), arr2.null_count()); - for i in 0..5 { - assert_eq!(arr1.is_null(i), arr2.is_null(i)); - assert_eq!(arr1.is_valid(i), arr2.is_valid(i)); - if arr1.is_valid(i) { - assert_eq!(arr1.value(i), arr2.value(i)); - } - } - } - - #[test] - fn test_primitive_array_builder_finish() { - let mut builder = Int32Builder::new(5); - builder.append_slice(&[2, 4, 6, 8]).unwrap(); - let mut arr = builder.finish(); - assert_eq!(4, arr.len()); - assert_eq!(0, builder.len()); - - builder.append_slice(&[1, 3, 5, 7, 9]).unwrap(); - arr = builder.finish(); - assert_eq!(5, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_list_array_builder() { - let values_builder = Int32Builder::new(10); - let mut builder = ListBuilder::new(values_builder); - - // [[0, 1, 2], [3, 4, 5], [6, 7]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_value(4).unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - let values = list_array.values().data().buffers()[0].clone(); - assert_eq!(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]), values); - assert_eq!( - Buffer::from_slice_ref(&[0, 3, 6, 8]), - list_array.data().buffers()[0].clone() - ); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offsets()[2]); - assert_eq!(2, list_array.value_length(2)); - for i in 0..3 { - assert!(list_array.is_valid(i)); - assert!(!list_array.is_null(i)); - } - } - - #[test] - fn test_large_list_array_builder() { - let values_builder = Int32Builder::new(10); - let mut builder = LargeListBuilder::new(values_builder); - - // [[0, 1, 2], [3, 4, 5], [6, 7]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_value(4).unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - let values = list_array.values().data().buffers()[0].clone(); - assert_eq!(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7]), values); - assert_eq!( - Buffer::from_slice_ref(&[0i64, 3, 6, 8]), - list_array.data().buffers()[0].clone() - ); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offsets()[2]); - assert_eq!(2, list_array.value_length(2)); - for i in 0..3 { - assert!(list_array.is_valid(i)); - assert!(!list_array.is_null(i)); - } - } - - #[test] - fn test_list_array_builder_nulls() { - let values_builder = Int32Builder::new(10); - let mut builder = ListBuilder::new(values_builder); - - // [[0, 1, 2], null, [3, null, 5], [6, 7]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(3, list_array.value_offsets()[2]); - assert_eq!(3, list_array.value_length(2)); - } - - #[test] - fn test_large_list_array_builder_nulls() { - let values_builder = Int32Builder::new(10); - let mut builder = LargeListBuilder::new(values_builder); - - // [[0, 1, 2], null, [3, null, 5], [6, 7]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(3, list_array.value_offsets()[2]); - assert_eq!(3, list_array.value_length(2)); - } - - #[test] - fn test_fixed_size_list_array_builder() { - let values_builder = Int32Builder::new(10); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - - // [[0, 1, 2], null, [3, null, 5], [6, 7, null]] - builder.values().append_value(0).unwrap(); - builder.values().append_value(1).unwrap(); - builder.values().append_value(2).unwrap(); - builder.append(true).unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_null().unwrap(); - builder.append(false).unwrap(); - builder.values().append_value(3).unwrap(); - builder.values().append_null().unwrap(); - builder.values().append_value(5).unwrap(); - builder.append(true).unwrap(); - builder.values().append_value(6).unwrap(); - builder.values().append_value(7).unwrap(); - builder.values().append_null().unwrap(); - builder.append(true).unwrap(); - let list_array = builder.finish(); - - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - fn test_list_array_builder_finish() { - let values_builder = Int32Array::builder(5); - let mut builder = ListBuilder::new(values_builder); - - builder.values().append_slice(&[1, 2, 3]).unwrap(); - builder.append(true).unwrap(); - builder.values().append_slice(&[4, 5, 6]).unwrap(); - builder.append(true).unwrap(); - - let mut arr = builder.finish(); - assert_eq!(2, arr.len()); - assert_eq!(0, builder.len()); - - builder.values().append_slice(&[7, 8, 9]).unwrap(); - builder.append(true).unwrap(); - arr = builder.finish(); - assert_eq!(1, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_fixed_size_list_array_builder_empty() { - let values_builder = Int32Array::builder(5); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - - let arr = builder.finish(); - assert_eq!(0, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_fixed_size_list_array_builder_finish() { - let values_builder = Int32Array::builder(5); - let mut builder = FixedSizeListBuilder::new(values_builder, 3); - - builder.values().append_slice(&[1, 2, 3]).unwrap(); - builder.append(true).unwrap(); - builder.values().append_slice(&[4, 5, 6]).unwrap(); - builder.append(true).unwrap(); - - let mut arr = builder.finish(); - assert_eq!(2, arr.len()); - assert_eq!(0, builder.len()); - - builder.values().append_slice(&[7, 8, 9]).unwrap(); - builder.append(true).unwrap(); - arr = builder.finish(); - assert_eq!(1, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_list_list_array_builder() { - let primitive_builder = Int32Builder::new(10); - let values_builder = ListBuilder::new(primitive_builder); - let mut builder = ListBuilder::new(values_builder); - - // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] - builder.values().values().append_value(1).unwrap(); - builder.values().values().append_value(2).unwrap(); - builder.values().append(true).unwrap(); - builder.values().values().append_value(3).unwrap(); - builder.values().values().append_value(4).unwrap(); - builder.values().append(true).unwrap(); - builder.append(true).unwrap(); - - builder.values().values().append_value(5).unwrap(); - builder.values().values().append_value(6).unwrap(); - builder.values().values().append_value(7).unwrap(); - builder.values().append(true).unwrap(); - builder.values().append(false).unwrap(); - builder.values().values().append_value(8).unwrap(); - builder.values().append(true).unwrap(); - builder.append(true).unwrap(); - - builder.append(false).unwrap(); - - builder.values().values().append_value(9).unwrap(); - builder.values().values().append_value(10).unwrap(); - builder.values().append(true).unwrap(); - builder.append(true).unwrap(); - - let list_array = builder.finish(); - - assert_eq!(4, list_array.len()); - assert_eq!(1, list_array.null_count()); - assert_eq!( - Buffer::from_slice_ref(&[0, 2, 5, 5, 6]), - list_array.data().buffers()[0].clone() - ); - - assert_eq!(6, list_array.values().data().len()); - assert_eq!(1, list_array.values().data().null_count()); - assert_eq!( - Buffer::from_slice_ref(&[0, 2, 4, 7, 7, 8, 10]), - list_array.values().data().buffers()[0].clone() - ); - - assert_eq!(10, list_array.values().data().child_data()[0].len()); - assert_eq!(0, list_array.values().data().child_data()[0].null_count()); - assert_eq!( - Buffer::from_slice_ref(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), - list_array.values().data().child_data()[0].buffers()[0].clone() - ); - } - - #[test] - fn test_binary_array_builder() { - let mut builder = BinaryBuilder::new(20); - - builder.append_byte(b'h').unwrap(); - builder.append_byte(b'e').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'o').unwrap(); - builder.append(true).unwrap(); - builder.append(true).unwrap(); - builder.append_byte(b'w').unwrap(); - builder.append_byte(b'o').unwrap(); - builder.append_byte(b'r').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'd').unwrap(); - builder.append(true).unwrap(); - - let binary_array = builder.finish(); - - assert_eq!(3, binary_array.len()); - assert_eq!(0, binary_array.null_count()); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); - assert_eq!([] as [u8; 0], binary_array.value(1)); - assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); - assert_eq!(5, binary_array.value_offsets()[2]); - assert_eq!(5, binary_array.value_length(2)); - } - - #[test] - fn test_large_binary_array_builder() { - let mut builder = LargeBinaryBuilder::new(20); - - builder.append_byte(b'h').unwrap(); - builder.append_byte(b'e').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'o').unwrap(); - builder.append(true).unwrap(); - builder.append(true).unwrap(); - builder.append_byte(b'w').unwrap(); - builder.append_byte(b'o').unwrap(); - builder.append_byte(b'r').unwrap(); - builder.append_byte(b'l').unwrap(); - builder.append_byte(b'd').unwrap(); - builder.append(true).unwrap(); - - let binary_array = builder.finish(); - - assert_eq!(3, binary_array.len()); - assert_eq!(0, binary_array.null_count()); - assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); - assert_eq!([] as [u8; 0], binary_array.value(1)); - assert_eq!([b'w', b'o', b'r', b'l', b'd'], binary_array.value(2)); - assert_eq!(5, binary_array.value_offsets()[2]); - assert_eq!(5, binary_array.value_length(2)); - } - - #[test] - fn test_string_array_builder() { - let mut builder = StringBuilder::new(20); - - builder.append_value("hello").unwrap(); - builder.append(true).unwrap(); - builder.append_value("world").unwrap(); - - let string_array = builder.finish(); - - assert_eq!(3, string_array.len()); - assert_eq!(0, string_array.null_count()); - assert_eq!("hello", string_array.value(0)); - assert_eq!("", string_array.value(1)); - assert_eq!("world", string_array.value(2)); - assert_eq!(5, string_array.value_offsets()[2]); - assert_eq!(5, string_array.value_length(2)); - } - - #[test] - fn test_fixed_size_binary_builder() { - let mut builder = FixedSizeBinaryBuilder::new(15, 5); - - // [b"hello", null, "arrow"] - builder.append_value(b"hello").unwrap(); - builder.append_null().unwrap(); - builder.append_value(b"arrow").unwrap(); - let fixed_size_binary_array: FixedSizeBinaryArray = builder.finish(); - - assert_eq!( - &DataType::FixedSizeBinary(5), - fixed_size_binary_array.data_type() - ); - assert_eq!(3, fixed_size_binary_array.len()); - assert_eq!(1, fixed_size_binary_array.null_count()); - assert_eq!(10, fixed_size_binary_array.value_offset(2)); - assert_eq!(5, fixed_size_binary_array.value_length()); - } - - #[test] - fn test_decimal_builder() { - let mut builder = DecimalBuilder::new(30, 23, 6); - - builder.append_value(8_887_000_000).unwrap(); - builder.append_null().unwrap(); - builder.append_value(-8_887_000_000).unwrap(); - let decimal_array: DecimalArray = builder.finish(); - - assert_eq!(&DataType::Decimal(23, 6), decimal_array.data_type()); - assert_eq!(3, decimal_array.len()); - assert_eq!(1, decimal_array.null_count()); - assert_eq!(32, decimal_array.value_offset(2)); - assert_eq!(16, decimal_array.value_length()); - } - - #[test] - fn test_string_array_builder_finish() { - let mut builder = StringBuilder::new(10); - - builder.append_value("hello").unwrap(); - builder.append_value("world").unwrap(); - - let mut arr = builder.finish(); - assert_eq!(2, arr.len()); - assert_eq!(0, builder.len()); - - builder.append_value("arrow").unwrap(); - arr = builder.finish(); - assert_eq!(1, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_string_array_builder_append_string() { - let mut builder = StringBuilder::new(20); - - let var = "hello".to_owned(); - builder.append_value(&var).unwrap(); - builder.append(true).unwrap(); - builder.append_value("world").unwrap(); - - let string_array = builder.finish(); - - assert_eq!(3, string_array.len()); - assert_eq!(0, string_array.null_count()); - assert_eq!("hello", string_array.value(0)); - assert_eq!("", string_array.value(1)); - assert_eq!("world", string_array.value(2)); - assert_eq!(5, string_array.value_offsets()[2]); - assert_eq!(5, string_array.value_length(2)); - } - - #[test] - fn test_struct_array_builder() { - let string_builder = StringBuilder::new(4); - let int_builder = Int32Builder::new(4); - - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Utf8, false)); - field_builders.push(Box::new(string_builder) as Box); - fields.push(Field::new("f2", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - assert_eq!(2, builder.num_fields()); - - let string_builder = builder - .field_builder::(0) - .expect("builder at field 0 should be string builder"); - string_builder.append_value("joe").unwrap(); - string_builder.append_null().unwrap(); - string_builder.append_null().unwrap(); - string_builder.append_value("mark").unwrap(); - - let int_builder = builder - .field_builder::(1) - .expect("builder at field 1 should be int builder"); - int_builder.append_value(1).unwrap(); - int_builder.append_value(2).unwrap(); - int_builder.append_null().unwrap(); - int_builder.append_value(4).unwrap(); - - builder.append(true).unwrap(); - builder.append(true).unwrap(); - builder.append_null().unwrap(); - builder.append(true).unwrap(); - - let arr = builder.finish(); - - let struct_data = arr.data(); - assert_eq!(4, struct_data.len()); - assert_eq!(1, struct_data.null_count()); - assert_eq!( - &Some(Bitmap::from(Buffer::from(&[11_u8]))), - struct_data.null_bitmap() - ); - - let expected_string_data = ArrayData::builder(DataType::Utf8) - .len(4) - .null_bit_buffer(Buffer::from(&[9_u8])) - .add_buffer(Buffer::from_slice_ref(&[0, 3, 3, 3, 7])) - .add_buffer(Buffer::from_slice_ref(b"joemark")) - .build(); - - let expected_int_data = ArrayData::builder(DataType::Int32) - .len(4) - .null_bit_buffer(Buffer::from_slice_ref(&[11_u8])) - .add_buffer(Buffer::from_slice_ref(&[1, 2, 0, 4])) - .build(); - - assert_eq!(&expected_string_data, arr.column(0).data()); - - // TODO: implement equality for ArrayData - assert_eq!(expected_int_data.len(), arr.column(1).data().len()); - assert_eq!( - expected_int_data.null_count(), - arr.column(1).data().null_count() - ); - assert_eq!( - expected_int_data.null_bitmap(), - arr.column(1).data().null_bitmap() - ); - let expected_value_buf = expected_int_data.buffers()[0].clone(); - let actual_value_buf = arr.column(1).data().buffers()[0].clone(); - for i in 0..expected_int_data.len() { - if !expected_int_data.is_null(i) { - assert_eq!( - expected_value_buf.as_slice()[i * 4..(i + 1) * 4], - actual_value_buf.as_slice()[i * 4..(i + 1) * 4] - ); - } - } - } - - #[test] - fn test_struct_array_builder_finish() { - let int_builder = Int32Builder::new(10); - let bool_builder = BooleanBuilder::new(10); - - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - fields.push(Field::new("f2", DataType::Boolean, false)); - field_builders.push(Box::new(bool_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - builder - .field_builder::(0) - .unwrap() - .append_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) - .unwrap(); - builder - .field_builder::(1) - .unwrap() - .append_slice(&[ - false, true, false, true, false, true, false, true, false, true, - ]) - .unwrap(); - - // Append slot values - all are valid. - for _ in 0..10 { - assert!(builder.append(true).is_ok()) - } - - assert_eq!(10, builder.len()); - - let arr = builder.finish(); - - assert_eq!(10, arr.len()); - assert_eq!(0, builder.len()); - - builder - .field_builder::(0) - .unwrap() - .append_slice(&[1, 3, 5, 7, 9]) - .unwrap(); - builder - .field_builder::(1) - .unwrap() - .append_slice(&[false, true, false, true, false]) - .unwrap(); - - // Append slot values - all are valid. - for _ in 0..5 { - assert!(builder.append(true).is_ok()) - } - - assert_eq!(5, builder.len()); - - let arr = builder.finish(); - - assert_eq!(5, arr.len()); - assert_eq!(0, builder.len()); - } - - #[test] - fn test_struct_array_builder_from_schema() { - let mut fields = Vec::new(); - fields.push(Field::new("f1", DataType::Float32, false)); - fields.push(Field::new("f2", DataType::Utf8, false)); - let mut sub_fields = Vec::new(); - sub_fields.push(Field::new("g1", DataType::Int32, false)); - sub_fields.push(Field::new("g2", DataType::Boolean, false)); - let struct_type = DataType::Struct(sub_fields); - fields.push(Field::new("f3", struct_type, false)); - - let mut builder = StructBuilder::from_fields(fields, 5); - assert_eq!(3, builder.num_fields()); - assert!(builder.field_builder::(0).is_some()); - assert!(builder.field_builder::(1).is_some()); - assert!(builder.field_builder::(2).is_some()); - } - - #[test] - #[should_panic( - expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }) is not currently supported" - )] - fn test_struct_array_builder_from_schema_unsupported_type() { - let mut fields = Vec::new(); - fields.push(Field::new("f1", DataType::Int16, false)); - let list_type = - DataType::List(Box::new(Field::new("item", DataType::Int64, true))); - fields.push(Field::new("f2", list_type, false)); - - let _ = StructBuilder::from_fields(fields, 5); - } - - #[test] - fn test_struct_array_builder_field_builder_type_mismatch() { - let int_builder = Int32Builder::new(10); - - let mut fields = Vec::new(); - let mut field_builders = Vec::new(); - fields.push(Field::new("f1", DataType::Int32, false)); - field_builders.push(Box::new(int_builder) as Box); - - let mut builder = StructBuilder::new(fields, field_builders); - assert!(builder.field_builder::(0).is_none()); - } - - #[test] - fn test_primitive_dictionary_builder() { - let key_builder = PrimitiveBuilder::::new(3); - let value_builder = PrimitiveBuilder::::new(2); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); - builder.append(12345678).unwrap(); - builder.append_null().unwrap(); - builder.append(22345678).unwrap(); - let array = builder.finish(); - - assert_eq!( - array.keys(), - &UInt8Array::from(vec![Some(0), None, Some(1)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); - let avs: &[u32] = ava.values(); - - assert_eq!(array.is_null(0), false); - assert_eq!(array.is_null(1), true); - assert_eq!(array.is_null(2), false); - - assert_eq!(avs, &[12345678, 22345678]); - } - - #[test] - fn test_string_dictionary_builder() { - let key_builder = PrimitiveBuilder::::new(5); - let value_builder = StringBuilder::new(2); - let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); - builder.append("abc").unwrap(); - builder.append_null().unwrap(); - builder.append("def").unwrap(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); - let array = builder.finish(); - - assert_eq!( - array.keys(), - &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); - - assert_eq!(ava.value(0), "abc"); - assert_eq!(ava.value(1), "def"); - } - - #[test] - fn test_string_dictionary_builder_with_existing_dictionary() { - let dictionary = StringArray::from(vec![None, Some("def"), Some("abc")]); - - let key_builder = PrimitiveBuilder::::new(6); - let mut builder = - StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary) - .unwrap(); - builder.append("abc").unwrap(); - builder.append_null().unwrap(); - builder.append("def").unwrap(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); - builder.append("ghi").unwrap(); - let array = builder.finish(); - - assert_eq!( - array.keys(), - &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)]) - ); - - // Values are polymorphic and so require a downcast. - let av = array.values(); - let ava: &StringArray = av.as_any().downcast_ref::().unwrap(); - - assert_eq!(ava.is_valid(0), false); - assert_eq!(ava.value(1), "def"); - assert_eq!(ava.value(2), "abc"); - assert_eq!(ava.value(3), "ghi"); - } - - #[test] - fn test_string_dictionary_builder_with_reserved_null_value() { - let dictionary: Vec> = vec![None]; - let dictionary = StringArray::from(dictionary); - - let key_builder = PrimitiveBuilder::::new(4); - let mut builder = - StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary) - .unwrap(); - builder.append("abc").unwrap(); - builder.append_null().unwrap(); - builder.append("def").unwrap(); - builder.append("abc").unwrap(); - let array = builder.finish(); - - assert_eq!(array.is_null(1), true); - assert_eq!(array.is_valid(1), false); - - let keys = array.keys_array(); - - assert_eq!(keys.value(0), 1); - assert_eq!(keys.is_null(1), true); - // zero initialization is currently guaranteed by Buffer allocation and resizing - assert_eq!(keys.value(1), 0); - assert_eq!(keys.value(2), 2); - assert_eq!(keys.value(3), 1); - } - - #[test] - #[should_panic(expected = "DictionaryKeyOverflowError")] - fn test_primitive_dictionary_overflow() { - let key_builder = PrimitiveBuilder::::new(257); - let value_builder = PrimitiveBuilder::::new(257); - let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); - // 256 unique keys. - for i in 0..256 { - builder.append(i + 1000).unwrap(); - } - // Special error if the key overflows (256th entry) - builder.append(1257).unwrap(); - } -} diff --git a/rust/arrow/src/array/cast.rs b/rust/arrow/src/array/cast.rs deleted file mode 100644 index 0477f2831f9..00000000000 --- a/rust/arrow/src/array/cast.rs +++ /dev/null @@ -1,84 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines helper functions for force Array type downcast - -use crate::array::*; -use crate::datatypes::*; - -/// Force downcast ArrayRef to PrimitiveArray -pub fn as_primitive_array(arr: &ArrayRef) -> &PrimitiveArray -where - T: ArrowPrimitiveType, -{ - arr.as_any() - .downcast_ref::>() - .expect("Unable to downcast to primitive array") -} - -/// Force downcast ArrayRef to DictionaryArray -pub fn as_dictionary_array(arr: &ArrayRef) -> &DictionaryArray -where - T: ArrowDictionaryKeyType, -{ - arr.as_any() - .downcast_ref::>() - .expect("Unable to downcast to dictionary array") -} - -#[doc = "Force downcast ArrayRef to GenericListArray"] -pub fn as_generic_list_array(arr: &ArrayRef) -> &GenericListArray { - arr.as_any() - .downcast_ref::>() - .expect("Unable to downcast to list array") -} - -#[doc = "Force downcast ArrayRef to ListArray"] -#[inline] -pub fn as_list_array(arr: &ArrayRef) -> &ListArray { - as_generic_list_array::(arr) -} - -#[doc = "Force downcast ArrayRef to LargeListArray"] -#[inline] -pub fn as_large_list_array(arr: &ArrayRef) -> &LargeListArray { - as_generic_list_array::(arr) -} - -macro_rules! array_downcast_fn { - ($name: ident, $arrty: ty, $arrty_str:expr) => { - #[doc = "Force downcast ArrayRef to "] - #[doc = $arrty_str] - pub fn $name(arr: &ArrayRef) -> &$arrty { - arr.as_any().downcast_ref::<$arrty>().expect(concat!( - "Unable to downcast to typed array through ", - stringify!($name) - )) - } - }; - - // use recursive macro to generate dynamic doc string for a given array type - ($name: ident, $arrty: ty) => { - array_downcast_fn!($name, $arrty, stringify!($arrty)); - }; -} - -array_downcast_fn!(as_string_array, StringArray); -array_downcast_fn!(as_largestring_array, LargeStringArray); -array_downcast_fn!(as_boolean_array, BooleanArray); -array_downcast_fn!(as_null_array, NullArray); -array_downcast_fn!(as_struct_array, StructArray); diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs deleted file mode 100644 index 7ae3858e35c..00000000000 --- a/rust/arrow/src/array/data.rs +++ /dev/null @@ -1,679 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates -//! common attributes and operations for Arrow array. - -use std::mem; -use std::sync::Arc; - -use crate::datatypes::{DataType, IntervalUnit}; -use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; -use crate::{ - buffer::{Buffer, MutableBuffer}, - util::bit_util, -}; - -use super::equal::equal; - -#[inline] -pub(crate) fn count_nulls( - null_bit_buffer: Option<&Buffer>, - offset: usize, - len: usize, -) -> usize { - if let Some(buf) = null_bit_buffer { - len.checked_sub(buf.count_set_bits_offset(offset, len)) - .unwrap() - } else { - 0 - } -} - -/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots). -#[inline] -pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] { - let empty_buffer = MutableBuffer::new(0); - match data_type { - DataType::Null => [empty_buffer, MutableBuffer::new(0)], - DataType::Boolean => { - let bytes = bit_util::ceil(capacity, 8); - let buffer = MutableBuffer::new(bytes); - [buffer, empty_buffer] - } - DataType::UInt8 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt16 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int8 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int16 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Float32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Float64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Date32 | DataType::Time32(_) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Date64 - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Timestamp(_, _) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Interval(IntervalUnit::YearMonth) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Interval(IntervalUnit::DayTime) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Utf8 | DataType::Binary => { - let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); - // safety: `unsafe` code assumes that this buffer is initialized with one element - buffer.push(0i32); - [buffer, MutableBuffer::new(capacity * mem::size_of::())] - } - DataType::LargeUtf8 | DataType::LargeBinary => { - let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); - // safety: `unsafe` code assumes that this buffer is initialized with one element - buffer.push(0i64); - [buffer, MutableBuffer::new(capacity * mem::size_of::())] - } - DataType::List(_) => { - // offset buffer always starts with a zero - let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); - buffer.push(0i32); - [buffer, empty_buffer] - } - DataType::LargeList(_) => { - // offset buffer always starts with a zero - let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); - buffer.push(0i64); - [buffer, empty_buffer] - } - DataType::FixedSizeBinary(size) => { - [MutableBuffer::new(capacity * *size as usize), empty_buffer] - } - DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { - DataType::UInt8 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt16 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::UInt64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int8 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int16 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int32 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Int64 => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - _ => unreachable!(), - }, - DataType::Float16 => unreachable!(), - DataType::FixedSizeList(_, _) | DataType::Struct(_) => { - [empty_buffer, MutableBuffer::new(0)] - } - DataType::Decimal(_, _) => [ - MutableBuffer::new(capacity * mem::size_of::()), - empty_buffer, - ], - DataType::Union(_) => unimplemented!(), - } -} - -/// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`. -#[inline] -pub(crate) fn into_buffers( - data_type: &DataType, - buffer1: MutableBuffer, - buffer2: MutableBuffer, -) -> Vec { - match data_type { - DataType::Null | DataType::Struct(_) => vec![], - DataType::Utf8 - | DataType::Binary - | DataType::LargeUtf8 - | DataType::LargeBinary => vec![buffer1.into(), buffer2.into()], - _ => vec![buffer1.into()], - } -} - -/// An generic representation of Arrow array data which encapsulates common attributes and -/// operations for Arrow array. Specific operations for different arrays types (e.g., -/// primitive, list, struct) are implemented in `Array`. -#[derive(Debug, Clone)] -pub struct ArrayData { - /// The data type for this array data - data_type: DataType, - - /// The number of elements in this array data - len: usize, - - /// The number of null elements in this array data - null_count: usize, - - /// The offset into this array data, in number of items - offset: usize, - - /// The buffers for this array data. Note that depending on the array types, this - /// could hold different kinds of buffers (e.g., value buffer, value offset buffer) - /// at different positions. - buffers: Vec, - - /// The child(ren) of this array. Only non-empty for nested types, currently - /// `ListArray` and `StructArray`. - child_data: Vec, - - /// The null bitmap. A `None` value for this indicates all values are non-null in - /// this array. - null_bitmap: Option, -} - -pub type ArrayDataRef = Arc; - -impl ArrayData { - pub fn new( - data_type: DataType, - len: usize, - null_count: Option, - null_bit_buffer: Option, - offset: usize, - buffers: Vec, - child_data: Vec, - ) -> Self { - let null_count = match null_count { - None => count_nulls(null_bit_buffer.as_ref(), offset, len), - Some(null_count) => null_count, - }; - let null_bitmap = null_bit_buffer.map(Bitmap::from); - Self { - data_type, - len, - null_count, - offset, - buffers, - child_data, - null_bitmap, - } - } - - /// Returns a builder to construct a `ArrayData` instance. - #[inline] - pub const fn builder(data_type: DataType) -> ArrayDataBuilder { - ArrayDataBuilder::new(data_type) - } - - /// Returns a reference to the data type of this array data - #[inline] - pub const fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Returns a slice of buffers for this array data - pub fn buffers(&self) -> &[Buffer] { - &self.buffers[..] - } - - /// Returns a slice of children data arrays - pub fn child_data(&self) -> &[ArrayData] { - &self.child_data[..] - } - - /// Returns whether the element at index `i` is null - pub fn is_null(&self, i: usize) -> bool { - if let Some(ref b) = self.null_bitmap { - return !b.is_set(self.offset + i); - } - false - } - - /// Returns a reference to the null bitmap of this array data - #[inline] - pub const fn null_bitmap(&self) -> &Option { - &self.null_bitmap - } - - /// Returns a reference to the null buffer of this array data. - pub fn null_buffer(&self) -> Option<&Buffer> { - self.null_bitmap().as_ref().map(|b| b.buffer_ref()) - } - - /// Returns whether the element at index `i` is not null - pub fn is_valid(&self, i: usize) -> bool { - if let Some(ref b) = self.null_bitmap { - return b.is_set(self.offset + i); - } - true - } - - /// Returns the length (i.e., number of elements) of this array - #[inline] - pub const fn len(&self) -> usize { - self.len - } - - // Returns whether array data is empty - #[inline] - pub const fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the offset of this array - #[inline] - pub const fn offset(&self) -> usize { - self.offset - } - - /// Returns the total number of nulls in this array - #[inline] - pub const fn null_count(&self) -> usize { - self.null_count - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [ArrayData]. - pub fn get_buffer_memory_size(&self) -> usize { - let mut size = 0; - for buffer in &self.buffers { - size += buffer.capacity(); - } - if let Some(bitmap) = &self.null_bitmap { - size += bitmap.get_buffer_memory_size() - } - for child in &self.child_data { - size += child.get_buffer_memory_size(); - } - size - } - - /// Returns the total number of bytes of memory occupied physically by this [ArrayData]. - pub fn get_array_memory_size(&self) -> usize { - let mut size = 0; - // Calculate size of the fields that don't have [get_array_memory_size] method internally. - size += mem::size_of_val(self) - - mem::size_of_val(&self.buffers) - - mem::size_of_val(&self.null_bitmap) - - mem::size_of_val(&self.child_data); - - // Calculate rest of the fields top down which contain actual data - for buffer in &self.buffers { - size += mem::size_of_val(&buffer); - size += buffer.capacity(); - } - if let Some(bitmap) = &self.null_bitmap { - size += bitmap.get_array_memory_size() - } - for child in &self.child_data { - size += child.get_array_memory_size(); - } - - size - } - - /// Creates a zero-copy slice of itself. This creates a new [ArrayData] - /// with a different offset, len and a shifted null bitmap. - /// - /// # Panics - /// - /// Panics if `offset + length > self.len()`. - pub fn slice(&self, offset: usize, length: usize) -> ArrayData { - assert!((offset + length) <= self.len()); - - let mut new_data = self.clone(); - - new_data.len = length; - new_data.offset = offset + self.offset; - - new_data.null_count = - count_nulls(new_data.null_buffer(), new_data.offset, new_data.len); - - new_data - } - - /// Returns the `buffer` as a slice of type `T` starting at self.offset - /// # Panics - /// This function panics if: - /// * the buffer is not byte-aligned with type T, or - /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable) - #[inline] - pub(crate) fn buffer(&self, buffer: usize) -> &[T] { - let values = unsafe { self.buffers[buffer].as_slice().align_to::() }; - if !values.0.is_empty() || !values.2.is_empty() { - panic!("The buffer is not byte-aligned with its interpretation") - }; - assert_ne!(self.data_type, DataType::Boolean); - &values.1[self.offset..] - } - - /// Returns a new empty [ArrayData] valid for `data_type`. - pub(super) fn new_empty(data_type: &DataType) -> Self { - let buffers = new_buffers(data_type, 0); - let [buffer1, buffer2] = buffers; - let buffers = into_buffers(data_type, buffer1, buffer2); - - let child_data = match data_type { - DataType::Null - | DataType::Boolean - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Float32 - | DataType::Float64 - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Timestamp(_, _) - | DataType::Utf8 - | DataType::Binary - | DataType::LargeUtf8 - | DataType::LargeBinary - | DataType::Interval(_) - | DataType::FixedSizeBinary(_) - | DataType::Decimal(_, _) => vec![], - DataType::List(field) => { - vec![Self::new_empty(field.data_type())] - } - DataType::FixedSizeList(field, _) => { - vec![Self::new_empty(field.data_type())] - } - DataType::LargeList(field) => { - vec![Self::new_empty(field.data_type())] - } - DataType::Struct(fields) => fields - .iter() - .map(|field| Self::new_empty(field.data_type())) - .collect(), - DataType::Union(_) => unimplemented!(), - DataType::Dictionary(_, data_type) => { - vec![Self::new_empty(data_type)] - } - DataType::Float16 => unreachable!(), - }; - - Self::new(data_type.clone(), 0, Some(0), None, 0, buffers, child_data) - } -} - -impl PartialEq for ArrayData { - fn eq(&self, other: &Self) -> bool { - equal(self, other) - } -} - -/// Builder for `ArrayData` type -#[derive(Debug)] -pub struct ArrayDataBuilder { - data_type: DataType, - len: usize, - null_count: Option, - null_bit_buffer: Option, - offset: usize, - buffers: Vec, - child_data: Vec, -} - -impl ArrayDataBuilder { - #[inline] - pub const fn new(data_type: DataType) -> Self { - Self { - data_type, - len: 0, - null_count: None, - null_bit_buffer: None, - offset: 0, - buffers: vec![], - child_data: vec![], - } - } - - #[inline] - pub const fn len(mut self, n: usize) -> Self { - self.len = n; - self - } - - pub fn null_bit_buffer(mut self, buf: Buffer) -> Self { - self.null_bit_buffer = Some(buf); - self - } - - #[inline] - pub const fn offset(mut self, n: usize) -> Self { - self.offset = n; - self - } - - pub fn buffers(mut self, v: Vec) -> Self { - self.buffers = v; - self - } - - pub fn add_buffer(mut self, b: Buffer) -> Self { - self.buffers.push(b); - self - } - - pub fn child_data(mut self, v: Vec) -> Self { - self.child_data = v; - self - } - - pub fn add_child_data(mut self, r: ArrayData) -> Self { - self.child_data.push(r); - self - } - - pub fn build(self) -> ArrayData { - ArrayData::new( - self.data_type, - self.len, - self.null_count, - self.null_bit_buffer, - self.offset, - self.buffers, - self.child_data, - ) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::buffer::Buffer; - use crate::util::bit_util; - - #[test] - fn test_new() { - let arr_data = - ArrayData::new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]); - assert_eq!(10, arr_data.len()); - assert_eq!(1, arr_data.null_count()); - assert_eq!(2, arr_data.offset()); - assert_eq!(0, arr_data.buffers().len()); - assert_eq!(0, arr_data.child_data().len()); - } - - #[test] - fn test_builder() { - let child_arr_data = ArrayData::new( - DataType::Int32, - 5, - Some(0), - None, - 0, - vec![Buffer::from_slice_ref(&[1i32, 2, 3, 4, 5])], - vec![], - ); - let v = vec![0, 1, 2, 3]; - let b1 = Buffer::from(&v[..]); - let arr_data = ArrayData::builder(DataType::Int32) - .len(20) - .offset(5) - .add_buffer(b1) - .null_bit_buffer(Buffer::from(vec![ - 0b01011111, 0b10110101, 0b01100011, 0b00011110, - ])) - .add_child_data(child_arr_data.clone()) - .build(); - - assert_eq!(20, arr_data.len()); - assert_eq!(10, arr_data.null_count()); - assert_eq!(5, arr_data.offset()); - assert_eq!(1, arr_data.buffers().len()); - assert_eq!(&[0, 1, 2, 3], arr_data.buffers()[0].as_slice()); - assert_eq!(1, arr_data.child_data().len()); - assert_eq!(child_arr_data, arr_data.child_data()[0]); - } - - #[test] - fn test_null_count() { - let mut bit_v: [u8; 2] = [0; 2]; - bit_util::set_bit(&mut bit_v, 0); - bit_util::set_bit(&mut bit_v, 3); - bit_util::set_bit(&mut bit_v, 10); - let arr_data = ArrayData::builder(DataType::Int32) - .len(16) - .null_bit_buffer(Buffer::from(bit_v)) - .build(); - assert_eq!(13, arr_data.null_count()); - - // Test with offset - let mut bit_v: [u8; 2] = [0; 2]; - bit_util::set_bit(&mut bit_v, 0); - bit_util::set_bit(&mut bit_v, 3); - bit_util::set_bit(&mut bit_v, 10); - let arr_data = ArrayData::builder(DataType::Int32) - .len(12) - .offset(2) - .null_bit_buffer(Buffer::from(bit_v)) - .build(); - assert_eq!(10, arr_data.null_count()); - } - - #[test] - fn test_null_buffer_ref() { - let mut bit_v: [u8; 2] = [0; 2]; - bit_util::set_bit(&mut bit_v, 0); - bit_util::set_bit(&mut bit_v, 3); - bit_util::set_bit(&mut bit_v, 10); - let arr_data = ArrayData::builder(DataType::Int32) - .len(16) - .null_bit_buffer(Buffer::from(bit_v)) - .build(); - assert!(arr_data.null_buffer().is_some()); - assert_eq!(&bit_v, arr_data.null_buffer().unwrap().as_slice()); - } - - #[test] - fn test_slice() { - let mut bit_v: [u8; 2] = [0; 2]; - bit_util::set_bit(&mut bit_v, 0); - bit_util::set_bit(&mut bit_v, 3); - bit_util::set_bit(&mut bit_v, 10); - let data = ArrayData::builder(DataType::Int32) - .len(16) - .null_bit_buffer(Buffer::from(bit_v)) - .build(); - let new_data = data.slice(1, 15); - assert_eq!(data.len() - 1, new_data.len()); - assert_eq!(1, new_data.offset()); - assert_eq!(data.null_count(), new_data.null_count()); - - // slice of a slice (removes one null) - let new_data = new_data.slice(1, 14); - assert_eq!(data.len() - 2, new_data.len()); - assert_eq!(2, new_data.offset()); - assert_eq!(data.null_count() - 1, new_data.null_count()); - } - - #[test] - fn test_equality() { - let int_data = ArrayData::builder(DataType::Int32).build(); - let float_data = ArrayData::builder(DataType::Float32).build(); - assert_ne!(int_data, float_data); - } - - #[test] - fn test_count_nulls() { - let null_buffer = Some(Buffer::from(vec![0b00010110, 0b10011111])); - let count = count_nulls(null_buffer.as_ref(), 0, 16); - assert_eq!(count, 7); - - let count = count_nulls(null_buffer.as_ref(), 4, 8); - assert_eq!(count, 3); - } -} diff --git a/rust/arrow/src/array/equal/boolean.rs b/rust/arrow/src/array/equal/boolean.rs deleted file mode 100644 index 35c9786e49f..00000000000 --- a/rust/arrow/src/array/equal/boolean.rs +++ /dev/null @@ -1,93 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::{data::count_nulls, ArrayData}; -use crate::buffer::Buffer; -use crate::util::bit_util::get_bit; - -use super::utils::{equal_bits, equal_len}; - -pub(super) fn boolean_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - mut lhs_start: usize, - mut rhs_start: usize, - mut len: usize, -) -> bool { - let lhs_values = lhs.buffers()[0].as_slice(); - let rhs_values = rhs.buffers()[0].as_slice(); - - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - - if lhs_null_count == 0 && rhs_null_count == 0 { - // Optimize performance for starting offset at u8 boundary. - if lhs_start % 8 == 0 && rhs_start % 8 == 0 { - let quot = len / 8; - if quot > 0 - && !equal_len( - lhs_values, - rhs_values, - lhs_start / 8 + lhs.offset(), - rhs_start / 8 + rhs.offset(), - quot, - ) - { - return false; - } - - // Calculate for suffix bits. - let rem = len % 8; - if rem == 0 { - return true; - } else { - let aligned_bits = len - rem; - lhs_start += aligned_bits; - rhs_start += aligned_bits; - len = rem - } - } - - equal_bits( - lhs_values, - rhs_values, - lhs_start + lhs.offset(), - rhs_start + rhs.offset(), - len, - ) - } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); - - let lhs_start = lhs.offset() + lhs_start; - let rhs_start = rhs.offset() + rhs_start; - - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_bits(lhs_values, rhs_values, lhs_pos, rhs_pos, 1) - }) - } -} diff --git a/rust/arrow/src/array/equal/decimal.rs b/rust/arrow/src/array/equal/decimal.rs deleted file mode 100644 index 1ee6ec9b543..00000000000 --- a/rust/arrow/src/array/equal/decimal.rs +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::{data::count_nulls, ArrayData}; -use crate::buffer::Buffer; -use crate::datatypes::DataType; -use crate::util::bit_util::get_bit; - -use super::utils::equal_len; - -pub(super) fn decimal_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - let size = match lhs.data_type() { - DataType::Decimal(_, _) => 16, - _ => unreachable!(), - }; - - let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * size..]; - let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * size..]; - - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - - if lhs_null_count == 0 && rhs_null_count == 0 { - equal_len( - lhs_values, - rhs_values, - size * lhs_start, - size * rhs_start, - size * len, - ) - } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_len( - lhs_values, - rhs_values, - lhs_pos * size, - rhs_pos * size, - size, // 1 * size since we are comparing a single entry - ) - }) - } -} diff --git a/rust/arrow/src/array/equal/dictionary.rs b/rust/arrow/src/array/equal/dictionary.rs deleted file mode 100644 index 22add2494d2..00000000000 --- a/rust/arrow/src/array/equal/dictionary.rs +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::{data::count_nulls, ArrayData}; -use crate::buffer::Buffer; -use crate::datatypes::ArrowNativeType; -use crate::util::bit_util::get_bit; - -use super::equal_range; - -pub(super) fn dictionary_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - let lhs_keys = lhs.buffer::(0); - let rhs_keys = rhs.buffer::(0); - - let lhs_values = &lhs.child_data()[0]; - let rhs_values = &rhs.child_data()[0]; - - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - - if lhs_null_count == 0 && rhs_null_count == 0 { - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - - equal_range( - lhs_values, - rhs_values, - lhs_values.null_buffer(), - rhs_values.null_buffer(), - lhs_keys[lhs_pos].to_usize().unwrap(), - rhs_keys[rhs_pos].to_usize().unwrap(), - 1, - ) - }) - } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_range( - lhs_values, - rhs_values, - lhs_values.null_buffer(), - rhs_values.null_buffer(), - lhs_keys[lhs_pos].to_usize().unwrap(), - rhs_keys[rhs_pos].to_usize().unwrap(), - 1, - ) - }) - } -} diff --git a/rust/arrow/src/array/equal/fixed_binary.rs b/rust/arrow/src/array/equal/fixed_binary.rs deleted file mode 100644 index 5f8f93232d5..00000000000 --- a/rust/arrow/src/array/equal/fixed_binary.rs +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::{data::count_nulls, ArrayData}; -use crate::buffer::Buffer; -use crate::datatypes::DataType; -use crate::util::bit_util::get_bit; - -use super::utils::equal_len; - -pub(super) fn fixed_binary_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - let size = match lhs.data_type() { - DataType::FixedSizeBinary(i) => *i as usize, - _ => unreachable!(), - }; - - let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * size..]; - let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * size..]; - - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - - if lhs_null_count == 0 && rhs_null_count == 0 { - equal_len( - lhs_values, - rhs_values, - size * lhs_start, - size * rhs_start, - size * len, - ) - } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_len( - lhs_values, - rhs_values, - lhs_pos * size, - rhs_pos * size, - size, // 1 * size since we are comparing a single entry - ) - }) - } -} diff --git a/rust/arrow/src/array/equal/fixed_list.rs b/rust/arrow/src/array/equal/fixed_list.rs deleted file mode 100644 index e708a06efcd..00000000000 --- a/rust/arrow/src/array/equal/fixed_list.rs +++ /dev/null @@ -1,80 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::{data::count_nulls, ArrayData}; -use crate::buffer::Buffer; -use crate::datatypes::DataType; -use crate::util::bit_util::get_bit; - -use super::equal_range; - -pub(super) fn fixed_list_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - let size = match lhs.data_type() { - DataType::FixedSizeList(_, i) => *i as usize, - _ => unreachable!(), - }; - - let lhs_values = &lhs.child_data()[0]; - let rhs_values = &rhs.child_data()[0]; - - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - - if lhs_null_count == 0 && rhs_null_count == 0 { - equal_range( - lhs_values, - rhs_values, - lhs_values.null_buffer(), - rhs_values.null_buffer(), - size * lhs_start, - size * rhs_start, - size * len, - ) - } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_range( - lhs_values, - rhs_values, - lhs_values.null_buffer(), - rhs_values.null_buffer(), - lhs_pos * size, - rhs_pos * size, - size, // 1 * size since we are comparing a single entry - ) - }) - } -} diff --git a/rust/arrow/src/array/equal/list.rs b/rust/arrow/src/array/equal/list.rs deleted file mode 100644 index 331cdc7c614..00000000000 --- a/rust/arrow/src/array/equal/list.rs +++ /dev/null @@ -1,172 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::{ - array::ArrayData, - array::{data::count_nulls, OffsetSizeTrait}, - buffer::Buffer, - util::bit_util::get_bit, -}; - -use super::{equal_range, utils::child_logical_null_buffer}; - -fn lengths_equal(lhs: &[T], rhs: &[T]) -> bool { - // invariant from `base_equal` - debug_assert_eq!(lhs.len(), rhs.len()); - - if lhs.is_empty() { - return true; - } - - if lhs[0] == T::zero() && rhs[0] == T::zero() { - return lhs == rhs; - }; - - // The expensive case, e.g. - // [0, 2, 4, 6, 9] == [4, 6, 8, 10, 13] - lhs.windows(2) - .zip(rhs.windows(2)) - .all(|(lhs_offsets, rhs_offsets)| { - // length of left == length of right - (lhs_offsets[1] - lhs_offsets[0]) == (rhs_offsets[1] - rhs_offsets[0]) - }) -} - -#[allow(clippy::too_many_arguments)] -#[inline] -fn offset_value_equal( - lhs_values: &ArrayData, - rhs_values: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_offsets: &[T], - rhs_offsets: &[T], - lhs_pos: usize, - rhs_pos: usize, - len: usize, -) -> bool { - let lhs_start = lhs_offsets[lhs_pos].to_usize().unwrap(); - let rhs_start = rhs_offsets[rhs_pos].to_usize().unwrap(); - let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos]; - let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos]; - - lhs_len == rhs_len - && equal_range( - lhs_values, - rhs_values, - lhs_nulls, - rhs_nulls, - lhs_start, - rhs_start, - lhs_len.to_usize().unwrap(), - ) -} - -pub(super) fn list_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - let lhs_offsets = lhs.buffer::(0); - let rhs_offsets = rhs.buffer::(0); - - // There is an edge-case where a n-length list that has 0 children, results in panics. - // For example; an array with offsets [0, 0, 0, 0, 0] has 4 slots, but will have - // no valid children. - // Under logical equality, the child null bitmap will be an empty buffer, as there are - // no child values. This causes panics when trying to count set bits. - // - // We caught this by chance from an accidental test-case, but due to the nature of this - // crash only occuring on list equality checks, we are adding a check here, instead of - // on the buffer/bitmap utilities, as a length check would incur a penalty for almost all - // other use-cases. - // - // The solution is to check the number of child values from offsets, and return `true` if - // they = 0. Empty arrays are equal, so this is correct. - // - // It's unlikely that one would create a n-length list array with no values, where n > 0, - // however, one is more likely to slice into a list array and get a region that has 0 - // child values. - // The test that triggered this behaviour had [4, 4] as a slice of 1 value slot. - let lhs_child_length = lhs_offsets.get(len).unwrap().to_usize().unwrap() - - lhs_offsets.first().unwrap().to_usize().unwrap(); - let rhs_child_length = rhs_offsets.get(len).unwrap().to_usize().unwrap() - - rhs_offsets.first().unwrap().to_usize().unwrap(); - - if lhs_child_length == 0 && lhs_child_length == rhs_child_length { - return true; - } - - let lhs_values = &lhs.child_data()[0]; - let rhs_values = &rhs.child_data()[0]; - - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - - // compute the child logical bitmap - let child_lhs_nulls = - child_logical_null_buffer(lhs, lhs_nulls, lhs.child_data().get(0).unwrap()); - let child_rhs_nulls = - child_logical_null_buffer(rhs, rhs_nulls, rhs.child_data().get(0).unwrap()); - - if lhs_null_count == 0 && rhs_null_count == 0 { - lengths_equal( - &lhs_offsets[lhs_start..lhs_start + len], - &rhs_offsets[rhs_start..rhs_start + len], - ) && equal_range( - lhs_values, - rhs_values, - child_lhs_nulls.as_ref(), - child_rhs_nulls.as_ref(), - lhs_offsets[lhs_start].to_usize().unwrap(), - rhs_offsets[rhs_start].to_usize().unwrap(), - (lhs_offsets[len] - lhs_offsets[lhs_start]) - .to_usize() - .unwrap(), - ) - } else { - // get a ref of the parent null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.unwrap().as_slice(); - let rhs_null_bytes = rhs_nulls.unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && offset_value_equal::( - lhs_values, - rhs_values, - child_lhs_nulls.as_ref(), - child_rhs_nulls.as_ref(), - lhs_offsets, - rhs_offsets, - lhs_pos, - rhs_pos, - 1, - ) - }) - } -} diff --git a/rust/arrow/src/array/equal/mod.rs b/rust/arrow/src/array/equal/mod.rs deleted file mode 100644 index 0924fc193a6..00000000000 --- a/rust/arrow/src/array/equal/mod.rs +++ /dev/null @@ -1,1277 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Module containing functionality to compute array equality. -//! This module uses [ArrayData] and does not -//! depend on dynamic casting of `Array`. - -use super::{ - Array, ArrayData, BinaryOffsetSizeTrait, BooleanArray, DecimalArray, - FixedSizeBinaryArray, FixedSizeListArray, GenericBinaryArray, GenericListArray, - GenericStringArray, NullArray, OffsetSizeTrait, PrimitiveArray, - StringOffsetSizeTrait, StructArray, -}; - -use crate::{ - buffer::Buffer, - datatypes::{ArrowPrimitiveType, DataType, IntervalUnit}, -}; - -mod boolean; -mod decimal; -mod dictionary; -mod fixed_binary; -mod fixed_list; -mod list; -mod null; -mod primitive; -mod structure; -mod utils; -mod variable_size; - -// these methods assume the same type, len and null count. -// For this reason, they are not exposed and are instead used -// to build the generic functions below (`equal_range` and `equal`). -use boolean::boolean_equal; -use decimal::decimal_equal; -use dictionary::dictionary_equal; -use fixed_binary::fixed_binary_equal; -use fixed_list::fixed_list_equal; -use list::list_equal; -use null::null_equal; -use primitive::primitive_equal; -use structure::struct_equal; -use variable_size::variable_sized_equal; - -impl PartialEq for dyn Array { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for dyn Array { - fn eq(&self, other: &T) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for NullArray { - fn eq(&self, other: &NullArray) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for PrimitiveArray { - fn eq(&self, other: &PrimitiveArray) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for BooleanArray { - fn eq(&self, other: &BooleanArray) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for GenericStringArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for GenericBinaryArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for FixedSizeBinaryArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for DecimalArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for GenericListArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for FixedSizeListArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -impl PartialEq for StructArray { - fn eq(&self, other: &Self) -> bool { - equal(self.data(), other.data()) - } -} - -/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively -/// for `len` slots. The null buffers `lhs_nulls` and `rhs_nulls` inherit parent nullability. -/// -/// If an array is a child of a struct or list, the array's nulls have to be merged with the parent. -/// This then affects the null count of the array, thus the merged nulls are passed separately -/// as `lhs_nulls` and `rhs_nulls` variables to functions. -/// The nulls are merged with a bitwise AND, and null counts are recomputed where necessary. -#[inline] -fn equal_values( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - match lhs.data_type() { - DataType::Null => null_equal(lhs, rhs, lhs_start, rhs_start, len), - DataType::Boolean => { - boolean_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) - } - DataType::UInt8 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::UInt16 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::UInt32 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::UInt64 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Int8 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Int16 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Int32 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Int64 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Float32 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Float64 => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Date64 - | DataType::Interval(IntervalUnit::DayTime) - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => primitive_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Utf8 | DataType::Binary => variable_sized_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::LargeUtf8 | DataType::LargeBinary => variable_sized_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::FixedSizeBinary(_) => { - fixed_binary_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) - } - DataType::Decimal(_, _) => { - decimal_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) - } - DataType::List(_) => { - list_equal::(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) - } - DataType::LargeList(_) => { - list_equal::(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) - } - DataType::FixedSizeList(_, _) => { - fixed_list_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) - } - DataType::Struct(_) => { - struct_equal(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) - } - DataType::Union(_) => unimplemented!("See ARROW-8576"), - DataType::Dictionary(data_type, _) => match data_type.as_ref() { - DataType::Int8 => dictionary_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Int16 => dictionary_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Int32 => dictionary_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::Int64 => dictionary_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::UInt8 => dictionary_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::UInt16 => dictionary_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::UInt32 => dictionary_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - DataType::UInt64 => dictionary_equal::( - lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, - ), - _ => unreachable!(), - }, - DataType::Float16 => unreachable!(), - } -} - -fn equal_range( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - utils::base_equal(lhs, rhs) - && utils::equal_nulls(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) - && equal_values(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) -} - -/// Logically compares two [ArrayData]. -/// Two arrays are logically equal if and only if: -/// * their data types are equal -/// * their lengths are equal -/// * their null counts are equal -/// * their null bitmaps are equal -/// * each of their items are equal -/// two items are equal when their in-memory representation is physically equal (i.e. same bit content). -/// The physical comparison depend on the data type. -/// # Panics -/// This function may panic whenever any of the [ArrayData] does not follow the Arrow specification. -/// (e.g. wrong number of buffers, buffer `len` does not correspond to the declared `len`) -pub fn equal(lhs: &ArrayData, rhs: &ArrayData) -> bool { - let lhs_nulls = lhs.null_buffer(); - let rhs_nulls = rhs.null_buffer(); - utils::base_equal(lhs, rhs) - && lhs.null_count() == rhs.null_count() - && utils::equal_nulls(lhs, rhs, lhs_nulls, rhs_nulls, 0, 0, lhs.len()) - && equal_values(lhs, rhs, lhs_nulls, rhs_nulls, 0, 0, lhs.len()) -} - -#[cfg(test)] -mod tests { - use std::convert::TryFrom; - use std::sync::Arc; - - use crate::array::{ - array::Array, ArrayDataBuilder, ArrayRef, BinaryOffsetSizeTrait, BooleanArray, - DecimalBuilder, FixedSizeBinaryBuilder, FixedSizeListBuilder, GenericBinaryArray, - Int32Builder, ListBuilder, NullArray, PrimitiveBuilder, StringArray, - StringDictionaryBuilder, StringOffsetSizeTrait, StructArray, - }; - use crate::array::{GenericStringArray, Int32Array}; - use crate::buffer::Buffer; - use crate::datatypes::{Field, Int16Type, ToByteSlice}; - - use super::*; - - #[test] - fn test_null_equal() { - let a = NullArray::new(12); - let a = a.data(); - let b = NullArray::new(12); - let b = b.data(); - test_equal(&a, &b, true); - - let b = NullArray::new(10); - let b = b.data(); - test_equal(&a, &b, false); - - // Test the case where offset != 0 - - let a_slice = a.slice(2, 3); - let b_slice = b.slice(1, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(5, 4); - let b_slice = b.slice(3, 3); - test_equal(&a_slice, &b_slice, false); - } - - #[test] - fn test_boolean_equal() { - let a = BooleanArray::from(vec![false, false, true]); - let a = a.data(); - let b = BooleanArray::from(vec![false, false, true]); - let b = b.data(); - test_equal(&a, &b, true); - - let b = BooleanArray::from(vec![false, false, false]); - let b = b.data(); - test_equal(&a, &b, false); - } - - #[test] - fn test_boolean_equal_nulls() { - let a = BooleanArray::from(vec![Some(false), None, None, Some(true)]); - let a = a.data(); - let b = BooleanArray::from(vec![Some(false), None, None, Some(true)]); - let b = b.data(); - test_equal(&a, &b, true); - - let b = BooleanArray::from(vec![None, None, None, Some(true)]); - let b = b.data(); - test_equal(&a, &b, false); - - let b = BooleanArray::from(vec![Some(true), None, None, Some(true)]); - let b = b.data(); - test_equal(&a, &b, false); - } - - #[test] - fn test_boolean_equal_offset() { - let a = BooleanArray::from(vec![false, true, false, true, false, false, true]); - let a = a.data(); - let b = - BooleanArray::from(vec![true, false, false, false, true, false, true, true]); - let b = b.data(); - assert_eq!(equal(a, b), false); - assert_eq!(equal(b, a), false); - - let a_slice = a.slice(2, 3); - let b_slice = b.slice(3, 3); - assert_eq!(equal(&a_slice, &b_slice), true); - assert_eq!(equal(&b_slice, &a_slice), true); - - let a_slice = a.slice(3, 4); - let b_slice = b.slice(4, 4); - assert_eq!(equal(&a_slice, &b_slice), false); - assert_eq!(equal(&b_slice, &a_slice), false); - - // Test the optimization cases where null_count == 0 and starts at 0 and len >= size_of(u8) - - // Elements fill in `u8`'s exactly. - let mut vector = vec![false, false, true, true, true, true, true, true]; - let a = BooleanArray::from(vector.clone()); - let a = a.data(); - let b = BooleanArray::from(vector.clone()); - let b = b.data(); - test_equal(&a, &b, true); - - // Elements fill in `u8`s + suffix bits. - vector.push(true); - let a = BooleanArray::from(vector.clone()); - let a = a.data(); - let b = BooleanArray::from(vector); - let b = b.data(); - test_equal(&a, &b, true); - } - - #[test] - fn test_primitive() { - let cases = vec![ - ( - vec![Some(1), Some(2), Some(3)], - vec![Some(1), Some(2), Some(3)], - true, - ), - ( - vec![Some(1), Some(2), Some(3)], - vec![Some(1), Some(2), Some(4)], - false, - ), - ( - vec![Some(1), Some(2), None], - vec![Some(1), Some(2), None], - true, - ), - ( - vec![Some(1), None, Some(3)], - vec![Some(1), Some(2), None], - false, - ), - ( - vec![Some(1), None, None], - vec![Some(1), Some(2), None], - false, - ), - ]; - - for (lhs, rhs, expected) in cases { - let lhs = Int32Array::from(lhs); - let lhs = lhs.data(); - let rhs = Int32Array::from(rhs); - let rhs = rhs.data(); - test_equal(&lhs, &rhs, expected); - } - } - - #[test] - fn test_primitive_slice() { - let cases = vec![ - ( - vec![Some(1), Some(2), Some(3)], - (0, 1), - vec![Some(1), Some(2), Some(3)], - (0, 1), - true, - ), - ( - vec![Some(1), Some(2), Some(3)], - (1, 1), - vec![Some(1), Some(2), Some(3)], - (2, 1), - false, - ), - ( - vec![Some(1), Some(2), None], - (1, 1), - vec![Some(1), None, Some(2)], - (2, 1), - true, - ), - ( - vec![None, Some(2), None], - (1, 1), - vec![None, None, Some(2)], - (2, 1), - true, - ), - ( - vec![Some(1), None, Some(2), None, Some(3)], - (2, 2), - vec![None, Some(2), None, Some(3)], - (1, 2), - true, - ), - ]; - - for (lhs, slice_lhs, rhs, slice_rhs, expected) in cases { - let lhs = Int32Array::from(lhs); - let lhs = lhs.data(); - let lhs = lhs.slice(slice_lhs.0, slice_lhs.1); - let rhs = Int32Array::from(rhs); - let rhs = rhs.data(); - let rhs = rhs.slice(slice_rhs.0, slice_rhs.1); - - test_equal(&lhs, &rhs, expected); - } - } - - fn test_equal(lhs: &ArrayData, rhs: &ArrayData, expected: bool) { - // equality is symmetric - assert_eq!(equal(lhs, lhs), true, "\n{:?}\n{:?}", lhs, lhs); - assert_eq!(equal(rhs, rhs), true, "\n{:?}\n{:?}", rhs, rhs); - - assert_eq!(equal(lhs, rhs), expected, "\n{:?}\n{:?}", lhs, rhs); - assert_eq!(equal(rhs, lhs), expected, "\n{:?}\n{:?}", rhs, lhs); - } - - fn binary_cases() -> Vec<(Vec>, Vec>, bool)> { - let base = vec![ - Some("hello".to_owned()), - None, - None, - Some("world".to_owned()), - None, - None, - ]; - let not_base = vec![ - Some("hello".to_owned()), - Some("foo".to_owned()), - None, - Some("world".to_owned()), - None, - None, - ]; - vec![ - ( - vec![Some("hello".to_owned()), Some("world".to_owned())], - vec![Some("hello".to_owned()), Some("world".to_owned())], - true, - ), - ( - vec![Some("hello".to_owned()), Some("world".to_owned())], - vec![Some("hello".to_owned()), Some("arrow".to_owned())], - false, - ), - (base.clone(), base.clone(), true), - (base, not_base, false), - ] - } - - fn test_generic_string_equal() { - let cases = binary_cases(); - - for (lhs, rhs, expected) in cases { - let lhs = lhs.iter().map(|x| x.as_deref()).collect(); - let rhs = rhs.iter().map(|x| x.as_deref()).collect(); - let lhs = GenericStringArray::::from_opt_vec(lhs); - let lhs = lhs.data(); - let rhs = GenericStringArray::::from_opt_vec(rhs); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); - } - } - - #[test] - fn test_string_equal() { - test_generic_string_equal::() - } - - #[test] - fn test_large_string_equal() { - test_generic_string_equal::() - } - - fn test_generic_binary_equal() { - let cases = binary_cases(); - - for (lhs, rhs, expected) in cases { - let lhs = lhs - .iter() - .map(|x| x.as_deref().map(|x| x.as_bytes())) - .collect(); - let rhs = rhs - .iter() - .map(|x| x.as_deref().map(|x| x.as_bytes())) - .collect(); - let lhs = GenericBinaryArray::::from_opt_vec(lhs); - let lhs = lhs.data(); - let rhs = GenericBinaryArray::::from_opt_vec(rhs); - let rhs = rhs.data(); - test_equal(lhs, rhs, expected); - } - } - - #[test] - fn test_binary_equal() { - test_generic_binary_equal::() - } - - #[test] - fn test_large_binary_equal() { - test_generic_binary_equal::() - } - - #[test] - fn test_string_offset() { - let a = StringArray::from(vec![Some("a"), None, Some("b")]); - let a = a.data(); - let a = a.slice(2, 1); - let b = StringArray::from(vec![Some("b")]); - let b = b.data(); - - test_equal(&a, &b, true); - } - - #[test] - fn test_string_offset_larger() { - let a = StringArray::from(vec![Some("a"), None, Some("b"), None, Some("c")]); - let a = a.data(); - let b = StringArray::from(vec![None, Some("b"), None, Some("c")]); - let b = b.data(); - - test_equal(&a.slice(2, 2), &b.slice(0, 2), false); - test_equal(&a.slice(2, 2), &b.slice(1, 2), true); - test_equal(&a.slice(2, 2), &b.slice(2, 2), false); - } - - #[test] - fn test_null() { - let a = NullArray::new(2); - let a = a.data(); - let b = NullArray::new(2); - let b = b.data(); - test_equal(&a, &b, true); - - let b = NullArray::new(1); - let b = b.data(); - test_equal(&a, &b, false); - } - - fn create_list_array, T: AsRef<[Option]>>(data: T) -> ArrayData { - let mut builder = ListBuilder::new(Int32Builder::new(10)); - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()).unwrap(); - builder.append(true).unwrap() - } else { - builder.append(false).unwrap() - } - } - builder.finish().data().clone() - } - - #[test] - fn test_list_equal() { - let a = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - test_equal(&a, &b, true); - - let b = create_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_list_null() { - let a = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - let b = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - test_equal(&a, &b, true); - - let b = create_list_array(&[ - Some(&[1, 2]), - None, - Some(&[5, 6]), - Some(&[3, 4]), - None, - None, - ]); - test_equal(&a, &b, false); - - let b = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); - test_equal(&a, &b, false); - - // a list where the nullness of values is determined by the list's bitmap - let c_values = Int32Array::from(vec![1, 2, -1, -2, 3, 4, -3, -4]); - let c = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(6) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) - .add_child_data(c_values.data().clone()) - .null_bit_buffer(Buffer::from(vec![0b00001001])) - .build(); - - let d_values = Int32Array::from(vec![ - Some(1), - Some(2), - None, - None, - Some(3), - Some(4), - None, - None, - ]); - let d = ArrayDataBuilder::new(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - true, - )))) - .len(6) - .add_buffer(Buffer::from(vec![0i32, 2, 3, 4, 6, 7, 8].to_byte_slice())) - .add_child_data(d_values.data().clone()) - .null_bit_buffer(Buffer::from(vec![0b00001001])) - .build(); - test_equal(&c, &d, true); - } - - // Test the case where offset != 0 - #[test] - fn test_list_offsets() { - let a = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 4]), None, None]); - let b = - create_list_array(&[Some(&[1, 2]), None, None, Some(&[3, 5]), None, None]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(0, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(0, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(4, 1); - test_equal(&a_slice, &b_slice, true); - } - - fn create_fixed_size_binary_array, T: AsRef<[Option]>>( - data: T, - ) -> ArrayData { - let mut builder = FixedSizeBinaryBuilder::new(15, 5); - - for d in data.as_ref() { - if let Some(v) = d { - builder.append_value(v.as_ref()).unwrap(); - } else { - builder.append_null().unwrap(); - } - } - builder.finish().data().clone() - } - - #[test] - fn test_fixed_size_binary_equal() { - let a = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world")]); - test_equal(&a, &b, true); - - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"arrow")]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_fixed_size_binary_null() { - let a = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); - let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"world")]); - test_equal(&a, &b, true); - - let b = create_fixed_size_binary_array(&[Some(b"hello"), Some(b"world"), None]); - test_equal(&a, &b, false); - - let b = create_fixed_size_binary_array(&[Some(b"hello"), None, Some(b"arrow")]); - test_equal(&a, &b, false); - } - - #[test] - fn test_fixed_size_binary_offsets() { - // Test the case where offset != 0 - let a = create_fixed_size_binary_array(&[ - Some(b"hello"), - None, - None, - Some(b"world"), - None, - None, - ]); - let b = create_fixed_size_binary_array(&[ - Some(b"hello"), - None, - None, - Some(b"arrow"), - None, - None, - ]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(0, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(0, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(4, 1); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(3, 1); - let b_slice = b.slice(3, 1); - test_equal(&a_slice, &b_slice, false); - } - - fn create_decimal_array(data: &[Option]) -> ArrayData { - let mut builder = DecimalBuilder::new(20, 23, 6); - - for d in data { - if let Some(v) = d { - builder.append_value(*v).unwrap(); - } else { - builder.append_null().unwrap(); - } - } - builder.finish().data().clone() - } - - #[test] - fn test_decimal_equal() { - let a = create_decimal_array(&[Some(8_887_000_000), Some(-8_887_000_000)]); - let b = create_decimal_array(&[Some(8_887_000_000), Some(-8_887_000_000)]); - test_equal(&a, &b, true); - - let b = create_decimal_array(&[Some(15_887_000_000), Some(-8_887_000_000)]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_decimal_null() { - let a = create_decimal_array(&[Some(8_887_000_000), None, Some(-8_887_000_000)]); - let b = create_decimal_array(&[Some(8_887_000_000), None, Some(-8_887_000_000)]); - test_equal(&a, &b, true); - - let b = create_decimal_array(&[Some(8_887_000_000), Some(-8_887_000_000), None]); - test_equal(&a, &b, false); - - let b = create_decimal_array(&[Some(15_887_000_000), None, Some(-8_887_000_000)]); - test_equal(&a, &b, false); - } - - #[test] - fn test_decimal_offsets() { - // Test the case where offset != 0 - let a = create_decimal_array(&[ - Some(8_887_000_000), - None, - None, - Some(-8_887_000_000), - None, - None, - ]); - let b = create_decimal_array(&[ - None, - Some(8_887_000_000), - None, - None, - Some(15_887_000_000), - None, - None, - ]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(1, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(1, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(5, 1); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(3, 3); - let b_slice = b.slice(4, 3); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(1, 3); - let b_slice = b.slice(2, 3); - test_equal(&a_slice, &b_slice, false); - - let b = create_decimal_array(&[ - None, - None, - None, - Some(-8_887_000_000), - Some(-3_000), - None, - ]); - let a_slice = a.slice(1, 3); - let b_slice = b.slice(1, 3); - test_equal(&a_slice, &b_slice, true); - } - - /// Create a fixed size list of 2 value lengths - fn create_fixed_size_list_array, T: AsRef<[Option]>>( - data: T, - ) -> ArrayData { - let mut builder = FixedSizeListBuilder::new(Int32Builder::new(10), 3); - - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()).unwrap(); - builder.append(true).unwrap() - } else { - for _ in 0..builder.value_length() { - builder.values().append_null().unwrap(); - } - builder.append(false).unwrap() - } - } - builder.finish().data().clone() - } - - #[test] - fn test_fixed_size_list_equal() { - let a = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 6])]); - test_equal(&a, &b, true); - - let b = create_fixed_size_list_array(&[Some(&[1, 2, 3]), Some(&[4, 5, 7])]); - test_equal(&a, &b, false); - } - - // Test the case where null_count > 0 - #[test] - fn test_fixed_list_null() { - let a = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - test_equal(&a, &b, true); - - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - Some(&[7, 8, 9]), - Some(&[4, 5, 6]), - None, - None, - ]); - test_equal(&a, &b, false); - - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[3, 6, 9]), - None, - None, - ]); - test_equal(&a, &b, false); - } - - #[test] - fn test_fixed_list_offsets() { - // Test the case where offset != 0 - let a = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[4, 5, 6]), - None, - None, - ]); - let b = create_fixed_size_list_array(&[ - Some(&[1, 2, 3]), - None, - None, - Some(&[3, 6, 9]), - None, - None, - ]); - - let a_slice = a.slice(0, 3); - let b_slice = b.slice(0, 3); - test_equal(&a_slice, &b_slice, true); - - let a_slice = a.slice(0, 5); - let b_slice = b.slice(0, 5); - test_equal(&a_slice, &b_slice, false); - - let a_slice = a.slice(4, 1); - let b_slice = b.slice(4, 1); - test_equal(&a_slice, &b_slice, true); - } - - #[test] - fn test_struct_equal() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let a = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let a = a.data(); - - let b = StructArray::try_from(vec![("f1", strings), ("f2", ints)]).unwrap(); - let b = b.data(); - - test_equal(&a, &b, true); - } - - #[test] - fn test_struct_equal_null() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - let ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 0])); - - let a = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Buffer::from(vec![0b00001011])) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints.data_ref().clone()) - .build(); - let a = crate::array::make_array(a); - - let b = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Buffer::from(vec![0b00001011])) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints_non_null.data_ref().clone()) - .build(); - let b = crate::array::make_array(b); - - test_equal(a.data_ref(), b.data_ref(), true); - - // test with arrays that are not equal - let c_ints_non_null: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 0, 4])); - let c = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Buffer::from(vec![0b00001011])) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(c_ints_non_null.data_ref().clone()) - .build(); - let c = crate::array::make_array(c); - - test_equal(a.data_ref(), c.data_ref(), false); - - // test a nested struct - let a = ArrayData::builder(DataType::Struct(vec![Field::new( - "f3", - a.data_type().clone(), - true, - )])) - .null_bit_buffer(Buffer::from(vec![0b00011110])) - .len(5) - .add_child_data(a.data_ref().clone()) - .build(); - let a = crate::array::make_array(a); - - // reconstruct b, but with different data where the first struct is null - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joanne"), // difference - None, - None, - Some("mark"), - Some("doe"), - ])); - let b = ArrayData::builder(DataType::Struct(vec![ - Field::new("f1", DataType::Utf8, true), - Field::new("f2", DataType::Int32, true), - ])) - .null_bit_buffer(Buffer::from(vec![0b00001011])) - .len(5) - .add_child_data(strings.data_ref().clone()) - .add_child_data(ints_non_null.data_ref().clone()) - .build(); - - let b = ArrayData::builder(DataType::Struct(vec![Field::new( - "f3", - b.data_type().clone(), - true, - )])) - .null_bit_buffer(Buffer::from(vec![0b00011110])) - .len(5) - .add_child_data(b) - .build(); - let b = crate::array::make_array(b); - - test_equal(a.data_ref(), b.data_ref(), true); - } - - #[test] - fn test_struct_equal_null_variable_size() { - // the string arrays differ, but where the struct array is null - let strings1: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doel"), - ])); - let strings2: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joel"), - None, - None, - Some("mark"), - Some("doe"), - ])); - - let a = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) - .null_bit_buffer(Buffer::from(vec![0b00001010])) - .len(5) - .add_child_data(strings1.data_ref().clone()) - .build(); - let a = crate::array::make_array(a); - - let b = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) - .null_bit_buffer(Buffer::from(vec![0b00001010])) - .len(5) - .add_child_data(strings2.data_ref().clone()) - .build(); - let b = crate::array::make_array(b); - - test_equal(a.data_ref(), b.data_ref(), true); - - // test with arrays that are not equal - let strings3: ArrayRef = Arc::new(StringArray::from(vec![ - Some("mark"), - None, - None, - Some("doe"), - Some("joe"), - ])); - let c = ArrayData::builder(DataType::Struct(vec![Field::new( - "f1", - DataType::Utf8, - true, - )])) - .null_bit_buffer(Buffer::from(vec![0b00001011])) - .len(5) - .add_child_data(strings3.data_ref().clone()) - .build(); - let c = crate::array::make_array(c); - - test_equal(a.data_ref(), c.data_ref(), false); - } - - fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { - let values = StringArray::from(values.to_vec()); - let mut builder = StringDictionaryBuilder::new_with_dictionary( - PrimitiveBuilder::::new(3), - &values, - ) - .unwrap(); - for key in keys { - if let Some(v) = key { - builder.append(v).unwrap(); - } else { - builder.append_null().unwrap() - } - } - builder.finish().data().clone() - } - - #[test] - fn test_dictionary_equal() { - // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) - let a = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), Some("b"), Some("a"), Some("c")], - ); - // different representation (values and keys are swapped), same result - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), Some("c")], - ); - test_equal(&a, &b, true); - - // different len - let b = - create_dictionary_array(&["a", "c", "b"], &[Some("a"), Some("b"), Some("a")]); - test_equal(&a, &b, false); - - // different key - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), Some("a")], - ); - test_equal(&a, &b, false); - - // different values, same keys - let b = create_dictionary_array( - &["a", "b", "d"], - &[Some("a"), Some("b"), Some("a"), Some("d")], - ); - test_equal(&a, &b, false); - } - - #[test] - fn test_dictionary_equal_null() { - // (a, b, c), (1, 2, 1, 3) => (a, b, a, c) - let a = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), None, Some("a"), Some("c")], - ); - - // equal to self - test_equal(&a, &a, true); - - // different representation (values and keys are swapped), same result - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), None, Some("a"), Some("c")], - ); - test_equal(&a, &b, true); - - // different null position - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), Some("b"), Some("a"), None], - ); - test_equal(&a, &b, false); - - // different key - let b = create_dictionary_array( - &["a", "c", "b"], - &[Some("a"), None, Some("a"), Some("a")], - ); - test_equal(&a, &b, false); - - // different values, same keys - let b = create_dictionary_array( - &["a", "b", "d"], - &[Some("a"), None, Some("a"), Some("d")], - ); - test_equal(&a, &b, false); - } -} diff --git a/rust/arrow/src/array/equal/null.rs b/rust/arrow/src/array/equal/null.rs deleted file mode 100644 index f287a382507..00000000000 --- a/rust/arrow/src/array/equal/null.rs +++ /dev/null @@ -1,31 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::ArrayData; - -#[inline] -pub(super) fn null_equal( - _lhs: &ArrayData, - _rhs: &ArrayData, - _lhs_start: usize, - _rhs_start: usize, - _len: usize, -) -> bool { - // a null buffer's range is always true, as every element is by definition equal (to null). - // We only need to compare data_types - true -} diff --git a/rust/arrow/src/array/equal/primitive.rs b/rust/arrow/src/array/equal/primitive.rs deleted file mode 100644 index db7587915c8..00000000000 --- a/rust/arrow/src/array/equal/primitive.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::mem::size_of; - -use crate::array::{data::count_nulls, ArrayData}; -use crate::buffer::Buffer; -use crate::util::bit_util::get_bit; - -use super::utils::equal_len; - -pub(super) fn primitive_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - let byte_width = size_of::(); - let lhs_values = &lhs.buffers()[0].as_slice()[lhs.offset() * byte_width..]; - let rhs_values = &rhs.buffers()[0].as_slice()[rhs.offset() * byte_width..]; - - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - - if lhs_null_count == 0 && rhs_null_count == 0 { - // without nulls, we just need to compare slices - equal_len( - lhs_values, - rhs_values, - lhs_start * byte_width, - rhs_start * byte_width, - len * byte_width, - ) - } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_len( - lhs_values, - rhs_values, - lhs_pos * byte_width, - rhs_pos * byte_width, - byte_width, // 1 * byte_width since we are comparing a single entry - ) - }) - } -} diff --git a/rust/arrow/src/array/equal/structure.rs b/rust/arrow/src/array/equal/structure.rs deleted file mode 100644 index b3cc4029e9e..00000000000 --- a/rust/arrow/src/array/equal/structure.rs +++ /dev/null @@ -1,90 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::{ - array::data::count_nulls, array::ArrayData, buffer::Buffer, util::bit_util::get_bit, -}; - -use super::{equal_range, utils::child_logical_null_buffer}; - -/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively -/// for `len` slots. The null buffers `lhs_nulls` and `rhs_nulls` inherit parent nullability. -/// -/// If an array is a child of a struct or list, the array's nulls have to be merged with the parent. -/// This then affects the null count of the array, thus the merged nulls are passed separately -/// as `lhs_nulls` and `rhs_nulls` variables to functions. -/// The nulls are merged with a bitwise AND, and null counts are recomputed where necessary. -fn equal_values( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - lhs.child_data() - .iter() - .zip(rhs.child_data()) - .all(|(lhs_values, rhs_values)| { - // merge the null data - let lhs_merged_nulls = child_logical_null_buffer(lhs, lhs_nulls, lhs_values); - let rhs_merged_nulls = child_logical_null_buffer(rhs, rhs_nulls, rhs_values); - equal_range( - lhs_values, - rhs_values, - lhs_merged_nulls.as_ref(), - rhs_merged_nulls.as_ref(), - lhs_start, - rhs_start, - len, - ) - }) -} - -pub(super) fn struct_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - // we have to recalculate null counts from the null buffers - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - if lhs_null_count == 0 && rhs_null_count == 0 { - equal_values(lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len) - } else { - // get a ref of the null buffer bytes, to use in testing for nullness - let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice(); - let rhs_null_bytes = rhs_nulls.as_ref().unwrap().as_slice(); - // with nulls, we need to compare item by item whenever it is not null - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - // if both struct and child had no null buffers, - let lhs_is_null = !get_bit(lhs_null_bytes, lhs_pos + lhs.offset()); - let rhs_is_null = !get_bit(rhs_null_bytes, rhs_pos + rhs.offset()); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && equal_values(lhs, rhs, lhs_nulls, rhs_nulls, lhs_pos, rhs_pos, 1) - }) - } -} diff --git a/rust/arrow/src/array/equal/utils.rs b/rust/arrow/src/array/equal/utils.rs deleted file mode 100644 index d0108d23649..00000000000 --- a/rust/arrow/src/array/equal/utils.rs +++ /dev/null @@ -1,264 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::{data::count_nulls, ArrayData, OffsetSizeTrait}; -use crate::bitmap::Bitmap; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::datatypes::DataType; -use crate::util::bit_util; - -// whether bits along the positions are equal -// `lhs_start`, `rhs_start` and `len` are _measured in bits_. -#[inline] -pub(super) fn equal_bits( - lhs_values: &[u8], - rhs_values: &[u8], - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - (0..len).all(|i| { - bit_util::get_bit(lhs_values, lhs_start + i) - == bit_util::get_bit(rhs_values, rhs_start + i) - }) -} - -#[inline] -pub(super) fn equal_nulls( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - if lhs_null_count > 0 || rhs_null_count > 0 { - let lhs_values = lhs_nulls.unwrap().as_slice(); - let rhs_values = rhs_nulls.unwrap().as_slice(); - equal_bits( - lhs_values, - rhs_values, - lhs_start + lhs.offset(), - rhs_start + rhs.offset(), - len, - ) - } else { - true - } -} - -#[inline] -pub(super) fn base_equal(lhs: &ArrayData, rhs: &ArrayData) -> bool { - lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() -} - -// whether the two memory regions are equal -#[inline] -pub(super) fn equal_len( - lhs_values: &[u8], - rhs_values: &[u8], - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - lhs_values[lhs_start..(lhs_start + len)] == rhs_values[rhs_start..(rhs_start + len)] -} - -/// Computes the logical validity bitmap of the array data using the -/// parent's array data. The parent should be a list or struct, else -/// the logical bitmap of the array is returned unaltered. -/// -/// Parent data is passed along with the parent's logical bitmap, as -/// nested arrays could have a logical bitmap different to the physical -/// one on the `ArrayData`. -pub(super) fn child_logical_null_buffer( - parent_data: &ArrayData, - logical_null_buffer: Option<&Buffer>, - child_data: &ArrayData, -) -> Option { - let parent_len = parent_data.len(); - let parent_bitmap = logical_null_buffer - .cloned() - .map(Bitmap::from) - .unwrap_or_else(|| { - let ceil = bit_util::ceil(parent_len, 8); - Bitmap::from(Buffer::from(vec![0b11111111; ceil])) - }); - let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| { - let ceil = bit_util::ceil(child_data.len(), 8); - Bitmap::from(Buffer::from(vec![0b11111111; ceil])) - }); - match parent_data.data_type() { - DataType::List(_) => Some(logical_list_bitmap::( - parent_data, - parent_bitmap, - self_null_bitmap, - )), - DataType::LargeList(_) => Some(logical_list_bitmap::( - parent_data, - parent_bitmap, - self_null_bitmap, - )), - DataType::FixedSizeList(_, len) => { - let len = *len as usize; - let array_offset = parent_data.offset(); - let bitmap_len = bit_util::ceil(parent_len * len, 8); - let mut buffer = MutableBuffer::from_len_zeroed(bitmap_len); - let mut null_slice = buffer.as_slice_mut(); - (array_offset..parent_len + array_offset).for_each(|index| { - let start = index * len; - let end = start + len; - let mask = parent_bitmap.is_set(index); - (start..end).for_each(|child_index| { - if mask && self_null_bitmap.is_set(child_index) { - bit_util::set_bit(&mut null_slice, child_index); - } - }); - }); - Some(buffer.into()) - } - DataType::Struct(_) => { - // Arrow implementations are free to pad data, which can result in null buffers not - // having the same length. - // Rust bitwise comparisons will return an error if left AND right is performed on - // buffers of different length. - // This might be a valid case during integration testing, where we read Arrow arrays - // from IPC data, which has padding. - // - // We first perform a bitwise comparison, and if there is an error, we revert to a - // slower method that indexes into the buffers one-by-one. - let result = &parent_bitmap & &self_null_bitmap; - if let Ok(bitmap) = result { - return Some(bitmap.bits); - } - // slow path - let array_offset = parent_data.offset(); - let mut buffer = MutableBuffer::new_null(parent_len); - let mut null_slice = buffer.as_slice_mut(); - (0..parent_len).for_each(|index| { - if parent_bitmap.is_set(index + array_offset) - && self_null_bitmap.is_set(index + array_offset) - { - bit_util::set_bit(&mut null_slice, index); - } - }); - Some(buffer.into()) - } - DataType::Union(_) => { - unimplemented!("Logical equality not yet implemented for union arrays") - } - DataType::Dictionary(_, _) => { - unimplemented!("Logical equality not yet implemented for nested dictionaries") - } - data_type => panic!("Data type {:?} is not a supported nested type", data_type), - } -} - -// Calculate a list child's logical bitmap/buffer -#[inline] -fn logical_list_bitmap( - parent_data: &ArrayData, - parent_bitmap: Bitmap, - child_bitmap: Bitmap, -) -> Buffer { - let offsets = parent_data.buffer::(0); - let offset_start = offsets.first().unwrap().to_usize().unwrap(); - let offset_len = offsets.get(parent_data.len()).unwrap().to_usize().unwrap(); - let mut buffer = MutableBuffer::new_null(offset_len - offset_start); - let mut null_slice = buffer.as_slice_mut(); - - offsets - .windows(2) - .enumerate() - .take(offset_len - offset_start) - .for_each(|(index, window)| { - let start = window[0].to_usize().unwrap(); - let end = window[1].to_usize().unwrap(); - let mask = parent_bitmap.is_set(index); - (start..end).for_each(|child_index| { - if mask && child_bitmap.is_set(child_index) { - bit_util::set_bit(&mut null_slice, child_index - offset_start); - } - }); - }); - buffer.into() -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::datatypes::{Field, ToByteSlice}; - - #[test] - fn test_logical_null_buffer() { - let child_data = ArrayData::builder(DataType::Int32) - .len(11) - .add_buffer(Buffer::from( - vec![1i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11].to_byte_slice(), - )) - .build(); - - let data = ArrayData::builder(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - false, - )))) - .len(7) - .add_buffer(Buffer::from(vec![0, 0, 3, 5, 6, 9, 10, 11].to_byte_slice())) - .null_bit_buffer(Buffer::from(vec![0b01011010])) - .add_child_data(child_data.clone()) - .build(); - - // Get the child logical null buffer. The child is non-nullable, but because the list has nulls, - // we expect the child to logically have some nulls, inherited from the parent: - // [1, 2, 3, null, null, 6, 7, 8, 9, null, 11] - let nulls = child_logical_null_buffer( - &data, - data.null_buffer(), - data.child_data().get(0).unwrap(), - ); - let expected = Some(Buffer::from(vec![0b11100111, 0b00000101])); - assert_eq!(nulls, expected); - - // test with offset - let data = ArrayData::builder(DataType::List(Box::new(Field::new( - "item", - DataType::Int32, - false, - )))) - .len(4) - .offset(3) - .add_buffer(Buffer::from(vec![0, 0, 3, 5, 6, 9, 10, 11].to_byte_slice())) - // the null_bit_buffer doesn't have an offset, i.e. cleared the 3 offset bits 0b[---]01011[010] - .null_bit_buffer(Buffer::from(vec![0b00001011])) - .add_child_data(child_data) - .build(); - - let nulls = child_logical_null_buffer( - &data, - data.null_buffer(), - data.child_data().get(0).unwrap(), - ); - - let expected = Some(Buffer::from(vec![0b00101111])); - assert_eq!(nulls, expected); - } -} diff --git a/rust/arrow/src/array/equal/variable_size.rs b/rust/arrow/src/array/equal/variable_size.rs deleted file mode 100644 index ecb3bc2a3c2..00000000000 --- a/rust/arrow/src/array/equal/variable_size.rs +++ /dev/null @@ -1,110 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::buffer::Buffer; -use crate::util::bit_util::get_bit; -use crate::{ - array::data::count_nulls, - array::{ArrayData, OffsetSizeTrait}, -}; - -use super::utils::equal_len; - -fn offset_value_equal( - lhs_values: &[u8], - rhs_values: &[u8], - lhs_offsets: &[T], - rhs_offsets: &[T], - lhs_pos: usize, - rhs_pos: usize, - len: usize, -) -> bool { - let lhs_start = lhs_offsets[lhs_pos].to_usize().unwrap(); - let rhs_start = rhs_offsets[rhs_pos].to_usize().unwrap(); - let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos]; - let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos]; - - lhs_len == rhs_len - && equal_len( - lhs_values, - rhs_values, - lhs_start, - rhs_start, - lhs_len.to_usize().unwrap(), - ) -} - -pub(super) fn variable_sized_equal( - lhs: &ArrayData, - rhs: &ArrayData, - lhs_nulls: Option<&Buffer>, - rhs_nulls: Option<&Buffer>, - lhs_start: usize, - rhs_start: usize, - len: usize, -) -> bool { - let lhs_offsets = lhs.buffer::(0); - let rhs_offsets = rhs.buffer::(0); - - // the offsets of the `ArrayData` are ignored as they are only applied to the offset buffer. - let lhs_values = lhs.buffers()[1].as_slice(); - let rhs_values = rhs.buffers()[1].as_slice(); - - let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len); - let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len); - - if lhs_null_count == 0 - && rhs_null_count == 0 - && !lhs_values.is_empty() - && !rhs_values.is_empty() - { - offset_value_equal( - lhs_values, - rhs_values, - lhs_offsets, - rhs_offsets, - lhs_start, - rhs_start, - len, - ) - } else { - (0..len).all(|i| { - let lhs_pos = lhs_start + i; - let rhs_pos = rhs_start + i; - - // the null bits can still be `None`, so we don't unwrap - let lhs_is_null = !lhs_nulls - .map(|v| get_bit(v.as_slice(), lhs.offset() + lhs_pos)) - .unwrap_or(false); - let rhs_is_null = !rhs_nulls - .map(|v| get_bit(v.as_slice(), rhs.offset() + rhs_pos)) - .unwrap_or(false); - - lhs_is_null - || (lhs_is_null == rhs_is_null) - && offset_value_equal( - lhs_values, - rhs_values, - lhs_offsets, - rhs_offsets, - lhs_pos, - rhs_pos, - 1, - ) - }) - } -} diff --git a/rust/arrow/src/array/equal_json.rs b/rust/arrow/src/array/equal_json.rs deleted file mode 100644 index 043174b9ac8..00000000000 --- a/rust/arrow/src/array/equal_json.rs +++ /dev/null @@ -1,1113 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use super::*; -use crate::datatypes::*; -use array::Array; -use hex::FromHex; -use serde_json::value::Value::{Null as JNull, Object, String as JString}; -use serde_json::Value; - -/// Trait for comparing arrow array with json array -pub trait JsonEqual { - /// Checks whether arrow array equals to json array. - fn equals_json(&self, json: &[&Value]) -> bool; - - /// Checks whether arrow array equals to json array. - fn equals_json_values(&self, json: &[Value]) -> bool { - let refs = json.iter().collect::>(); - - self.equals_json(&refs) - } -} - -/// Implement array equals for numeric type -impl JsonEqual for PrimitiveArray { - fn equals_json(&self, json: &[&Value]) -> bool { - self.len() == json.len() - && (0..self.len()).all(|i| match json[i] { - Value::Null => self.is_null(i), - v => { - self.is_valid(i) - && Some(v) == self.value(i).into_json_value().as_ref() - } - }) - } -} - -/// Implement array equals for numeric type -impl JsonEqual for BooleanArray { - fn equals_json(&self, json: &[&Value]) -> bool { - self.len() == json.len() - && (0..self.len()).all(|i| match json[i] { - Value::Null => self.is_null(i), - v => { - self.is_valid(i) - && Some(v) == self.value(i).into_json_value().as_ref() - } - }) - } -} - -impl PartialEq for PrimitiveArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(array) => self.equals_json_values(&array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &PrimitiveArray) -> bool { - match self { - Value::Array(array) => arrow.equals_json_values(&array), - _ => false, - } - } -} - -impl JsonEqual for GenericListArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - Value::Array(v) => self.is_valid(i) && self.value(i).equals_json_values(v), - Value::Null => self.is_null(i) || self.value_length(i).is_zero(), - _ => false, - }) - } -} - -impl PartialEq for GenericListArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &GenericListArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for DictionaryArray { - fn equals_json(&self, json: &[&Value]) -> bool { - // todo: this is wrong: we must test the values also - self.keys().equals_json(json) - } -} - -impl PartialEq for DictionaryArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &DictionaryArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for FixedSizeListArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - Value::Array(v) => self.is_valid(i) && self.value(i).equals_json_values(v), - Value::Null => self.is_null(i) || self.value_length() == 0, - _ => false, - }) - } -} - -impl PartialEq for FixedSizeListArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &FixedSizeListArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for StructArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - let all_object = json.iter().all(|v| matches!(v, Object(_) | JNull)); - - if !all_object { - return false; - } - - for column_name in self.column_names() { - let json_values = json - .iter() - .map(|obj| obj.get(column_name).unwrap_or(&Value::Null)) - .collect::>(); - - if !self - .column_by_name(column_name) - .map(|arr| arr.equals_json(&json_values)) - .unwrap_or(false) - { - return false; - } - } - - true - } -} - -impl PartialEq for StructArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(&json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &StructArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(&json_array), - _ => false, - } - } -} - -impl JsonEqual for GenericBinaryArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => { - // binary data is sometimes hex encoded, this checks if bytes are equal, - // and if not converting to hex is attempted - self.is_valid(i) - && (s.as_str().as_bytes() == self.value(i) - || Vec::from_hex(s.as_str()) == Ok(self.value(i).to_vec())) - } - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq - for GenericBinaryArray -{ - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(&json_array), - _ => false, - } - } -} - -impl PartialEq> - for Value -{ - fn eq(&self, arrow: &GenericBinaryArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(&json_array), - _ => false, - } - } -} - -impl JsonEqual for GenericStringArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => self.is_valid(i) && s.as_str() == self.value(i), - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq - for GenericStringArray -{ - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(&json_array), - _ => false, - } - } -} - -impl PartialEq> - for Value -{ - fn eq(&self, arrow: &GenericStringArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(&json_array), - _ => false, - } - } -} - -impl JsonEqual for FixedSizeBinaryArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => { - // binary data is sometimes hex encoded, this checks if bytes are equal, - // and if not converting to hex is attempted - self.is_valid(i) - && (s.as_str().as_bytes() == self.value(i) - || Vec::from_hex(s.as_str()) == Ok(self.value(i).to_vec())) - } - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for FixedSizeBinaryArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(&json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &FixedSizeBinaryArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(&json_array), - _ => false, - } - } -} - -impl JsonEqual for DecimalArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => { - self.is_valid(i) - && (s - .parse::() - .map_or_else(|_| false, |v| v == self.value(i))) - } - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for DecimalArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(&json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &DecimalArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(&json_array), - _ => false, - } - } -} - -impl JsonEqual for UnionArray { - fn equals_json(&self, _json: &[&Value]) -> bool { - unimplemented!( - "Added to allow UnionArray to implement the Array trait: see ARROW-8547" - ) - } -} - -impl JsonEqual for NullArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - // all JSON values must be nulls - json.iter().all(|&v| v == &JNull) - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &NullArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(&json_array), - _ => false, - } - } -} - -impl PartialEq for NullArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(&json_array), - _ => false, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::error::Result; - use std::{convert::TryFrom, sync::Arc}; - - fn create_list_array, T: AsRef<[Option]>>( - builder: &mut ListBuilder, - data: T, - ) -> Result { - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref())?; - builder.append(true)? - } else { - builder.append(false)? - } - } - Ok(builder.finish()) - } - - /// Create a fixed size list of 2 value lengths - fn create_fixed_size_list_array, T: AsRef<[Option]>>( - builder: &mut FixedSizeListBuilder, - data: T, - ) -> Result { - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref())?; - builder.append(true)? - } else { - for _ in 0..builder.value_length() { - builder.values().append_null()?; - } - builder.append(false)? - } - } - Ok(builder.finish()) - } - - #[test] - fn test_primitive_json_equal() { - // Test equaled array - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - [ - 1, null, 2, 3 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequaled array - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - [ - 1, 1, 2, 3 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - [ - 1, 1 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test not json array type case - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_list_json_equal() { - // Test equal case - let arrow_array = create_list_array( - &mut ListBuilder::new(Int32Builder::new(10)), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - null, - [4, 5, 6] - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = create_list_array( - &mut ListBuilder::new(Int32Builder::new(10)), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - [7, 8], - [4, 5, 6] - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = create_list_array( - &mut ListBuilder::new(Int32Builder::new(10)), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_fixed_size_list_json_equal() { - // Test equal case - let arrow_array = create_fixed_size_list_array( - &mut FixedSizeListBuilder::new(Int32Builder::new(10), 3), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - null, - [4, 5, 6] - ] - "#, - ) - .unwrap(); - println!("{:?}", arrow_array); - println!("{:?}", json_array); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = create_fixed_size_list_array( - &mut FixedSizeListBuilder::new(Int32Builder::new(10), 3), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - [7, 8, 9], - [4, 5, 6] - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = create_fixed_size_list_array( - &mut FixedSizeListBuilder::new(Int32Builder::new(10), 3), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_string_json_equal() { - // Test the equal case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None, None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "world", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None, None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - 1, - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_binary_json_equal() { - // Test the equal case - let mut builder = BinaryBuilder::new(6); - builder.append_value(b"hello").unwrap(); - builder.append_null().unwrap(); - builder.append_null().unwrap(); - builder.append_value(b"world").unwrap(); - builder.append_null().unwrap(); - builder.append_null().unwrap(); - let arrow_array = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "world", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None, None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - 1, - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_fixed_size_binary_json_equal() { - // Test the equal case - let mut builder = FixedSizeBinaryBuilder::new(15, 5); - builder.append_value(b"hello").unwrap(); - builder.append_null().unwrap(); - builder.append_value(b"world").unwrap(); - let arrow_array: FixedSizeBinaryArray = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - "world" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - builder.append_value(b"hello").unwrap(); - builder.append_null().unwrap(); - builder.append_value(b"world").unwrap(); - let arrow_array: FixedSizeBinaryArray = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - "arrow" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "world" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - 1 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_decimal_json_equal() { - // Test the equal case - let mut builder = DecimalBuilder::new(30, 23, 6); - builder.append_value(1_000).unwrap(); - builder.append_null().unwrap(); - builder.append_value(-250).unwrap(); - let arrow_array: DecimalArray = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "1000", - null, - "-250" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - builder.append_value(1_000).unwrap(); - builder.append_null().unwrap(); - builder.append_value(55).unwrap(); - let arrow_array: DecimalArray = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "1000", - null, - "-250" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let json_array: Value = serde_json::from_str( - r#" - [ - "1000", - null, - null, - "55" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - 1 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_struct_json_equal() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let arrow_array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - - let json_array: Value = serde_json::from_str( - r#" - [ - { - "f1": "joe", - "f2": 1 - }, - { - "f2": 2 - }, - null, - { - "f1": "mark", - "f2": 4 - }, - { - "f1": "doe", - "f2": 5 - } - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal length case - let json_array: Value = serde_json::from_str( - r#" - [ - { - "f1": "joe", - "f2": 1 - }, - { - "f2": 2 - }, - null, - { - "f1": "mark", - "f2": 4 - } - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let json_array: Value = serde_json::from_str( - r#" - { - "f1": "joe", - "f2": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test not all object case - let json_array: Value = serde_json::from_str( - r#" - [ - { - "f1": "joe", - "f2": 1 - }, - 2, - null, - { - "f1": "mark", - "f2": 4 - } - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_null_json_equal() { - // Test equaled array - let arrow_array = NullArray::new(4); - let json_array: Value = serde_json::from_str( - r#" - [ - null, null, null, null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequaled array - let arrow_array = NullArray::new(2); - let json_array: Value = serde_json::from_str( - r#" - [ - null, null, null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } -} diff --git a/rust/arrow/src/array/ffi.rs b/rust/arrow/src/array/ffi.rs deleted file mode 100644 index 450685bf522..00000000000 --- a/rust/arrow/src/array/ffi.rs +++ /dev/null @@ -1,168 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Contains functionality to load an ArrayData from the C Data Interface - -use std::convert::TryFrom; - -use crate::{ - error::{ArrowError, Result}, - ffi, -}; - -use super::ArrayData; -use crate::datatypes::DataType; -use crate::ffi::ArrowArray; - -impl TryFrom for ArrayData { - type Error = ArrowError; - - fn try_from(value: ffi::ArrowArray) -> Result { - let child_data = value.children()?; - - let child_type = if !child_data.is_empty() { - Some(child_data[0].data_type().clone()) - } else { - None - }; - - let data_type = value.data_type(child_type)?; - - let len = value.len(); - let offset = value.offset(); - let null_count = value.null_count(); - let buffers = value.buffers()?; - let null_bit_buffer = value.null_bit_buffer(); - - Ok(ArrayData::new( - data_type, - len, - Some(null_count), - null_bit_buffer, - offset, - buffers, - child_data, - )) - } -} - -impl TryFrom for ffi::ArrowArray { - type Error = ArrowError; - - fn try_from(value: ArrayData) -> Result { - // If parent is nullable, then children also must be nullable - // so we pass this nullable to the creation of hte child data - let nullable = match value.data_type() { - DataType::List(field) => field.is_nullable(), - DataType::LargeList(field) => field.is_nullable(), - _ => false, - }; - - let len = value.len(); - let offset = value.offset() as usize; - let null_count = value.null_count(); - let buffers = value.buffers().to_vec(); - let null_buffer = value.null_buffer().cloned(); - let child_data = value - .child_data() - .iter() - .map(|arr| { - let len = arr.len(); - let offset = arr.offset() as usize; - let null_count = arr.null_count(); - let buffers = arr.buffers().to_vec(); - let null_buffer = arr.null_buffer().cloned(); - - // Note: the nullable comes from the parent data. - unsafe { - ArrowArray::try_new( - arr.data_type(), - len, - null_count, - null_buffer, - offset, - buffers, - vec![], - nullable, - ) - .expect("infallible") - } - }) - .collect::>(); - - unsafe { - ffi::ArrowArray::try_new( - value.data_type(), - len, - null_count, - null_buffer, - offset, - buffers, - child_data, - nullable, - ) - } - } -} - -#[cfg(test)] -mod tests { - use crate::error::Result; - use crate::{ - array::{Array, ArrayData, Int64Array, UInt32Array, UInt64Array}, - ffi::ArrowArray, - }; - use std::convert::TryFrom; - - fn test_round_trip(expected: &ArrayData) -> Result<()> { - // create a `ArrowArray` from the data. - let d1 = ArrowArray::try_from(expected.clone())?; - - // here we export the array as 2 pointers. We would have no control over ownership if it was not for - // the release mechanism. - let (array, schema) = ArrowArray::into_raw(d1); - - // simulate an external consumer by being the consumer - let d1 = unsafe { ArrowArray::try_from_raw(array, schema) }?; - - let result = &ArrayData::try_from(d1)?; - - assert_eq!(result, expected); - Ok(()) - } - - #[test] - fn test_u32() -> Result<()> { - let array = UInt32Array::from(vec![Some(2), None, Some(1), None]); - let data = array.data(); - test_round_trip(data) - } - - #[test] - fn test_u64() -> Result<()> { - let array = UInt64Array::from(vec![Some(2), None, Some(1), None]); - let data = array.data(); - test_round_trip(data) - } - - #[test] - fn test_i64() -> Result<()> { - let array = Int64Array::from(vec![Some(2), None, Some(1), None]); - let data = array.data(); - test_round_trip(data) - } -} diff --git a/rust/arrow/src/array/iterator.rs b/rust/arrow/src/array/iterator.rs deleted file mode 100644 index d97aa16744c..00000000000 --- a/rust/arrow/src/array/iterator.rs +++ /dev/null @@ -1,527 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::datatypes::ArrowPrimitiveType; - -use super::{ - Array, ArrayRef, BinaryOffsetSizeTrait, BooleanArray, GenericBinaryArray, - GenericListArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, - StringOffsetSizeTrait, -}; - -/// an iterator that returns Some(T) or None, that can be used on any PrimitiveArray -// Note: This implementation is based on std's [Vec]s' [IntoIter]. -#[derive(Debug)] -pub struct PrimitiveIter<'a, T: ArrowPrimitiveType> { - array: &'a PrimitiveArray, - current: usize, - current_end: usize, -} - -impl<'a, T: ArrowPrimitiveType> PrimitiveIter<'a, T> { - /// create a new iterator - pub fn new(array: &'a PrimitiveArray) -> Self { - PrimitiveIter:: { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a, T: ArrowPrimitiveType> std::iter::Iterator for PrimitiveIter<'a, T> { - type Item = Option; - - #[inline] - fn next(&mut self) -> Option { - if self.current == self.current_end { - None - } else if self.array.is_null(self.current) { - self.current += 1; - Some(None) - } else { - let old = self.current; - self.current += 1; - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(Some(self.array.value_unchecked(old))) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.array.len() - self.current, - Some(self.array.len() - self.current), - ) - } -} - -impl<'a, T: ArrowPrimitiveType> std::iter::DoubleEndedIterator for PrimitiveIter<'a, T> { - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { - None - } else { - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(self.array.value_unchecked(self.current_end)) } - }) - } - } -} - -/// all arrays have known size. -impl<'a, T: ArrowPrimitiveType> std::iter::ExactSizeIterator for PrimitiveIter<'a, T> {} - -/// an iterator that returns Some(bool) or None. -// Note: This implementation is based on std's [Vec]s' [IntoIter]. -#[derive(Debug)] -pub struct BooleanIter<'a> { - array: &'a BooleanArray, - current: usize, - current_end: usize, -} - -impl<'a> BooleanIter<'a> { - /// create a new iterator - pub fn new(array: &'a BooleanArray) -> Self { - BooleanIter { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a> std::iter::Iterator for BooleanIter<'a> { - type Item = Option; - - fn next(&mut self) -> Option { - if self.current == self.current_end { - None - } else if self.array.is_null(self.current) { - self.current += 1; - Some(None) - } else { - let old = self.current; - self.current += 1; - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(Some(self.array.value_unchecked(old))) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.array.len() - self.current, - Some(self.array.len() - self.current), - ) - } -} - -impl<'a> std::iter::DoubleEndedIterator for BooleanIter<'a> { - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { - None - } else { - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(self.array.value_unchecked(self.current_end)) } - }) - } - } -} - -/// all arrays have known size. -impl<'a> std::iter::ExactSizeIterator for BooleanIter<'a> {} - -/// an iterator that returns `Some(&str)` or `None`, for string arrays -#[derive(Debug)] -pub struct GenericStringIter<'a, T> -where - T: StringOffsetSizeTrait, -{ - array: &'a GenericStringArray, - current: usize, - current_end: usize, -} - -impl<'a, T: StringOffsetSizeTrait> GenericStringIter<'a, T> { - /// create a new iterator - pub fn new(array: &'a GenericStringArray) -> Self { - GenericStringIter:: { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a, T: StringOffsetSizeTrait> std::iter::Iterator for GenericStringIter<'a, T> { - type Item = Option<&'a str>; - - fn next(&mut self) -> Option { - let i = self.current; - if i >= self.current_end { - None - } else if self.array.is_null(i) { - self.current += 1; - Some(None) - } else { - self.current += 1; - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(Some(self.array.value_unchecked(i))) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.current_end - self.current, - Some(self.current_end - self.current), - ) - } -} - -impl<'a, T: StringOffsetSizeTrait> std::iter::DoubleEndedIterator - for GenericStringIter<'a, T> -{ - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { - None - } else { - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(self.array.value_unchecked(self.current_end)) } - }) - } - } -} - -/// all arrays have known size. -impl<'a, T: StringOffsetSizeTrait> std::iter::ExactSizeIterator - for GenericStringIter<'a, T> -{ -} - -/// an iterator that returns `Some(&[u8])` or `None`, for binary arrays -#[derive(Debug)] -pub struct GenericBinaryIter<'a, T> -where - T: BinaryOffsetSizeTrait, -{ - array: &'a GenericBinaryArray, - current: usize, - current_end: usize, -} - -impl<'a, T: BinaryOffsetSizeTrait> GenericBinaryIter<'a, T> { - /// create a new iterator - pub fn new(array: &'a GenericBinaryArray) -> Self { - GenericBinaryIter:: { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a, T: BinaryOffsetSizeTrait> std::iter::Iterator for GenericBinaryIter<'a, T> { - type Item = Option<&'a [u8]>; - - fn next(&mut self) -> Option { - let i = self.current; - if i >= self.current_end { - None - } else if self.array.is_null(i) { - self.current += 1; - Some(None) - } else { - self.current += 1; - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(Some(self.array.value_unchecked(i))) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.current_end - self.current, - Some(self.current_end - self.current), - ) - } -} - -impl<'a, T: BinaryOffsetSizeTrait> std::iter::DoubleEndedIterator - for GenericBinaryIter<'a, T> -{ - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { - None - } else { - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(self.array.value_unchecked(self.current_end)) } - }) - } - } -} - -/// all arrays have known size. -impl<'a, T: BinaryOffsetSizeTrait> std::iter::ExactSizeIterator - for GenericBinaryIter<'a, T> -{ -} - -#[derive(Debug)] -pub struct GenericListArrayIter<'a, S> -where - S: OffsetSizeTrait, -{ - array: &'a GenericListArray, - current: usize, - current_end: usize, -} - -impl<'a, S: OffsetSizeTrait> GenericListArrayIter<'a, S> { - pub fn new(array: &'a GenericListArray) -> Self { - GenericListArrayIter:: { - array, - current: 0, - current_end: array.len(), - } - } -} - -impl<'a, S: OffsetSizeTrait> std::iter::Iterator for GenericListArrayIter<'a, S> { - type Item = Option; - - fn next(&mut self) -> Option { - let i = self.current; - if i >= self.current_end { - None - } else if self.array.is_null(i) { - self.current += 1; - Some(None) - } else { - self.current += 1; - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(Some(self.array.value_unchecked(i))) } - } - } - - fn size_hint(&self) -> (usize, Option) { - ( - self.current_end - self.current, - Some(self.current_end - self.current), - ) - } -} - -impl<'a, S: OffsetSizeTrait> std::iter::DoubleEndedIterator - for GenericListArrayIter<'a, S> -{ - fn next_back(&mut self) -> Option { - if self.current_end == self.current { - None - } else { - self.current_end -= 1; - Some(if self.array.is_null(self.current_end) { - None - } else { - // Safety: - // we just checked bounds in `self.current_end == self.current` - // this is safe on the premise that this struct is initialized with - // current = array.len() - // and that current_end is ever only decremented - unsafe { Some(self.array.value_unchecked(self.current_end)) } - }) - } - } -} - -/// all arrays have known size. -impl<'a, S: OffsetSizeTrait> std::iter::ExactSizeIterator - for GenericListArrayIter<'a, S> -{ -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use crate::array::{ArrayRef, BinaryArray, BooleanArray, Int32Array, StringArray}; - - #[test] - fn test_primitive_array_iter_round_trip() { - let array = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); - let array = Arc::new(array) as ArrayRef; - - let array = array.as_any().downcast_ref::().unwrap(); - - // to and from iter, with a +1 - let result: Int32Array = array.iter().map(|e| e.map(|e| e + 1)).collect(); - - let expected = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); - assert_eq!(result, expected); - - // check if DoubleEndedIterator is implemented - let result: Int32Array = array.iter().rev().collect(); - let rev_array = Int32Array::from(vec![Some(4), None, Some(2), None, Some(0)]); - assert_eq!(result, rev_array); - // check if ExactSizeIterator is implemented - let _ = array.iter().rposition(|opt_b| opt_b == Some(1)); - } - - #[test] - fn test_double_ended() { - let array = Int32Array::from(vec![Some(0), None, Some(2), None, Some(4)]); - let mut a = array.iter(); - assert_eq!(a.next(), Some(Some(0))); - assert_eq!(a.next(), Some(None)); - assert_eq!(a.next_back(), Some(Some(4))); - assert_eq!(a.next_back(), Some(None)); - assert_eq!(a.next_back(), Some(Some(2))); - // the two sides have met: None is returned by both - assert_eq!(a.next_back(), None); - assert_eq!(a.next(), None); - } - - #[test] - fn test_string_array_iter_round_trip() { - let array = - StringArray::from(vec![Some("a"), None, Some("aaa"), None, Some("aaaaa")]); - let array = Arc::new(array) as ArrayRef; - - let array = array.as_any().downcast_ref::().unwrap(); - - // to and from iter, with a +1 - let result: StringArray = array - .iter() - .map(|e| { - e.map(|e| { - let mut a = e.to_string(); - a.push('b'); - a - }) - }) - .collect(); - - let expected = - StringArray::from(vec![Some("ab"), None, Some("aaab"), None, Some("aaaaab")]); - assert_eq!(result, expected); - - // check if DoubleEndedIterator is implemented - let result: StringArray = array.iter().rev().collect(); - let rev_array = - StringArray::from(vec![Some("aaaaa"), None, Some("aaa"), None, Some("a")]); - assert_eq!(result, rev_array); - // check if ExactSizeIterator is implemented - let _ = array.iter().rposition(|opt_b| opt_b == Some("a")); - } - - #[test] - fn test_binary_array_iter_round_trip() { - let array = BinaryArray::from(vec![ - Some(b"a" as &[u8]), - None, - Some(b"aaa"), - None, - Some(b"aaaaa"), - ]); - - // to and from iter - let result: BinaryArray = array.iter().collect(); - - assert_eq!(result, array); - - // check if DoubleEndedIterator is implemented - let result: BinaryArray = array.iter().rev().collect(); - let rev_array = BinaryArray::from(vec![ - Some(b"aaaaa" as &[u8]), - None, - Some(b"aaa"), - None, - Some(b"a"), - ]); - assert_eq!(result, rev_array); - - // check if ExactSizeIterator is implemented - let _ = array.iter().rposition(|opt_b| opt_b == Some(&[9])); - } - - #[test] - fn test_boolean_array_iter_round_trip() { - let array = BooleanArray::from(vec![Some(true), None, Some(false)]); - - // to and from iter - let result: BooleanArray = array.iter().collect(); - - assert_eq!(result, array); - - // check if DoubleEndedIterator is implemented - let result: BooleanArray = array.iter().rev().collect(); - let rev_array = BooleanArray::from(vec![Some(false), None, Some(true)]); - assert_eq!(result, rev_array); - - // check if ExactSizeIterator is implemented - let _ = array.iter().rposition(|opt_b| opt_b == Some(true)); - } -} diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs deleted file mode 100644 index 65cf30832e2..00000000000 --- a/rust/arrow/src/array/mod.rs +++ /dev/null @@ -1,283 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! The central type in Apache Arrow are arrays, represented -//! by the [`Array` trait](crate::array::Array). -//! An array represents a known-length sequence of values all -//! having the same type. -//! -//! Internally, those values are represented by one or several -//! [buffers](crate::buffer::Buffer), the number and meaning -//! of which depend on the array’s data type, as documented in -//! [the Arrow data layout specification](https://arrow.apache.org/docs/format/Columnar.html). -//! For example, the type `Int16Array` represents an Apache -//! Arrow array of 16-bit integers. -//! -//! Those buffers consist of the value data itself and an -//! optional [bitmap buffer](crate::bitmap::Bitmap) that -//! indicates which array entries are null values. -//! The bitmap buffer can be entirely omitted if the array is -//! known to have zero null values. -//! -//! There are concrete implementations of this trait for each -//! data type, that help you access individual values of the -//! array. -//! -//! # Building an Array -//! -//! Arrow's `Arrays` are immutable, but there is the trait -//! [`ArrayBuilder`](crate::array::ArrayBuilder) -//! that helps you with constructing new `Arrays`. As with the -//! `Array` trait, there are builder implementations for all -//! concrete array types. -//! -//! # Example -//! ``` -//! extern crate arrow; -//! -//! use arrow::array::Int16Array; -//! -//! // Create a new builder with a capacity of 100 -//! let mut builder = Int16Array::builder(100); -//! -//! // Append a single primitive value -//! builder.append_value(1).unwrap(); -//! -//! // Append a null value -//! builder.append_null().unwrap(); -//! -//! // Append a slice of primitive values -//! builder.append_slice(&[2, 3, 4]).unwrap(); -//! -//! // Build the array -//! let array = builder.finish(); -//! -//! assert_eq!( -//! 5, -//! array.len(), -//! "The array has 5 values, counting the null value" -//! ); -//! -//! assert_eq!(2, array.value(2), "Get the value with index 2"); -//! -//! assert_eq!( -//! &array.values()[3..5], -//! &[3, 4], -//! "Get slice of len 2 starting at idx 3" -//! ) -//! ``` - -#[allow(clippy::module_inception)] -mod array; -mod array_binary; -mod array_boolean; -mod array_dictionary; -mod array_list; -mod array_primitive; -mod array_string; -mod array_struct; -mod array_union; -mod builder; -mod cast; -mod data; -mod equal; -mod equal_json; -mod ffi; -mod iterator; -mod null; -mod ord; -mod raw_pointer; -mod transform; - -use crate::datatypes::*; - -// --------------------- Array & ArrayData --------------------- - -pub use self::array::Array; -pub use self::array::ArrayRef; -pub use self::data::ArrayData; -pub use self::data::ArrayDataBuilder; -pub use self::data::ArrayDataRef; - -pub use self::array_binary::BinaryArray; -pub use self::array_binary::DecimalArray; -pub use self::array_binary::FixedSizeBinaryArray; -pub use self::array_binary::LargeBinaryArray; -pub use self::array_boolean::BooleanArray; -pub use self::array_dictionary::DictionaryArray; -pub use self::array_list::FixedSizeListArray; -pub use self::array_list::LargeListArray; -pub use self::array_list::ListArray; -pub use self::array_primitive::PrimitiveArray; -pub use self::array_string::LargeStringArray; -pub use self::array_string::StringArray; -pub use self::array_struct::StructArray; -pub use self::array_union::UnionArray; -pub use self::null::NullArray; - -pub use self::array::make_array; -pub use self::array::new_empty_array; -pub use self::array::new_null_array; - -pub type Int8Array = PrimitiveArray; -pub type Int16Array = PrimitiveArray; -pub type Int32Array = PrimitiveArray; -pub type Int64Array = PrimitiveArray; -pub type UInt8Array = PrimitiveArray; -pub type UInt16Array = PrimitiveArray; -pub type UInt32Array = PrimitiveArray; -pub type UInt64Array = PrimitiveArray; -pub type Float32Array = PrimitiveArray; -pub type Float64Array = PrimitiveArray; - -pub type Int8DictionaryArray = DictionaryArray; -pub type Int16DictionaryArray = DictionaryArray; -pub type Int32DictionaryArray = DictionaryArray; -pub type Int64DictionaryArray = DictionaryArray; -pub type UInt8DictionaryArray = DictionaryArray; -pub type UInt16DictionaryArray = DictionaryArray; -pub type UInt32DictionaryArray = DictionaryArray; -pub type UInt64DictionaryArray = DictionaryArray; - -pub type TimestampSecondArray = PrimitiveArray; -pub type TimestampMillisecondArray = PrimitiveArray; -pub type TimestampMicrosecondArray = PrimitiveArray; -pub type TimestampNanosecondArray = PrimitiveArray; -pub type Date32Array = PrimitiveArray; -pub type Date64Array = PrimitiveArray; -pub type Time32SecondArray = PrimitiveArray; -pub type Time32MillisecondArray = PrimitiveArray; -pub type Time64MicrosecondArray = PrimitiveArray; -pub type Time64NanosecondArray = PrimitiveArray; -pub type IntervalYearMonthArray = PrimitiveArray; -pub type IntervalDayTimeArray = PrimitiveArray; -pub type DurationSecondArray = PrimitiveArray; -pub type DurationMillisecondArray = PrimitiveArray; -pub type DurationMicrosecondArray = PrimitiveArray; -pub type DurationNanosecondArray = PrimitiveArray; - -pub use self::array_binary::BinaryOffsetSizeTrait; -pub use self::array_binary::GenericBinaryArray; -pub use self::array_list::GenericListArray; -pub use self::array_list::OffsetSizeTrait; -pub use self::array_string::GenericStringArray; -pub use self::array_string::StringOffsetSizeTrait; - -// --------------------- Array Builder --------------------- - -pub use self::builder::BooleanBufferBuilder; -pub use self::builder::BufferBuilder; - -pub type Int8BufferBuilder = BufferBuilder; -pub type Int16BufferBuilder = BufferBuilder; -pub type Int32BufferBuilder = BufferBuilder; -pub type Int64BufferBuilder = BufferBuilder; -pub type UInt8BufferBuilder = BufferBuilder; -pub type UInt16BufferBuilder = BufferBuilder; -pub type UInt32BufferBuilder = BufferBuilder; -pub type UInt64BufferBuilder = BufferBuilder; -pub type Float32BufferBuilder = BufferBuilder; -pub type Float64BufferBuilder = BufferBuilder; - -pub type TimestampSecondBufferBuilder = BufferBuilder; -pub type TimestampMillisecondBufferBuilder = BufferBuilder; -pub type TimestampMicrosecondBufferBuilder = BufferBuilder; -pub type TimestampNanosecondBufferBuilder = BufferBuilder; -pub type Date32BufferBuilder = BufferBuilder; -pub type Date64BufferBuilder = BufferBuilder; -pub type Time32SecondBufferBuilder = BufferBuilder; -pub type Time32MillisecondBufferBuilder = BufferBuilder; -pub type Time64MicrosecondBufferBuilder = BufferBuilder; -pub type Time64NanosecondBufferBuilder = BufferBuilder; -pub type IntervalYearMonthBufferBuilder = BufferBuilder; -pub type IntervalDayTimeBufferBuilder = BufferBuilder; -pub type DurationSecondBufferBuilder = BufferBuilder; -pub type DurationMillisecondBufferBuilder = BufferBuilder; -pub type DurationMicrosecondBufferBuilder = BufferBuilder; -pub type DurationNanosecondBufferBuilder = BufferBuilder; - -pub use self::builder::ArrayBuilder; -pub use self::builder::BinaryBuilder; -pub use self::builder::BooleanBuilder; -pub use self::builder::DecimalBuilder; -pub use self::builder::FixedSizeBinaryBuilder; -pub use self::builder::FixedSizeListBuilder; -pub use self::builder::GenericStringBuilder; -pub use self::builder::LargeBinaryBuilder; -pub use self::builder::LargeListBuilder; -pub use self::builder::LargeStringBuilder; -pub use self::builder::ListBuilder; -pub use self::builder::PrimitiveBuilder; -pub use self::builder::PrimitiveDictionaryBuilder; -pub use self::builder::StringBuilder; -pub use self::builder::StringDictionaryBuilder; -pub use self::builder::StructBuilder; -pub use self::builder::UnionBuilder; - -pub type Int8Builder = PrimitiveBuilder; -pub type Int16Builder = PrimitiveBuilder; -pub type Int32Builder = PrimitiveBuilder; -pub type Int64Builder = PrimitiveBuilder; -pub type UInt8Builder = PrimitiveBuilder; -pub type UInt16Builder = PrimitiveBuilder; -pub type UInt32Builder = PrimitiveBuilder; -pub type UInt64Builder = PrimitiveBuilder; -pub type Float32Builder = PrimitiveBuilder; -pub type Float64Builder = PrimitiveBuilder; - -pub type TimestampSecondBuilder = PrimitiveBuilder; -pub type TimestampMillisecondBuilder = PrimitiveBuilder; -pub type TimestampMicrosecondBuilder = PrimitiveBuilder; -pub type TimestampNanosecondBuilder = PrimitiveBuilder; -pub type Date32Builder = PrimitiveBuilder; -pub type Date64Builder = PrimitiveBuilder; -pub type Time32SecondBuilder = PrimitiveBuilder; -pub type Time32MillisecondBuilder = PrimitiveBuilder; -pub type Time64MicrosecondBuilder = PrimitiveBuilder; -pub type Time64NanosecondBuilder = PrimitiveBuilder; -pub type IntervalYearMonthBuilder = PrimitiveBuilder; -pub type IntervalDayTimeBuilder = PrimitiveBuilder; -pub type DurationSecondBuilder = PrimitiveBuilder; -pub type DurationMillisecondBuilder = PrimitiveBuilder; -pub type DurationMicrosecondBuilder = PrimitiveBuilder; -pub type DurationNanosecondBuilder = PrimitiveBuilder; - -pub use self::transform::MutableArrayData; - -// --------------------- Array Iterator --------------------- - -pub use self::iterator::*; - -// --------------------- Array Equality --------------------- - -pub use self::equal_json::JsonEqual; - -// --------------------- Array's values comparison --------------------- - -pub use self::ord::{build_compare, DynComparator}; - -// --------------------- Array downcast helper functions --------------------- - -pub use self::cast::{ - as_boolean_array, as_dictionary_array, as_generic_list_array, as_large_list_array, - as_largestring_array, as_list_array, as_null_array, as_primitive_array, - as_string_array, as_struct_array, -}; - -// ------------------------------ C Data Interface --------------------------- - -pub use self::array::make_array_from_raw; diff --git a/rust/arrow/src/array/null.rs b/rust/arrow/src/array/null.rs deleted file mode 100644 index 8e95bb00ed1..00000000000 --- a/rust/arrow/src/array/null.rs +++ /dev/null @@ -1,155 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Contains the `NullArray` type. -//! -//! A `NullArray` is a simplified array where all values are null. -//! -//! # Example: Create an array -//! -//! ``` -//! use arrow::array::{Array, NullArray}; -//! -//! # fn main() -> arrow::error::Result<()> { -//! let array = NullArray::new(10); -//! -//! assert_eq!(array.len(), 10); -//! assert_eq!(array.null_count(), 10); -//! -//! # Ok(()) -//! # } -//! ``` - -use std::any::Any; -use std::fmt; -use std::mem; - -use crate::array::{Array, ArrayData}; -use crate::datatypes::*; - -/// An Array where all elements are nulls -pub struct NullArray { - data: ArrayData, -} - -impl NullArray { - /// Create a new null array of the specified length - pub fn new(length: usize) -> Self { - let array_data = ArrayData::builder(DataType::Null).len(length).build(); - NullArray::from(array_data) - } -} - -impl Array for NullArray { - fn as_any(&self) -> &Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - /// Returns whether the element at `index` is null. - /// All elements of a `NullArray` are always null. - fn is_null(&self, _index: usize) -> bool { - true - } - - /// Returns whether the element at `index` is valid. - /// All elements of a `NullArray` are always invalid. - fn is_valid(&self, _index: usize) -> bool { - false - } - - /// Returns the total number of null values in this array. - /// The null count of a `NullArray` always equals its length. - fn null_count(&self) -> usize { - self.data_ref().len() - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [NullArray]. - fn get_buffer_memory_size(&self) -> usize { - self.data.get_buffer_memory_size() - } - - /// Returns the total number of bytes of memory occupied physically by this [NullArray]. - fn get_array_memory_size(&self) -> usize { - mem::size_of_val(self) - } -} - -impl From for NullArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.data_type(), - &DataType::Null, - "NullArray data type should be Null" - ); - assert_eq!( - data.buffers().len(), - 0, - "NullArray data should contain 0 buffers" - ); - assert!( - data.null_buffer().is_none(), - "NullArray data should not contain a null buffer, as no buffers are required" - ); - Self { data } - } -} - -impl fmt::Debug for NullArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "NullArray({})", self.len()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_null_array() { - let null_arr = NullArray::new(32); - - assert_eq!(null_arr.len(), 32); - assert_eq!(null_arr.null_count(), 32); - assert_eq!(null_arr.is_valid(0), false); - - assert_eq!(0, null_arr.get_buffer_memory_size()); - assert_eq!( - null_arr.get_buffer_memory_size() + std::mem::size_of::(), - null_arr.get_array_memory_size() - ); - } - - #[test] - fn test_null_array_slice() { - let array1 = NullArray::new(32); - - let array2 = array1.slice(8, 16); - assert_eq!(array2.len(), 16); - assert_eq!(array2.null_count(), 16); - assert_eq!(array2.offset(), 8); - } - - #[test] - fn test_debug_null_array() { - let array = NullArray::new(1024 * 1024); - assert_eq!(format!("{:?}", array), "NullArray(1048576)"); - } -} diff --git a/rust/arrow/src/array/ord.rs b/rust/arrow/src/array/ord.rs deleted file mode 100644 index efd68b12264..00000000000 --- a/rust/arrow/src/array/ord.rs +++ /dev/null @@ -1,310 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Contains functions and function factories to compare arrays. - -use std::cmp::Ordering; - -use crate::array::*; -use crate::datatypes::TimeUnit; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; - -use num::Float; - -/// Compare the values at two arbitrary indices in two arrays. -pub type DynComparator<'a> = Box Ordering + 'a>; - -/// compares two floats, placing NaNs at last -fn cmp_nans_last(a: &T, b: &T) -> Ordering { - match (a.is_nan(), b.is_nan()) { - (true, true) => Ordering::Equal, - (true, false) => Ordering::Greater, - (false, true) => Ordering::Less, - _ => a.partial_cmp(b).unwrap(), - } -} - -fn compare_primitives<'a, T: ArrowPrimitiveType>( - left: &'a Array, - right: &'a Array, -) -> DynComparator<'a> -where - T::Native: Ord, -{ - let left = left.as_any().downcast_ref::>().unwrap(); - let right = right.as_any().downcast_ref::>().unwrap(); - Box::new(move |i, j| left.value(i).cmp(&right.value(j))) -} - -fn compare_boolean<'a>(left: &'a Array, right: &'a Array) -> DynComparator<'a> { - let left = left.as_any().downcast_ref::().unwrap(); - let right = right.as_any().downcast_ref::().unwrap(); - Box::new(move |i, j| left.value(i).cmp(&right.value(j))) -} - -fn compare_float<'a, T: ArrowPrimitiveType>( - left: &'a Array, - right: &'a Array, -) -> DynComparator<'a> -where - T::Native: Float, -{ - let left = left.as_any().downcast_ref::>().unwrap(); - let right = right.as_any().downcast_ref::>().unwrap(); - Box::new(move |i, j| cmp_nans_last(&left.value(i), &right.value(j))) -} - -fn compare_string<'a, T>(left: &'a Array, right: &'a Array) -> DynComparator<'a> -where - T: StringOffsetSizeTrait, -{ - let left = left - .as_any() - .downcast_ref::>() - .unwrap(); - let right = right - .as_any() - .downcast_ref::>() - .unwrap(); - Box::new(move |i, j| left.value(i).cmp(&right.value(j))) -} - -fn compare_dict_string<'a, T>(left: &'a Array, right: &'a Array) -> DynComparator<'a> -where - T: ArrowDictionaryKeyType, -{ - let left = left.as_any().downcast_ref::>().unwrap(); - let right = right.as_any().downcast_ref::>().unwrap(); - let left_keys = left.keys_array(); - let right_keys = right.keys_array(); - - let left_values = StringArray::from(left.values().data().clone()); - let right_values = StringArray::from(left.values().data().clone()); - - Box::new(move |i: usize, j: usize| { - let key_left = left_keys.value(i).to_usize().unwrap(); - let key_right = right_keys.value(j).to_usize().unwrap(); - let left = left_values.value(key_left); - let right = right_values.value(key_right); - left.cmp(&right) - }) -} - -/// returns a comparison function that compares two values at two different positions -/// between the two arrays. -/// The arrays' types must be equal. -/// # Example -/// ``` -/// use arrow::array::{build_compare, Int32Array}; -/// -/// # fn main() -> arrow::error::Result<()> { -/// let array1 = Int32Array::from(vec![1, 2]); -/// let array2 = Int32Array::from(vec![3, 4]); -/// -/// let cmp = build_compare(&array1, &array2)?; -/// -/// // 1 (index 0 of array1) is smaller than 4 (index 1 of array2) -/// assert_eq!(std::cmp::Ordering::Less, (cmp)(0, 1)); -/// # Ok(()) -/// # } -/// ``` -// This is a factory of comparisons. -// The lifetime 'a enforces that we cannot use the closure beyond any of the array's lifetime. -pub fn build_compare<'a>(left: &'a Array, right: &'a Array) -> Result> { - use DataType::*; - use IntervalUnit::*; - use TimeUnit::*; - Ok(match (left.data_type(), right.data_type()) { - (a, b) if a != b => { - return Err(ArrowError::InvalidArgumentError( - "Can't compare arrays of different types".to_string(), - )); - } - (Boolean, Boolean) => compare_boolean(left, right), - (UInt8, UInt8) => compare_primitives::(left, right), - (UInt16, UInt16) => compare_primitives::(left, right), - (UInt32, UInt32) => compare_primitives::(left, right), - (UInt64, UInt64) => compare_primitives::(left, right), - (Int8, Int8) => compare_primitives::(left, right), - (Int16, Int16) => compare_primitives::(left, right), - (Int32, Int32) => compare_primitives::(left, right), - (Int64, Int64) => compare_primitives::(left, right), - (Float32, Float32) => compare_float::(left, right), - (Float64, Float64) => compare_float::(left, right), - (Date32, Date32) => compare_primitives::(left, right), - (Date64, Date64) => compare_primitives::(left, right), - (Time32(Second), Time32(Second)) => { - compare_primitives::(left, right) - } - (Time32(Millisecond), Time32(Millisecond)) => { - compare_primitives::(left, right) - } - (Time64(Microsecond), Time64(Microsecond)) => { - compare_primitives::(left, right) - } - (Time64(Nanosecond), Time64(Nanosecond)) => { - compare_primitives::(left, right) - } - (Timestamp(Second, _), Timestamp(Second, _)) => { - compare_primitives::(left, right) - } - (Timestamp(Millisecond, _), Timestamp(Millisecond, _)) => { - compare_primitives::(left, right) - } - (Timestamp(Microsecond, _), Timestamp(Microsecond, _)) => { - compare_primitives::(left, right) - } - (Timestamp(Nanosecond, _), Timestamp(Nanosecond, _)) => { - compare_primitives::(left, right) - } - (Interval(YearMonth), Interval(YearMonth)) => { - compare_primitives::(left, right) - } - (Interval(DayTime), Interval(DayTime)) => { - compare_primitives::(left, right) - } - (Duration(Second), Duration(Second)) => { - compare_primitives::(left, right) - } - (Duration(Millisecond), Duration(Millisecond)) => { - compare_primitives::(left, right) - } - (Duration(Microsecond), Duration(Microsecond)) => { - compare_primitives::(left, right) - } - (Duration(Nanosecond), Duration(Nanosecond)) => { - compare_primitives::(left, right) - } - (Utf8, Utf8) => compare_string::(left, right), - (LargeUtf8, LargeUtf8) => compare_string::(left, right), - ( - Dictionary(key_type_lhs, value_type_lhs), - Dictionary(key_type_rhs, value_type_rhs), - ) => { - if value_type_lhs.as_ref() != &DataType::Utf8 - || value_type_rhs.as_ref() != &DataType::Utf8 - { - return Err(ArrowError::InvalidArgumentError( - "Arrow still does not support comparisons of non-string dictionary arrays" - .to_string(), - )); - } - match (key_type_lhs.as_ref(), key_type_rhs.as_ref()) { - (a, b) if a != b => { - return Err(ArrowError::InvalidArgumentError( - "Can't compare arrays of different types".to_string(), - )); - } - (UInt8, UInt8) => compare_dict_string::(left, right), - (UInt16, UInt16) => compare_dict_string::(left, right), - (UInt32, UInt32) => compare_dict_string::(left, right), - (UInt64, UInt64) => compare_dict_string::(left, right), - (Int8, Int8) => compare_dict_string::(left, right), - (Int16, Int16) => compare_dict_string::(left, right), - (Int32, Int32) => compare_dict_string::(left, right), - (Int64, Int64) => compare_dict_string::(left, right), - (lhs, _) => { - return Err(ArrowError::InvalidArgumentError(format!( - "Dictionaries do not support keys of type {:?}", - lhs - ))) - } - } - } - (lhs, _) => { - return Err(ArrowError::InvalidArgumentError(format!( - "The data type type {:?} has no natural order", - lhs - ))) - } - }) -} - -#[cfg(test)] -pub mod tests { - use super::*; - use crate::array::{Float64Array, Int32Array}; - use crate::error::Result; - use std::cmp::Ordering; - use std::iter::FromIterator; - - #[test] - fn test_i32() -> Result<()> { - let array = Int32Array::from(vec![1, 2]); - - let cmp = build_compare(&array, &array)?; - - assert_eq!(Ordering::Less, (cmp)(0, 1)); - Ok(()) - } - - #[test] - fn test_i32_i32() -> Result<()> { - let array1 = Int32Array::from(vec![1]); - let array2 = Int32Array::from(vec![2]); - - let cmp = build_compare(&array1, &array2)?; - - assert_eq!(Ordering::Less, (cmp)(0, 0)); - Ok(()) - } - - #[test] - fn test_f64() -> Result<()> { - let array = Float64Array::from(vec![1.0, 2.0]); - - let cmp = build_compare(&array, &array)?; - - assert_eq!(Ordering::Less, (cmp)(0, 1)); - Ok(()) - } - - #[test] - fn test_f64_nan() -> Result<()> { - let array = Float64Array::from(vec![1.0, f64::NAN]); - - let cmp = build_compare(&array, &array)?; - - assert_eq!(Ordering::Less, (cmp)(0, 1)); - Ok(()) - } - - #[test] - fn test_f64_zeros() -> Result<()> { - let array = Float64Array::from(vec![-0.0, 0.0]); - - let cmp = build_compare(&array, &array)?; - - assert_eq!(Ordering::Equal, (cmp)(0, 1)); - assert_eq!(Ordering::Equal, (cmp)(1, 0)); - Ok(()) - } - - #[test] - fn test_dict() -> Result<()> { - let data = vec!["a", "b", "c", "a", "a", "c", "c"]; - let array = DictionaryArray::::from_iter(data.into_iter()); - - let cmp = build_compare(&array, &array)?; - - assert_eq!(Ordering::Less, (cmp)(0, 1)); - assert_eq!(Ordering::Equal, (cmp)(3, 4)); - assert_eq!(Ordering::Greater, (cmp)(2, 3)); - Ok(()) - } -} diff --git a/rust/arrow/src/array/raw_pointer.rs b/rust/arrow/src/array/raw_pointer.rs deleted file mode 100644 index 185e1cbe98a..00000000000 --- a/rust/arrow/src/array/raw_pointer.rs +++ /dev/null @@ -1,64 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::ptr::NonNull; - -/// This struct is highly `unsafe` and offers the possibility to self-reference a [arrow::Buffer] from [arrow::array::ArrayData]. -/// as a pointer to the beginning of its contents. -pub(super) struct RawPtrBox { - ptr: NonNull, -} - -impl RawPtrBox { - /// # Safety - /// The user must guarantee that: - /// * the contents where `ptr` points to are never `moved`. This is guaranteed when they are Pinned. - /// * the lifetime of this struct does not outlive the lifetime of `ptr`. - /// Failure to fulfill any the above conditions results in undefined behavior. - /// # Panic - /// This function panics if: - /// * `ptr` is null - /// * `ptr` is not aligned to a slice of type `T`. This is guaranteed if it was built from a slice of type `T`. - pub(super) unsafe fn new(ptr: *const u8) -> Self { - let ptr = NonNull::new(ptr as *mut u8).expect("Pointer cannot be null"); - assert_eq!( - ptr.as_ptr().align_offset(std::mem::align_of::()), - 0, - "memory is not aligned" - ); - Self { ptr: ptr.cast() } - } - - pub(super) fn as_ptr(&self) -> *const T { - self.ptr.as_ptr() - } -} - -unsafe impl Send for RawPtrBox {} -unsafe impl Sync for RawPtrBox {} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - #[should_panic(expected = "memory is not aligned")] - fn test_primitive_array_alignment() { - let bytes = vec![0u8, 1u8]; - unsafe { RawPtrBox::::new(bytes.as_ptr().offset(1)) }; - } -} diff --git a/rust/arrow/src/array/transform/boolean.rs b/rust/arrow/src/array/transform/boolean.rs deleted file mode 100644 index 18291497173..00000000000 --- a/rust/arrow/src/array/transform/boolean.rs +++ /dev/null @@ -1,45 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::ArrayData; - -use super::{ - Extend, _MutableArrayData, - utils::{resize_for_bits, set_bits}, -}; - -pub(super) fn build_extend(array: &ArrayData) -> Extend { - let values = array.buffers()[0].as_slice(); - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - let buffer = &mut mutable.buffer1; - resize_for_bits(buffer, mutable.len + len); - set_bits( - &mut buffer.as_slice_mut(), - values, - mutable.len, - array.offset() + start, - len, - ); - }, - ) -} - -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { - let buffer = &mut mutable.buffer1; - resize_for_bits(buffer, mutable.len + len); -} diff --git a/rust/arrow/src/array/transform/fixed_binary.rs b/rust/arrow/src/array/transform/fixed_binary.rs deleted file mode 100644 index 36952d46a4d..00000000000 --- a/rust/arrow/src/array/transform/fixed_binary.rs +++ /dev/null @@ -1,65 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::{array::ArrayData, datatypes::DataType}; - -use super::{Extend, _MutableArrayData}; - -pub(super) fn build_extend(array: &ArrayData) -> Extend { - let size = match array.data_type() { - DataType::FixedSizeBinary(i) => *i as usize, - _ => unreachable!(), - }; - - let values = &array.buffers()[0].as_slice()[array.offset() * size..]; - if array.null_count() == 0 { - // fast case where we can copy regions without null issues - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - let buffer = &mut mutable.buffer1; - buffer.extend_from_slice(&values[start * size..(start + len) * size]); - }, - ) - } else { - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - // nulls present: append item by item, ignoring null entries - let values_buffer = &mut mutable.buffer1; - - (start..start + len).for_each(|i| { - if array.is_valid(i) { - // append value - let bytes = &values[i * size..(i + 1) * size]; - values_buffer.extend_from_slice(bytes); - } else { - values_buffer.extend_zeros(size); - } - }) - }, - ) - } -} - -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { - let size = match mutable.data_type { - DataType::FixedSizeBinary(i) => i as usize, - _ => unreachable!(), - }; - - let values_buffer = &mut mutable.buffer1; - values_buffer.extend_zeros(len * size); -} diff --git a/rust/arrow/src/array/transform/list.rs b/rust/arrow/src/array/transform/list.rs deleted file mode 100644 index 8eb2bd1778d..00000000000 --- a/rust/arrow/src/array/transform/list.rs +++ /dev/null @@ -1,99 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::{ArrayData, OffsetSizeTrait}; - -use super::{ - Extend, _MutableArrayData, - utils::{extend_offsets, get_last_offset}, -}; - -pub(super) fn build_extend(array: &ArrayData) -> Extend { - let offsets = array.buffer::(0); - if array.null_count() == 0 { - // fast case where we can copy regions without nullability checks - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - let offset_buffer = &mut mutable.buffer1; - - // this is safe due to how offset is built. See details on `get_last_offset` - let last_offset: T = unsafe { get_last_offset(offset_buffer) }; - - // offsets - extend_offsets::( - offset_buffer, - last_offset, - &offsets[start..start + len + 1], - ); - - mutable.child_data[0].extend( - index, - offsets[start].to_usize().unwrap(), - offsets[start + len].to_usize().unwrap(), - ) - }, - ) - } else { - // nulls present: append item by item, ignoring null entries - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - let offset_buffer = &mut mutable.buffer1; - - // this is safe due to how offset is built. See details on `get_last_offset` - let mut last_offset: T = unsafe { get_last_offset(offset_buffer) }; - - let delta_len = array.len() - array.null_count(); - offset_buffer.reserve(delta_len * std::mem::size_of::()); - - let child = &mut mutable.child_data[0]; - (start..start + len).for_each(|i| { - if array.is_valid(i) { - // compute the new offset - last_offset += offsets[i + 1] - offsets[i]; - - // append value - child.extend( - index, - offsets[i].to_usize().unwrap(), - offsets[i + 1].to_usize().unwrap(), - ); - } - // append offset - offset_buffer.push(last_offset); - }) - }, - ) - } -} - -pub(super) fn extend_nulls( - mutable: &mut _MutableArrayData, - len: usize, -) { - let offset_buffer = &mut mutable.buffer1; - - // this is safe due to how offset is built. See details on `get_last_offset` - let last_offset: T = unsafe { get_last_offset(offset_buffer) }; - - (0..len).for_each(|_| offset_buffer.push(last_offset)) -} diff --git a/rust/arrow/src/array/transform/mod.rs b/rust/arrow/src/array/transform/mod.rs deleted file mode 100644 index 4dc7b56d1c3..00000000000 --- a/rust/arrow/src/array/transform/mod.rs +++ /dev/null @@ -1,1206 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::{buffer::MutableBuffer, datatypes::DataType, util::bit_util}; - -use super::{ - data::{into_buffers, new_buffers}, - ArrayData, -}; - -mod boolean; -mod fixed_binary; -mod list; -mod null; -mod primitive; -mod structure; -mod utils; -mod variable_size; - -type ExtendNullBits<'a> = Box; -// function that extends `[start..start+len]` to the mutable array. -// this is dynamic because different data_types influence how buffers and childs are extended. -type Extend<'a> = Box; - -type ExtendNulls = Box ()>; - -/// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData]. -/// This is just a data container. -#[derive(Debug)] -struct _MutableArrayData<'a> { - pub data_type: DataType, - pub null_count: usize, - - pub len: usize, - pub null_buffer: MutableBuffer, - - // arrow specification only allows up to 3 buffers (2 ignoring the nulls above). - // Thus, we place them in the stack to avoid bound checks and greater data locality. - pub buffer1: MutableBuffer, - pub buffer2: MutableBuffer, - pub child_data: Vec>, -} - -impl<'a> _MutableArrayData<'a> { - fn freeze(self, dictionary: Option) -> ArrayData { - let buffers = into_buffers(&self.data_type, self.buffer1, self.buffer2); - - let child_data = match self.data_type { - DataType::Dictionary(_, _) => vec![dictionary.unwrap()], - _ => { - let mut child_data = Vec::with_capacity(self.child_data.len()); - for child in self.child_data { - child_data.push(child.freeze()); - } - child_data - } - }; - ArrayData::new( - self.data_type, - self.len, - Some(self.null_count), - if self.null_count > 0 { - Some(self.null_buffer.into()) - } else { - None - }, - 0, - buffers, - child_data, - ) - } -} - -fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits { - if let Some(bitmap) = array.null_bitmap() { - let bytes = bitmap.bits.as_slice(); - Box::new(move |mutable, start, len| { - utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); - mutable.null_count += utils::set_bits( - mutable.null_buffer.as_slice_mut(), - bytes, - mutable.len, - array.offset() + start, - len, - ); - }) - } else if use_nulls { - Box::new(|mutable, _, len| { - utils::resize_for_bits(&mut mutable.null_buffer, mutable.len + len); - let write_data = mutable.null_buffer.as_slice_mut(); - let offset = mutable.len; - (0..len).for_each(|i| { - bit_util::set_bit(write_data, offset + i); - }); - }) - } else { - Box::new(|_, _, _| {}) - } -} - -/// Struct to efficiently and interactively create an [ArrayData] from an existing [ArrayData] by -/// copying chunks. -/// The main use case of this struct is to perform unary operations to arrays of arbitrary types, such as `filter` and `take`. -/// # Example: -/// -/// ``` -/// use arrow::{array::{Int32Array, Array, MutableArrayData}}; -/// -/// let array = Int32Array::from(vec![1, 2, 3, 4, 5]); -/// let array = array.data(); -/// // Create a new `MutableArrayData` from an array and with a capacity of 4. -/// // Capacity here is equivalent to `Vec::with_capacity` -/// let arrays = vec![array]; -/// let mut mutable = MutableArrayData::new(arrays, false, 4); -/// mutable.extend(0, 1, 3); // extend from the slice [1..3], [2,3] -/// mutable.extend(0, 0, 3); // extend from the slice [0..3], [1,2,3] -/// // `.freeze()` to convert `MutableArrayData` into a `ArrayData`. -/// let new_array = Int32Array::from(mutable.freeze()); -/// assert_eq!(Int32Array::from(vec![2, 3, 1, 2, 3]), new_array); -/// ``` -pub struct MutableArrayData<'a> { - arrays: Vec<&'a ArrayData>, - // The attributes in [_MutableArrayData] cannot be in [MutableArrayData] due to - // mutability invariants (interior mutability): - // [MutableArrayData] contains a function that can only mutate [_MutableArrayData], not - // [MutableArrayData] itself - data: _MutableArrayData<'a>, - - // the child data of the `Array` in Dictionary arrays. - // This is not stored in `MutableArrayData` because these values constant and only needed - // at the end, when freezing [_MutableArrayData]. - dictionary: Option, - - // function used to extend values from arrays. This function's lifetime is bound to the array - // because it reads values from it. - extend_values: Vec>, - // function used to extend nulls from arrays. This function's lifetime is bound to the array - // because it reads nulls from it. - extend_null_bits: Vec>, - - // function used to extend nulls. - // this is independent of the arrays and therefore has no lifetime. - extend_nulls: ExtendNulls, -} - -impl<'a> std::fmt::Debug for MutableArrayData<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - // ignores the closures. - f.debug_struct("MutableArrayData") - .field("data", &self.data) - .finish() - } -} - -fn build_extend(array: &ArrayData) -> Extend { - use crate::datatypes::*; - match array.data_type() { - DataType::Null => null::build_extend(array), - DataType::Boolean => boolean::build_extend(array), - DataType::UInt8 => primitive::build_extend::(array), - DataType::UInt16 => primitive::build_extend::(array), - DataType::UInt32 => primitive::build_extend::(array), - DataType::UInt64 => primitive::build_extend::(array), - DataType::Int8 => primitive::build_extend::(array), - DataType::Int16 => primitive::build_extend::(array), - DataType::Int32 => primitive::build_extend::(array), - DataType::Int64 => primitive::build_extend::(array), - DataType::Float32 => primitive::build_extend::(array), - DataType::Float64 => primitive::build_extend::(array), - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - primitive::build_extend::(array) - } - DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => { - primitive::build_extend::(array) - } - DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), - DataType::LargeUtf8 | DataType::LargeBinary => { - variable_size::build_extend::(array) - } - DataType::List(_) => list::build_extend::(array), - DataType::LargeList(_) => list::build_extend::(array), - DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { - DataType::UInt8 => primitive::build_extend::(array), - DataType::UInt16 => primitive::build_extend::(array), - DataType::UInt32 => primitive::build_extend::(array), - DataType::UInt64 => primitive::build_extend::(array), - DataType::Int8 => primitive::build_extend::(array), - DataType::Int16 => primitive::build_extend::(array), - DataType::Int32 => primitive::build_extend::(array), - DataType::Int64 => primitive::build_extend::(array), - _ => unreachable!(), - }, - DataType::Struct(_) => structure::build_extend(array), - DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array), - DataType::Float16 => unreachable!(), - /* - DataType::FixedSizeList(_, _) => {} - DataType::Union(_) => {} - */ - _ => todo!("Take and filter operations still not supported for this datatype"), - } -} - -fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { - use crate::datatypes::*; - Box::new(match data_type { - DataType::Null => null::extend_nulls, - DataType::Boolean => boolean::extend_nulls, - DataType::UInt8 => primitive::extend_nulls::, - DataType::UInt16 => primitive::extend_nulls::, - DataType::UInt32 => primitive::extend_nulls::, - DataType::UInt64 => primitive::extend_nulls::, - DataType::Int8 => primitive::extend_nulls::, - DataType::Int16 => primitive::extend_nulls::, - DataType::Int32 => primitive::extend_nulls::, - DataType::Int64 => primitive::extend_nulls::, - DataType::Float32 => primitive::extend_nulls::, - DataType::Float64 => primitive::extend_nulls::, - DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => primitive::extend_nulls::, - DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::, - DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, - DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, - DataType::List(_) => list::extend_nulls::, - DataType::LargeList(_) => list::extend_nulls::, - DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { - DataType::UInt8 => primitive::extend_nulls::, - DataType::UInt16 => primitive::extend_nulls::, - DataType::UInt32 => primitive::extend_nulls::, - DataType::UInt64 => primitive::extend_nulls::, - DataType::Int8 => primitive::extend_nulls::, - DataType::Int16 => primitive::extend_nulls::, - DataType::Int32 => primitive::extend_nulls::, - DataType::Int64 => primitive::extend_nulls::, - _ => unreachable!(), - }, - DataType::Struct(_) => structure::extend_nulls, - DataType::FixedSizeBinary(_) => fixed_binary::extend_nulls, - DataType::Float16 => unreachable!(), - /* - DataType::FixedSizeList(_, _) => {} - DataType::Union(_) => {} - */ - _ => todo!("Take and filter operations still not supported for this datatype"), - }) -} - -impl<'a> MutableArrayData<'a> { - /// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an - /// [ArrayData] from multiple `arrays`. - /// - /// `use_nulls` is a flag used to optimize insertions. It should be `false` if the only source of nulls - /// are the arrays themselves and `true` if the user plans to call [MutableArrayData::extend_nulls]. - /// In other words, if `use_nulls` is `false`, calling [MutableArrayData::extend_nulls] should not be used. - pub fn new(arrays: Vec<&'a ArrayData>, mut use_nulls: bool, capacity: usize) -> Self { - let data_type = arrays[0].data_type(); - use crate::datatypes::*; - - // if any of the arrays has nulls, insertions from any array requires setting bits - // as there is at least one array with nulls. - if arrays.iter().any(|array| array.null_count() > 0) { - use_nulls = true; - }; - - let [buffer1, buffer2] = new_buffers(data_type, capacity); - - let child_data = match &data_type { - DataType::Null - | DataType::Boolean - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Float32 - | DataType::Float64 - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Timestamp(_, _) - | DataType::Utf8 - | DataType::Binary - | DataType::LargeUtf8 - | DataType::LargeBinary - | DataType::Interval(_) - | DataType::FixedSizeBinary(_) => vec![], - DataType::List(_) | DataType::LargeList(_) => { - let childs = arrays - .iter() - .map(|array| &array.child_data()[0]) - .collect::>(); - vec![MutableArrayData::new(childs, use_nulls, capacity)] - } - // the dictionary type just appends keys and clones the values. - DataType::Dictionary(_, _) => vec![], - DataType::Float16 => unreachable!(), - DataType::Struct(fields) => (0..fields.len()) - .map(|i| { - let child_arrays = arrays - .iter() - .map(|array| &array.child_data()[i]) - .collect::>(); - MutableArrayData::new(child_arrays, use_nulls, capacity) - }) - .collect::>(), - _ => { - todo!("Take and filter operations still not supported for this datatype") - } - }; - - let dictionary = match &data_type { - DataType::Dictionary(_, _) => Some(arrays[0].child_data()[0].clone()), - _ => None, - }; - - let extend_nulls = build_extend_nulls(data_type); - - let extend_null_bits = arrays - .iter() - .map(|array| build_extend_null_bits(array, use_nulls)) - .collect(); - - let null_bytes = bit_util::ceil(capacity, 8); - let null_buffer = MutableBuffer::from_len_zeroed(null_bytes); - - let extend_values = arrays.iter().map(|array| build_extend(array)).collect(); - - let data = _MutableArrayData { - data_type: data_type.clone(), - len: 0, - null_count: 0, - null_buffer, - buffer1, - buffer2, - child_data, - }; - Self { - arrays, - data, - dictionary, - extend_values, - extend_null_bits, - extend_nulls, - } - } - - /// Extends this [MutableArrayData] with elements from the bounded [ArrayData] at `start` - /// and for a size of `len`. - /// # Panic - /// This function panics if the range is out of bounds, i.e. if `start + len >= array.len()`. - pub fn extend(&mut self, index: usize, start: usize, end: usize) { - let len = end - start; - (self.extend_null_bits[index])(&mut self.data, start, len); - (self.extend_values[index])(&mut self.data, index, start, len); - self.data.len += len; - } - - /// Extends this [MutableArrayData] with null elements, disregarding the bound arrays - pub fn extend_nulls(&mut self, len: usize) { - self.data.null_count += len; - (self.extend_nulls)(&mut self.data, len); - self.data.len += len; - } - - /// Creates a [ArrayData] from the pushed regions up to this point, consuming `self`. - pub fn freeze(self) -> ArrayData { - self.data.freeze(self.dictionary) - } -} - -#[cfg(test)] -mod tests { - use std::{convert::TryFrom, sync::Arc}; - - use super::*; - - use crate::{ - array::{ - Array, ArrayData, ArrayRef, BooleanArray, DictionaryArray, - FixedSizeBinaryArray, Int16Array, Int16Type, Int32Array, Int64Array, - Int64Builder, ListBuilder, NullArray, PrimitiveBuilder, StringArray, - StringDictionaryBuilder, StructArray, UInt8Array, - }, - buffer::Buffer, - datatypes::Field, - }; - use crate::{ - array::{ListArray, StringBuilder}, - error::Result, - }; - - /// tests extending from a primitive array w/ offset nor nulls - #[test] - fn test_primitive() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, false, 3); - a.extend(0, 0, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = UInt8Array::from(vec![Some(1), Some(2)]); - assert_eq!(array, expected); - } - - /// tests extending from a primitive array with offset w/ nulls - #[test] - fn test_primitive_offset() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, false, 2); - a.extend(0, 0, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = UInt8Array::from(vec![Some(2), Some(3)]); - assert_eq!(array, expected); - } - - /// tests extending from a primitive array with offset and nulls - #[test] - fn test_primitive_null_offset() { - let b = UInt8Array::from(vec![Some(1), None, Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, false, 2); - a.extend(0, 0, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = UInt8Array::from(vec![None, Some(3)]); - assert_eq!(array, expected); - } - - #[test] - fn test_primitive_null_offset_nulls() { - let b = UInt8Array::from(vec![Some(1), Some(2), Some(3)]); - let b = b.slice(1, 2); - let arrays = vec![b.data()]; - let mut a = MutableArrayData::new(arrays, true, 2); - a.extend(0, 0, 2); - a.extend_nulls(3); - a.extend(0, 1, 2); - let result = a.freeze(); - let array = UInt8Array::from(result); - let expected = - UInt8Array::from(vec![Some(2), Some(3), None, None, None, Some(3)]); - assert_eq!(array, expected); - } - - #[test] - fn test_list_null_offset() -> Result<()> { - let int_builder = Int64Builder::new(24); - let mut builder = ListBuilder::::new(int_builder); - builder.values().append_slice(&[1, 2, 3])?; - builder.append(true)?; - builder.values().append_slice(&[4, 5])?; - builder.append(true)?; - builder.values().append_slice(&[6, 7, 8])?; - builder.append(true)?; - let array = builder.finish(); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - mutable.extend(0, 0, 1); - - let result = mutable.freeze(); - let array = ListArray::from(result); - - let int_builder = Int64Builder::new(24); - let mut builder = ListBuilder::::new(int_builder); - builder.values().append_slice(&[1, 2, 3])?; - builder.append(true)?; - let expected = builder.finish(); - - assert_eq!(array, expected); - - Ok(()) - } - - /// tests extending from a variable-sized (strings and binary) array w/ offset with nulls - #[test] - fn test_variable_sized_nulls() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![Some("bc"), None]); - assert_eq!(result, expected); - } - - /// tests extending from a variable-sized (strings and binary) array - /// with an offset and nulls - #[test] - fn test_variable_sized_offsets() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 0, 3); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); - assert_eq!(result, expected); - } - - #[test] - fn test_string_offsets() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 0, 3); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![Some("bc"), None, Some("defh")]); - assert_eq!(result, expected); - } - - #[test] - fn test_multiple_with_nulls() { - let array1 = StringArray::from(vec!["hello", "world"]); - let array2 = StringArray::from(vec![Some("1"), None]); - - let arrays = vec![array1.data(), array2.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 5); - - mutable.extend(0, 0, 2); - mutable.extend(1, 0, 2); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = - StringArray::from(vec![Some("hello"), Some("world"), Some("1"), None]); - assert_eq!(result, expected); - } - - #[test] - fn test_string_null_offset_nulls() { - let array = StringArray::from(vec![Some("a"), Some("bc"), None, Some("defh")]); - let array = array.slice(1, 3); - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, true, 0); - - mutable.extend(0, 1, 3); - mutable.extend_nulls(1); - - let result = mutable.freeze(); - let result = StringArray::from(result); - - let expected = StringArray::from(vec![None, Some("defh"), None]); - assert_eq!(result, expected); - } - - #[test] - fn test_bool() { - let array = BooleanArray::from(vec![Some(false), Some(true), None, Some(false)]); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - - let result = mutable.freeze(); - let result = BooleanArray::from(result); - - let expected = BooleanArray::from(vec![Some(true), None]); - assert_eq!(result, expected); - } - - #[test] - fn test_null() { - let array1 = NullArray::new(10); - let array2 = NullArray::new(5); - let arrays = vec![array1.data(), array2.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - mutable.extend(1, 0, 1); - - let result = mutable.freeze(); - let result = NullArray::from(result); - - let expected = NullArray::new(3); - assert_eq!(result, expected); - } - - fn create_dictionary_array(values: &[&str], keys: &[Option<&str>]) -> ArrayData { - let values = StringArray::from(values.to_vec()); - let mut builder = StringDictionaryBuilder::new_with_dictionary( - PrimitiveBuilder::::new(3), - &values, - ) - .unwrap(); - for key in keys { - if let Some(v) = key { - builder.append(v).unwrap(); - } else { - builder.append_null().unwrap() - } - } - builder.finish().data().clone() - } - - #[test] - fn test_dictionary() { - // (a, b, c), (0, 1, 0, 2) => (a, b, a, c) - let array = create_dictionary_array( - &["a", "b", "c"], - &[Some("a"), Some("b"), None, Some("c")], - ); - let arrays = vec![&array]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - - let result = mutable.freeze(); - let result = DictionaryArray::from(result); - - let expected = Int16Array::from(vec![Some(1), None]); - assert_eq!(result.keys(), &expected); - } - - #[test] - fn test_struct() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data()]; - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected = StructArray::try_from(vec![ - ("f1", strings.slice(1, 2)), - ("f2", ints.slice(1, 2)), - ]) - .unwrap(); - assert_eq!(array, expected) - } - - #[test] - fn test_struct_offset() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap() - .slice(1, 3); - let arrays = vec![array.data()]; - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected_strings: ArrayRef = - Arc::new(StringArray::from(vec![None, Some("mark")])); - let expected = StructArray::try_from(vec![ - ("f1", expected_strings), - ("f2", ints.slice(2, 2)), - ]) - .unwrap(); - - assert_eq!(array, expected); - } - - #[test] - fn test_struct_nulls() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected_string = Arc::new(StringArray::from(vec![None, None])) as ArrayRef; - let expected_int = Arc::new(Int32Array::from(vec![Some(2), None])) as ArrayRef; - - let expected = - StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) - .unwrap(); - assert_eq!(array, expected) - } - - #[test] - fn test_struct_many() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - let arrays = vec![array.data(), array.data()]; - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 3); - mutable.extend(1, 0, 2); - let data = mutable.freeze(); - let array = StructArray::from(data); - - let expected_string = - Arc::new(StringArray::from(vec![None, None, Some("joe"), None])) as ArrayRef; - let expected_int = - Arc::new(Int32Array::from(vec![Some(2), None, Some(1), Some(2)])) as ArrayRef; - - let expected = - StructArray::try_from(vec![("f1", expected_string), ("f2", expected_int)]) - .unwrap(); - assert_eq!(array, expected) - } - - #[test] - fn test_binary_fixed_sized_offsets() { - let array = FixedSizeBinaryArray::try_from_iter( - vec![vec![0, 0], vec![0, 1], vec![0, 2]].into_iter(), - ) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - let array = array.slice(1, 2); - // = [[0, 1], [0, 2]] due to the offset = 1 - - let arrays = vec![array.data()]; - - let mut mutable = MutableArrayData::new(arrays, false, 0); - - mutable.extend(0, 1, 2); - mutable.extend(0, 0, 1); - - let result = mutable.freeze(); - let result = FixedSizeBinaryArray::from(result); - - let expected = - FixedSizeBinaryArray::try_from_iter(vec![vec![0, 2], vec![0, 1]].into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - assert_eq!(result, expected); - } - - #[test] - fn test_list_append() -> Result<()> { - let mut builder = ListBuilder::::new(Int64Builder::new(24)); - builder.values().append_slice(&[1, 2, 3])?; - builder.append(true)?; - builder.values().append_slice(&[4, 5])?; - builder.append(true)?; - builder.values().append_slice(&[6, 7, 8])?; - builder.values().append_slice(&[9, 10, 11])?; - builder.append(true)?; - let a = builder.finish(); - - let a_builder = Int64Builder::new(24); - let mut a_builder = ListBuilder::::new(a_builder); - a_builder.values().append_slice(&[12, 13])?; - a_builder.append(true)?; - a_builder.append(true)?; - a_builder.values().append_slice(&[14, 15])?; - a_builder.append(true)?; - let b = a_builder.finish(); - - let c = b.slice(1, 2); - - let mut mutable = - MutableArrayData::new(vec![a.data(), b.data(), c.data()], false, 1); - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - - let finished = mutable.freeze(); - - let expected_int_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - Some(9), - Some(10), - Some(11), - // append first array - Some(12), - Some(13), - Some(14), - Some(15), - // append second array - Some(14), - Some(15), - ]); - let list_value_offsets = - Buffer::from_slice_ref(&[0i32, 3, 5, 11, 13, 13, 15, 15, 17]); - let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 8, - None, - None, - 0, - vec![list_value_offsets], - vec![expected_int_array.data().clone()], - ); - assert_eq!(finished, expected_list_data); - - Ok(()) - } - - #[test] - fn test_list_nulls_append() -> Result<()> { - let mut builder = ListBuilder::::new(Int64Builder::new(32)); - builder.values().append_slice(&[1, 2, 3])?; - builder.append(true)?; - builder.values().append_slice(&[4, 5])?; - builder.append(true)?; - builder.append(false)?; - builder.values().append_slice(&[6, 7, 8])?; - builder.values().append_null()?; - builder.values().append_null()?; - builder.values().append_slice(&[9, 10, 11])?; - builder.append(true)?; - let a = builder.finish(); - let a = a.data(); - - let mut builder = ListBuilder::::new(Int64Builder::new(32)); - builder.values().append_slice(&[12, 13])?; - builder.append(true)?; - builder.append(false)?; - builder.append(true)?; - builder.values().append_null()?; - builder.values().append_null()?; - builder.values().append_slice(&[14, 15])?; - builder.append(true)?; - let b = builder.finish(); - let b = b.data(); - let c = b.slice(1, 2); - let d = b.slice(2, 2); - - let mut mutable = MutableArrayData::new(vec![a, b, &c, &d], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(2, 0, c.len()); - mutable.extend(3, 0, d.len()); - let result = mutable.freeze(); - - let expected_int_array = Int64Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - Some(7), - Some(8), - None, - None, - Some(9), - Some(10), - Some(11), - // second array - Some(12), - Some(13), - None, - None, - Some(14), - Some(15), - // slice(1, 2) results in no values added - None, - None, - Some(14), - Some(15), - ]); - let list_value_offsets = - Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); - let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 12, - None, - Some(Buffer::from(&[0b11011011, 0b1110])), - 0, - vec![list_value_offsets], - vec![expected_int_array.data().clone()], - ); - assert_eq!(result, expected_list_data); - - Ok(()) - } - - #[test] - fn test_list_of_strings_append() -> Result<()> { - // [["alpha", "beta", None]] - let mut builder = ListBuilder::new(StringBuilder::new(32)); - builder.values().append_value("Hello")?; - builder.values().append_value("Arrow")?; - builder.values().append_null()?; - builder.append(true)?; - let a = builder.finish(); - - // [["alpha", "beta"], [None], ["gamma", "delta", None]] - let mut builder = ListBuilder::new(StringBuilder::new(32)); - builder.values().append_value("alpha")?; - builder.values().append_value("beta")?; - builder.append(true)?; - builder.values().append_null()?; - builder.append(true)?; - builder.values().append_value("gamma")?; - builder.values().append_value("delta")?; - builder.values().append_null()?; - builder.append(true)?; - let b = builder.finish(); - - let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(1, 1, 3); - mutable.extend(1, 0, 0); - let result = mutable.freeze(); - - let expected_string_array = StringArray::from(vec![ - // extend a[0..a.len()] - // a[0] - Some("Hello"), - Some("Arrow"), - None, - // extend b[0..b.len()] - // b[0] - Some("alpha"), - Some("beta"), - // b[1] - None, - // b[2] - Some("gamma"), - Some("delta"), - None, - // extend b[1..3] - // b[1] - None, - // b[2] - Some("gamma"), - Some("delta"), - None, - // extend b[0..0] - ]); - let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 6, 9, 10, 13]); - let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - 6, - None, - None, - 0, - vec![list_value_offsets], - vec![expected_string_array.data().clone()], - ); - assert_eq!(result, expected_list_data); - Ok(()) - } - - #[test] - fn test_fixed_size_binary_append() { - let a = vec![Some(vec![1, 2]), Some(vec![3, 4]), Some(vec![5, 6])]; - let a = FixedSizeBinaryArray::try_from_sparse_iter(a.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - - let b = vec![ - None, - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - Some(vec![13, 14]), - None, - ]; - let b = FixedSizeBinaryArray::try_from_sparse_iter(b.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - - let mut mutable = MutableArrayData::new(vec![a.data(), b.data()], false, 10); - - mutable.extend(0, 0, a.len()); - mutable.extend(1, 0, b.len()); - mutable.extend(1, 1, 4); - mutable.extend(1, 2, 3); - mutable.extend(1, 5, 5); - let result = mutable.freeze(); - - let expected = vec![ - // a - Some(vec![1, 2]), - Some(vec![3, 4]), - Some(vec![5, 6]), - // b - None, - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - Some(vec![13, 14]), - None, - // b[1..4] - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - // b[2..3] - Some(vec![9, 10]), - // b[4..4] - ]; - let expected = FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter()) - .expect("Failed to create FixedSizeBinaryArray from iterable"); - assert_eq!(&result, expected.data()); - } - - /* - // this is an old test used on a meanwhile removed dead code - // that is still useful when `MutableArrayData` supports fixed-size lists. - #[test] - fn test_fixed_size_list_append() -> Result<()> { - let int_builder = UInt16Builder::new(64); - let mut builder = FixedSizeListBuilder::::new(int_builder, 2); - builder.values().append_slice(&[1, 2])?; - builder.append(true)?; - builder.values().append_slice(&[3, 4])?; - builder.append(false)?; - builder.values().append_slice(&[5, 6])?; - builder.append(true)?; - - let a_builder = UInt16Builder::new(64); - let mut a_builder = FixedSizeListBuilder::::new(a_builder, 2); - a_builder.values().append_slice(&[7, 8])?; - a_builder.append(true)?; - a_builder.values().append_slice(&[9, 10])?; - a_builder.append(true)?; - a_builder.values().append_slice(&[11, 12])?; - a_builder.append(false)?; - a_builder.values().append_slice(&[13, 14])?; - a_builder.append(true)?; - a_builder.values().append_null()?; - a_builder.values().append_null()?; - a_builder.append(true)?; - let a = a_builder.finish(); - - // append array - builder.append_data(&[ - a.data(), - a.slice(1, 3).data(), - a.slice(2, 1).data(), - a.slice(5, 0).data(), - ])?; - let finished = builder.finish(); - - let expected_int_array = UInt16Array::from(vec![ - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - // append first array - Some(7), - Some(8), - Some(9), - Some(10), - Some(11), - Some(12), - Some(13), - Some(14), - None, - None, - // append slice(1, 3) - Some(9), - Some(10), - Some(11), - Some(12), - Some(13), - Some(14), - // append slice(2, 1) - Some(11), - Some(12), - ]); - let expected_list_data = ArrayData::new( - DataType::FixedSizeList( - Box::new(Field::new("item", DataType::UInt16, true)), - 2, - ), - 12, - None, - None, - 0, - vec![], - vec![expected_int_array.data()], - ); - let expected_list = - FixedSizeListArray::from(Arc::new(expected_list_data) as ArrayData); - assert_eq!(&expected_list.values(), &finished.values()); - assert_eq!(expected_list.len(), finished.len()); - - Ok(()) - } - */ -} diff --git a/rust/arrow/src/array/transform/null.rs b/rust/arrow/src/array/transform/null.rs deleted file mode 100644 index e1335e17971..00000000000 --- a/rust/arrow/src/array/transform/null.rs +++ /dev/null @@ -1,26 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::ArrayData; - -use super::{Extend, _MutableArrayData}; - -pub(super) fn build_extend(_: &ArrayData) -> Extend { - Box::new(move |_, _, _, _| {}) -} - -pub(super) fn extend_nulls(_: &mut _MutableArrayData, _: usize) {} diff --git a/rust/arrow/src/array/transform/primitive.rs b/rust/arrow/src/array/transform/primitive.rs deleted file mode 100644 index 032bb4a8779..00000000000 --- a/rust/arrow/src/array/transform/primitive.rs +++ /dev/null @@ -1,40 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::mem::size_of; - -use crate::{array::ArrayData, datatypes::ArrowNativeType}; - -use super::{Extend, _MutableArrayData}; - -pub(super) fn build_extend(array: &ArrayData) -> Extend { - let values = array.buffer::(0); - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - mutable - .buffer1 - .extend_from_slice(&values[start..start + len]); - }, - ) -} - -pub(super) fn extend_nulls( - mutable: &mut _MutableArrayData, - len: usize, -) { - mutable.buffer1.extend_zeros(len * size_of::()); -} diff --git a/rust/arrow/src/array/transform/structure.rs b/rust/arrow/src/array/transform/structure.rs deleted file mode 100644 index c019f5ac6a9..00000000000 --- a/rust/arrow/src/array/transform/structure.rs +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::ArrayData; - -use super::{Extend, _MutableArrayData}; - -pub(super) fn build_extend(array: &ArrayData) -> Extend { - if array.null_count() == 0 { - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - mutable.child_data.iter_mut().for_each(|child| { - child.extend( - index, - array.offset() + start, - array.offset() + start + len, - ) - }) - }, - ) - } else { - Box::new( - move |mutable: &mut _MutableArrayData, - index: usize, - start: usize, - len: usize| { - (array.offset() + start..array.offset() + start + len).for_each(|i| { - if array.is_valid(i) { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend(index, i, i + 1)) - } else { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend_nulls(1)) - } - }) - }, - ) - } -} - -pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { - mutable - .child_data - .iter_mut() - .for_each(|child| child.extend_nulls(len)) -} diff --git a/rust/arrow/src/array/transform/utils.rs b/rust/arrow/src/array/transform/utils.rs deleted file mode 100644 index 8c718c70c17..00000000000 --- a/rust/arrow/src/array/transform/utils.rs +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::{array::OffsetSizeTrait, buffer::MutableBuffer, util::bit_util}; - -/// extends the `buffer` to be able to hold `len` bits, setting all bits of the new size to zero. -#[inline] -pub(super) fn resize_for_bits(buffer: &mut MutableBuffer, len: usize) { - let needed_bytes = bit_util::ceil(len, 8); - if buffer.len() < needed_bytes { - buffer.resize(needed_bytes, 0); - } -} - -/// sets all bits on `write_data` on the range `[offset_write..offset_write+len]` to be equal to the -/// bits on `data` on the range `[offset_read..offset_read+len]` -pub(super) fn set_bits( - write_data: &mut [u8], - data: &[u8], - offset_write: usize, - offset_read: usize, - len: usize, -) -> usize { - let mut count = 0; - (0..len).for_each(|i| { - if bit_util::get_bit(data, offset_read + i) { - bit_util::set_bit(write_data, offset_write + i); - } else { - count += 1; - } - }); - count -} - -pub(super) fn extend_offsets( - buffer: &mut MutableBuffer, - mut last_offset: T, - offsets: &[T], -) { - buffer.reserve(offsets.len() * std::mem::size_of::()); - offsets.windows(2).for_each(|offsets| { - // compute the new offset - let length = offsets[1] - offsets[0]; - last_offset += length; - buffer.push(last_offset); - }); -} - -#[inline] -pub(super) unsafe fn get_last_offset( - offset_buffer: &MutableBuffer, -) -> T { - // JUSTIFICATION - // Benefit - // 20% performance improvement extend of variable sized arrays (see bench `mutable_array`) - // Soundness - // * offset buffer is always extended in slices of T and aligned accordingly. - // * Buffer[0] is initialized with one element, 0, and thus `mutable_offsets.len() - 1` is always valid. - let (prefix, offsets, suffix) = offset_buffer.as_slice().align_to::(); - debug_assert!(prefix.is_empty() && suffix.is_empty()); - *offsets.get_unchecked(offsets.len() - 1) -} diff --git a/rust/arrow/src/array/transform/variable_size.rs b/rust/arrow/src/array/transform/variable_size.rs deleted file mode 100644 index c9304dbca20..00000000000 --- a/rust/arrow/src/array/transform/variable_size.rs +++ /dev/null @@ -1,105 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::{ - array::{ArrayData, OffsetSizeTrait}, - buffer::MutableBuffer, -}; - -use super::{ - Extend, _MutableArrayData, - utils::{extend_offsets, get_last_offset}, -}; - -#[inline] -fn extend_offset_values( - buffer: &mut MutableBuffer, - offsets: &[T], - values: &[u8], - start: usize, - len: usize, -) { - let start_values = offsets[start].to_usize().unwrap(); - let end_values = offsets[start + len].to_usize().unwrap(); - let new_values = &values[start_values..end_values]; - buffer.extend_from_slice(new_values); -} - -pub(super) fn build_extend(array: &ArrayData) -> Extend { - let offsets = array.buffer::(0); - let values = array.buffers()[1].as_slice(); - if array.null_count() == 0 { - // fast case where we can copy regions without null issues - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - let offset_buffer = &mut mutable.buffer1; - let values_buffer = &mut mutable.buffer2; - - // this is safe due to how offset is built. See details on `get_last_offset` - let last_offset = unsafe { get_last_offset(offset_buffer) }; - - extend_offsets::( - offset_buffer, - last_offset, - &offsets[start..start + len + 1], - ); - // values - extend_offset_values::(values_buffer, offsets, values, start, len); - }, - ) - } else { - Box::new( - move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| { - let offset_buffer = &mut mutable.buffer1; - let values_buffer = &mut mutable.buffer2; - - // this is safe due to how offset is built. See details on `get_last_offset` - let mut last_offset: T = unsafe { get_last_offset(offset_buffer) }; - - // nulls present: append item by item, ignoring null entries - offset_buffer.reserve(len * std::mem::size_of::()); - - (start..start + len).for_each(|i| { - if array.is_valid(i) { - // compute the new offset - let length = offsets[i + 1] - offsets[i]; - last_offset += length; - - // append value - let bytes = &values[offsets[i].to_usize().unwrap() - ..offsets[i + 1].to_usize().unwrap()]; - values_buffer.extend_from_slice(bytes); - } - // offsets are always present - offset_buffer.push(last_offset); - }) - }, - ) - } -} - -pub(super) fn extend_nulls( - mutable: &mut _MutableArrayData, - len: usize, -) { - let offset_buffer = &mut mutable.buffer1; - - // this is safe due to how offset is built. See details on `get_last_offset` - let last_offset: T = unsafe { get_last_offset(offset_buffer) }; - - (0..len).for_each(|_| offset_buffer.push(last_offset)) -} diff --git a/rust/arrow/src/bitmap.rs b/rust/arrow/src/bitmap.rs deleted file mode 100644 index b977f550999..00000000000 --- a/rust/arrow/src/bitmap.rs +++ /dev/null @@ -1,157 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines a bitmap, which is used to track which values in an Arrow array are null. -//! This is called a "validity bitmap" in the Arrow documentation. - -use crate::buffer::Buffer; -use crate::error::Result; -use crate::util::bit_util; -use std::mem; - -use std::ops::{BitAnd, BitOr}; - -#[derive(Debug, Clone)] -pub struct Bitmap { - pub(crate) bits: Buffer, -} - -impl Bitmap { - pub fn new(num_bits: usize) -> Self { - let num_bytes = num_bits / 8 + if num_bits % 8 > 0 { 1 } else { 0 }; - let r = num_bytes % 64; - let len = if r == 0 { - num_bytes - } else { - num_bytes + 64 - r - }; - Bitmap { - bits: Buffer::from(&vec![0xFF; len]), - } - } - - pub fn len(&self) -> usize { - self.bits.len() - } - - pub fn is_empty(&self) -> bool { - self.bits.is_empty() - } - - pub fn is_set(&self, i: usize) -> bool { - assert!(i < (self.bits.len() << 3)); - unsafe { bit_util::get_bit_raw(self.bits.as_ptr(), i) } - } - - pub fn buffer_ref(&self) -> &Buffer { - &self.bits - } - - pub fn into_buffer(self) -> Buffer { - self.bits - } - - /// Returns the total number of bytes of memory occupied by the buffers owned by this [Bitmap]. - pub fn get_buffer_memory_size(&self) -> usize { - self.bits.capacity() - } - - /// Returns the total number of bytes of memory occupied physically by this [Bitmap]. - pub fn get_array_memory_size(&self) -> usize { - self.bits.capacity() + mem::size_of_val(self) - } -} - -impl<'a, 'b> BitAnd<&'b Bitmap> for &'a Bitmap { - type Output = Result; - - fn bitand(self, rhs: &'b Bitmap) -> Result { - Ok(Bitmap::from((&self.bits & &rhs.bits)?)) - } -} - -impl<'a, 'b> BitOr<&'b Bitmap> for &'a Bitmap { - type Output = Result; - - fn bitor(self, rhs: &'b Bitmap) -> Result { - Ok(Bitmap::from((&self.bits | &rhs.bits)?)) - } -} - -impl From for Bitmap { - fn from(buf: Buffer) -> Self { - Self { bits: buf } - } -} - -impl PartialEq for Bitmap { - fn eq(&self, other: &Self) -> bool { - // buffer equality considers capacity, but here we want to only compare - // actual data contents - let self_len = self.bits.len(); - let other_len = other.bits.len(); - if self_len != other_len { - return false; - } - self.bits.as_slice()[..self_len] == other.bits.as_slice()[..self_len] - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_bitmap_length() { - assert_eq!(64, Bitmap::new(63 * 8).len()); - assert_eq!(64, Bitmap::new(64 * 8).len()); - assert_eq!(128, Bitmap::new(65 * 8).len()); - } - - #[test] - fn test_bitwise_and() { - let bitmap1 = Bitmap::from(Buffer::from([0b01101010])); - let bitmap2 = Bitmap::from(Buffer::from([0b01001110])); - assert_eq!( - Bitmap::from(Buffer::from([0b01001010])), - (&bitmap1 & &bitmap2).unwrap() - ); - } - - #[test] - fn test_bitwise_or() { - let bitmap1 = Bitmap::from(Buffer::from([0b01101010])); - let bitmap2 = Bitmap::from(Buffer::from([0b01001110])); - assert_eq!( - Bitmap::from(Buffer::from([0b01101110])), - (&bitmap1 | &bitmap2).unwrap() - ); - } - - #[test] - fn test_bitmap_is_set() { - let bitmap = Bitmap::from(Buffer::from([0b01001010])); - assert_eq!(false, bitmap.is_set(0)); - assert_eq!(true, bitmap.is_set(1)); - assert_eq!(false, bitmap.is_set(2)); - assert_eq!(true, bitmap.is_set(3)); - assert_eq!(false, bitmap.is_set(4)); - assert_eq!(false, bitmap.is_set(5)); - assert_eq!(true, bitmap.is_set(6)); - assert_eq!(false, bitmap.is_set(7)); - } -} diff --git a/rust/arrow/src/buffer/immutable.rs b/rust/arrow/src/buffer/immutable.rs deleted file mode 100644 index cd6a2a3c130..00000000000 --- a/rust/arrow/src/buffer/immutable.rs +++ /dev/null @@ -1,541 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::fmt::Debug; -use std::iter::FromIterator; -use std::ptr::NonNull; -use std::sync::Arc; -use std::{convert::AsRef, usize}; - -use crate::util::bit_chunk_iterator::BitChunks; -use crate::{ - bytes::{Bytes, Deallocation}, - datatypes::ArrowNativeType, - ffi, -}; - -use super::ops::bitwise_unary_op_helper; -use super::MutableBuffer; - -/// Buffer represents a contiguous memory region that can be shared with other buffers and across -/// thread boundaries. -#[derive(Clone, PartialEq, Debug)] -pub struct Buffer { - /// the internal byte buffer. - data: Arc, - - /// The offset into the buffer. - offset: usize, -} - -impl Buffer { - /// Auxiliary method to create a new Buffer - #[inline] - pub fn from_bytes(bytes: Bytes) -> Self { - Buffer { - data: Arc::new(bytes), - offset: 0, - } - } - - /// Initializes a [Buffer] from a slice of items. - pub fn from_slice_ref>(items: &T) -> Self { - let slice = items.as_ref(); - let len = slice.len(); - let mut buffer = MutableBuffer::with_capacity(len); - buffer.extend_from_slice(slice); - buffer.into() - } - - /// Creates a buffer from an existing memory region (must already be byte-aligned), this - /// `Buffer` will free this piece of memory when dropped. - /// - /// # Arguments - /// - /// * `ptr` - Pointer to raw parts - /// * `len` - Length of raw parts in **bytes** - /// * `capacity` - Total allocated memory for the pointer `ptr`, in **bytes** - /// - /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` - /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. - pub unsafe fn from_raw_parts(ptr: NonNull, len: usize, capacity: usize) -> Self { - assert!(len <= capacity); - Buffer::build_with_arguments(ptr, len, Deallocation::Native(capacity)) - } - - /// Creates a buffer from an existing memory region (must already be byte-aligned), this - /// `Buffer` **does not** free this piece of memory when dropped. - /// - /// # Arguments - /// - /// * `ptr` - Pointer to raw parts - /// * `len` - Length of raw parts in **bytes** - /// * `data` - An [ffi::FFI_ArrowArray] with the data - /// - /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` - /// bytes and that the foreign deallocator frees the region. - pub unsafe fn from_unowned( - ptr: NonNull, - len: usize, - data: Arc, - ) -> Self { - Buffer::build_with_arguments(ptr, len, Deallocation::Foreign(data)) - } - - /// Auxiliary method to create a new Buffer - unsafe fn build_with_arguments( - ptr: NonNull, - len: usize, - deallocation: Deallocation, - ) -> Self { - let bytes = Bytes::new(ptr, len, deallocation); - Buffer { - data: Arc::new(bytes), - offset: 0, - } - } - - /// Returns the number of bytes in the buffer - pub fn len(&self) -> usize { - self.data.len() - self.offset - } - - /// Returns the capacity of this buffer. - /// For exernally owned buffers, this returns zero - pub fn capacity(&self) -> usize { - self.data.capacity() - } - - /// Returns whether the buffer is empty. - pub fn is_empty(&self) -> bool { - self.data.len() - self.offset == 0 - } - - /// Returns the byte slice stored in this buffer - pub fn as_slice(&self) -> &[u8] { - &self.data[self.offset..] - } - - /// Returns a new [Buffer] that is a slice of this buffer starting at `offset`. - /// Doing so allows the same memory region to be shared between buffers. - /// # Panics - /// Panics iff `offset` is larger than `len`. - pub fn slice(&self, offset: usize) -> Self { - assert!( - offset <= self.len(), - "the offset of the new Buffer cannot exceed the existing length" - ); - Self { - data: self.data.clone(), - offset: self.offset + offset, - } - } - - /// Returns a pointer to the start of this buffer. - /// - /// Note that this should be used cautiously, and the returned pointer should not be - /// stored anywhere, to avoid dangling pointers. - pub fn as_ptr(&self) -> *const u8 { - unsafe { self.data.ptr().as_ptr().add(self.offset) } - } - - /// View buffer as typed slice. - /// - /// # Safety - /// - /// `ArrowNativeType` is public so that it can be used as a trait bound for other public - /// components, such as the `ToByteSlice` trait. However, this means that it can be - /// implemented by user defined types, which it is not intended for. - /// - /// Also `typed_data::` is unsafe as `0x00` and `0x01` are the only valid values for - /// `bool` in Rust. However, `bool` arrays in Arrow are bit-packed which breaks this condition. - /// View buffer as typed slice. - pub unsafe fn typed_data(&self) -> &[T] { - // JUSTIFICATION - // Benefit - // Many of the buffers represent specific types, and consumers of `Buffer` often need to re-interpret them. - // Soundness - // * The pointer is non-null by construction - // * alignment asserted below. - let (prefix, offsets, suffix) = self.as_slice().align_to::(); - assert!(prefix.is_empty() && suffix.is_empty()); - offsets - } - - /// Returns a slice of this buffer starting at a certain bit offset. - /// If the offset is byte-aligned the returned buffer is a shallow clone, - /// otherwise a new buffer is allocated and filled with a copy of the bits in the range. - pub fn bit_slice(&self, offset: usize, len: usize) -> Self { - if offset % 8 == 0 && len % 8 == 0 { - return self.slice(offset / 8); - } - - bitwise_unary_op_helper(&self, offset, len, |a| a) - } - - /// Returns a `BitChunks` instance which can be used to iterate over this buffers bits - /// in larger chunks and starting at arbitrary bit offsets. - /// Note that both `offset` and `length` are measured in bits. - pub fn bit_chunks(&self, offset: usize, len: usize) -> BitChunks { - BitChunks::new(&self.as_slice(), offset, len) - } - - /// Returns the number of 1-bits in this buffer. - pub fn count_set_bits(&self) -> usize { - let len_in_bits = self.len() * 8; - // self.offset is already taken into consideration by the bit_chunks implementation - self.count_set_bits_offset(0, len_in_bits) - } - - /// Returns the number of 1-bits in this buffer, starting from `offset` with `length` bits - /// inspected. Note that both `offset` and `length` are measured in bits. - pub fn count_set_bits_offset(&self, offset: usize, len: usize) -> usize { - let chunks = self.bit_chunks(offset, len); - let mut count = chunks.iter().map(|c| c.count_ones() as usize).sum(); - count += chunks.remainder_bits().count_ones() as usize; - - count - } -} - -/// Creating a `Buffer` instance by copying the memory from a `AsRef<[u8]>` into a newly -/// allocated memory region. -impl> From for Buffer { - fn from(p: T) -> Self { - // allocate aligned memory buffer - let slice = p.as_ref(); - let len = slice.len(); - let mut buffer = MutableBuffer::new(len); - buffer.extend_from_slice(slice); - buffer.into() - } -} - -/// Creating a `Buffer` instance by storing the boolean values into the buffer -impl std::iter::FromIterator for Buffer { - fn from_iter(iter: I) -> Self - where - I: IntoIterator, - { - MutableBuffer::from_iter(iter).into() - } -} - -impl std::ops::Deref for Buffer { - type Target = [u8]; - - fn deref(&self) -> &[u8] { - unsafe { std::slice::from_raw_parts(self.as_ptr(), self.len()) } - } -} - -unsafe impl Sync for Buffer {} -unsafe impl Send for Buffer {} - -impl From for Buffer { - #[inline] - fn from(buffer: MutableBuffer) -> Self { - buffer.into_buffer() - } -} - -impl Buffer { - /// Creates a [`Buffer`] from an [`Iterator`] with a trusted (upper) length. - /// Prefer this to `collect` whenever possible, as it is ~60% faster. - /// # Example - /// ``` - /// # use arrow::buffer::Buffer; - /// let v = vec![1u32]; - /// let iter = v.iter().map(|x| x * 2); - /// let buffer = unsafe { Buffer::from_trusted_len_iter(iter) }; - /// assert_eq!(buffer.len(), 4) // u32 has 4 bytes - /// ``` - /// # Safety - /// This method assumes that the iterator's size is correct and is undefined behavior - /// to use it on an iterator that reports an incorrect length. - // This implementation is required for two reasons: - // 1. there is no trait `TrustedLen` in stable rust and therefore - // we can't specialize `extend` for `TrustedLen` like `Vec` does. - // 2. `from_trusted_len_iter` is faster. - #[inline] - pub unsafe fn from_trusted_len_iter>( - iterator: I, - ) -> Self { - MutableBuffer::from_trusted_len_iter(iterator).into() - } - - /// Creates a [`Buffer`] from an [`Iterator`] with a trusted (upper) length or errors - /// if any of the items of the iterator is an error. - /// Prefer this to `collect` whenever possible, as it is ~60% faster. - /// # Safety - /// This method assumes that the iterator's size is correct and is undefined behavior - /// to use it on an iterator that reports an incorrect length. - #[inline] - pub unsafe fn try_from_trusted_len_iter< - E, - T: ArrowNativeType, - I: Iterator>, - >( - iterator: I, - ) -> std::result::Result { - Ok(MutableBuffer::try_from_trusted_len_iter(iterator)?.into()) - } -} - -impl FromIterator for Buffer { - fn from_iter>(iter: I) -> Self { - let mut iterator = iter.into_iter(); - let size = std::mem::size_of::(); - - // first iteration, which will likely reserve sufficient space for the buffer. - let mut buffer = match iterator.next() { - None => MutableBuffer::new(0), - Some(element) => { - let (lower, _) = iterator.size_hint(); - let mut buffer = MutableBuffer::new(lower.saturating_add(1) * size); - unsafe { - std::ptr::write(buffer.as_mut_ptr() as *mut T, element); - buffer.set_len(size); - } - buffer - } - }; - - buffer.extend_from_iter(iterator); - buffer.into() - } -} - -#[cfg(test)] -mod tests { - use std::thread; - - use super::*; - - #[test] - fn test_buffer_data_equality() { - let buf1 = Buffer::from(&[0, 1, 2, 3, 4]); - let buf2 = Buffer::from(&[0, 1, 2, 3, 4]); - assert_eq!(buf1, buf2); - - // slice with same offset should still preserve equality - let buf3 = buf1.slice(2); - assert_ne!(buf1, buf3); - let buf4 = buf2.slice(2); - assert_eq!(buf3, buf4); - - // Different capacities should still preserve equality - let mut buf2 = MutableBuffer::new(65); - buf2.extend_from_slice(&[0u8, 1, 2, 3, 4]); - - let buf2 = buf2.into(); - assert_eq!(buf1, buf2); - - // unequal because of different elements - let buf2 = Buffer::from(&[0, 0, 2, 3, 4]); - assert_ne!(buf1, buf2); - - // unequal because of different length - let buf2 = Buffer::from(&[0, 1, 2, 3]); - assert_ne!(buf1, buf2); - } - - #[test] - fn test_from_raw_parts() { - let buf = Buffer::from(&[0, 1, 2, 3, 4]); - assert_eq!(5, buf.len()); - assert!(!buf.as_ptr().is_null()); - assert_eq!([0, 1, 2, 3, 4], buf.as_slice()); - } - - #[test] - fn test_from_vec() { - let buf = Buffer::from(&[0, 1, 2, 3, 4]); - assert_eq!(5, buf.len()); - assert!(!buf.as_ptr().is_null()); - assert_eq!([0, 1, 2, 3, 4], buf.as_slice()); - } - - #[test] - fn test_copy() { - let buf = Buffer::from(&[0, 1, 2, 3, 4]); - let buf2 = buf; - assert_eq!(5, buf2.len()); - assert_eq!(64, buf2.capacity()); - assert!(!buf2.as_ptr().is_null()); - assert_eq!([0, 1, 2, 3, 4], buf2.as_slice()); - } - - #[test] - fn test_slice() { - let buf = Buffer::from(&[2, 4, 6, 8, 10]); - let buf2 = buf.slice(2); - - assert_eq!([6, 8, 10], buf2.as_slice()); - assert_eq!(3, buf2.len()); - assert_eq!(unsafe { buf.as_ptr().offset(2) }, buf2.as_ptr()); - - let buf3 = buf2.slice(1); - assert_eq!([8, 10], buf3.as_slice()); - assert_eq!(2, buf3.len()); - assert_eq!(unsafe { buf.as_ptr().offset(3) }, buf3.as_ptr()); - - let buf4 = buf.slice(5); - let empty_slice: [u8; 0] = []; - assert_eq!(empty_slice, buf4.as_slice()); - assert_eq!(0, buf4.len()); - assert!(buf4.is_empty()); - assert_eq!(buf2.slice(2).as_slice(), &[10]); - } - - #[test] - #[should_panic( - expected = "the offset of the new Buffer cannot exceed the existing length" - )] - fn test_slice_offset_out_of_bound() { - let buf = Buffer::from(&[2, 4, 6, 8, 10]); - buf.slice(6); - } - - #[test] - fn test_access_concurrently() { - let buffer = Buffer::from(vec![1, 2, 3, 4, 5]); - let buffer2 = buffer.clone(); - assert_eq!([1, 2, 3, 4, 5], buffer.as_slice()); - - let buffer_copy = thread::spawn(move || { - // access buffer in another thread. - buffer - }) - .join(); - - assert!(buffer_copy.is_ok()); - assert_eq!(buffer2, buffer_copy.ok().unwrap()); - } - - macro_rules! check_as_typed_data { - ($input: expr, $native_t: ty) => {{ - let buffer = Buffer::from_slice_ref($input); - let slice: &[$native_t] = unsafe { buffer.typed_data::<$native_t>() }; - assert_eq!($input, slice); - }}; - } - - #[test] - #[allow(clippy::float_cmp)] - fn test_as_typed_data() { - check_as_typed_data!(&[1i8, 3i8, 6i8], i8); - check_as_typed_data!(&[1u8, 3u8, 6u8], u8); - check_as_typed_data!(&[1i16, 3i16, 6i16], i16); - check_as_typed_data!(&[1i32, 3i32, 6i32], i32); - check_as_typed_data!(&[1i64, 3i64, 6i64], i64); - check_as_typed_data!(&[1u16, 3u16, 6u16], u16); - check_as_typed_data!(&[1u32, 3u32, 6u32], u32); - check_as_typed_data!(&[1u64, 3u64, 6u64], u64); - check_as_typed_data!(&[1f32, 3f32, 6f32], f32); - check_as_typed_data!(&[1f64, 3f64, 6f64], f64); - } - - #[test] - fn test_count_bits() { - assert_eq!(0, Buffer::from(&[0b00000000]).count_set_bits()); - assert_eq!(8, Buffer::from(&[0b11111111]).count_set_bits()); - assert_eq!(3, Buffer::from(&[0b00001101]).count_set_bits()); - assert_eq!(6, Buffer::from(&[0b01001001, 0b01010010]).count_set_bits()); - assert_eq!(16, Buffer::from(&[0b11111111, 0b11111111]).count_set_bits()); - } - - #[test] - fn test_count_bits_slice() { - assert_eq!( - 0, - Buffer::from(&[0b11111111, 0b00000000]) - .slice(1) - .count_set_bits() - ); - assert_eq!( - 8, - Buffer::from(&[0b11111111, 0b11111111]) - .slice(1) - .count_set_bits() - ); - assert_eq!( - 3, - Buffer::from(&[0b11111111, 0b11111111, 0b00001101]) - .slice(2) - .count_set_bits() - ); - assert_eq!( - 6, - Buffer::from(&[0b11111111, 0b01001001, 0b01010010]) - .slice(1) - .count_set_bits() - ); - assert_eq!( - 16, - Buffer::from(&[0b11111111, 0b11111111, 0b11111111, 0b11111111]) - .slice(2) - .count_set_bits() - ); - } - - #[test] - fn test_count_bits_offset_slice() { - assert_eq!(8, Buffer::from(&[0b11111111]).count_set_bits_offset(0, 8)); - assert_eq!(3, Buffer::from(&[0b11111111]).count_set_bits_offset(0, 3)); - assert_eq!(5, Buffer::from(&[0b11111111]).count_set_bits_offset(3, 5)); - assert_eq!(1, Buffer::from(&[0b11111111]).count_set_bits_offset(3, 1)); - assert_eq!(0, Buffer::from(&[0b11111111]).count_set_bits_offset(8, 0)); - assert_eq!(2, Buffer::from(&[0b01010101]).count_set_bits_offset(0, 3)); - assert_eq!( - 16, - Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(0, 16) - ); - assert_eq!( - 10, - Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(0, 10) - ); - assert_eq!( - 10, - Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(3, 10) - ); - assert_eq!( - 8, - Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(8, 8) - ); - assert_eq!( - 5, - Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(11, 5) - ); - assert_eq!( - 0, - Buffer::from(&[0b11111111, 0b11111111]).count_set_bits_offset(16, 0) - ); - assert_eq!( - 2, - Buffer::from(&[0b01101101, 0b10101010]).count_set_bits_offset(7, 5) - ); - assert_eq!( - 4, - Buffer::from(&[0b01101101, 0b10101010]).count_set_bits_offset(7, 9) - ); - } -} diff --git a/rust/arrow/src/buffer/mod.rs b/rust/arrow/src/buffer/mod.rs deleted file mode 100644 index cc5c63b1c37..00000000000 --- a/rust/arrow/src/buffer/mod.rs +++ /dev/null @@ -1,69 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! This module contains two main structs: [Buffer] and [MutableBuffer]. A buffer represents -//! a contiguous memory region that can be shared via `offsets`. - -mod immutable; -pub use immutable::*; -mod mutable; -pub use mutable::*; -mod ops; -pub(super) use ops::*; - -use crate::error::{ArrowError, Result}; -use std::ops::{BitAnd, BitOr, Not}; - -impl<'a, 'b> BitAnd<&'b Buffer> for &'a Buffer { - type Output = Result; - - fn bitand(self, rhs: &'b Buffer) -> Result { - if self.len() != rhs.len() { - return Err(ArrowError::ComputeError( - "Buffers must be the same size to apply Bitwise AND.".to_string(), - )); - } - - let len_in_bits = self.len() * 8; - Ok(buffer_bin_and(&self, 0, &rhs, 0, len_in_bits)) - } -} - -impl<'a, 'b> BitOr<&'b Buffer> for &'a Buffer { - type Output = Result; - - fn bitor(self, rhs: &'b Buffer) -> Result { - if self.len() != rhs.len() { - return Err(ArrowError::ComputeError( - "Buffers must be the same size to apply Bitwise OR.".to_string(), - )); - } - - let len_in_bits = self.len() * 8; - - Ok(buffer_bin_or(&self, 0, &rhs, 0, len_in_bits)) - } -} - -impl Not for &Buffer { - type Output = Buffer; - - fn not(self) -> Buffer { - let len_in_bits = self.len() * 8; - buffer_unary_not(&self, 0, len_in_bits) - } -} diff --git a/rust/arrow/src/buffer/mutable.rs b/rust/arrow/src/buffer/mutable.rs deleted file mode 100644 index d7fd5b9d200..00000000000 --- a/rust/arrow/src/buffer/mutable.rs +++ /dev/null @@ -1,749 +0,0 @@ -use std::ptr::NonNull; - -use crate::{ - alloc, - bytes::{Bytes, Deallocation}, - datatypes::{ArrowNativeType, ToByteSlice}, - util::bit_util, -}; - -use super::Buffer; - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// A [`MutableBuffer`] is Arrow's interface to build a [`Buffer`] out of items or slices of items. -/// [`Buffer`]s created from [`MutableBuffer`] (via `into`) are guaranteed to have its pointer aligned -/// along cache lines and in multiple of 64 bytes. -/// Use [MutableBuffer::push] to insert an item, [MutableBuffer::extend_from_slice] -/// to insert many items, and `into` to convert it to [`Buffer`]. -/// # Example -/// ``` -/// # use arrow::buffer::{Buffer, MutableBuffer}; -/// let mut buffer = MutableBuffer::new(0); -/// buffer.push(256u32); -/// buffer.extend_from_slice(&[1u32]); -/// let buffer: Buffer = buffer.into(); -/// assert_eq!(buffer.as_slice(), &[0u8, 1, 0, 0, 1, 0, 0, 0]) -/// ``` -#[derive(Debug)] -pub struct MutableBuffer { - // dangling iff capacity = 0 - data: NonNull, - // invariant: len <= capacity - len: usize, - capacity: usize, -} - -impl MutableBuffer { - /// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity`. - #[inline] - pub fn new(capacity: usize) -> Self { - Self::with_capacity(capacity) - } - - /// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity`. - #[inline] - pub fn with_capacity(capacity: usize) -> Self { - let capacity = bit_util::round_upto_multiple_of_64(capacity); - let ptr = alloc::allocate_aligned(capacity); - Self { - data: ptr, - len: 0, - capacity, - } - } - - /// Allocates a new [MutableBuffer] with `len` and capacity to be at least `len` where - /// all bytes are guaranteed to be `0u8`. - /// # Example - /// ``` - /// # use arrow::buffer::{Buffer, MutableBuffer}; - /// let mut buffer = MutableBuffer::from_len_zeroed(127); - /// assert_eq!(buffer.len(), 127); - /// assert!(buffer.capacity() >= 127); - /// let data = buffer.as_slice_mut(); - /// assert_eq!(data[126], 0u8); - /// ``` - pub fn from_len_zeroed(len: usize) -> Self { - let new_capacity = bit_util::round_upto_multiple_of_64(len); - let ptr = alloc::allocate_aligned_zeroed(new_capacity); - Self { - data: ptr, - len, - capacity: new_capacity, - } - } - - /// creates a new [MutableBuffer] with capacity and length capable of holding `len` bits. - /// This is useful to create a buffer for packed bitmaps. - pub fn new_null(len: usize) -> Self { - let num_bytes = bit_util::ceil(len, 8); - MutableBuffer::from_len_zeroed(num_bytes) - } - - /// Set the bits in the range of `[0, end)` to 0 (if `val` is false), or 1 (if `val` - /// is true). Also extend the length of this buffer to be `end`. - /// - /// This is useful when one wants to clear (or set) the bits and then manipulate - /// the buffer directly (e.g., modifying the buffer by holding a mutable reference - /// from `data_mut()`). - pub fn with_bitset(mut self, end: usize, val: bool) -> Self { - assert!(end <= self.capacity); - let v = if val { 255 } else { 0 }; - unsafe { - std::ptr::write_bytes(self.data.as_ptr(), v, end); - self.len = end; - } - self - } - - /// Ensure that `count` bytes from `start` contain zero bits - /// - /// This is used to initialize the bits in a buffer, however, it has no impact on the - /// `len` of the buffer and so can be used to initialize the memory region from - /// `len` to `capacity`. - pub fn set_null_bits(&mut self, start: usize, count: usize) { - assert!(start + count <= self.capacity); - unsafe { - std::ptr::write_bytes(self.data.as_ptr().add(start), 0, count); - } - } - - /// Ensures that this buffer has at least `self.len + additional` bytes. This re-allocates iff - /// `self.len + additional > capacity`. - /// # Example - /// ``` - /// # use arrow::buffer::{Buffer, MutableBuffer}; - /// let mut buffer = MutableBuffer::new(0); - /// buffer.reserve(253); // allocates for the first time - /// (0..253u8).for_each(|i| buffer.push(i)); // no reallocation - /// let buffer: Buffer = buffer.into(); - /// assert_eq!(buffer.len(), 253); - /// ``` - // For performance reasons, this must be inlined so that the `if` is executed inside the caller, and not as an extra call that just - // exits. - #[inline(always)] - pub fn reserve(&mut self, additional: usize) { - let required_cap = self.len + additional; - if required_cap > self.capacity { - // JUSTIFICATION - // Benefit - // necessity - // Soundness - // `self.data` is valid for `self.capacity`. - let (ptr, new_capacity) = - unsafe { reallocate(self.data, self.capacity, required_cap) }; - self.data = ptr; - self.capacity = new_capacity; - } - } - - /// Resizes the buffer, either truncating its contents (with no change in capacity), or - /// growing it (potentially reallocating it) and writing `value` in the newly available bytes. - /// # Example - /// ``` - /// # use arrow::buffer::{Buffer, MutableBuffer}; - /// let mut buffer = MutableBuffer::new(0); - /// buffer.resize(253, 2); // allocates for the first time - /// assert_eq!(buffer.as_slice()[252], 2u8); - /// ``` - // For performance reasons, this must be inlined so that the `if` is executed inside the caller, and not as an extra call that just - // exits. - #[inline(always)] - pub fn resize(&mut self, new_len: usize, value: u8) { - if new_len > self.len { - let diff = new_len - self.len; - self.reserve(diff); - // write the value - unsafe { self.data.as_ptr().add(self.len).write_bytes(value, diff) }; - } - // this truncates the buffer when new_len < self.len - self.len = new_len; - } - - /// Returns whether this buffer is empty or not. - #[inline] - pub const fn is_empty(&self) -> bool { - self.len == 0 - } - - /// Returns the length (the number of bytes written) in this buffer. - /// The invariant `buffer.len() <= buffer.capacity()` is always upheld. - #[inline] - pub const fn len(&self) -> usize { - self.len - } - - /// Returns the total capacity in this buffer. - /// The invariant `buffer.len() <= buffer.capacity()` is always upheld. - #[inline] - pub const fn capacity(&self) -> usize { - self.capacity - } - - /// Clear all existing data from this buffer. - pub fn clear(&mut self) { - self.len = 0 - } - - /// Returns the data stored in this buffer as a slice. - pub fn as_slice(&self) -> &[u8] { - self - } - - /// Returns the data stored in this buffer as a mutable slice. - pub fn as_slice_mut(&mut self) -> &mut [u8] { - self - } - - /// Returns a raw pointer to this buffer's internal memory - /// This pointer is guaranteed to be aligned along cache-lines. - #[inline] - pub const fn as_ptr(&self) -> *const u8 { - self.data.as_ptr() - } - - /// Returns a mutable raw pointer to this buffer's internal memory - /// This pointer is guaranteed to be aligned along cache-lines. - #[inline] - pub fn as_mut_ptr(&mut self) -> *mut u8 { - self.data.as_ptr() - } - - #[deprecated( - since = "2.0.0", - note = "This method is deprecated in favour of `into` from the trait `Into`." - )] - /// Freezes this buffer and return an immutable version of it. - pub fn freeze(self) -> Buffer { - self.into_buffer() - } - - #[inline] - pub(super) fn into_buffer(self) -> Buffer { - let bytes = unsafe { - Bytes::new(self.data, self.len, Deallocation::Native(self.capacity)) - }; - std::mem::forget(self); - Buffer::from_bytes(bytes) - } - - /// View this buffer asa slice of a specific type. - /// # Safety - /// This function must only be used when this buffer was extended with items of type `T`. - /// Failure to do so results in undefined behavior. - pub fn typed_data_mut(&mut self) -> &mut [T] { - unsafe { - let (prefix, offsets, suffix) = self.as_slice_mut().align_to_mut::(); - assert!(prefix.is_empty() && suffix.is_empty()); - offsets - } - } - - /// Extends this buffer from a slice of items that can be represented in bytes, increasing its capacity if needed. - /// # Example - /// ``` - /// # use arrow::buffer::MutableBuffer; - /// let mut buffer = MutableBuffer::new(0); - /// buffer.extend_from_slice(&[2u32, 0]); - /// assert_eq!(buffer.len(), 8) // u32 has 4 bytes - /// ``` - #[inline] - pub fn extend_from_slice(&mut self, items: &[T]) { - let len = items.len(); - let additional = len * std::mem::size_of::(); - self.reserve(additional); - unsafe { - let dst = self.data.as_ptr().add(self.len); - let src = items.as_ptr() as *const u8; - std::ptr::copy_nonoverlapping(src, dst, additional) - } - self.len += additional; - } - - /// Extends the buffer with a new item, increasing its capacity if needed. - /// # Example - /// ``` - /// # use arrow::buffer::MutableBuffer; - /// let mut buffer = MutableBuffer::new(0); - /// buffer.push(256u32); - /// assert_eq!(buffer.len(), 4) // u32 has 4 bytes - /// ``` - #[inline] - pub fn push(&mut self, item: T) { - let additional = std::mem::size_of::(); - self.reserve(additional); - unsafe { - let dst = self.data.as_ptr().add(self.len) as *mut T; - std::ptr::write(dst, item); - } - self.len += additional; - } - - /// Extends the buffer with a new item, without checking for sufficient capacity - /// Safety - /// Caller must ensure that the capacity()-len()>=size_of() - #[inline] - unsafe fn push_unchecked(&mut self, item: T) { - let additional = std::mem::size_of::(); - let dst = self.data.as_ptr().add(self.len) as *mut T; - std::ptr::write(dst, item); - self.len += additional; - } - - /// Extends the buffer by `additional` bytes equal to `0u8`, incrementing its capacity if needed. - #[inline] - pub fn extend_zeros(&mut self, additional: usize) { - self.resize(self.len + additional, 0); - } - - /// # Safety - /// The caller must ensure that the buffer was properly initialized up to `len`. - #[inline] - pub(crate) unsafe fn set_len(&mut self, len: usize) { - assert!(len <= self.capacity()); - self.len = len; - } -} - -/// # Safety -/// `ptr` must be allocated for `old_capacity`. -#[inline] -unsafe fn reallocate( - ptr: NonNull, - old_capacity: usize, - new_capacity: usize, -) -> (NonNull, usize) { - let new_capacity = bit_util::round_upto_multiple_of_64(new_capacity); - let new_capacity = std::cmp::max(new_capacity, old_capacity * 2); - let ptr = alloc::reallocate(ptr, old_capacity, new_capacity); - (ptr, new_capacity) -} - -impl Extend for MutableBuffer { - #[inline] - fn extend>(&mut self, iter: T) { - let iterator = iter.into_iter(); - self.extend_from_iter(iterator) - } -} - -impl MutableBuffer { - #[inline] - pub(super) fn extend_from_iter>( - &mut self, - mut iterator: I, - ) { - let size = std::mem::size_of::(); - let (lower, _) = iterator.size_hint(); - let additional = lower * size; - self.reserve(additional); - - // this is necessary because of https://github.com/rust-lang/rust/issues/32155 - let mut len = SetLenOnDrop::new(&mut self.len); - let mut dst = unsafe { self.data.as_ptr().add(len.local_len) as *mut T }; - let capacity = self.capacity; - - while len.local_len + size <= capacity { - if let Some(item) = iterator.next() { - unsafe { - std::ptr::write(dst, item); - dst = dst.add(1); - } - len.local_len += size; - } else { - break; - } - } - drop(len); - - iterator.for_each(|item| self.push(item)); - } - - /// Creates a [`MutableBuffer`] from an [`Iterator`] with a trusted (upper) length. - /// Prefer this to `collect` whenever possible, as it is faster ~60% faster. - /// # Example - /// ``` - /// # use arrow::buffer::MutableBuffer; - /// let v = vec![1u32]; - /// let iter = v.iter().map(|x| x * 2); - /// let buffer = unsafe { MutableBuffer::from_trusted_len_iter(iter) }; - /// assert_eq!(buffer.len(), 4) // u32 has 4 bytes - /// ``` - /// # Safety - /// This method assumes that the iterator's size is correct and is undefined behavior - /// to use it on an iterator that reports an incorrect length. - // This implementation is required for two reasons: - // 1. there is no trait `TrustedLen` in stable rust and therefore - // we can't specialize `extend` for `TrustedLen` like `Vec` does. - // 2. `from_trusted_len_iter` is faster. - #[inline] - pub unsafe fn from_trusted_len_iter>( - iterator: I, - ) -> Self { - let (_, upper) = iterator.size_hint(); - let upper = upper.expect("from_trusted_len_iter requires an upper limit"); - let len = upper * std::mem::size_of::(); - - let mut buffer = MutableBuffer::new(len); - - let mut dst = buffer.data.as_ptr() as *mut T; - for item in iterator { - // note how there is no reserve here (compared with `extend_from_iter`) - std::ptr::write(dst, item); - dst = dst.add(1); - } - assert_eq!( - dst.offset_from(buffer.data.as_ptr() as *mut T) as usize, - upper, - "Trusted iterator length was not accurately reported" - ); - buffer.len = len; - buffer - } - - /// Creates a [`MutableBuffer`] from a boolean [`Iterator`] with a trusted (upper) length. - /// # use arrow::buffer::MutableBuffer; - /// # Example - /// ``` - /// # use arrow::buffer::MutableBuffer; - /// let v = vec![false, true, false]; - /// let iter = v.iter().map(|x| *x || true); - /// let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(iter) }; - /// assert_eq!(buffer.len(), 1) // 3 booleans have 1 byte - /// ``` - /// # Safety - /// This method assumes that the iterator's size is correct and is undefined behavior - /// to use it on an iterator that reports an incorrect length. - // This implementation is required for two reasons: - // 1. there is no trait `TrustedLen` in stable rust and therefore - // we can't specialize `extend` for `TrustedLen` like `Vec` does. - // 2. `from_trusted_len_iter_bool` is faster. - #[inline] - pub unsafe fn from_trusted_len_iter_bool>( - mut iterator: I, - ) -> Self { - let (_, upper) = iterator.size_hint(); - let upper = upper.expect("from_trusted_len_iter requires an upper limit"); - - let mut result = { - let byte_capacity: usize = upper.saturating_add(7) / 8; - MutableBuffer::new(byte_capacity) - }; - - 'a: loop { - let mut byte_accum: u8 = 0; - let mut mask: u8 = 1; - - //collect (up to) 8 bits into a byte - while mask != 0 { - if let Some(value) = iterator.next() { - byte_accum |= match value { - true => mask, - false => 0, - }; - mask <<= 1; - } else { - if mask != 1 { - // Add last byte - result.push_unchecked(byte_accum); - } - break 'a; - } - } - - // Soundness: from_trusted_len - result.push_unchecked(byte_accum); - } - result - } - - /// Creates a [`MutableBuffer`] from an [`Iterator`] with a trusted (upper) length or errors - /// if any of the items of the iterator is an error. - /// Prefer this to `collect` whenever possible, as it is faster ~60% faster. - /// # Safety - /// This method assumes that the iterator's size is correct and is undefined behavior - /// to use it on an iterator that reports an incorrect length. - #[inline] - pub unsafe fn try_from_trusted_len_iter< - E, - T: ArrowNativeType, - I: Iterator>, - >( - iterator: I, - ) -> std::result::Result { - let (_, upper) = iterator.size_hint(); - let upper = upper.expect("try_from_trusted_len_iter requires an upper limit"); - let len = upper * std::mem::size_of::(); - - let mut buffer = MutableBuffer::new(len); - - let mut dst = buffer.data.as_ptr() as *mut T; - for item in iterator { - // note how there is no reserve here (compared with `extend_from_iter`) - std::ptr::write(dst, item?); - dst = dst.add(1); - } - assert_eq!( - dst.offset_from(buffer.data.as_ptr() as *mut T) as usize, - upper, - "Trusted iterator length was not accurately reported" - ); - buffer.len = len; - Ok(buffer) - } -} - -impl std::ops::Deref for MutableBuffer { - type Target = [u8]; - - fn deref(&self) -> &[u8] { - unsafe { std::slice::from_raw_parts(self.as_ptr(), self.len) } - } -} - -impl std::ops::DerefMut for MutableBuffer { - fn deref_mut(&mut self) -> &mut [u8] { - unsafe { std::slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) } - } -} - -impl Drop for MutableBuffer { - fn drop(&mut self) { - unsafe { alloc::free_aligned(self.data, self.capacity) }; - } -} - -impl PartialEq for MutableBuffer { - fn eq(&self, other: &MutableBuffer) -> bool { - if self.len != other.len { - return false; - } - if self.capacity != other.capacity { - return false; - } - self.as_slice() == other.as_slice() - } -} - -unsafe impl Sync for MutableBuffer {} -unsafe impl Send for MutableBuffer {} - -struct SetLenOnDrop<'a> { - len: &'a mut usize, - local_len: usize, -} - -impl<'a> SetLenOnDrop<'a> { - #[inline] - fn new(len: &'a mut usize) -> Self { - SetLenOnDrop { - local_len: *len, - len, - } - } -} - -impl Drop for SetLenOnDrop<'_> { - #[inline] - fn drop(&mut self) { - *self.len = self.local_len; - } -} - -/// Creating a `MutableBuffer` instance by setting bits according to the boolean values -impl std::iter::FromIterator for MutableBuffer { - fn from_iter(iter: I) -> Self - where - I: IntoIterator, - { - let mut iterator = iter.into_iter(); - let mut result = { - let byte_capacity: usize = iterator.size_hint().0.saturating_add(7) / 8; - MutableBuffer::new(byte_capacity) - }; - - loop { - let mut exhausted = false; - let mut byte_accum: u8 = 0; - let mut mask: u8 = 1; - - //collect (up to) 8 bits into a byte - while mask != 0 { - if let Some(value) = iterator.next() { - byte_accum |= match value { - true => mask, - false => 0, - }; - mask <<= 1; - } else { - exhausted = true; - break; - } - } - - // break if the iterator was exhausted before it provided a bool for this byte - if exhausted && mask == 1 { - break; - } - - //ensure we have capacity to write the byte - if result.len() == result.capacity() { - //no capacity for new byte, allocate 1 byte more (plus however many more the iterator advertises) - let additional_byte_capacity = 1usize.saturating_add( - iterator.size_hint().0.saturating_add(7) / 8, //convert bit count to byte count, rounding up - ); - result.reserve(additional_byte_capacity) - } - - // Soundness: capacity was allocated above - unsafe { result.push_unchecked(byte_accum) }; - if exhausted { - break; - } - } - result - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_mutable_new() { - let buf = MutableBuffer::new(63); - assert_eq!(64, buf.capacity()); - assert_eq!(0, buf.len()); - assert!(buf.is_empty()); - } - - #[test] - fn test_mutable_extend_from_slice() { - let mut buf = MutableBuffer::new(100); - buf.extend_from_slice(b"hello"); - assert_eq!(5, buf.len()); - assert_eq!(b"hello", buf.as_slice()); - - buf.extend_from_slice(b" world"); - assert_eq!(11, buf.len()); - assert_eq!(b"hello world", buf.as_slice()); - - buf.clear(); - assert_eq!(0, buf.len()); - buf.extend_from_slice(b"hello arrow"); - assert_eq!(11, buf.len()); - assert_eq!(b"hello arrow", buf.as_slice()); - } - - #[test] - fn mutable_extend_from_iter() { - let mut buf = MutableBuffer::new(0); - buf.extend(vec![1u32, 2]); - assert_eq!(8, buf.len()); - assert_eq!(&[1u8, 0, 0, 0, 2, 0, 0, 0], buf.as_slice()); - - buf.extend(vec![3u32, 4]); - assert_eq!(16, buf.len()); - assert_eq!( - &[1u8, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0], - buf.as_slice() - ); - } - - #[test] - fn test_from_trusted_len_iter() { - let iter = vec![1u32, 2].into_iter(); - let buf = unsafe { Buffer::from_trusted_len_iter(iter) }; - assert_eq!(8, buf.len()); - assert_eq!(&[1u8, 0, 0, 0, 2, 0, 0, 0], buf.as_slice()); - } - - #[test] - fn test_mutable_reserve() { - let mut buf = MutableBuffer::new(1); - assert_eq!(64, buf.capacity()); - - // Reserving a smaller capacity should have no effect. - buf.reserve(10); - assert_eq!(64, buf.capacity()); - - buf.reserve(80); - assert_eq!(128, buf.capacity()); - - buf.reserve(129); - assert_eq!(256, buf.capacity()); - } - - #[test] - fn test_mutable_resize() { - let mut buf = MutableBuffer::new(1); - assert_eq!(64, buf.capacity()); - assert_eq!(0, buf.len()); - - buf.resize(20, 0); - assert_eq!(64, buf.capacity()); - assert_eq!(20, buf.len()); - - buf.resize(10, 0); - assert_eq!(64, buf.capacity()); - assert_eq!(10, buf.len()); - - buf.resize(100, 0); - assert_eq!(128, buf.capacity()); - assert_eq!(100, buf.len()); - - buf.resize(30, 0); - assert_eq!(128, buf.capacity()); - assert_eq!(30, buf.len()); - - buf.resize(0, 0); - assert_eq!(128, buf.capacity()); - assert_eq!(0, buf.len()); - } - - #[test] - fn test_mutable_into() { - let mut buf = MutableBuffer::new(1); - buf.extend_from_slice(b"aaaa bbbb cccc dddd"); - assert_eq!(19, buf.len()); - assert_eq!(64, buf.capacity()); - assert_eq!(b"aaaa bbbb cccc dddd", buf.as_slice()); - - let immutable_buf: Buffer = buf.into(); - assert_eq!(19, immutable_buf.len()); - assert_eq!(64, immutable_buf.capacity()); - assert_eq!(b"aaaa bbbb cccc dddd", immutable_buf.as_slice()); - } - - #[test] - fn test_mutable_equal() { - let mut buf = MutableBuffer::new(1); - let mut buf2 = MutableBuffer::new(1); - - buf.extend_from_slice(&[0xaa]); - buf2.extend_from_slice(&[0xaa, 0xbb]); - assert!(buf != buf2); - - buf.extend_from_slice(&[0xbb]); - assert_eq!(buf, buf2); - - buf2.reserve(65); - assert!(buf != buf2); - } -} diff --git a/rust/arrow/src/buffer/ops.rs b/rust/arrow/src/buffer/ops.rs deleted file mode 100644 index fbcb9510944..00000000000 --- a/rust/arrow/src/buffer/ops.rs +++ /dev/null @@ -1,429 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[cfg(feature = "simd")] -use crate::util::bit_util; -#[cfg(feature = "simd")] -use packed_simd::u8x64; - -#[cfg(feature = "avx512")] -use crate::arch::avx512::*; -use crate::util::bit_util::ceil; -#[cfg(any(feature = "simd", feature = "avx512"))] -use std::borrow::BorrowMut; - -use super::{Buffer, MutableBuffer}; - -/// Apply a bitwise operation `simd_op` / `scalar_op` to two inputs using simd instructions and return the result as a Buffer. -/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a time -/// and the `scalar_op` gets applied to remaining bytes. -/// Contrary to the non-simd version `bitwise_bin_op_helper`, the offset and length is specified in bytes -/// and this version does not support operations starting at arbitrary bit offsets. -#[cfg(simd)] -pub fn bitwise_bin_op_simd_helper( - left: &Buffer, - left_offset: usize, - right: &Buffer, - right_offset: usize, - len: usize, - simd_op: F_SIMD, - scalar_op: F_SCALAR, -) -> Buffer -where - F_SIMD: Fn(u8x64, u8x64) -> u8x64, - F_SCALAR: Fn(u8, u8) -> u8, -{ - let mut result = MutableBuffer::new(len).with_bitset(len, false); - let lanes = u8x64::lanes(); - - let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes); - let mut right_chunks = right.as_slice()[right_offset..].chunks_exact(lanes); - let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(res, (left, right))| { - unsafe { bit_util::bitwise_bin_op_simd(&left, &right, res, &simd_op) }; - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip( - left_chunks - .remainder() - .iter() - .zip(right_chunks.remainder().iter()), - ) - .for_each(|(res, (left, right))| { - *res = scalar_op(*left, *right); - }); - - result.into() -} - -/// Apply a bitwise operation `simd_op` / `scalar_op` to one input using simd instructions and return the result as a Buffer. -/// The `simd_op` functions gets applied on chunks of 64 bytes (512 bits) at a time -/// and the `scalar_op` gets applied to remaining bytes. -/// Contrary to the non-simd version `bitwise_unary_op_helper`, the offset and length is specified in bytes -/// and this version does not support operations starting at arbitrary bit offsets. -#[cfg(simd)] -pub fn bitwise_unary_op_simd_helper( - left: &Buffer, - left_offset: usize, - len: usize, - simd_op: F_SIMD, - scalar_op: F_SCALAR, -) -> Buffer -where - F_SIMD: Fn(u8x64) -> u8x64, - F_SCALAR: Fn(u8) -> u8, -{ - let mut result = MutableBuffer::new(len).with_bitset(len, false); - let lanes = u8x64::lanes(); - - let mut left_chunks = left.as_slice()[left_offset..].chunks_exact(lanes); - let mut result_chunks = result.as_slice_mut().chunks_exact_mut(lanes); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut()) - .for_each(|(res, left)| unsafe { - let data_simd = u8x64::from_slice_unaligned_unchecked(left); - let simd_result = simd_op(data_simd); - simd_result.write_to_slice_unaligned_unchecked(res); - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip(left_chunks.remainder().iter()) - .for_each(|(res, left)| { - *res = scalar_op(*left); - }); - - result.into() -} - -/// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. -/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. -pub fn bitwise_bin_op_helper( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, - op: F, -) -> Buffer -where - F: Fn(u64, u64) -> u64, -{ - let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits); - let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits); - - let chunks = left_chunks - .iter() - .zip(right_chunks.iter()) - .map(|(left, right)| op(left, right)); - // Soundness: `BitChunks` is a trusted len iterator - let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; - - let remainder_bytes = ceil(left_chunks.remainder_len(), 8); - let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); - // we are counting its starting from the least significant bit, to to_le_bytes should be correct - let rem = &rem.to_le_bytes()[0..remainder_bytes]; - buffer.extend_from_slice(rem); - - buffer.into() -} - -/// Apply a bitwise operation `op` to one input and return the result as a Buffer. -/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. -pub fn bitwise_unary_op_helper( - left: &Buffer, - offset_in_bits: usize, - len_in_bits: usize, - op: F, -) -> Buffer -where - F: Fn(u64) -> u64, -{ - // reserve capacity and set length so we can get a typed view of u64 chunks - let mut result = - MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false); - - let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits); - let result_chunks = result.typed_data_mut::().iter_mut(); - - result_chunks - .zip(left_chunks.iter()) - .for_each(|(res, left)| { - *res = op(left); - }); - - let remainder_bytes = ceil(left_chunks.remainder_len(), 8); - let rem = op(left_chunks.remainder_bits()); - // we are counting its starting from the least significant bit, to to_le_bytes should be correct - let rem = &rem.to_le_bytes()[0..remainder_bytes]; - result.extend_from_slice(rem); - - result.into() -} - -#[cfg(all(target_arch = "x86_64", feature = "avx512"))] -pub fn buffer_bin_and( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - let len = len_in_bits / 8; - let left_offset = left_offset_in_bits / 8; - let right_offset = right_offset_in_bits / 8; - - let mut result = MutableBuffer::new(len).with_bitset(len, false); - - let mut left_chunks = - left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut right_chunks = - right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut result_chunks = - result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(res, (left, right))| unsafe { - avx512_bin_and(left, right, res); - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip( - left_chunks - .remainder() - .iter() - .zip(right_chunks.remainder().iter()), - ) - .for_each(|(res, (left, right))| { - *res = *left & *right; - }); - - result.into() - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a & b, - ) - } -} - -#[cfg(all(feature = "simd", not(feature = "avx512")))] -pub fn buffer_bin_and( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - bitwise_bin_op_simd_helper( - &left, - left_offset_in_bits / 8, - &right, - right_offset_in_bits / 8, - len_in_bits / 8, - |a, b| a & b, - |a, b| a & b, - ) - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a & b, - ) - } -} - -// Note: do not target specific features like x86 without considering -// other targets like wasm32, as those would fail to build -#[cfg(all(not(any(feature = "simd", feature = "avx512"))))] -pub fn buffer_bin_and( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a & b, - ) -} - -#[cfg(all(target_arch = "x86_64", feature = "avx512"))] -pub fn buffer_bin_or( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - let len = len_in_bits / 8; - let left_offset = left_offset_in_bits / 8; - let right_offset = right_offset_in_bits / 8; - - let mut result = MutableBuffer::new(len).with_bitset(len, false); - - let mut left_chunks = - left.as_slice()[left_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut right_chunks = - right.as_slice()[right_offset..].chunks_exact(AVX512_U8X64_LANES); - let mut result_chunks = - result.as_slice_mut().chunks_exact_mut(AVX512_U8X64_LANES); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(res, (left, right))| unsafe { - avx512_bin_or(left, right, res); - }); - - result_chunks - .into_remainder() - .iter_mut() - .zip( - left_chunks - .remainder() - .iter() - .zip(right_chunks.remainder().iter()), - ) - .for_each(|(res, (left, right))| { - *res = *left | *right; - }); - - result.into() - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a | b, - ) - } -} - -#[cfg(all(feature = "simd", not(feature = "avx512")))] -pub fn buffer_bin_or( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - if left_offset_in_bits % 8 == 0 - && right_offset_in_bits % 8 == 0 - && len_in_bits % 8 == 0 - { - bitwise_bin_op_simd_helper( - &left, - left_offset_in_bits / 8, - &right, - right_offset_in_bits / 8, - len_in_bits / 8, - |a, b| a | b, - |a, b| a | b, - ) - } else { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a | b, - ) - } -} - -#[cfg(all(not(any(feature = "simd", feature = "avx512"))))] -pub fn buffer_bin_or( - left: &Buffer, - left_offset_in_bits: usize, - right: &Buffer, - right_offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - bitwise_bin_op_helper( - &left, - left_offset_in_bits, - right, - right_offset_in_bits, - len_in_bits, - |a, b| a | b, - ) -} - -pub fn buffer_unary_not( - left: &Buffer, - offset_in_bits: usize, - len_in_bits: usize, -) -> Buffer { - // SIMD implementation if available and byte-aligned - #[cfg(simd)] - if offset_in_bits % 8 == 0 && len_in_bits % 8 == 0 { - return bitwise_unary_op_simd_helper( - &left, - offset_in_bits / 8, - len_in_bits / 8, - |a| !a, - |a| !a, - ); - } - // Default implementation - #[allow(unreachable_code)] - { - bitwise_unary_op_helper(&left, offset_in_bits, len_in_bits, |a| !a) - } -} diff --git a/rust/arrow/src/bytes.rs b/rust/arrow/src/bytes.rs deleted file mode 100644 index 38fa4439b42..00000000000 --- a/rust/arrow/src/bytes.rs +++ /dev/null @@ -1,159 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! This module contains an implementation of a contiguous immutable memory region that knows -//! how to de-allocate itself, [`Bytes`]. -//! Note that this is a low-level functionality of this crate. - -use core::slice; -use std::ptr::NonNull; -use std::sync::Arc; -use std::{fmt::Debug, fmt::Formatter}; - -use crate::{alloc, ffi}; - -/// Mode of deallocating memory regions -pub enum Deallocation { - /// Native deallocation, using Rust deallocator with Arrow-specific memory aligment - Native(usize), - /// Foreign interface, via a callback - Foreign(Arc), -} - -impl Debug for Deallocation { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - match self { - Deallocation::Native(capacity) => { - write!(f, "Deallocation::Native {{ capacity: {} }}", capacity) - } - Deallocation::Foreign(_) => { - write!(f, "Deallocation::Foreign {{ capacity: unknown }}") - } - } - } -} - -/// A continuous, fixed-size, immutable memory region that knows how to de-allocate itself. -/// This structs' API is inspired by the `bytes::Bytes`, but it is not limited to using rust's -/// global allocator nor u8 aligmnent. -/// -/// In the most common case, this buffer is allocated using [`allocate_aligned`](memory::allocate_aligned) -/// and deallocated accordingly [`free_aligned`](memory::free_aligned). -/// When the region is allocated by an foreign allocator, [Deallocation::Foreign], this calls the -/// foreign deallocator to deallocate the region when it is no longer needed. -pub struct Bytes { - /// The raw pointer to be begining of the region - ptr: NonNull, - - /// The number of bytes visible to this region. This is always smaller than its capacity (when avaliable). - len: usize, - - /// how to deallocate this region - deallocation: Deallocation, -} - -impl Bytes { - /// Takes ownership of an allocated memory region, - /// - /// # Arguments - /// - /// * `ptr` - Pointer to raw parts - /// * `len` - Length of raw parts in **bytes** - /// * `capacity` - Total allocated memory for the pointer `ptr`, in **bytes** - /// - /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` - /// bytes. If the `ptr` and `capacity` come from a `Buffer`, then this is guaranteed. - #[inline] - pub unsafe fn new( - ptr: std::ptr::NonNull, - len: usize, - deallocation: Deallocation, - ) -> Bytes { - Bytes { - ptr, - len, - deallocation, - } - } - - fn as_slice(&self) -> &[u8] { - self - } - - #[inline] - pub fn len(&self) -> usize { - self.len - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.len == 0 - } - - #[inline] - pub fn ptr(&self) -> NonNull { - self.ptr - } - - pub fn capacity(&self) -> usize { - match self.deallocation { - Deallocation::Native(capacity) => capacity, - // we cannot determine this in general, - // and thus we state that this is externally-owned memory - Deallocation::Foreign(_) => 0, - } - } -} - -impl Drop for Bytes { - #[inline] - fn drop(&mut self) { - match &self.deallocation { - Deallocation::Native(capacity) => { - unsafe { alloc::free_aligned::(self.ptr, *capacity) }; - } - // foreign interface knows how to deallocate itself. - Deallocation::Foreign(_) => (), - } - } -} - -impl std::ops::Deref for Bytes { - type Target = [u8]; - - fn deref(&self) -> &[u8] { - unsafe { slice::from_raw_parts(self.ptr.as_ptr(), self.len) } - } -} - -impl PartialEq for Bytes { - fn eq(&self, other: &Bytes) -> bool { - self.as_slice() == other.as_slice() - } -} - -impl Debug for Bytes { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - write!(f, "Bytes {{ ptr: {:?}, len: {}, data: ", self.ptr, self.len,)?; - - f.debug_list().entries(self.iter()).finish()?; - - write!(f, " }}") - } -} diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs deleted file mode 100644 index d0e3f22f541..00000000000 --- a/rust/arrow/src/compute/kernels/aggregate.rs +++ /dev/null @@ -1,975 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines aggregations over Arrow arrays. - -use std::ops::Add; - -use crate::array::{ - Array, BooleanArray, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait, -}; -use crate::datatypes::{ArrowNativeType, ArrowNumericType}; - -/// Generic test for NaN, the optimizer should be able to remove this for integer types. -#[inline] -fn is_nan(a: T) -> bool { - #[allow(clippy::eq_op)] - !(a == a) -} - -/// Helper macro to perform min/max of strings -fn min_max_string bool>( - array: &GenericStringArray, - cmp: F, -) -> Option<&str> { - let null_count = array.null_count(); - - if null_count == array.len() { - return None; - } - let data = array.data(); - let mut n; - if null_count == 0 { - n = array.value(0); - for i in 1..data.len() { - let item = array.value(i); - if cmp(&n, item) { - n = item; - } - } - } else { - n = ""; - let mut has_value = false; - - for i in 0..data.len() { - let item = array.value(i); - if data.is_valid(i) && (!has_value || cmp(&n, item)) { - has_value = true; - n = item; - } - } - } - Some(n) -} - -/// Returns the minimum value in the array, according to the natural order. -/// For floating point arrays any NaN values are considered to be greater than any other non-null value -#[cfg(not(simd))] -pub fn min(array: &PrimitiveArray) -> Option -where - T: ArrowNumericType, - T::Native: ArrowNativeType, -{ - min_max_helper(array, |a, b| (is_nan(*a) & !is_nan(*b)) || a > b) -} - -/// Returns the maximum value in the array, according to the natural order. -/// For floating point arrays any NaN values are considered to be greater than any other non-null value -#[cfg(not(simd))] -pub fn max(array: &PrimitiveArray) -> Option -where - T: ArrowNumericType, - T::Native: ArrowNativeType, -{ - min_max_helper(array, |a, b| (!is_nan(*a) & is_nan(*b)) || a < b) -} - -/// Returns the maximum value in the string array, according to the natural order. -pub fn max_string( - array: &GenericStringArray, -) -> Option<&str> { - min_max_string(array, |a, b| a < b) -} - -/// Returns the minimum value in the string array, according to the natural order. -pub fn min_string( - array: &GenericStringArray, -) -> Option<&str> { - min_max_string(array, |a, b| a > b) -} - -/// Helper function to perform min/max lambda function on values from a numeric array. -fn min_max_helper(array: &PrimitiveArray, cmp: F) -> Option -where - T: ArrowNumericType, - F: Fn(&T::Native, &T::Native) -> bool, -{ - let null_count = array.null_count(); - - // Includes case array.len() == 0 - if null_count == array.len() { - return None; - } - - let data = array.data(); - let m = array.values(); - let mut n; - - if null_count == 0 { - // optimized path for arrays without null values - n = m[1..] - .iter() - .fold(m[0], |max, item| if cmp(&max, item) { *item } else { max }); - } else { - n = T::default_value(); - let mut has_value = false; - for (i, item) in m.iter().enumerate() { - if data.is_valid(i) && (!has_value || cmp(&n, item)) { - has_value = true; - n = *item - } - } - } - Some(n) -} - -/// Returns the minimum value in the boolean array. -/// -/// ``` -/// use arrow::{ -/// array::BooleanArray, -/// compute::min_boolean, -/// }; -/// -/// let a = BooleanArray::from(vec![Some(true), None, Some(false)]); -/// assert_eq!(min_boolean(&a), Some(false)) -/// ``` -pub fn min_boolean(array: &BooleanArray) -> Option { - // short circuit if all nulls / zero length array - if array.null_count() == array.len() { - return None; - } - - // Note the min bool is false (0), so short circuit as soon as we see it - array - .iter() - .find(|&b| b == Some(false)) - .flatten() - .or(Some(true)) -} - -/// Returns the maximum value in the boolean array -/// -/// ``` -/// use arrow::{ -/// array::BooleanArray, -/// compute::max_boolean, -/// }; -/// -/// let a = BooleanArray::from(vec![Some(true), None, Some(false)]); -/// assert_eq!(max_boolean(&a), Some(true)) -/// ``` -pub fn max_boolean(array: &BooleanArray) -> Option { - // short circuit if all nulls / zero length array - if array.null_count() == array.len() { - return None; - } - - // Note the max bool is true (1), so short circuit as soon as we see it - array - .iter() - .find(|&b| b == Some(true)) - .flatten() - .or(Some(false)) -} - -/// Returns the sum of values in the array. -/// -/// Returns `None` if the array is empty or only contains null values. -#[cfg(not(simd))] -pub fn sum(array: &PrimitiveArray) -> Option -where - T: ArrowNumericType, - T::Native: Add, -{ - let null_count = array.null_count(); - - if null_count == array.len() { - return None; - } - - let data: &[T::Native] = array.values(); - - match array.data().null_buffer() { - None => { - let sum = data.iter().fold(T::default_value(), |accumulator, value| { - accumulator + *value - }); - - Some(sum) - } - Some(buffer) => { - let mut sum = T::default_value(); - let data_chunks = data.chunks_exact(64); - let remainder = data_chunks.remainder(); - - let bit_chunks = buffer.bit_chunks(array.offset(), array.len()); - data_chunks - .zip(bit_chunks.iter()) - .for_each(|(chunk, mask)| { - // index_mask has value 1 << i in the loop - let mut index_mask = 1; - chunk.iter().for_each(|value| { - if (mask & index_mask) != 0 { - sum = sum + *value; - } - index_mask <<= 1; - }); - }); - - let remainder_bits = bit_chunks.remainder_bits(); - - remainder.iter().enumerate().for_each(|(i, value)| { - if remainder_bits & (1 << i) != 0 { - sum = sum + *value; - } - }); - - Some(sum) - } - } -} - -#[cfg(simd)] -mod simd { - use super::is_nan; - use crate::array::{Array, PrimitiveArray}; - use crate::datatypes::ArrowNumericType; - use std::marker::PhantomData; - use std::ops::Add; - - pub(super) trait SimdAggregate { - type ScalarAccumulator; - type SimdAccumulator; - - /// Returns the accumulator for aggregating scalar values - fn init_accumulator_scalar() -> Self::ScalarAccumulator; - - /// Returns the accumulator for aggregating simd chunks of values - fn init_accumulator_chunk() -> Self::SimdAccumulator; - - /// Updates the accumulator with the values of one chunk - fn accumulate_chunk_non_null( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - ); - - /// Updates the accumulator with the values of one chunk according to the given vector mask - fn accumulate_chunk_nullable( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - mask: T::SimdMask, - ); - - /// Updates the accumulator with one value - fn accumulate_scalar(accumulator: &mut Self::ScalarAccumulator, value: T::Native); - - /// Reduces the vector lanes of the simd accumulator and the scalar accumulator to a single value - fn reduce( - simd_accumulator: Self::SimdAccumulator, - scalar_accumulator: Self::ScalarAccumulator, - ) -> Option; - } - - pub(super) struct SumAggregate { - phantom: PhantomData, - } - - impl SimdAggregate for SumAggregate - where - T::Native: Add, - { - type ScalarAccumulator = T::Native; - type SimdAccumulator = T::Simd; - - fn init_accumulator_scalar() -> Self::ScalarAccumulator { - T::default_value() - } - - fn init_accumulator_chunk() -> Self::SimdAccumulator { - T::init(Self::init_accumulator_scalar()) - } - - fn accumulate_chunk_non_null(accumulator: &mut T::Simd, chunk: T::Simd) { - *accumulator = *accumulator + chunk; - } - - fn accumulate_chunk_nullable( - accumulator: &mut T::Simd, - chunk: T::Simd, - vecmask: T::SimdMask, - ) { - let zero = T::init(T::default_value()); - let blended = T::mask_select(vecmask, chunk, zero); - - *accumulator = *accumulator + blended; - } - - fn accumulate_scalar(accumulator: &mut T::Native, value: T::Native) { - *accumulator = *accumulator + value - } - - fn reduce( - simd_accumulator: Self::SimdAccumulator, - scalar_accumulator: Self::ScalarAccumulator, - ) -> Option { - // we can't use T::lanes() as the slice len because it is not const, - // instead always reserve the maximum number of lanes - let mut tmp = [T::default_value(); 64]; - let slice = &mut tmp[0..T::lanes()]; - T::write(simd_accumulator, slice); - - let mut reduced = Self::init_accumulator_scalar(); - slice - .iter() - .for_each(|value| Self::accumulate_scalar(&mut reduced, *value)); - - Self::accumulate_scalar(&mut reduced, scalar_accumulator); - - // result can not be None because we checked earlier for the null count - Some(reduced) - } - } - - pub(super) struct MinAggregate { - phantom: PhantomData, - } - - impl SimdAggregate for MinAggregate - where - T::Native: PartialOrd, - { - type ScalarAccumulator = (T::Native, bool); - type SimdAccumulator = (T::Simd, T::SimdMask); - - fn init_accumulator_scalar() -> Self::ScalarAccumulator { - (T::default_value(), false) - } - - fn init_accumulator_chunk() -> Self::SimdAccumulator { - (T::init(T::default_value()), T::mask_init(false)) - } - - fn accumulate_chunk_non_null( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - ) { - let acc_is_nan = !T::eq(accumulator.0, accumulator.0); - let is_lt = acc_is_nan | T::lt(chunk, accumulator.0); - let first_or_lt = !accumulator.1 | is_lt; - - accumulator.0 = T::mask_select(first_or_lt, chunk, accumulator.0); - accumulator.1 = T::mask_init(true); - } - - fn accumulate_chunk_nullable( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - vecmask: T::SimdMask, - ) { - let acc_is_nan = !T::eq(accumulator.0, accumulator.0); - let is_lt = vecmask & (acc_is_nan | T::lt(chunk, accumulator.0)); - let first_or_lt = !accumulator.1 | is_lt; - - accumulator.0 = T::mask_select(first_or_lt, chunk, accumulator.0); - accumulator.1 |= vecmask; - } - - fn accumulate_scalar( - accumulator: &mut Self::ScalarAccumulator, - value: T::Native, - ) { - if !accumulator.1 { - accumulator.0 = value; - } else { - let acc_is_nan = is_nan(accumulator.0); - if acc_is_nan || value < accumulator.0 { - accumulator.0 = value - } - } - accumulator.1 = true - } - - fn reduce( - simd_accumulator: Self::SimdAccumulator, - scalar_accumulator: Self::ScalarAccumulator, - ) -> Option { - // we can't use T::lanes() as the slice len because it is not const, - // instead always reserve the maximum number of lanes - let mut tmp = [T::default_value(); 64]; - let slice = &mut tmp[0..T::lanes()]; - T::write(simd_accumulator.0, slice); - - let mut reduced = Self::init_accumulator_scalar(); - slice - .iter() - .enumerate() - .filter(|(i, _value)| T::mask_get(&simd_accumulator.1, *i)) - .for_each(|(_i, value)| Self::accumulate_scalar(&mut reduced, *value)); - - if scalar_accumulator.1 { - Self::accumulate_scalar(&mut reduced, scalar_accumulator.0); - } - - if reduced.1 { - Some(reduced.0) - } else { - None - } - } - } - - pub(super) struct MaxAggregate { - phantom: PhantomData, - } - - impl SimdAggregate for MaxAggregate - where - T::Native: PartialOrd, - { - type ScalarAccumulator = (T::Native, bool); - type SimdAccumulator = (T::Simd, T::SimdMask); - - fn init_accumulator_scalar() -> Self::ScalarAccumulator { - (T::default_value(), false) - } - - fn init_accumulator_chunk() -> Self::SimdAccumulator { - (T::init(T::default_value()), T::mask_init(false)) - } - - fn accumulate_chunk_non_null( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - ) { - let chunk_is_nan = !T::eq(chunk, chunk); - let is_gt = chunk_is_nan | T::gt(chunk, accumulator.0); - let first_or_gt = !accumulator.1 | is_gt; - - accumulator.0 = T::mask_select(first_or_gt, chunk, accumulator.0); - accumulator.1 = T::mask_init(true); - } - - fn accumulate_chunk_nullable( - accumulator: &mut Self::SimdAccumulator, - chunk: T::Simd, - vecmask: T::SimdMask, - ) { - let chunk_is_nan = !T::eq(chunk, chunk); - let is_gt = vecmask & (chunk_is_nan | T::gt(chunk, accumulator.0)); - let first_or_gt = !accumulator.1 | is_gt; - - accumulator.0 = T::mask_select(first_or_gt, chunk, accumulator.0); - accumulator.1 |= vecmask; - } - - fn accumulate_scalar( - accumulator: &mut Self::ScalarAccumulator, - value: T::Native, - ) { - if !accumulator.1 { - accumulator.0 = value; - } else { - let value_is_nan = is_nan(value); - if value_is_nan || value > accumulator.0 { - accumulator.0 = value - } - } - accumulator.1 = true; - } - - fn reduce( - simd_accumulator: Self::SimdAccumulator, - scalar_accumulator: Self::ScalarAccumulator, - ) -> Option { - // we can't use T::lanes() as the slice len because it is not const, - // instead always reserve the maximum number of lanes - let mut tmp = [T::default_value(); 64]; - let slice = &mut tmp[0..T::lanes()]; - T::write(simd_accumulator.0, slice); - - let mut reduced = Self::init_accumulator_scalar(); - slice - .iter() - .enumerate() - .filter(|(i, _value)| T::mask_get(&simd_accumulator.1, *i)) - .for_each(|(_i, value)| Self::accumulate_scalar(&mut reduced, *value)); - - if scalar_accumulator.1 { - Self::accumulate_scalar(&mut reduced, scalar_accumulator.0); - } - - if reduced.1 { - Some(reduced.0) - } else { - None - } - } - } - - pub(super) fn simd_aggregation>( - array: &PrimitiveArray, - ) -> Option { - let null_count = array.null_count(); - - if null_count == array.len() { - return None; - } - - let data: &[T::Native] = array.values(); - - let mut chunk_acc = A::init_accumulator_chunk(); - let mut rem_acc = A::init_accumulator_scalar(); - - match array.data().null_buffer() { - None => { - let data_chunks = data.chunks_exact(64); - let remainder = data_chunks.remainder(); - - data_chunks.for_each(|chunk| { - chunk.chunks_exact(T::lanes()).for_each(|chunk| { - let chunk = T::load(&chunk); - A::accumulate_chunk_non_null(&mut chunk_acc, chunk); - }); - }); - - remainder.iter().for_each(|value| { - A::accumulate_scalar(&mut rem_acc, *value); - }); - } - Some(buffer) => { - // process data in chunks of 64 elements since we also get 64 bits of validity information at a time - let data_chunks = data.chunks_exact(64); - let remainder = data_chunks.remainder(); - - let bit_chunks = buffer.bit_chunks(array.offset(), array.len()); - let remainder_bits = bit_chunks.remainder_bits(); - - data_chunks.zip(bit_chunks).for_each(|(chunk, mut mask)| { - // split chunks further into slices corresponding to the vector length - // the compiler is able to unroll this inner loop and remove bounds checks - // since the outer chunk size (64) is always a multiple of the number of lanes - chunk.chunks_exact(T::lanes()).for_each(|chunk| { - let vecmask = T::mask_from_u64(mask); - let chunk = T::load(&chunk); - - A::accumulate_chunk_nullable(&mut chunk_acc, chunk, vecmask); - - // skip the shift and avoid overflow for u8 type, which uses 64 lanes. - mask >>= T::lanes() % 64; - }); - }); - - remainder.iter().enumerate().for_each(|(i, value)| { - if remainder_bits & (1 << i) != 0 { - A::accumulate_scalar(&mut rem_acc, *value) - } - }); - } - } - - A::reduce(chunk_acc, rem_acc) - } -} - -/// Returns the sum of values in the array. -/// -/// Returns `None` if the array is empty or only contains null values. -#[cfg(simd)] -pub fn sum(array: &PrimitiveArray) -> Option -where - T::Native: Add, -{ - use simd::*; - - simd::simd_aggregation::>(&array) -} - -#[cfg(simd)] -/// Returns the minimum value in the array, according to the natural order. -/// For floating point arrays any NaN values are considered to be greater than any other non-null value -pub fn min(array: &PrimitiveArray) -> Option -where - T::Native: PartialOrd, -{ - use simd::*; - - simd::simd_aggregation::>(&array) -} - -#[cfg(simd)] -/// Returns the maximum value in the array, according to the natural order. -/// For floating point arrays any NaN values are considered to be greater than any other non-null value -pub fn max(array: &PrimitiveArray) -> Option -where - T::Native: PartialOrd, -{ - use simd::*; - - simd::simd_aggregation::>(&array) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::array::*; - use crate::compute::add; - - #[test] - fn test_primitive_array_sum() { - let a = Int32Array::from(vec![1, 2, 3, 4, 5]); - assert_eq!(15, sum(&a).unwrap()); - } - - #[test] - fn test_primitive_array_float_sum() { - let a = Float64Array::from(vec![1.1, 2.2, 3.3, 4.4, 5.5]); - assert!(16.5 - sum(&a).unwrap() < f64::EPSILON); - } - - #[test] - fn test_primitive_array_sum_with_nulls() { - let a = Int32Array::from(vec![None, Some(2), Some(3), None, Some(5)]); - assert_eq!(10, sum(&a).unwrap()); - } - - #[test] - fn test_primitive_array_sum_all_nulls() { - let a = Int32Array::from(vec![None, None, None]); - assert_eq!(None, sum(&a)); - } - - #[test] - fn test_primitive_array_sum_large_64() { - let a: Int64Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(i) } else { None }) - .collect(); - let b: Int64Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(0) } else { Some(i) }) - .collect(); - // create an array that actually has non-zero values at the invalid indices - let c = add(&a, &b).unwrap(); - assert_eq!(Some((1..=100).filter(|i| i % 3 == 0).sum()), sum(&c)); - } - - #[test] - fn test_primitive_array_sum_large_32() { - let a: Int32Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(i) } else { None }) - .collect(); - let b: Int32Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(0) } else { Some(i) }) - .collect(); - // create an array that actually has non-zero values at the invalid indices - let c = add(&a, &b).unwrap(); - assert_eq!(Some((1..=100).filter(|i| i % 3 == 0).sum()), sum(&c)); - } - - #[test] - fn test_primitive_array_sum_large_16() { - let a: Int16Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(i) } else { None }) - .collect(); - let b: Int16Array = (1..=100) - .map(|i| if i % 3 == 0 { Some(0) } else { Some(i) }) - .collect(); - // create an array that actually has non-zero values at the invalid indices - let c = add(&a, &b).unwrap(); - assert_eq!(Some((1..=100).filter(|i| i % 3 == 0).sum()), sum(&c)); - } - - #[test] - fn test_primitive_array_sum_large_8() { - // include fewer values than other large tests so the result does not overflow the u8 - let a: UInt8Array = (1..=100) - .map(|i| if i % 33 == 0 { Some(i) } else { None }) - .collect(); - let b: UInt8Array = (1..=100) - .map(|i| if i % 33 == 0 { Some(0) } else { Some(i) }) - .collect(); - // create an array that actually has non-zero values at the invalid indices - let c = add(&a, &b).unwrap(); - assert_eq!(Some((1..=100).filter(|i| i % 33 == 0).sum()), sum(&c)); - } - - #[test] - fn test_primitive_array_min_max() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - assert_eq!(5, min(&a).unwrap()); - assert_eq!(9, max(&a).unwrap()); - } - - #[test] - fn test_primitive_array_min_max_with_nulls() { - let a = Int32Array::from(vec![Some(5), None, None, Some(8), Some(9)]); - assert_eq!(5, min(&a).unwrap()); - assert_eq!(9, max(&a).unwrap()); - } - - #[test] - fn test_primitive_min_max_1() { - let a = Int32Array::from(vec![None, None, Some(5), Some(2)]); - assert_eq!(Some(2), min(&a)); - assert_eq!(Some(5), max(&a)); - } - - #[test] - fn test_primitive_min_max_float_large_nonnull_array() { - let a: Float64Array = (0..256).map(|i| Some((i + 1) as f64)).collect(); - // min/max are on boundaries of chunked data - assert_eq!(Some(1.0), min(&a)); - assert_eq!(Some(256.0), max(&a)); - - // max is last value in remainder after chunking - let a: Float64Array = (0..255).map(|i| Some((i + 1) as f64)).collect(); - assert_eq!(Some(255.0), max(&a)); - - // max is first value in remainder after chunking - let a: Float64Array = (0..257).map(|i| Some((i + 1) as f64)).collect(); - assert_eq!(Some(257.0), max(&a)); - } - - #[test] - fn test_primitive_min_max_float_large_nullable_array() { - let a: Float64Array = (0..256) - .map(|i| { - if (i + 1) % 3 == 0 { - None - } else { - Some((i + 1) as f64) - } - }) - .collect(); - // min/max are on boundaries of chunked data - assert_eq!(Some(1.0), min(&a)); - assert_eq!(Some(256.0), max(&a)); - - let a: Float64Array = (0..256) - .map(|i| { - if i == 0 || i == 255 { - None - } else { - Some((i + 1) as f64) - } - }) - .collect(); - // boundaries of chunked data are null - assert_eq!(Some(2.0), min(&a)); - assert_eq!(Some(255.0), max(&a)); - - let a: Float64Array = (0..256) - .map(|i| if i != 100 { None } else { Some((i) as f64) }) - .collect(); - // a single non-null value somewhere in the middle - assert_eq!(Some(100.0), min(&a)); - assert_eq!(Some(100.0), max(&a)); - - // max is last value in remainder after chunking - let a: Float64Array = (0..255).map(|i| Some((i + 1) as f64)).collect(); - assert_eq!(Some(255.0), max(&a)); - - // max is first value in remainder after chunking - let a: Float64Array = (0..257).map(|i| Some((i + 1) as f64)).collect(); - assert_eq!(Some(257.0), max(&a)); - } - - #[test] - fn test_primitive_min_max_float_edge_cases() { - let a: Float64Array = (0..100).map(|_| Some(f64::NEG_INFINITY)).collect(); - assert_eq!(Some(f64::NEG_INFINITY), min(&a)); - assert_eq!(Some(f64::NEG_INFINITY), max(&a)); - - let a: Float64Array = (0..100).map(|_| Some(f64::MIN)).collect(); - assert_eq!(Some(f64::MIN), min(&a)); - assert_eq!(Some(f64::MIN), max(&a)); - - let a: Float64Array = (0..100).map(|_| Some(f64::MAX)).collect(); - assert_eq!(Some(f64::MAX), min(&a)); - assert_eq!(Some(f64::MAX), max(&a)); - - let a: Float64Array = (0..100).map(|_| Some(f64::INFINITY)).collect(); - assert_eq!(Some(f64::INFINITY), min(&a)); - assert_eq!(Some(f64::INFINITY), max(&a)); - } - - #[test] - fn test_primitive_min_max_float_all_nans_non_null() { - let a: Float64Array = (0..100).map(|_| Some(f64::NAN)).collect(); - assert!(max(&a).unwrap().is_nan()); - assert!(min(&a).unwrap().is_nan()); - } - - #[test] - fn test_primitive_min_max_float_first_nan_nonnull() { - let a: Float64Array = (0..100) - .map(|i| { - if i == 0 { - Some(f64::NAN) - } else { - Some(i as f64) - } - }) - .collect(); - assert_eq!(Some(1.0), min(&a)); - assert!(max(&a).unwrap().is_nan()); - } - - #[test] - fn test_primitive_min_max_float_last_nan_nonnull() { - let a: Float64Array = (0..100) - .map(|i| { - if i == 99 { - Some(f64::NAN) - } else { - Some((i + 1) as f64) - } - }) - .collect(); - assert_eq!(Some(1.0), min(&a)); - assert!(max(&a).unwrap().is_nan()); - } - - #[test] - fn test_primitive_min_max_float_first_nan_nullable() { - let a: Float64Array = (0..100) - .map(|i| { - if i == 0 { - Some(f64::NAN) - } else if i % 2 == 0 { - None - } else { - Some(i as f64) - } - }) - .collect(); - assert_eq!(Some(1.0), min(&a)); - assert!(max(&a).unwrap().is_nan()); - } - - #[test] - fn test_primitive_min_max_float_last_nan_nullable() { - let a: Float64Array = (0..100) - .map(|i| { - if i == 99 { - Some(f64::NAN) - } else if i % 2 == 0 { - None - } else { - Some(i as f64) - } - }) - .collect(); - assert_eq!(Some(1.0), min(&a)); - assert!(max(&a).unwrap().is_nan()); - } - - #[test] - fn test_primitive_min_max_float_inf_and_nans() { - let a: Float64Array = (0..100) - .map(|i| { - let x = match i % 10 { - 0 => f64::NEG_INFINITY, - 1 => f64::MIN, - 2 => f64::MAX, - 4 => f64::INFINITY, - 5 => f64::NAN, - _ => i as f64, - }; - Some(x) - }) - .collect(); - assert_eq!(Some(f64::NEG_INFINITY), min(&a)); - assert!(max(&a).unwrap().is_nan()); - } - - #[test] - fn test_string_min_max_with_nulls() { - let a = StringArray::from(vec![Some("b"), None, None, Some("a"), Some("c")]); - assert_eq!("a", min_string(&a).unwrap()); - assert_eq!("c", max_string(&a).unwrap()); - } - - #[test] - fn test_string_min_max_all_nulls() { - let a = StringArray::from(vec![None, None]); - assert_eq!(None, min_string(&a)); - assert_eq!(None, max_string(&a)); - } - - #[test] - fn test_string_min_max_1() { - let a = StringArray::from(vec![None, None, Some("b"), Some("a")]); - assert_eq!(Some("a"), min_string(&a)); - assert_eq!(Some("b"), max_string(&a)); - } - - #[test] - fn test_boolean_min_max_empty() { - let a = BooleanArray::from(vec![] as Vec>); - assert_eq!(None, min_boolean(&a)); - assert_eq!(None, max_boolean(&a)); - } - - #[test] - fn test_boolean_min_max_all_null() { - let a = BooleanArray::from(vec![None, None]); - assert_eq!(None, min_boolean(&a)); - assert_eq!(None, max_boolean(&a)); - } - - #[test] - fn test_boolean_min_max_no_null() { - let a = BooleanArray::from(vec![Some(true), Some(false), Some(true)]); - assert_eq!(Some(false), min_boolean(&a)); - assert_eq!(Some(true), max_boolean(&a)); - } - - #[test] - fn test_boolean_min_max() { - let a = BooleanArray::from(vec![Some(true), Some(true), None, Some(false), None]); - assert_eq!(Some(false), min_boolean(&a)); - assert_eq!(Some(true), max_boolean(&a)); - - let a = BooleanArray::from(vec![None, Some(true), None, Some(false), None]); - assert_eq!(Some(false), min_boolean(&a)); - assert_eq!(Some(true), max_boolean(&a)); - - let a = - BooleanArray::from(vec![Some(false), Some(true), None, Some(false), None]); - assert_eq!(Some(false), min_boolean(&a)); - assert_eq!(Some(true), max_boolean(&a)); - } - - #[test] - fn test_boolean_min_max_smaller() { - let a = BooleanArray::from(vec![Some(false)]); - assert_eq!(Some(false), min_boolean(&a)); - assert_eq!(Some(false), max_boolean(&a)); - - let a = BooleanArray::from(vec![None, Some(false)]); - assert_eq!(Some(false), min_boolean(&a)); - assert_eq!(Some(false), max_boolean(&a)); - - let a = BooleanArray::from(vec![None, Some(true)]); - assert_eq!(Some(true), min_boolean(&a)); - assert_eq!(Some(true), max_boolean(&a)); - - let a = BooleanArray::from(vec![Some(true)]); - assert_eq!(Some(true), min_boolean(&a)); - assert_eq!(Some(true), max_boolean(&a)); - } -} diff --git a/rust/arrow/src/compute/kernels/arithmetic.rs b/rust/arrow/src/compute/kernels/arithmetic.rs deleted file mode 100644 index d7aadf144d4..00000000000 --- a/rust/arrow/src/compute/kernels/arithmetic.rs +++ /dev/null @@ -1,1009 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines basic arithmetic kernels for `PrimitiveArrays`. -//! -//! These kernels can leverage SIMD if available on your system. Currently no runtime -//! detection is provided, you should enable the specific SIMD intrinsics using -//! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation -//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. - -use std::ops::{Add, Div, Mul, Neg, Sub}; - -use num::{One, Zero}; - -use crate::buffer::Buffer; -#[cfg(simd)] -use crate::buffer::MutableBuffer; -#[cfg(not(simd))] -use crate::compute::kernels::arity::unary; -use crate::compute::util::combine_option_bitmap; -use crate::datatypes; -use crate::datatypes::ArrowNumericType; -use crate::error::{ArrowError, Result}; -use crate::{array::*, util::bit_util}; -use num::traits::Pow; -#[cfg(simd)] -use std::borrow::BorrowMut; -#[cfg(simd)] -use std::slice::{ChunksExact, ChunksExactMut}; - -/// SIMD vectorized version of `unary_math_op` above specialized for signed numerical values. -#[cfg(simd)] -fn simd_signed_unary_math_op( - array: &PrimitiveArray, - simd_op: SIMD_OP, - scalar_op: SCALAR_OP, -) -> Result> -where - T: datatypes::ArrowSignedNumericType, - SIMD_OP: Fn(T::SignedSimd) -> T::SignedSimd, - SCALAR_OP: Fn(T::Native) -> T::Native, -{ - let lanes = T::lanes(); - let buffer_size = array.len() * std::mem::size_of::(); - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); - let mut array_chunks = array.values().chunks_exact(lanes); - - result_chunks - .borrow_mut() - .zip(array_chunks.borrow_mut()) - .for_each(|(result_slice, input_slice)| { - let simd_input = T::load_signed(input_slice); - let simd_result = T::signed_unary_op(simd_input, &simd_op); - T::write_signed(simd_result, result_slice); - }); - - let result_remainder = result_chunks.into_remainder(); - let array_remainder = array_chunks.remainder(); - - result_remainder.into_iter().zip(array_remainder).for_each( - |(scalar_result, scalar_input)| { - *scalar_result = scalar_op(*scalar_input); - }, - ); - - let data = ArrayData::new( - T::DATA_TYPE, - array.len(), - None, - array.data_ref().null_buffer().cloned(), - 0, - vec![result.into()], - vec![], - ); - Ok(PrimitiveArray::::from(data)) -} - -#[cfg(simd)] -fn simd_float_unary_math_op( - array: &PrimitiveArray, - simd_op: SIMD_OP, - scalar_op: SCALAR_OP, -) -> Result> -where - T: datatypes::ArrowFloatNumericType, - SIMD_OP: Fn(T::Simd) -> T::Simd, - SCALAR_OP: Fn(T::Native) -> T::Native, -{ - let lanes = T::lanes(); - let buffer_size = array.len() * std::mem::size_of::(); - - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); - let mut array_chunks = array.values().chunks_exact(lanes); - - result_chunks - .borrow_mut() - .zip(array_chunks.borrow_mut()) - .for_each(|(result_slice, input_slice)| { - let simd_input = T::load(input_slice); - let simd_result = T::unary_op(simd_input, &simd_op); - T::write(simd_result, result_slice); - }); - - let result_remainder = result_chunks.into_remainder(); - let array_remainder = array_chunks.remainder(); - - result_remainder.into_iter().zip(array_remainder).for_each( - |(scalar_result, scalar_input)| { - *scalar_result = scalar_op(*scalar_input); - }, - ); - - let data = ArrayData::new( - T::DATA_TYPE, - array.len(), - None, - array.data_ref().null_buffer().cloned(), - 0, - vec![result.into()], - vec![], - ); - Ok(PrimitiveArray::::from(data)) -} - -/// Helper function to perform math lambda function on values from two arrays. If either -/// left or right value is null then the output value is also null, so `1 + null` is -/// `null`. -/// -/// # Errors -/// -/// This function errors if the arrays have different lengths -pub fn math_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result> -where - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> T::Native, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; - - let values = left - .values() - .iter() - .zip(right.values().iter()) - .map(|(l, r)| op(*l, *r)); - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` is an iterator with a known size. - let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ); - Ok(PrimitiveArray::::from(data)) -} - -/// Helper function to divide two arrays. -/// -/// # Errors -/// -/// This function errors if: -/// * the arrays have different lengths -/// * a division by zero is found -fn math_divide( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result> -where - T: ArrowNumericType, - T::Native: Div + Zero, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; - - let buffer = if let Some(b) = &null_bit_buffer { - let values = left.values().iter().zip(right.values()).enumerate().map( - |(i, (left, right))| { - let is_valid = unsafe { bit_util::get_bit_raw(b.as_ptr(), i) }; - if is_valid { - if right.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(*left / *right) - } - } else { - Ok(T::default_value()) - } - }, - ); - unsafe { Buffer::try_from_trusted_len_iter(values) } - } else { - // no value is null - let values = left - .values() - .iter() - .zip(right.values()) - .map(|(left, right)| { - if right.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(*left / *right) - } - }); - unsafe { Buffer::try_from_trusted_len_iter(values) } - }?; - - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ); - Ok(PrimitiveArray::::from(data)) -} - -/// Scalar-divisor version of `math_divide`. -fn math_divide_scalar( - array: &PrimitiveArray, - divisor: T::Native, -) -> Result> -where - T: ArrowNumericType, - T::Native: Div + Zero, -{ - if divisor.is_zero() { - return Err(ArrowError::DivideByZero); - } - - let values = array.values().iter().map(|value| *value / divisor); - let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - - let data = ArrayData::new( - T::DATA_TYPE, - array.len(), - None, - array.data_ref().null_buffer().cloned(), - 0, - vec![buffer], - vec![], - ); - Ok(PrimitiveArray::::from(data)) -} - -/// SIMD vectorized version of `math_op` above. -#[cfg(simd)] -fn simd_math_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - simd_op: SIMD_OP, - scalar_op: SCALAR_OP, -) -> Result> -where - T: ArrowNumericType, - SIMD_OP: Fn(T::Simd, T::Simd) -> T::Simd, - SCALAR_OP: Fn(T::Native, T::Native) -> T::Native, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; - - let lanes = T::lanes(); - let buffer_size = left.len() * std::mem::size_of::(); - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); - let mut left_chunks = left.values().chunks_exact(lanes); - let mut right_chunks = right.values().chunks_exact(lanes); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .for_each(|(result_slice, (left_slice, right_slice))| { - let simd_left = T::load(left_slice); - let simd_right = T::load(right_slice); - let simd_result = T::bin_op(simd_left, simd_right, &simd_op); - T::write(simd_result, result_slice); - }); - - let result_remainder = result_chunks.into_remainder(); - let left_remainder = left_chunks.remainder(); - let right_remainder = right_chunks.remainder(); - - result_remainder - .iter_mut() - .zip(left_remainder.iter().zip(right_remainder.iter())) - .for_each(|(scalar_result, (scalar_left, scalar_right))| { - *scalar_result = scalar_op(*scalar_left, *scalar_right); - }); - - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ); - Ok(PrimitiveArray::::from(data)) -} - -/// SIMD vectorized implementation of `left / right`. -/// If any of the lanes marked as valid in `valid_mask` are `0` then an `ArrowError::DivideByZero` -/// is returned. The contents of no-valid lanes are undefined. -#[cfg(simd)] -#[inline] -fn simd_checked_divide( - valid_mask: Option, - left: T::Simd, - right: T::Simd, -) -> Result -where - T::Native: One + Zero, -{ - let zero = T::init(T::Native::zero()); - let one = T::init(T::Native::one()); - - let right_no_invalid_zeros = match valid_mask { - Some(mask) => { - let simd_mask = T::mask_from_u64(mask); - // select `1` for invalid lanes, which will be a no-op during division later - T::mask_select(simd_mask, right, one) - } - None => right, - }; - - let zero_mask = T::eq(right_no_invalid_zeros, zero); - - if T::mask_any(zero_mask) { - Err(ArrowError::DivideByZero) - } else { - Ok(T::bin_op(left, right_no_invalid_zeros, |a, b| a / b)) - } -} - -/// Scalar implementation of `left / right` for the remainder elements after complete chunks have been processed using SIMD. -/// If any of the values marked as valid in `valid_mask` are `0` then an `ArrowError::DivideByZero` is returned. -#[cfg(simd)] -#[inline] -fn simd_checked_divide_remainder( - valid_mask: Option, - left_chunks: ChunksExact, - right_chunks: ChunksExact, - result_chunks: ChunksExactMut, -) -> Result<()> -where - T::Native: Zero + Div, -{ - let result_remainder = result_chunks.into_remainder(); - let left_remainder = left_chunks.remainder(); - let right_remainder = right_chunks.remainder(); - - result_remainder - .iter_mut() - .zip(left_remainder.iter().zip(right_remainder.iter())) - .enumerate() - .try_for_each(|(i, (result_scalar, (left_scalar, right_scalar)))| { - if valid_mask.map(|mask| mask & (1 << i) != 0).unwrap_or(true) { - if *right_scalar == T::Native::zero() { - return Err(ArrowError::DivideByZero); - } - *result_scalar = *left_scalar / *right_scalar; - } - Ok(()) - })?; - - Ok(()) -} - -/// Scalar-divisor version of `simd_checked_divide_remainder`. -#[cfg(simd)] -#[inline] -fn simd_checked_divide_scalar_remainder( - array_chunks: ChunksExact, - divisor: T::Native, - result_chunks: ChunksExactMut, -) -> Result<()> -where - T::Native: Zero + Div, -{ - if divisor.is_zero() { - return Err(ArrowError::DivideByZero); - } - - let result_remainder = result_chunks.into_remainder(); - let array_remainder = array_chunks.remainder(); - - result_remainder - .iter_mut() - .zip(array_remainder.iter()) - .for_each(|(result_scalar, array_scalar)| { - *result_scalar = *array_scalar / divisor; - }); - - Ok(()) -} - -/// SIMD vectorized version of `divide`. -/// -/// The divide kernels need their own implementation as there is a need to handle situations -/// where a divide by `0` occurs. This is complicated by `NULL` slots and padding. -#[cfg(simd)] -fn simd_divide( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result> -where - T: ArrowNumericType, - T::Native: One + Zero + Div, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - // Create the combined `Bitmap` - let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; - - let lanes = T::lanes(); - let buffer_size = left.len() * std::mem::size_of::(); - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - match &null_bit_buffer { - Some(b) => { - // combine_option_bitmap returns a slice or new buffer starting at 0 - let valid_chunks = b.bit_chunks(0, left.len()); - - // process data in chunks of 64 elements since we also get 64 bits of validity information at a time - let mut result_chunks = result.typed_data_mut().chunks_exact_mut(64); - let mut left_chunks = left.values().chunks_exact(64); - let mut right_chunks = right.values().chunks_exact(64); - - valid_chunks - .iter() - .zip( - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())), - ) - .try_for_each( - |(mut mask, (result_slice, (left_slice, right_slice)))| { - // split chunks further into slices corresponding to the vector length - // the compiler is able to unroll this inner loop and remove bounds checks - // since the outer chunk size (64) is always a multiple of the number of lanes - result_slice - .chunks_exact_mut(lanes) - .zip(left_slice.chunks_exact(lanes).zip(right_slice.chunks_exact(lanes))) - .try_for_each(|(result_slice, (left_slice, right_slice))| -> Result<()> { - let simd_left = T::load(left_slice); - let simd_right = T::load(right_slice); - - let simd_result = simd_checked_divide::(Some(mask), simd_left, simd_right)?; - - T::write(simd_result, result_slice); - - // skip the shift and avoid overflow for u8 type, which uses 64 lanes. - mask >>= T::lanes() % 64; - - Ok(()) - }) - }, - )?; - - let valid_remainder = valid_chunks.remainder_bits(); - - simd_checked_divide_remainder::( - Some(valid_remainder), - left_chunks, - right_chunks, - result_chunks, - )?; - } - None => { - let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); - let mut left_chunks = left.values().chunks_exact(lanes); - let mut right_chunks = right.values().chunks_exact(lanes); - - result_chunks - .borrow_mut() - .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut())) - .try_for_each( - |(result_slice, (left_slice, right_slice))| -> Result<()> { - let simd_left = T::load(left_slice); - let simd_right = T::load(right_slice); - - let simd_result = - simd_checked_divide::(None, simd_left, simd_right)?; - - T::write(simd_result, result_slice); - - Ok(()) - }, - )?; - - simd_checked_divide_remainder::( - None, - left_chunks, - right_chunks, - result_chunks, - )?; - } - } - - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ); - Ok(PrimitiveArray::::from(data)) -} - -/// SIMD vectorized version of `divide_scalar`. -#[cfg(simd)] -fn simd_divide_scalar( - array: &PrimitiveArray, - divisor: T::Native, -) -> Result> -where - T: ArrowNumericType, - T::Native: One + Zero + Div, -{ - if divisor.is_zero() { - return Err(ArrowError::DivideByZero); - } - - let lanes = T::lanes(); - let buffer_size = array.len() * std::mem::size_of::(); - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - let mut result_chunks = result.typed_data_mut().chunks_exact_mut(lanes); - let mut array_chunks = array.values().chunks_exact(lanes); - - result_chunks - .borrow_mut() - .zip(array_chunks.borrow_mut()) - .for_each(|(result_slice, array_slice)| { - let simd_left = T::load(array_slice); - let simd_right = T::init(divisor); - - let simd_result = T::bin_op(simd_left, simd_right, |a, b| a / b); - T::write(simd_result, result_slice); - }); - - simd_checked_divide_scalar_remainder::(array_chunks, divisor, result_chunks)?; - - let data = ArrayData::new( - T::DATA_TYPE, - array.len(), - None, - array.data_ref().null_buffer().cloned(), - 0, - vec![result.into()], - vec![], - ); - Ok(PrimitiveArray::::from(data)) -} - -/// Perform `left + right` operation on two arrays. If either left or right value is null -/// then the result is also null. -pub fn add( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result> -where - T: ArrowNumericType, - T::Native: Add - + Sub - + Mul - + Div - + Zero, -{ - #[cfg(simd)] - return simd_math_op(&left, &right, |a, b| a + b, |a, b| a + b); - #[cfg(not(simd))] - return math_op(left, right, |a, b| a + b); -} - -/// Perform `left - right` operation on two arrays. If either left or right value is null -/// then the result is also null. -pub fn subtract( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result> -where - T: datatypes::ArrowNumericType, - T::Native: Add - + Sub - + Mul - + Div - + Zero, -{ - #[cfg(simd)] - return simd_math_op(&left, &right, |a, b| a - b, |a, b| a - b); - #[cfg(not(simd))] - return math_op(left, right, |a, b| a - b); -} - -/// Perform `-` operation on an array. If value is null then the result is also null. -pub fn negate(array: &PrimitiveArray) -> Result> -where - T: datatypes::ArrowSignedNumericType, - T::Native: Neg, -{ - #[cfg(simd)] - return simd_signed_unary_math_op(array, |x| -x, |x| -x); - #[cfg(not(simd))] - return Ok(unary(array, |x| -x)); -} - -/// Raise array with floating point values to the power of a scalar. -pub fn powf_scalar( - array: &PrimitiveArray, - raise: T::Native, -) -> Result> -where - T: datatypes::ArrowFloatNumericType, - T::Native: Pow, -{ - #[cfg(simd)] - { - let raise_vector = T::init(raise); - return simd_float_unary_math_op( - array, - |x| T::pow(x, raise_vector), - |x| x.pow(raise), - ); - } - #[cfg(not(simd))] - return Ok(unary(array, |x| x.pow(raise))); -} - -/// Perform `left * right` operation on two arrays. If either left or right value is null -/// then the result is also null. -pub fn multiply( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result> -where - T: datatypes::ArrowNumericType, - T::Native: Add - + Sub - + Mul - + Div - + Zero, -{ - #[cfg(simd)] - return simd_math_op(&left, &right, |a, b| a * b, |a, b| a * b); - #[cfg(not(simd))] - return math_op(left, right, |a, b| a * b); -} - -/// Perform `left / right` operation on two arrays. If either left or right value is null -/// then the result is also null. If any right hand value is zero then the result of this -/// operation will be `Err(ArrowError::DivideByZero)`. -pub fn divide( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result> -where - T: datatypes::ArrowNumericType, - T::Native: Add - + Sub - + Mul - + Div - + Zero - + One, -{ - #[cfg(simd)] - return simd_divide(&left, &right); - #[cfg(not(simd))] - return math_divide(&left, &right); -} - -/// Divide every value in an array by a scalar. If any value in the array is null then the -/// result is also null. If the scalar is zero then the result of this operation will be -/// `Err(ArrowError::DivideByZero)`. -pub fn divide_scalar( - array: &PrimitiveArray, - divisor: T::Native, -) -> Result> -where - T: datatypes::ArrowNumericType, - T::Native: Add - + Sub - + Mul - + Div - + Zero - + One, -{ - #[cfg(simd)] - return simd_divide_scalar(&array, divisor); - #[cfg(not(simd))] - return math_divide_scalar(&array, divisor); -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::array::Int32Array; - - #[test] - fn test_primitive_array_add() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let b = Int32Array::from(vec![6, 7, 8, 9, 8]); - let c = add(&a, &b).unwrap(); - assert_eq!(11, c.value(0)); - assert_eq!(13, c.value(1)); - assert_eq!(15, c.value(2)); - assert_eq!(17, c.value(3)); - assert_eq!(17, c.value(4)); - } - - #[test] - fn test_primitive_array_add_sliced() { - let a = Int32Array::from(vec![0, 0, 0, 5, 6, 7, 8, 9, 0]); - let b = Int32Array::from(vec![0, 0, 0, 6, 7, 8, 9, 8, 0]); - let a = a.slice(3, 5); - let b = b.slice(3, 5); - let a = a.as_any().downcast_ref::().unwrap(); - let b = b.as_any().downcast_ref::().unwrap(); - - assert_eq!(5, a.value(0)); - assert_eq!(6, b.value(0)); - - let c = add(&a, &b).unwrap(); - assert_eq!(5, c.len()); - assert_eq!(11, c.value(0)); - assert_eq!(13, c.value(1)); - assert_eq!(15, c.value(2)); - assert_eq!(17, c.value(3)); - assert_eq!(17, c.value(4)); - } - - #[test] - fn test_primitive_array_add_mismatched_length() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let b = Int32Array::from(vec![6, 7, 8]); - let e = add(&a, &b) - .err() - .expect("should have failed due to different lengths"); - assert_eq!( - "ComputeError(\"Cannot perform math operation on arrays of different length\")", - format!("{:?}", e) - ); - } - - #[test] - fn test_primitive_array_subtract() { - let a = Int32Array::from(vec![1, 2, 3, 4, 5]); - let b = Int32Array::from(vec![5, 4, 3, 2, 1]); - let c = subtract(&a, &b).unwrap(); - assert_eq!(-4, c.value(0)); - assert_eq!(-2, c.value(1)); - assert_eq!(0, c.value(2)); - assert_eq!(2, c.value(3)); - assert_eq!(4, c.value(4)); - } - - #[test] - fn test_primitive_array_multiply() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let b = Int32Array::from(vec![6, 7, 8, 9, 8]); - let c = multiply(&a, &b).unwrap(); - assert_eq!(30, c.value(0)); - assert_eq!(42, c.value(1)); - assert_eq!(56, c.value(2)); - assert_eq!(72, c.value(3)); - assert_eq!(72, c.value(4)); - } - - #[test] - fn test_primitive_array_divide() { - let a = Int32Array::from(vec![15, 15, 8, 1, 9]); - let b = Int32Array::from(vec![5, 6, 8, 9, 1]); - let c = divide(&a, &b).unwrap(); - assert_eq!(3, c.value(0)); - assert_eq!(2, c.value(1)); - assert_eq!(1, c.value(2)); - assert_eq!(0, c.value(3)); - assert_eq!(9, c.value(4)); - } - - #[test] - fn test_primitive_array_divide_scalar() { - let a = Int32Array::from(vec![15, 14, 9, 8, 1]); - let b = 3; - let c = divide_scalar(&a, b).unwrap(); - let expected = Int32Array::from(vec![5, 4, 3, 2, 0]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_array_divide_sliced() { - let a = Int32Array::from(vec![0, 0, 0, 15, 15, 8, 1, 9, 0]); - let b = Int32Array::from(vec![0, 0, 0, 5, 6, 8, 9, 1, 0]); - let a = a.slice(3, 5); - let b = b.slice(3, 5); - let a = a.as_any().downcast_ref::().unwrap(); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = divide(&a, &b).unwrap(); - assert_eq!(5, c.len()); - assert_eq!(3, c.value(0)); - assert_eq!(2, c.value(1)); - assert_eq!(1, c.value(2)); - assert_eq!(0, c.value(3)); - assert_eq!(9, c.value(4)); - } - - #[test] - fn test_primitive_array_divide_with_nulls() { - let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9), None]); - let b = Int32Array::from(vec![Some(5), Some(6), Some(8), Some(9), None, None]); - let c = divide(&a, &b).unwrap(); - assert_eq!(3, c.value(0)); - assert_eq!(true, c.is_null(1)); - assert_eq!(1, c.value(2)); - assert_eq!(0, c.value(3)); - assert_eq!(true, c.is_null(4)); - assert_eq!(true, c.is_null(5)); - } - - #[test] - fn test_primitive_array_divide_scalar_with_nulls() { - let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9), None]); - let b = 3; - let c = divide_scalar(&a, b).unwrap(); - let expected = - Int32Array::from(vec![Some(5), None, Some(2), Some(0), Some(3), None]); - assert_eq!(c, expected); - } - - #[test] - fn test_primitive_array_divide_with_nulls_sliced() { - let a = Int32Array::from(vec![ - None, - None, - None, - None, - None, - None, - None, - None, - Some(15), - None, - Some(8), - Some(1), - Some(9), - None, - None, - ]); - let b = Int32Array::from(vec![ - None, - None, - None, - None, - None, - None, - None, - None, - Some(5), - Some(6), - Some(8), - Some(9), - None, - None, - None, - ]); - - let a = a.slice(8, 6); - let a = a.as_any().downcast_ref::().unwrap(); - - let b = b.slice(8, 6); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = divide(&a, &b).unwrap(); - assert_eq!(6, c.len()); - assert_eq!(3, c.value(0)); - assert_eq!(true, c.is_null(1)); - assert_eq!(1, c.value(2)); - assert_eq!(0, c.value(3)); - assert_eq!(true, c.is_null(4)); - assert_eq!(true, c.is_null(5)); - } - - #[test] - #[should_panic(expected = "DivideByZero")] - fn test_primitive_array_divide_by_zero() { - let a = Int32Array::from(vec![15]); - let b = Int32Array::from(vec![0]); - divide(&a, &b).unwrap(); - } - - #[test] - fn test_primitive_array_divide_f64() { - let a = Float64Array::from(vec![15.0, 15.0, 8.0]); - let b = Float64Array::from(vec![5.0, 6.0, 8.0]); - let c = divide(&a, &b).unwrap(); - assert!(3.0 - c.value(0) < f64::EPSILON); - assert!(2.5 - c.value(1) < f64::EPSILON); - assert!(1.0 - c.value(2) < f64::EPSILON); - } - - #[test] - fn test_primitive_array_add_with_nulls() { - let a = Int32Array::from(vec![Some(5), None, Some(7), None]); - let b = Int32Array::from(vec![None, None, Some(6), Some(7)]); - let c = add(&a, &b).unwrap(); - assert_eq!(true, c.is_null(0)); - assert_eq!(true, c.is_null(1)); - assert_eq!(false, c.is_null(2)); - assert_eq!(true, c.is_null(3)); - assert_eq!(13, c.value(2)); - } - - #[test] - fn test_primitive_array_negate() { - let a: Int64Array = (0..100).into_iter().map(Some).collect(); - let actual = negate(&a).unwrap(); - let expected: Int64Array = (0..100).into_iter().map(|i| Some(-i)).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn test_arithmetic_kernel_should_not_rely_on_padding() { - let a: UInt8Array = (0..128_u8).into_iter().map(Some).collect(); - let a = a.slice(63, 65); - let a = a.as_any().downcast_ref::().unwrap(); - - let b: UInt8Array = (0..128_u8).into_iter().map(Some).collect(); - let b = b.slice(63, 65); - let b = b.as_any().downcast_ref::().unwrap(); - - let actual = add(&a, &b).unwrap(); - let actual: Vec> = actual.iter().collect(); - let expected: Vec> = (63..63_u8 + 65_u8) - .into_iter() - .map(|i| Some(i + i)) - .collect(); - assert_eq!(expected, actual); - } - - #[test] - fn test_primitive_array_raise_power_scalar() { - let a = Float64Array::from(vec![1.0, 2.0, 3.0]); - let actual = powf_scalar(&a, 2.0).unwrap(); - let expected = Float64Array::from(vec![1.0, 4.0, 9.0]); - assert_eq!(expected, actual); - let a = Float64Array::from(vec![Some(1.0), None, Some(3.0)]); - let actual = powf_scalar(&a, 2.0).unwrap(); - let expected = Float64Array::from(vec![Some(1.0), None, Some(9.0)]); - assert_eq!(expected, actual); - } -} diff --git a/rust/arrow/src/compute/kernels/arity.rs b/rust/arrow/src/compute/kernels/arity.rs deleted file mode 100644 index 4aa7f3d6e5d..00000000000 --- a/rust/arrow/src/compute/kernels/arity.rs +++ /dev/null @@ -1,74 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines kernels suitable to perform operations to primitive arrays. - -use crate::array::{Array, ArrayData, PrimitiveArray}; -use crate::buffer::Buffer; -use crate::datatypes::ArrowPrimitiveType; - -#[inline] -fn into_primitive_array_data( - array: &PrimitiveArray, - buffer: Buffer, -) -> ArrayData { - ArrayData::new( - O::DATA_TYPE, - array.len(), - None, - array.data_ref().null_buffer().cloned(), - 0, - vec![buffer], - vec![], - ) -} - -/// Applies an unary and infalible function to a primitive array. -/// This is the fastest way to perform an operation on a primitive array when -/// the benefits of a vectorized operation outweights the cost of branching nulls and non-nulls. -/// # Implementation -/// This will apply the function for all values, including those on null slots. -/// This implies that the operation must be infalible for any value of the corresponding type -/// or this function may panic. -/// # Example -/// ```rust -/// # use arrow::array::Int32Array; -/// # use arrow::datatypes::Int32Type; -/// # use arrow::compute::kernels::arity::unary; -/// # fn main() { -/// let array = Int32Array::from(vec![Some(5), Some(7), None]); -/// let c = unary::<_, _, Int32Type>(&array, |x| x * 2 + 1); -/// assert_eq!(c, Int32Array::from(vec![Some(11), Some(15), None])); -/// # } -/// ``` -pub fn unary(array: &PrimitiveArray, op: F) -> PrimitiveArray -where - I: ArrowPrimitiveType, - O: ArrowPrimitiveType, - F: Fn(I::Native) -> O::Native, -{ - let values = array.values().iter().map(|v| op(*v)); - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` is an iterator with a known size because arrays are sized. - let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - - let data = into_primitive_array_data::<_, O>(array, buffer); - PrimitiveArray::::from(data) -} diff --git a/rust/arrow/src/compute/kernels/boolean.rs b/rust/arrow/src/compute/kernels/boolean.rs deleted file mode 100644 index e1d5592d423..00000000000 --- a/rust/arrow/src/compute/kernels/boolean.rs +++ /dev/null @@ -1,1146 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines boolean kernels on Arrow `BooleanArray`'s, e.g. `AND`, `OR` and `NOT`. -//! -//! These kernels can leverage SIMD if available on your system. Currently no runtime -//! detection is provided, you should enable the specific SIMD intrinsics using -//! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation -//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. - -use std::ops::Not; - -use crate::array::{Array, ArrayData, BooleanArray, PrimitiveArray}; -use crate::buffer::{ - buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer, -}; -use crate::compute::util::combine_option_bitmap; -use crate::datatypes::{ArrowNumericType, DataType}; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util::{ceil, round_upto_multiple_of_64}; -use core::iter; -use lexical_core::Integer; - -fn binary_boolean_kleene_kernel( - left: &BooleanArray, - right: &BooleanArray, - op: F, -) -> Result -where - F: Fn(u64, u64, u64, u64) -> (u64, u64), -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform bitwise operation on arrays of different length".to_string(), - )); - } - - // length and offset of boolean array is measured in bits - let len = left.len(); - - // result length measured in bytes (incl. remainder) - let mut result_len = round_upto_multiple_of_64(len) / 8; - // The iterator that applies the kleene_op closure always chains an additional iteration - // for the remainder chunk, even without a remainder. If the remainder is absent - // (length % 64 == 0), kleene_op would resize the result buffers (value_buffer and - // valid_buffer) to store 8 additional bytes, because result_len wouldn't include a remainder - // chunk. The resizing is unnecessary and expensive. We can prevent it by adding 8 bytes to - // result_len here. Nonetheless, all bits of these 8 bytes will be 0. - if len % 64 == 0 { - result_len += 8; - } - - let mut value_buffer = MutableBuffer::new(result_len); - let mut valid_buffer = MutableBuffer::new(result_len); - - let kleene_op = |((left_data, left_valid), (right_data, right_valid)): ( - (u64, u64), - (u64, u64), - )| { - let left_true = left_valid & left_data; - let left_false = left_valid & !left_data; - - let right_true = right_valid & right_data; - let right_false = right_valid & !right_data; - - let (value, valid) = op(left_true, left_false, right_true, right_false); - - value_buffer.extend_from_slice(&[value]); - valid_buffer.extend_from_slice(&[valid]); - }; - - let left_offset = left.offset(); - let right_offset = right.offset(); - - let left_buffer = left.values(); - let right_buffer = right.values(); - - let left_chunks = left_buffer.bit_chunks(left_offset, len); - let right_chunks = right_buffer.bit_chunks(right_offset, len); - - let left_rem = left_chunks.remainder_bits(); - let right_rem = right_chunks.remainder_bits(); - - let opt_left_valid_chunks_and_rem = left - .data_ref() - .null_buffer() - .map(|b| b.bit_chunks(left_offset, len)) - .map(|chunks| (chunks.iter(), chunks.remainder_bits())); - let opt_right_valid_chunks_and_rem = right - .data_ref() - .null_buffer() - .map(|b| b.bit_chunks(right_offset, len)) - .map(|chunks| (chunks.iter(), chunks.remainder_bits())); - - match ( - opt_left_valid_chunks_and_rem, - opt_right_valid_chunks_and_rem, - ) { - ( - Some((left_valid_chunks, left_valid_rem)), - Some((right_valid_chunks, right_valid_rem)), - ) => { - left_chunks - .iter() - .zip(left_valid_chunks) - .zip(right_chunks.iter().zip(right_valid_chunks)) - .chain(iter::once(( - (left_rem, left_valid_rem), - (right_rem, right_valid_rem), - ))) - .for_each(kleene_op); - } - (Some((left_valid_chunks, left_valid_rem)), None) => { - left_chunks - .iter() - .zip(left_valid_chunks) - .zip(right_chunks.iter().zip(iter::repeat(u64::MAX))) - .chain(iter::once(( - (left_rem, left_valid_rem), - (right_rem, u64::MAX), - ))) - .for_each(kleene_op); - } - (None, Some((right_valid_chunks, right_valid_rem))) => { - left_chunks - .iter() - .zip(iter::repeat(u64::MAX)) - .zip(right_chunks.iter().zip(right_valid_chunks)) - .chain(iter::once(( - (left_rem, u64::MAX), - (right_rem, right_valid_rem), - ))) - .for_each(kleene_op); - } - (None, None) => { - left_chunks - .iter() - .zip(iter::repeat(u64::MAX)) - .zip(right_chunks.iter().zip(iter::repeat(u64::MAX))) - .chain(iter::once(((left_rem, u64::MAX), (right_rem, u64::MAX)))) - .for_each(kleene_op); - } - }; - - let bool_buffer: Buffer = value_buffer.into(); - let bool_valid_buffer: Buffer = valid_buffer.into(); - - let array_data = ArrayData::new( - DataType::Boolean, - len, - None, - Some(bool_valid_buffer), - left_offset, - vec![bool_buffer], - vec![], - ); - - Ok(BooleanArray::from(array_data)) -} - -/// Helper function to implement binary kernels -fn binary_boolean_kernel( - left: &BooleanArray, - right: &BooleanArray, - op: F, -) -> Result -where - F: Fn(&Buffer, usize, &Buffer, usize, usize) -> Buffer, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform bitwise operation on arrays of different length".to_string(), - )); - } - - let len = left.len(); - - let left_data = left.data_ref(); - let right_data = right.data_ref(); - let null_bit_buffer = combine_option_bitmap(&left_data, &right_data, len)?; - - let left_buffer = &left_data.buffers()[0]; - let right_buffer = &right_data.buffers()[0]; - let left_offset = left.offset(); - let right_offset = right.offset(); - - let values = op(&left_buffer, left_offset, &right_buffer, right_offset, len); - - let data = ArrayData::new( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![values], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -/// Performs `AND` operation on two arrays. If either left or right value is null then the -/// result is also null. -/// # Error -/// This function errors when the arrays have different lengths. -/// # Example -/// ```rust -/// use arrow::array::BooleanArray; -/// use arrow::error::Result; -/// use arrow::compute::kernels::boolean::and; -/// # fn main() -> Result<()> { -/// let a = BooleanArray::from(vec![Some(false), Some(true), None]); -/// let b = BooleanArray::from(vec![Some(true), Some(true), Some(false)]); -/// let and_ab = and(&a, &b)?; -/// assert_eq!(and_ab, BooleanArray::from(vec![Some(false), Some(true), None])); -/// # Ok(()) -/// # } -/// ``` -pub fn and(left: &BooleanArray, right: &BooleanArray) -> Result { - binary_boolean_kernel(&left, &right, buffer_bin_and) -} - -/// Logical 'and' boolean values with Kleene logic -/// -/// # Behavior -/// -/// This function behaves as follows with nulls: -/// -/// * `true` and `null` = `null` -/// * `null` and `true` = `null` -/// * `false` and `null` = `false` -/// * `null` and `false` = `false` -/// * `null` and `null` = `null` -/// -/// In other words, in this context a null value really means \"unknown\", -/// and an unknown value 'and' false is always false. -/// For a different null behavior, see function \"and\". -/// -/// # Example -/// -/// ```rust -/// use arrow::array::BooleanArray; -/// use arrow::error::Result; -/// use arrow::compute::kernels::boolean::and_kleene; -/// # fn main() -> Result<()> { -/// let a = BooleanArray::from(vec![Some(true), Some(false), None]); -/// let b = BooleanArray::from(vec![None, None, None]); -/// let and_ab = and_kleene(&a, &b)?; -/// assert_eq!(and_ab, BooleanArray::from(vec![None, Some(false), None])); -/// # Ok(()) -/// # } -/// ``` -/// -/// # Fails -/// -/// If the operands have different lengths -pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result { - if left.null_count().is_zero() && right.null_count().is_zero() { - return and(left, right); - } - - let op = |left_true, left_false, right_true, right_false| { - ( - left_true & right_true, - left_false | right_false | (left_true & right_true), - ) - }; - - binary_boolean_kleene_kernel(left, right, op) -} - -/// Performs `OR` operation on two arrays. If either left or right value is null then the -/// result is also null. -/// # Error -/// This function errors when the arrays have different lengths. -/// # Example -/// ```rust -/// use arrow::array::BooleanArray; -/// use arrow::error::Result; -/// use arrow::compute::kernels::boolean::or; -/// # fn main() -> Result<()> { -/// let a = BooleanArray::from(vec![Some(false), Some(true), None]); -/// let b = BooleanArray::from(vec![Some(true), Some(true), Some(false)]); -/// let or_ab = or(&a, &b)?; -/// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), Some(true), None])); -/// # Ok(()) -/// # } -/// ``` -pub fn or(left: &BooleanArray, right: &BooleanArray) -> Result { - binary_boolean_kernel(&left, &right, buffer_bin_or) -} - -/// Logical 'or' boolean values with Kleene logic -/// -/// # Behavior -/// -/// This function behaves as follows with nulls: -/// -/// * `true` or `null` = `true` -/// * `null` or `true` = `true` -/// * `false` or `null` = `null` -/// * `null` or `false` = `null` -/// * `null` or `null` = `null` -/// -/// In other words, in this context a null value really means \"unknown\", -/// and an unknown value 'or' true is always true. -/// For a different null behavior, see function \"or\". -/// -/// # Example -/// -/// ```rust -/// use arrow::array::BooleanArray; -/// use arrow::error::Result; -/// use arrow::compute::kernels::boolean::or_kleene; -/// # fn main() -> Result<()> { -/// let a = BooleanArray::from(vec![Some(true), Some(false), None]); -/// let b = BooleanArray::from(vec![None, None, None]); -/// let or_ab = or_kleene(&a, &b)?; -/// assert_eq!(or_ab, BooleanArray::from(vec![Some(true), None, None])); -/// # Ok(()) -/// # } -/// ``` -/// -/// # Fails -/// -/// If the operands have different lengths -pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result { - if left.null_count().is_zero() && right.null_count().is_zero() { - return or(left, right); - } - - let op = |left_true, left_false, right_true, right_false| { - ( - left_true | right_true, - left_true | right_true | (left_false & right_false), - ) - }; - - binary_boolean_kleene_kernel(left, right, op) -} - -/// Performs unary `NOT` operation on an arrays. If value is null then the result is also -/// null. -/// # Error -/// This function never errors. It returns an error for consistency. -/// # Example -/// ```rust -/// use arrow::array::BooleanArray; -/// use arrow::error::Result; -/// use arrow::compute::kernels::boolean::not; -/// # fn main() -> Result<()> { -/// let a = BooleanArray::from(vec![Some(false), Some(true), None]); -/// let not_a = not(&a)?; -/// assert_eq!(not_a, BooleanArray::from(vec![Some(true), Some(false), None])); -/// # Ok(()) -/// # } -/// ``` -pub fn not(left: &BooleanArray) -> Result { - let left_offset = left.offset(); - let len = left.len(); - - let data = left.data_ref(); - let null_bit_buffer = data - .null_bitmap() - .as_ref() - .map(|b| b.bits.slice(left_offset)); - - let values = buffer_unary_not(&data.buffers()[0], left_offset, len); - - let data = ArrayData::new( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![values], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -/// Returns a non-null [BooleanArray] with whether each value of the array is null. -/// # Error -/// This function never errors. -/// # Example -/// ```rust -/// # use arrow::error::Result; -/// use arrow::array::BooleanArray; -/// use arrow::compute::kernels::boolean::is_null; -/// # fn main() -> Result<()> { -/// let a = BooleanArray::from(vec![Some(false), Some(true), None]); -/// let a_is_null = is_null(&a)?; -/// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true])); -/// # Ok(()) -/// # } -/// ``` -pub fn is_null(input: &Array) -> Result { - let len = input.len(); - - let output = match input.data_ref().null_buffer() { - None => { - let len_bytes = ceil(len, 8); - MutableBuffer::from_len_zeroed(len_bytes).into() - } - Some(buffer) => buffer_unary_not(buffer, input.offset(), len), - }; - - let data = - ArrayData::new(DataType::Boolean, len, None, None, 0, vec![output], vec![]); - - Ok(BooleanArray::from(data)) -} - -/// Returns a non-null [BooleanArray] with whether each value of the array is not null. -/// # Error -/// This function never errors. -/// # Example -/// ```rust -/// # use arrow::error::Result; -/// use arrow::array::BooleanArray; -/// use arrow::compute::kernels::boolean::is_not_null; -/// # fn main() -> Result<()> { -/// let a = BooleanArray::from(vec![Some(false), Some(true), None]); -/// let a_is_not_null = is_not_null(&a)?; -/// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false])); -/// # Ok(()) -/// # } -/// ``` -pub fn is_not_null(input: &Array) -> Result { - let len = input.len(); - - let output = match input.data_ref().null_buffer() { - None => { - let len_bytes = ceil(len, 8); - MutableBuffer::new(len_bytes) - .with_bitset(len_bytes, true) - .into() - } - Some(buffer) => buffer.bit_slice(input.offset(), len), - }; - - let data = - ArrayData::new(DataType::Boolean, len, None, None, 0, vec![output], vec![]); - - Ok(BooleanArray::from(data)) -} - -/// Copies original array, setting null bit to true if a secondary comparison boolean array is set to true. -/// Typically used to implement NULLIF. -// NOTE: For now this only supports Primitive Arrays. Although the code could be made generic, the issue -// is that currently the bitmap operations result in a final bitmap which is aligned to bit 0, and thus -// the left array's data needs to be sliced to a new offset, and for non-primitive arrays shifting the -// data might be too complicated. In the future, to avoid shifting left array's data, we could instead -// shift the final bitbuffer to the right, prepending with 0's instead. -pub fn nullif( - left: &PrimitiveArray, - right: &BooleanArray, -) -> Result> -where - T: ArrowNumericType, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - let left_data = left.data(); - let right_data = right.data(); - - // If left has no bitmap, create a new one with all values set for nullity op later - // left=0 (null) right=null output bitmap=null - // left=0 right=1 output bitmap=null - // left=1 (set) right=null output bitmap=set (passthrough) - // left=1 right=1 & comp=true output bitmap=null - // left=1 right=1 & comp=false output bitmap=set - // - // Thus: result = left null bitmap & (!right_values | !right_bitmap) - // OR left null bitmap & !(right_values & right_bitmap) - // - // Do the right expression !(right_values & right_bitmap) first since there are two steps - // TRICK: convert BooleanArray buffer as a bitmap for faster operation - let right_combo_buffer = match right.data().null_bitmap() { - Some(right_bitmap) => { - // NOTE: right values and bitmaps are combined and stay at bit offset right.offset() - (right.values() & &right_bitmap.bits).ok().map(|b| b.not()) - } - None => Some(!right.values()), - }; - - // AND of original left null bitmap with right expression - // Here we take care of the possible offsets of the left and right arrays all at once. - let modified_null_buffer = match left_data.null_bitmap() { - Some(left_null_bitmap) => match right_combo_buffer { - Some(rcb) => Some(buffer_bin_and( - &left_null_bitmap.bits, - left_data.offset(), - &rcb, - right_data.offset(), - left_data.len(), - )), - None => Some( - left_null_bitmap - .bits - .bit_slice(left_data.offset(), left.len()), - ), - }, - None => right_combo_buffer - .map(|rcb| rcb.bit_slice(right_data.offset(), right_data.len())), - }; - - // Align/shift left data on offset as needed, since new bitmaps are shifted and aligned to 0 already - // NOTE: this probably only works for primitive arrays. - let data_buffers = if left.offset() == 0 { - left_data.buffers().to_vec() - } else { - // Shift each data buffer by type's bit_width * offset. - left_data - .buffers() - .iter() - .map(|buf| buf.slice(left.offset() * T::get_byte_width())) - .collect::>() - }; - - // Construct new array with same values but modified null bitmap - // TODO: shift data buffer as needed - let data = ArrayData::new( - T::DATA_TYPE, - left.len(), - None, // force new to compute the number of null bits - modified_null_buffer, - 0, // No need for offset since left data has been shifted - data_buffers, - left_data.child_data().to_vec(), - ); - Ok(PrimitiveArray::::from(data)) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::array::{ArrayRef, Int32Array}; - use std::sync::Arc; - - #[test] - fn test_bool_array_and() { - let a = BooleanArray::from(vec![false, false, true, true]); - let b = BooleanArray::from(vec![false, true, false, true]); - let c = and(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![false, false, false, true]); - - assert_eq!(c, expected); - } - - #[test] - fn test_bool_array_or() { - let a = BooleanArray::from(vec![false, false, true, true]); - let b = BooleanArray::from(vec![false, true, false, true]); - let c = or(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![false, true, true, true]); - - assert_eq!(c, expected); - } - - #[test] - fn test_bool_array_or_nulls() { - let a = BooleanArray::from(vec![ - None, - None, - None, - Some(false), - Some(false), - Some(false), - Some(true), - Some(true), - Some(true), - ]); - let b = BooleanArray::from(vec![ - None, - Some(false), - Some(true), - None, - Some(false), - Some(true), - None, - Some(false), - Some(true), - ]); - let c = or(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![ - None, - None, - None, - None, - Some(false), - Some(true), - None, - Some(true), - Some(true), - ]); - - assert_eq!(c, expected); - } - - #[test] - fn test_binary_boolean_kleene_kernel() { - // the kleene kernel is based on chunking and we want to also create - // cases, where the number of values is not a multiple of 64 - for &value in [true, false].iter() { - for &is_valid in [true, false].iter() { - for &n in [0usize, 1, 63, 64, 65, 127, 128].iter() { - let a = BooleanArray::from(vec![Some(true); n]); - let b = BooleanArray::from(vec![None; n]); - - let result = binary_boolean_kleene_kernel(&a, &b, |_, _, _, _| { - let tmp_value = if value { u64::MAX } else { 0 }; - let tmp_is_valid = if is_valid { u64::MAX } else { 0 }; - (tmp_value, tmp_is_valid) - }) - .unwrap(); - - assert_eq!(result.len(), n); - (0..n).for_each(|idx| { - assert_eq!(value, result.value(idx)); - assert_eq!(is_valid, result.is_valid(idx)); - }); - } - } - } - } - - #[test] - fn test_boolean_array_kleene_no_remainder() { - let n = 1024; - let a = BooleanArray::from(vec![true; n]); - let b = BooleanArray::from(vec![None; n]); - let result = or_kleene(&a, &b).unwrap(); - - assert_eq!(result, a); - } - - #[test] - fn test_bool_array_and_kleene_nulls() { - let a = BooleanArray::from(vec![ - None, - None, - None, - Some(false), - Some(false), - Some(false), - Some(true), - Some(true), - Some(true), - ]); - let b = BooleanArray::from(vec![ - None, - Some(false), - Some(true), - None, - Some(false), - Some(true), - None, - Some(false), - Some(true), - ]); - let c = and_kleene(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![ - None, - Some(false), - None, - Some(false), - Some(false), - Some(false), - None, - Some(false), - Some(true), - ]); - - assert_eq!(c, expected); - } - - #[test] - fn test_bool_array_or_kleene_nulls() { - let a = BooleanArray::from(vec![ - None, - None, - None, - Some(false), - Some(false), - Some(false), - Some(true), - Some(true), - Some(true), - ]); - let b = BooleanArray::from(vec![ - None, - Some(false), - Some(true), - None, - Some(false), - Some(true), - None, - Some(false), - Some(true), - ]); - let c = or_kleene(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![ - None, - None, - Some(true), - None, - Some(false), - Some(true), - Some(true), - Some(true), - Some(true), - ]); - - assert_eq!(c, expected); - } - - #[test] - fn test_bool_array_or_kleene_right_sided_nulls() { - let a = BooleanArray::from(vec![false, false, false, true, true, true]); - - // ensure null bitmap of a is absent - assert!(a.data_ref().null_bitmap().is_none()); - - let b = BooleanArray::from(vec![ - Some(true), - Some(false), - None, - Some(true), - Some(false), - None, - ]); - - // ensure null bitmap of b is present - assert!(b.data_ref().null_bitmap().is_some()); - - let c = or_kleene(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![ - Some(true), - Some(false), - None, - Some(true), - Some(true), - Some(true), - ]); - - assert_eq!(c, expected); - } - - #[test] - fn test_bool_array_or_kleene_left_sided_nulls() { - let a = BooleanArray::from(vec![ - Some(true), - Some(false), - None, - Some(true), - Some(false), - None, - ]); - - // ensure null bitmap of b is absent - assert!(a.data_ref().null_bitmap().is_some()); - - let b = BooleanArray::from(vec![false, false, false, true, true, true]); - - // ensure null bitmap of a is present - assert!(b.data_ref().null_bitmap().is_none()); - - let c = or_kleene(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![ - Some(true), - Some(false), - None, - Some(true), - Some(true), - Some(true), - ]); - - assert_eq!(c, expected); - } - - #[test] - fn test_bool_array_not() { - let a = BooleanArray::from(vec![false, true]); - let c = not(&a).unwrap(); - - let expected = BooleanArray::from(vec![true, false]); - - assert_eq!(c, expected); - } - - #[test] - fn test_bool_array_and_nulls() { - let a = BooleanArray::from(vec![ - None, - None, - None, - Some(false), - Some(false), - Some(false), - Some(true), - Some(true), - Some(true), - ]); - let b = BooleanArray::from(vec![ - None, - Some(false), - Some(true), - None, - Some(false), - Some(true), - None, - Some(false), - Some(true), - ]); - let c = and(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![ - None, - None, - None, - None, - Some(false), - Some(false), - None, - Some(false), - Some(true), - ]); - - assert_eq!(c, expected); - } - - #[test] - fn test_bool_array_and_sliced_same_offset() { - let a = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, false, true, - true, - ]); - let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, - ]); - - let a = a.slice(8, 4); - let a = a.as_any().downcast_ref::().unwrap(); - let b = b.slice(8, 4); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = and(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![false, false, false, true]); - - assert_eq!(expected, c); - } - - #[test] - fn test_bool_array_and_sliced_same_offset_mod8() { - let a = BooleanArray::from(vec![ - false, false, true, true, false, false, false, false, false, false, false, - false, - ]); - let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, - ]); - - let a = a.slice(0, 4); - let a = a.as_any().downcast_ref::().unwrap(); - let b = b.slice(8, 4); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = and(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![false, false, false, true]); - - assert_eq!(expected, c); - } - - #[test] - fn test_bool_array_and_sliced_offset1() { - let a = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, false, true, - true, - ]); - let b = BooleanArray::from(vec![false, true, false, true]); - - let a = a.slice(8, 4); - let a = a.as_any().downcast_ref::().unwrap(); - - let c = and(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![false, false, false, true]); - - assert_eq!(expected, c); - } - - #[test] - fn test_bool_array_and_sliced_offset2() { - let a = BooleanArray::from(vec![false, false, true, true]); - let b = BooleanArray::from(vec![ - false, false, false, false, false, false, false, false, false, true, false, - true, - ]); - - let b = b.slice(8, 4); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = and(&a, &b).unwrap(); - - let expected = BooleanArray::from(vec![false, false, false, true]); - - assert_eq!(expected, c); - } - - #[test] - fn test_bool_array_and_nulls_offset() { - let a = BooleanArray::from(vec![None, Some(false), Some(true), None, Some(true)]); - let a = a.slice(1, 4); - let a = a.as_any().downcast_ref::().unwrap(); - - let b = BooleanArray::from(vec![ - None, - None, - Some(true), - Some(false), - Some(true), - Some(true), - ]); - - let b = b.slice(2, 4); - let b = b.as_any().downcast_ref::().unwrap(); - - let c = and(&a, &b).unwrap(); - - let expected = - BooleanArray::from(vec![Some(false), Some(false), None, Some(true)]); - - assert_eq!(expected, c); - } - - #[test] - fn test_nonnull_array_is_null() { - let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); - - let res = is_null(a.as_ref()).unwrap(); - - let expected = BooleanArray::from(vec![false, false, false, false]); - - assert_eq!(expected, res); - assert_eq!(&None, res.data_ref().null_bitmap()); - } - - #[test] - fn test_nonnull_array_with_offset_is_null() { - let a = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1]); - let a = a.slice(8, 4); - - let res = is_null(a.as_ref()).unwrap(); - - let expected = BooleanArray::from(vec![false, false, false, false]); - - assert_eq!(expected, res); - assert_eq!(&None, res.data_ref().null_bitmap()); - } - - #[test] - fn test_nonnull_array_is_not_null() { - let a = Int32Array::from(vec![1, 2, 3, 4]); - - let res = is_not_null(&a).unwrap(); - - let expected = BooleanArray::from(vec![true, true, true, true]); - - assert_eq!(expected, res); - assert_eq!(&None, res.data_ref().null_bitmap()); - } - - #[test] - fn test_nonnull_array_with_offset_is_not_null() { - let a = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 7, 6, 5, 4, 3, 2, 1]); - let a = a.slice(8, 4); - - let res = is_not_null(a.as_ref()).unwrap(); - - let expected = BooleanArray::from(vec![true, true, true, true]); - - assert_eq!(expected, res); - assert_eq!(&None, res.data_ref().null_bitmap()); - } - - #[test] - fn test_nullable_array_is_null() { - let a = Int32Array::from(vec![Some(1), None, Some(3), None]); - - let res = is_null(&a).unwrap(); - - let expected = BooleanArray::from(vec![false, true, false, true]); - - assert_eq!(expected, res); - assert_eq!(&None, res.data_ref().null_bitmap()); - } - - #[test] - fn test_nullable_array_with_offset_is_null() { - let a = Int32Array::from(vec![ - None, - None, - None, - None, - None, - None, - None, - None, - // offset 8, previous None values are skipped by the slice - Some(1), - None, - Some(2), - None, - Some(3), - Some(4), - None, - None, - ]); - let a = a.slice(8, 4); - - let res = is_null(a.as_ref()).unwrap(); - - let expected = BooleanArray::from(vec![false, true, false, true]); - - assert_eq!(expected, res); - assert_eq!(&None, res.data_ref().null_bitmap()); - } - - #[test] - fn test_nullable_array_is_not_null() { - let a = Int32Array::from(vec![Some(1), None, Some(3), None]); - - let res = is_not_null(&a).unwrap(); - - let expected = BooleanArray::from(vec![true, false, true, false]); - - assert_eq!(expected, res); - assert_eq!(&None, res.data_ref().null_bitmap()); - } - - #[test] - fn test_nullable_array_with_offset_is_not_null() { - let a = Int32Array::from(vec![ - None, - None, - None, - None, - None, - None, - None, - None, - // offset 8, previous None values are skipped by the slice - Some(1), - None, - Some(2), - None, - Some(3), - Some(4), - None, - None, - ]); - let a = a.slice(8, 4); - - let res = is_not_null(a.as_ref()).unwrap(); - - let expected = BooleanArray::from(vec![true, false, true, false]); - - assert_eq!(expected, res); - assert_eq!(&None, res.data_ref().null_bitmap()); - } - - #[test] - fn test_nullif_int_array() { - let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9)]); - let comp = - BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); - let res = nullif(&a, &comp).unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), - None, - None, // comp true, slot 2 turned into null - Some(1), - // Even though comp array / right is null, should still pass through original value - // comp true, slot 2 turned into null - Some(9), - ]); - - assert_eq!(expected, res); - } - - #[test] - fn test_nullif_int_array_offset() { - let a = Int32Array::from(vec![None, Some(15), Some(8), Some(1), Some(9)]); - let a = a.slice(1, 3); // Some(15), Some(8), Some(1) - let a = a.as_any().downcast_ref::().unwrap(); - let comp = BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(2, 3); // Some(false), None, Some(true) - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(&a, &comp).unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), // False => keep it - Some(8), // None => keep it - None, // true => None - ]); - assert_eq!(&expected, &res) - } -} diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs deleted file mode 100644 index de1516b0768..00000000000 --- a/rust/arrow/src/compute/kernels/cast.rs +++ /dev/null @@ -1,3843 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines cast kernels for `ArrayRef`, to convert `Array`s between -//! supported datatypes. -//! -//! Example: -//! -//! ``` -//! use arrow::array::*; -//! use arrow::compute::cast; -//! use arrow::datatypes::DataType; -//! use std::sync::Arc; -//! -//! let a = Int32Array::from(vec![5, 6, 7]); -//! let array = Arc::new(a) as ArrayRef; -//! let b = cast(&array, &DataType::Float64).unwrap(); -//! let c = b.as_any().downcast_ref::().unwrap(); -//! assert_eq!(5.0, c.value(0)); -//! assert_eq!(6.0, c.value(1)); -//! assert_eq!(7.0, c.value(2)); -//! ``` - -use std::str; -use std::sync::Arc; - -use crate::buffer::MutableBuffer; -use crate::compute::kernels::arithmetic::{divide, multiply}; -use crate::compute::kernels::arity::unary; -use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::{array::*, compute::take}; -use crate::{buffer::Buffer, util::serialization::lexical_to_string}; -use num::{NumCast, ToPrimitive}; - -/// CastOptions provides a way to override the default cast behaviors -#[derive(Debug)] -pub struct CastOptions { - /// how to handle cast failures, either return NULL (safe=true) or return ERR (safe=false) - pub safe: bool, -} - -pub const DEFAULT_CAST_OPTIONS: CastOptions = CastOptions { safe: true }; - -/// Return true if a value of type `from_type` can be cast into a -/// value of `to_type`. Note that such as cast may be lossy. -/// -/// If this function returns true to stay consistent with the `cast` kernel below. -pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { - use self::DataType::*; - if from_type == to_type { - return true; - } - - match (from_type, to_type) { - (Struct(_), _) => false, - (_, Struct(_)) => false, - (LargeList(list_from), LargeList(list_to)) => { - can_cast_types(list_from.data_type(), list_to.data_type()) - } - (List(list_from), List(list_to)) => { - can_cast_types(list_from.data_type(), list_to.data_type()) - } - (List(list_from), LargeList(list_to)) => { - list_from.data_type() == list_to.data_type() - } - (List(_), _) => false, - (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()), - (_, LargeList(list_to)) => can_cast_types(from_type, list_to.data_type()), - (Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => { - can_cast_types(from_value_type, to_value_type) - } - (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), - (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), - - (_, Boolean) => DataType::is_numeric(from_type), - (Boolean, _) => DataType::is_numeric(to_type) || to_type == &Utf8, - - (Utf8, LargeUtf8) => true, - (LargeUtf8, Utf8) => true, - (Utf8, Date32) => true, - (Utf8, Date64) => true, - (Utf8, Timestamp(TimeUnit::Nanosecond, None)) => true, - (Utf8, _) => DataType::is_numeric(to_type), - (LargeUtf8, Date32) => true, - (LargeUtf8, Date64) => true, - (LargeUtf8, Timestamp(TimeUnit::Nanosecond, None)) => true, - (LargeUtf8, _) => DataType::is_numeric(to_type), - (_, Utf8) | (_, LargeUtf8) => { - DataType::is_numeric(from_type) || from_type == &Binary - } - - // start numeric casts - (UInt8, UInt16) => true, - (UInt8, UInt32) => true, - (UInt8, UInt64) => true, - (UInt8, Int8) => true, - (UInt8, Int16) => true, - (UInt8, Int32) => true, - (UInt8, Int64) => true, - (UInt8, Float32) => true, - (UInt8, Float64) => true, - - (UInt16, UInt8) => true, - (UInt16, UInt32) => true, - (UInt16, UInt64) => true, - (UInt16, Int8) => true, - (UInt16, Int16) => true, - (UInt16, Int32) => true, - (UInt16, Int64) => true, - (UInt16, Float32) => true, - (UInt16, Float64) => true, - - (UInt32, UInt8) => true, - (UInt32, UInt16) => true, - (UInt32, UInt64) => true, - (UInt32, Int8) => true, - (UInt32, Int16) => true, - (UInt32, Int32) => true, - (UInt32, Int64) => true, - (UInt32, Float32) => true, - (UInt32, Float64) => true, - - (UInt64, UInt8) => true, - (UInt64, UInt16) => true, - (UInt64, UInt32) => true, - (UInt64, Int8) => true, - (UInt64, Int16) => true, - (UInt64, Int32) => true, - (UInt64, Int64) => true, - (UInt64, Float32) => true, - (UInt64, Float64) => true, - - (Int8, UInt8) => true, - (Int8, UInt16) => true, - (Int8, UInt32) => true, - (Int8, UInt64) => true, - (Int8, Int16) => true, - (Int8, Int32) => true, - (Int8, Int64) => true, - (Int8, Float32) => true, - (Int8, Float64) => true, - - (Int16, UInt8) => true, - (Int16, UInt16) => true, - (Int16, UInt32) => true, - (Int16, UInt64) => true, - (Int16, Int8) => true, - (Int16, Int32) => true, - (Int16, Int64) => true, - (Int16, Float32) => true, - (Int16, Float64) => true, - - (Int32, UInt8) => true, - (Int32, UInt16) => true, - (Int32, UInt32) => true, - (Int32, UInt64) => true, - (Int32, Int8) => true, - (Int32, Int16) => true, - (Int32, Int64) => true, - (Int32, Float32) => true, - (Int32, Float64) => true, - - (Int64, UInt8) => true, - (Int64, UInt16) => true, - (Int64, UInt32) => true, - (Int64, UInt64) => true, - (Int64, Int8) => true, - (Int64, Int16) => true, - (Int64, Int32) => true, - (Int64, Float32) => true, - (Int64, Float64) => true, - - (Float32, UInt8) => true, - (Float32, UInt16) => true, - (Float32, UInt32) => true, - (Float32, UInt64) => true, - (Float32, Int8) => true, - (Float32, Int16) => true, - (Float32, Int32) => true, - (Float32, Int64) => true, - (Float32, Float64) => true, - - (Float64, UInt8) => true, - (Float64, UInt16) => true, - (Float64, UInt32) => true, - (Float64, UInt64) => true, - (Float64, Int8) => true, - (Float64, Int16) => true, - (Float64, Int32) => true, - (Float64, Int64) => true, - (Float64, Float32) => true, - // end numeric casts - - // temporal casts - (Int32, Date32) => true, - (Int32, Date64) => true, - (Int32, Time32(_)) => true, - (Date32, Int32) => true, - (Date32, Int64) => true, - (Time32(_), Int32) => true, - (Int64, Date64) => true, - (Int64, Date32) => true, - (Int64, Time64(_)) => true, - (Date64, Int64) => true, - (Date64, Int32) => true, - (Time64(_), Int64) => true, - (Date32, Date64) => true, - (Date64, Date32) => true, - (Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => true, - (Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => true, - (Time32(_), Time64(_)) => true, - (Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => true, - (Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => true, - (Time64(_), Time32(to_unit)) => { - matches!(to_unit, TimeUnit::Second | TimeUnit::Millisecond) - } - (Timestamp(_, _), Int64) => true, - (Int64, Timestamp(_, _)) => true, - (Timestamp(_, _), Timestamp(_, _)) => true, - (Timestamp(_, _), Date32) => true, - (Timestamp(_, _), Date64) => true, - // date64 to timestamp might not make sense, - (Int64, Duration(_)) => true, - (Null, Int32) => true, - (_, _) => false, - } -} - -/// Cast `array` to the provided data type and return a new Array with -/// type `to_type`, if possible. -/// -/// Behavior: -/// * Boolean to Utf8: `true` => '1', `false` => `0` -/// * Utf8 to numeric: strings that can't be parsed to numbers return null, float strings -/// in integer casts return null -/// * Numeric to boolean: 0 returns `false`, any other value returns `true` -/// * List to List: the underlying data type is cast -/// * Primitive to List: a list array with 1 value per slot is created -/// * Date32 and Date64: precision lost when going to higher interval -/// * Time32 and Time64: precision lost when going to higher interval -/// * Timestamp and Date{32|64}: precision lost when going to higher interval -/// * Temporal to/from backing primitive: zero-copy with data type change -/// -/// Unsupported Casts -/// * To or from `StructArray` -/// * List to primitive -/// * Utf8 to boolean -/// * Interval and duration -pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result { - cast_with_options(array, to_type, &DEFAULT_CAST_OPTIONS) -} - -/// Cast `array` to the provided data type and return a new Array with -/// type `to_type`, if possible. It accepts `CastOptions` to allow consumers -/// to configure cast behavior. -/// -/// Behavior: -/// * Boolean to Utf8: `true` => '1', `false` => `0` -/// * Utf8 to numeric: strings that can't be parsed to numbers return null, float strings -/// in integer casts return null -/// * Numeric to boolean: 0 returns `false`, any other value returns `true` -/// * List to List: the underlying data type is cast -/// * Primitive to List: a list array with 1 value per slot is created -/// * Date32 and Date64: precision lost when going to higher interval -/// * Time32 and Time64: precision lost when going to higher interval -/// * Timestamp and Date{32|64}: precision lost when going to higher interval -/// * Temporal to/from backing primitive: zero-copy with data type change -/// -/// Unsupported Casts -/// * To or from `StructArray` -/// * List to primitive -/// * Utf8 to boolean -/// * Interval and duration -pub fn cast_with_options( - array: &ArrayRef, - to_type: &DataType, - cast_options: &CastOptions, -) -> Result { - use DataType::*; - let from_type = array.data_type(); - - // clone array if types are the same - if from_type == to_type { - return Ok(array.clone()); - } - match (from_type, to_type) { - (Struct(_), _) => Err(ArrowError::CastError( - "Cannot cast from struct to other types".to_string(), - )), - (_, Struct(_)) => Err(ArrowError::CastError( - "Cannot cast to struct from other types".to_string(), - )), - (List(_), List(ref to)) => { - cast_list_inner::(array, to, to_type, cast_options) - } - (LargeList(_), LargeList(ref to)) => { - cast_list_inner::(array, to, to_type, cast_options) - } - (List(list_from), LargeList(list_to)) => { - if list_to.data_type() != list_from.data_type() { - Err(ArrowError::CastError( - "cannot cast list to large-list with different child data".into(), - )) - } else { - cast_list_container::(&**array, cast_options) - } - } - (LargeList(list_from), List(list_to)) => { - if list_to.data_type() != list_from.data_type() { - Err(ArrowError::CastError( - "cannot cast large-list to list with different child data".into(), - )) - } else { - cast_list_container::(&**array, cast_options) - } - } - (List(_), _) => Err(ArrowError::CastError( - "Cannot cast list to non-list data types".to_string(), - )), - (_, List(ref to)) => { - cast_primitive_to_list::(array, to, to_type, cast_options) - } - (_, LargeList(ref to)) => { - cast_primitive_to_list::(array, to, to_type, cast_options) - } - (Dictionary(index_type, _), _) => match **index_type { - DataType::Int8 => dictionary_cast::(array, to_type, cast_options), - DataType::Int16 => dictionary_cast::(array, to_type, cast_options), - DataType::Int32 => dictionary_cast::(array, to_type, cast_options), - DataType::Int64 => dictionary_cast::(array, to_type, cast_options), - DataType::UInt8 => dictionary_cast::(array, to_type, cast_options), - DataType::UInt16 => { - dictionary_cast::(array, to_type, cast_options) - } - DataType::UInt32 => { - dictionary_cast::(array, to_type, cast_options) - } - DataType::UInt64 => { - dictionary_cast::(array, to_type, cast_options) - } - _ => Err(ArrowError::CastError(format!( - "Casting from dictionary type {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - (_, Dictionary(index_type, value_type)) => match **index_type { - DataType::Int8 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::Int16 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::Int32 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::Int64 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::UInt8 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::UInt16 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::UInt32 => { - cast_to_dictionary::(array, value_type, cast_options) - } - DataType::UInt64 => { - cast_to_dictionary::(array, value_type, cast_options) - } - _ => Err(ArrowError::CastError(format!( - "Casting from type {:?} to dictionary type {:?} not supported", - from_type, to_type, - ))), - }, - (_, Boolean) => match from_type { - UInt8 => cast_numeric_to_bool::(array), - UInt16 => cast_numeric_to_bool::(array), - UInt32 => cast_numeric_to_bool::(array), - UInt64 => cast_numeric_to_bool::(array), - Int8 => cast_numeric_to_bool::(array), - Int16 => cast_numeric_to_bool::(array), - Int32 => cast_numeric_to_bool::(array), - Int64 => cast_numeric_to_bool::(array), - Float32 => cast_numeric_to_bool::(array), - Float64 => cast_numeric_to_bool::(array), - Utf8 => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - (Boolean, _) => match to_type { - UInt8 => cast_bool_to_numeric::(array, cast_options), - UInt16 => cast_bool_to_numeric::(array, cast_options), - UInt32 => cast_bool_to_numeric::(array, cast_options), - UInt64 => cast_bool_to_numeric::(array, cast_options), - Int8 => cast_bool_to_numeric::(array, cast_options), - Int16 => cast_bool_to_numeric::(array, cast_options), - Int32 => cast_bool_to_numeric::(array, cast_options), - Int64 => cast_bool_to_numeric::(array, cast_options), - Float32 => cast_bool_to_numeric::(array, cast_options), - Float64 => cast_bool_to_numeric::(array, cast_options), - Utf8 => { - let array = array.as_any().downcast_ref::().unwrap(); - Ok(Arc::new( - array - .iter() - .map(|value| value.map(|value| if value { "1" } else { "0" })) - .collect::(), - )) - } - _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - (Utf8, _) => match to_type { - LargeUtf8 => cast_str_container::(&**array), - UInt8 => cast_string_to_numeric::(array, cast_options), - UInt16 => cast_string_to_numeric::(array, cast_options), - UInt32 => cast_string_to_numeric::(array, cast_options), - UInt64 => cast_string_to_numeric::(array, cast_options), - Int8 => cast_string_to_numeric::(array, cast_options), - Int16 => cast_string_to_numeric::(array, cast_options), - Int32 => cast_string_to_numeric::(array, cast_options), - Int64 => cast_string_to_numeric::(array, cast_options), - Float32 => cast_string_to_numeric::(array, cast_options), - Float64 => cast_string_to_numeric::(array, cast_options), - Date32 => cast_string_to_date32::(&**array, cast_options), - Date64 => cast_string_to_date64::(&**array, cast_options), - Timestamp(TimeUnit::Nanosecond, None) => { - cast_string_to_timestamp_ns::(&**array, cast_options) - } - _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - (_, Utf8) => match from_type { - LargeUtf8 => cast_str_container::(&**array), - UInt8 => cast_numeric_to_string::(array), - UInt16 => cast_numeric_to_string::(array), - UInt32 => cast_numeric_to_string::(array), - UInt64 => cast_numeric_to_string::(array), - Int8 => cast_numeric_to_string::(array), - Int16 => cast_numeric_to_string::(array), - Int32 => cast_numeric_to_string::(array), - Int64 => cast_numeric_to_string::(array), - Float32 => cast_numeric_to_string::(array), - Float64 => cast_numeric_to_string::(array), - Binary => { - let array = array.as_any().downcast_ref::().unwrap(); - Ok(Arc::new( - array - .iter() - .map(|maybe_value| match maybe_value { - Some(value) => { - let result = str::from_utf8(value); - if cast_options.safe { - Ok(result.ok()) - } else { - Some(result.map_err(|_| { - ArrowError::CastError( - "Cannot cast binary to string".to_string(), - ) - })) - .transpose() - } - } - None => Ok(None), - }) - .collect::>()?, - )) - } - _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - (_, LargeUtf8) => match from_type { - UInt8 => cast_numeric_to_string::(array), - UInt16 => cast_numeric_to_string::(array), - UInt32 => cast_numeric_to_string::(array), - UInt64 => cast_numeric_to_string::(array), - Int8 => cast_numeric_to_string::(array), - Int16 => cast_numeric_to_string::(array), - Int32 => cast_numeric_to_string::(array), - Int64 => cast_numeric_to_string::(array), - Float32 => cast_numeric_to_string::(array), - Float64 => cast_numeric_to_string::(array), - Binary => { - let array = array.as_any().downcast_ref::().unwrap(); - Ok(Arc::new( - array - .iter() - .map(|maybe_value| match maybe_value { - Some(value) => { - let result = str::from_utf8(value); - if cast_options.safe { - Ok(result.ok()) - } else { - Some(result.map_err(|_| { - ArrowError::CastError( - "Cannot cast binary to string".to_string(), - ) - })) - .transpose() - } - } - None => Ok(None), - }) - .collect::>()?, - )) - } - _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - (LargeUtf8, _) => match to_type { - UInt8 => cast_string_to_numeric::(array, cast_options), - UInt16 => cast_string_to_numeric::(array, cast_options), - UInt32 => cast_string_to_numeric::(array, cast_options), - UInt64 => cast_string_to_numeric::(array, cast_options), - Int8 => cast_string_to_numeric::(array, cast_options), - Int16 => cast_string_to_numeric::(array, cast_options), - Int32 => cast_string_to_numeric::(array, cast_options), - Int64 => cast_string_to_numeric::(array, cast_options), - Float32 => cast_string_to_numeric::(array, cast_options), - Float64 => cast_string_to_numeric::(array, cast_options), - Date32 => cast_string_to_date32::(&**array, cast_options), - Date64 => cast_string_to_date64::(&**array, cast_options), - Timestamp(TimeUnit::Nanosecond, None) => { - cast_string_to_timestamp_ns::(&**array, cast_options) - } - _ => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - }, - - // start numeric casts - (UInt8, UInt16) => cast_numeric_arrays::(array), - (UInt8, UInt32) => cast_numeric_arrays::(array), - (UInt8, UInt64) => cast_numeric_arrays::(array), - (UInt8, Int8) => cast_numeric_arrays::(array), - (UInt8, Int16) => cast_numeric_arrays::(array), - (UInt8, Int32) => cast_numeric_arrays::(array), - (UInt8, Int64) => cast_numeric_arrays::(array), - (UInt8, Float32) => cast_numeric_arrays::(array), - (UInt8, Float64) => cast_numeric_arrays::(array), - - (UInt16, UInt8) => cast_numeric_arrays::(array), - (UInt16, UInt32) => cast_numeric_arrays::(array), - (UInt16, UInt64) => cast_numeric_arrays::(array), - (UInt16, Int8) => cast_numeric_arrays::(array), - (UInt16, Int16) => cast_numeric_arrays::(array), - (UInt16, Int32) => cast_numeric_arrays::(array), - (UInt16, Int64) => cast_numeric_arrays::(array), - (UInt16, Float32) => cast_numeric_arrays::(array), - (UInt16, Float64) => cast_numeric_arrays::(array), - - (UInt32, UInt8) => cast_numeric_arrays::(array), - (UInt32, UInt16) => cast_numeric_arrays::(array), - (UInt32, UInt64) => cast_numeric_arrays::(array), - (UInt32, Int8) => cast_numeric_arrays::(array), - (UInt32, Int16) => cast_numeric_arrays::(array), - (UInt32, Int32) => cast_numeric_arrays::(array), - (UInt32, Int64) => cast_numeric_arrays::(array), - (UInt32, Float32) => cast_numeric_arrays::(array), - (UInt32, Float64) => cast_numeric_arrays::(array), - - (UInt64, UInt8) => cast_numeric_arrays::(array), - (UInt64, UInt16) => cast_numeric_arrays::(array), - (UInt64, UInt32) => cast_numeric_arrays::(array), - (UInt64, Int8) => cast_numeric_arrays::(array), - (UInt64, Int16) => cast_numeric_arrays::(array), - (UInt64, Int32) => cast_numeric_arrays::(array), - (UInt64, Int64) => cast_numeric_arrays::(array), - (UInt64, Float32) => cast_numeric_arrays::(array), - (UInt64, Float64) => cast_numeric_arrays::(array), - - (Int8, UInt8) => cast_numeric_arrays::(array), - (Int8, UInt16) => cast_numeric_arrays::(array), - (Int8, UInt32) => cast_numeric_arrays::(array), - (Int8, UInt64) => cast_numeric_arrays::(array), - (Int8, Int16) => cast_numeric_arrays::(array), - (Int8, Int32) => cast_numeric_arrays::(array), - (Int8, Int64) => cast_numeric_arrays::(array), - (Int8, Float32) => cast_numeric_arrays::(array), - (Int8, Float64) => cast_numeric_arrays::(array), - - (Int16, UInt8) => cast_numeric_arrays::(array), - (Int16, UInt16) => cast_numeric_arrays::(array), - (Int16, UInt32) => cast_numeric_arrays::(array), - (Int16, UInt64) => cast_numeric_arrays::(array), - (Int16, Int8) => cast_numeric_arrays::(array), - (Int16, Int32) => cast_numeric_arrays::(array), - (Int16, Int64) => cast_numeric_arrays::(array), - (Int16, Float32) => cast_numeric_arrays::(array), - (Int16, Float64) => cast_numeric_arrays::(array), - - (Int32, UInt8) => cast_numeric_arrays::(array), - (Int32, UInt16) => cast_numeric_arrays::(array), - (Int32, UInt32) => cast_numeric_arrays::(array), - (Int32, UInt64) => cast_numeric_arrays::(array), - (Int32, Int8) => cast_numeric_arrays::(array), - (Int32, Int16) => cast_numeric_arrays::(array), - (Int32, Int64) => cast_numeric_arrays::(array), - (Int32, Float32) => cast_numeric_arrays::(array), - (Int32, Float64) => cast_numeric_arrays::(array), - - (Int64, UInt8) => cast_numeric_arrays::(array), - (Int64, UInt16) => cast_numeric_arrays::(array), - (Int64, UInt32) => cast_numeric_arrays::(array), - (Int64, UInt64) => cast_numeric_arrays::(array), - (Int64, Int8) => cast_numeric_arrays::(array), - (Int64, Int16) => cast_numeric_arrays::(array), - (Int64, Int32) => cast_numeric_arrays::(array), - (Int64, Float32) => cast_numeric_arrays::(array), - (Int64, Float64) => cast_numeric_arrays::(array), - - (Float32, UInt8) => cast_numeric_arrays::(array), - (Float32, UInt16) => cast_numeric_arrays::(array), - (Float32, UInt32) => cast_numeric_arrays::(array), - (Float32, UInt64) => cast_numeric_arrays::(array), - (Float32, Int8) => cast_numeric_arrays::(array), - (Float32, Int16) => cast_numeric_arrays::(array), - (Float32, Int32) => cast_numeric_arrays::(array), - (Float32, Int64) => cast_numeric_arrays::(array), - (Float32, Float64) => cast_numeric_arrays::(array), - - (Float64, UInt8) => cast_numeric_arrays::(array), - (Float64, UInt16) => cast_numeric_arrays::(array), - (Float64, UInt32) => cast_numeric_arrays::(array), - (Float64, UInt64) => cast_numeric_arrays::(array), - (Float64, Int8) => cast_numeric_arrays::(array), - (Float64, Int16) => cast_numeric_arrays::(array), - (Float64, Int32) => cast_numeric_arrays::(array), - (Float64, Int64) => cast_numeric_arrays::(array), - (Float64, Float32) => cast_numeric_arrays::(array), - // end numeric casts - - // temporal casts - (Int32, Date32) => cast_array_data::(array, to_type.clone()), - (Int32, Date64) => cast_with_options( - &cast_with_options(array, &DataType::Date32, &cast_options)?, - &DataType::Date64, - &cast_options, - ), - (Int32, Time32(TimeUnit::Second)) => { - cast_array_data::(array, to_type.clone()) - } - (Int32, Time32(TimeUnit::Millisecond)) => { - cast_array_data::(array, to_type.clone()) - } - // No support for microsecond/nanosecond with i32 - (Date32, Int32) => cast_array_data::(array, to_type.clone()), - (Date32, Int64) => cast_with_options( - &cast_with_options(array, &DataType::Int32, cast_options)?, - &DataType::Int64, - &cast_options, - ), - (Time32(_), Int32) => cast_array_data::(array, to_type.clone()), - (Int64, Date64) => cast_array_data::(array, to_type.clone()), - (Int64, Date32) => cast_with_options( - &cast_with_options(array, &DataType::Int32, &cast_options)?, - &DataType::Date32, - &cast_options, - ), - // No support for second/milliseconds with i64 - (Int64, Time64(TimeUnit::Microsecond)) => { - cast_array_data::(array, to_type.clone()) - } - (Int64, Time64(TimeUnit::Nanosecond)) => { - cast_array_data::(array, to_type.clone()) - } - - (Date64, Int64) => cast_array_data::(array, to_type.clone()), - (Date64, Int32) => cast_with_options( - &cast_with_options(array, &DataType::Int64, &cast_options)?, - &DataType::Int32, - &cast_options, - ), - (Time64(_), Int64) => cast_array_data::(array, to_type.clone()), - (Date32, Date64) => { - let date_array = array.as_any().downcast_ref::().unwrap(); - - let values = - unary::<_, _, Date64Type>(date_array, |x| x as i64 * MILLISECONDS_IN_DAY); - - Ok(Arc::new(values) as ArrayRef) - } - (Date64, Date32) => { - let date_array = array.as_any().downcast_ref::().unwrap(); - - let values = unary::<_, _, Date32Type>(date_array, |x| { - (x / MILLISECONDS_IN_DAY) as i32 - }); - - Ok(Arc::new(values) as ArrayRef) - } - (Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => { - let time_array = array.as_any().downcast_ref::().unwrap(); - - let values = unary::<_, _, Time32MillisecondType>(time_array, |x| { - x * MILLISECONDS as i32 - }); - - Ok(Arc::new(values) as ArrayRef) - } - (Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => { - let time_array = array - .as_any() - .downcast_ref::() - .unwrap(); - - let values = unary::<_, _, Time32SecondType>(time_array, |x| { - x / (MILLISECONDS as i32) - }); - - Ok(Arc::new(values) as ArrayRef) - } - //(Time32(TimeUnit::Second), Time64(_)) => {}, - (Time32(from_unit), Time64(to_unit)) => { - let time_array = Int32Array::from(array.data().clone()); - // note: (numeric_cast + SIMD multiply) is faster than (cast & multiply) - let c: Int64Array = numeric_cast(&time_array); - let from_size = time_unit_multiple(&from_unit); - let to_size = time_unit_multiple(&to_unit); - // from is only smaller than to if 64milli/64second don't exist - let mult = Int64Array::from(vec![to_size / from_size; array.len()]); - let converted = multiply(&c, &mult)?; - let array_ref = Arc::new(converted) as ArrayRef; - use TimeUnit::*; - match to_unit { - Microsecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - Nanosecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - _ => unreachable!("array type not supported"), - } - } - (Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => { - let time_array = array - .as_any() - .downcast_ref::() - .unwrap(); - - let values = - unary::<_, _, Time64NanosecondType>(time_array, |x| x * MILLISECONDS); - Ok(Arc::new(values) as ArrayRef) - } - (Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => { - let time_array = array - .as_any() - .downcast_ref::() - .unwrap(); - - let values = - unary::<_, _, Time64MicrosecondType>(time_array, |x| x / MILLISECONDS); - Ok(Arc::new(values) as ArrayRef) - } - (Time64(from_unit), Time32(to_unit)) => { - let time_array = Int64Array::from(array.data().clone()); - let from_size = time_unit_multiple(&from_unit); - let to_size = time_unit_multiple(&to_unit); - let divisor = from_size / to_size; - match to_unit { - TimeUnit::Second => { - let values = unary::<_, _, Time32SecondType>(&time_array, |x| { - (x as i64 / divisor) as i32 - }); - Ok(Arc::new(values) as ArrayRef) - } - TimeUnit::Millisecond => { - let values = unary::<_, _, Time32MillisecondType>(&time_array, |x| { - (x as i64 / divisor) as i32 - }); - Ok(Arc::new(values) as ArrayRef) - } - _ => unreachable!("array type not supported"), - } - } - (Timestamp(_, _), Int64) => cast_array_data::(array, to_type.clone()), - (Int64, Timestamp(to_unit, _)) => { - use TimeUnit::*; - match to_unit { - Second => cast_array_data::(array, to_type.clone()), - Millisecond => { - cast_array_data::(array, to_type.clone()) - } - Microsecond => { - cast_array_data::(array, to_type.clone()) - } - Nanosecond => { - cast_array_data::(array, to_type.clone()) - } - } - } - (Timestamp(from_unit, _), Timestamp(to_unit, _)) => { - let time_array = Int64Array::from(array.data().clone()); - let from_size = time_unit_multiple(&from_unit); - let to_size = time_unit_multiple(&to_unit); - // we either divide or multiply, depending on size of each unit - // units are never the same when the types are the same - let converted = if from_size >= to_size { - divide( - &time_array, - &Int64Array::from(vec![from_size / to_size; array.len()]), - )? - } else { - multiply( - &time_array, - &Int64Array::from(vec![to_size / from_size; array.len()]), - )? - }; - let array_ref = Arc::new(converted) as ArrayRef; - use TimeUnit::*; - match to_unit { - Second => { - cast_array_data::(&array_ref, to_type.clone()) - } - Millisecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - Microsecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - Nanosecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - } - } - (Timestamp(from_unit, _), Date32) => { - let time_array = Int64Array::from(array.data().clone()); - let from_size = time_unit_multiple(&from_unit) * SECONDS_IN_DAY; - let mut b = Date32Builder::new(array.len()); - for i in 0..array.len() { - if array.is_null(i) { - b.append_null()?; - } else { - b.append_value((time_array.value(i) / from_size) as i32)?; - } - } - - Ok(Arc::new(b.finish()) as ArrayRef) - } - (Timestamp(from_unit, _), Date64) => { - let from_size = time_unit_multiple(&from_unit); - let to_size = MILLISECONDS; - - // Scale time_array by (to_size / from_size) using a - // single integer operation, but need to avoid integer - // math rounding down to zero - - match to_size.cmp(&from_size) { - std::cmp::Ordering::Less => { - let time_array = Date64Array::from(array.data().clone()); - Ok(Arc::new(divide( - &time_array, - &Date64Array::from(vec![from_size / to_size; array.len()]), - )?) as ArrayRef) - } - std::cmp::Ordering::Equal => { - cast_array_data::(array, to_type.clone()) - } - std::cmp::Ordering::Greater => { - let time_array = Date64Array::from(array.data().clone()); - Ok(Arc::new(multiply( - &time_array, - &Date64Array::from(vec![to_size / from_size; array.len()]), - )?) as ArrayRef) - } - } - } - // date64 to timestamp might not make sense, - (Int64, Duration(to_unit)) => { - use TimeUnit::*; - match to_unit { - Second => cast_array_data::(array, to_type.clone()), - Millisecond => { - cast_array_data::(array, to_type.clone()) - } - Microsecond => { - cast_array_data::(array, to_type.clone()) - } - Nanosecond => { - cast_array_data::(array, to_type.clone()) - } - } - } - - // null to primitive/flat types - (Null, Int32) => Ok(Arc::new(Int32Array::from(vec![None; array.len()]))), - - (_, _) => Err(ArrowError::CastError(format!( - "Casting from {:?} to {:?} not supported", - from_type, to_type, - ))), - } -} - -/// Get the time unit as a multiple of a second -const fn time_unit_multiple(unit: &TimeUnit) -> i64 { - match unit { - TimeUnit::Second => 1, - TimeUnit::Millisecond => MILLISECONDS, - TimeUnit::Microsecond => MICROSECONDS, - TimeUnit::Nanosecond => NANOSECONDS, - } -} - -/// Number of seconds in a day -const SECONDS_IN_DAY: i64 = 86_400; -/// Number of milliseconds in a second -const MILLISECONDS: i64 = 1_000; -/// Number of microseconds in a second -const MICROSECONDS: i64 = 1_000_000; -/// Number of nanoseconds in a second -const NANOSECONDS: i64 = 1_000_000_000; -/// Number of milliseconds in a day -const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS; -/// Number of days between 0001-01-01 and 1970-01-01 -const EPOCH_DAYS_FROM_CE: i32 = 719_163; - -/// Cast an array by changing its array_data type to the desired type -/// -/// Arrays should have the same primitive data type, otherwise this should fail. -/// We do not perform this check on primitive data types as we only use this -/// function internally, where it is guaranteed to be infallible. -#[allow(clippy::unnecessary_wraps)] -fn cast_array_data(array: &ArrayRef, to_type: DataType) -> Result -where - TO: ArrowNumericType, -{ - let data = ArrayData::new( - to_type, - array.len(), - Some(array.null_count()), - array.data().null_bitmap().clone().map(|bitmap| bitmap.bits), - array.data().offset(), - array.data().buffers().to_vec(), - vec![], - ); - Ok(Arc::new(PrimitiveArray::::from(data)) as ArrayRef) -} - -/// Convert Array into a PrimitiveArray of type, and apply numeric cast -#[allow(clippy::unnecessary_wraps)] -fn cast_numeric_arrays(from: &ArrayRef) -> Result -where - FROM: ArrowNumericType, - TO: ArrowNumericType, - FROM::Native: num::NumCast, - TO::Native: num::NumCast, -{ - Ok(Arc::new(numeric_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), - ))) -} - -/// Natural cast between numeric types -fn numeric_cast(from: &PrimitiveArray) -> PrimitiveArray -where - T: ArrowNumericType, - R: ArrowNumericType, - T::Native: num::NumCast, - R::Native: num::NumCast, -{ - let iter = from - .iter() - .map(|v| v.and_then(num::cast::cast::)); - // Soundness: - // The iterator is trustedLen because it comes from an `PrimitiveArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } -} - -/// Cast numeric types to Utf8 -#[allow(clippy::unnecessary_wraps)] -fn cast_numeric_to_string(array: &ArrayRef) -> Result -where - FROM: ArrowNumericType, - FROM::Native: lexical_core::ToLexical, - OffsetSize: StringOffsetSizeTrait, -{ - Ok(Arc::new(numeric_to_string_cast::( - array - .as_any() - .downcast_ref::>() - .unwrap(), - ))) -} - -fn numeric_to_string_cast( - from: &PrimitiveArray, -) -> GenericStringArray -where - T: ArrowPrimitiveType + ArrowNumericType, - T::Native: lexical_core::ToLexical, - OffsetSize: StringOffsetSizeTrait, -{ - from.iter() - .map(|maybe_value| maybe_value.map(lexical_to_string)) - .collect() -} - -/// Cast numeric types to Utf8 -#[allow(clippy::unnecessary_wraps)] -fn cast_string_to_numeric( - from: &ArrayRef, - cast_options: &CastOptions, -) -> Result -where - T: ArrowNumericType, - ::Native: lexical_core::FromLexical, -{ - Ok(Arc::new(string_to_numeric_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), - cast_options, - )?)) -} - -fn string_to_numeric_cast( - from: &GenericStringArray, - cast_options: &CastOptions, -) -> Result> -where - T: ArrowNumericType, - ::Native: lexical_core::FromLexical, -{ - if cast_options.safe { - let iter = (0..from.len()).map(|i| { - if from.is_null(i) { - None - } else { - lexical_core::parse(from.value(i).as_bytes()).ok() - } - }); - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - Ok(unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }) - } else { - let vec = (0..from.len()) - .map(|i| { - if from.is_null(i) { - Ok(None) - } else { - let string = from.value(i); - let result = lexical_core::parse(string.as_bytes()); - Some(result.map_err(|_| { - ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {} type", - string, - std::any::type_name::() - )) - })) - .transpose() - } - }) - .collect::>>()?; - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - Ok(unsafe { PrimitiveArray::::from_trusted_len_iter(vec.iter()) }) - } -} - -/// Casts generic string arrays to Date32Array -#[allow(clippy::unnecessary_wraps)] -fn cast_string_to_date32( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - use chrono::Datelike; - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = (0..string_array.len()).map(|i| { - if string_array.is_null(i) { - None - } else { - string_array - .value(i) - .parse::() - .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - .ok() - } - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date32Array::from_trusted_len_iter(iter) } - } else { - let vec = (0..string_array.len()) - .map(|i| { - if string_array.is_null(i) { - Ok(None) - } else { - let string = string_array - .value(i); - - let result = string - .parse::() - .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE); - - Some(result.map_err(|_| { - ArrowError::CastError( - format!("Cannot cast string '{}' to value of arrow::datatypes::types::Date32Type type", string), - ) - })) - .transpose() - } - }) - .collect::>>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date32Array::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to Date64Array -#[allow(clippy::unnecessary_wraps)] -fn cast_string_to_date64( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = (0..string_array.len()).map(|i| { - if string_array.is_null(i) { - None - } else { - string_array - .value(i) - .parse::() - .map(|datetime| datetime.timestamp_millis()) - .ok() - } - }); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date64Array::from_trusted_len_iter(iter) } - } else { - let vec = (0..string_array.len()) - .map(|i| { - if string_array.is_null(i) { - Ok(None) - } else { - let string = string_array - .value(i); - - let result = string - .parse::() - .map(|datetime| datetime.timestamp_millis()); - - Some(result.map_err(|_| { - ArrowError::CastError( - format!("Cannot cast string '{}' to value of arrow::datatypes::types::Date64Type type", string), - ) - })) - .transpose() - } - }) - .collect::>>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { Date64Array::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Casts generic string arrays to TimeStampNanosecondArray -#[allow(clippy::unnecessary_wraps)] -fn cast_string_to_timestamp_ns( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result { - let string_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - let array = if cast_options.safe { - let iter = (0..string_array.len()).map(|i| { - if string_array.is_null(i) { - None - } else { - string_to_timestamp_nanos(string_array.value(i)).ok() - } - }); - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) } - } else { - let vec = (0..string_array.len()) - .map(|i| { - if string_array.is_null(i) { - Ok(None) - } else { - let result = string_to_timestamp_nanos(string_array.value(i)); - Some(result).transpose() - } - }) - .collect::>>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { TimestampNanosecondArray::from_trusted_len_iter(vec.iter()) } - }; - - Ok(Arc::new(array) as ArrayRef) -} - -/// Cast numeric types to Boolean -/// -/// Any zero value returns `false` while non-zero returns `true` -fn cast_numeric_to_bool(from: &ArrayRef) -> Result -where - FROM: ArrowNumericType, -{ - numeric_to_bool_cast::( - from.as_any() - .downcast_ref::>() - .unwrap(), - ) - .map(|to| Arc::new(to) as ArrayRef) -} - -fn numeric_to_bool_cast(from: &PrimitiveArray) -> Result -where - T: ArrowPrimitiveType + ArrowNumericType, -{ - let mut b = BooleanBuilder::new(from.len()); - - for i in 0..from.len() { - if from.is_null(i) { - b.append_null()?; - } else if from.value(i) != T::default_value() { - b.append_value(true)?; - } else { - b.append_value(false)?; - } - } - - Ok(b.finish()) -} - -/// Cast Boolean types to numeric -/// -/// `false` returns 0 while `true` returns 1 -#[allow(clippy::unnecessary_wraps)] -fn cast_bool_to_numeric( - from: &ArrayRef, - cast_options: &CastOptions, -) -> Result -where - TO: ArrowNumericType, - TO::Native: num::cast::NumCast, -{ - Ok(Arc::new(bool_to_numeric_cast::( - from.as_any().downcast_ref::().unwrap(), - cast_options, - ))) -} - -fn bool_to_numeric_cast( - from: &BooleanArray, - _cast_options: &CastOptions, -) -> PrimitiveArray -where - T: ArrowNumericType, - T::Native: num::NumCast, -{ - let iter = (0..from.len()).map(|i| { - if from.is_null(i) { - None - } else if from.value(i) { - // a workaround to cast a primitive to T::Native, infallible - num::cast::cast(1) - } else { - Some(T::default_value()) - } - }); - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from a Range - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } -} - -/// Attempts to cast an `ArrayDictionary` with index type K into -/// `to_type` for supported types. -/// -/// K is the key type -fn dictionary_cast( - array: &ArrayRef, - to_type: &DataType, - cast_options: &CastOptions, -) -> Result { - use DataType::*; - - match to_type { - Dictionary(to_index_type, to_value_type) => { - let dict_array = array - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - ArrowError::ComputeError( - "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(), - ) - })?; - - let keys_array: ArrayRef = Arc::new(dict_array.keys_array()); - let values_array: ArrayRef = dict_array.values(); - let cast_keys = cast_with_options(&keys_array, to_index_type, &cast_options)?; - let cast_values = - cast_with_options(&values_array, to_value_type, &cast_options)?; - - // Failure to cast keys (because they don't fit in the - // target type) results in NULL values; - if cast_keys.null_count() > keys_array.null_count() { - return Err(ArrowError::ComputeError(format!( - "Could not convert {} dictionary indexes from {:?} to {:?}", - cast_keys.null_count() - keys_array.null_count(), - keys_array.data_type(), - to_index_type - ))); - } - - // keys are data, child_data is values (dictionary) - let data = ArrayData::new( - to_type.clone(), - cast_keys.len(), - Some(cast_keys.null_count()), - cast_keys - .data() - .null_bitmap() - .clone() - .map(|bitmap| bitmap.bits), - cast_keys.data().offset(), - cast_keys.data().buffers().to_vec(), - vec![cast_values.data().clone()], - ); - - // create the appropriate array type - let new_array: ArrayRef = match **to_index_type { - Int8 => Arc::new(DictionaryArray::::from(data)), - Int16 => Arc::new(DictionaryArray::::from(data)), - Int32 => Arc::new(DictionaryArray::::from(data)), - Int64 => Arc::new(DictionaryArray::::from(data)), - UInt8 => Arc::new(DictionaryArray::::from(data)), - UInt16 => Arc::new(DictionaryArray::::from(data)), - UInt32 => Arc::new(DictionaryArray::::from(data)), - UInt64 => Arc::new(DictionaryArray::::from(data)), - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported type {:?} for dictionary index", - to_index_type - ))) - } - }; - - Ok(new_array) - } - _ => unpack_dictionary::(array, to_type, cast_options), - } -} - -// Unpack a dictionary where the keys are of type into a flattened array of type to_type -fn unpack_dictionary( - array: &ArrayRef, - to_type: &DataType, - cast_options: &CastOptions, -) -> Result -where - K: ArrowDictionaryKeyType, -{ - let dict_array = array - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - ArrowError::ComputeError( - "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(), - ) - })?; - - // attempt to cast the dict values to the target type - // use the take kernel to expand out the dictionary - let cast_dict_values = - cast_with_options(&dict_array.values(), to_type, cast_options)?; - - // Note take requires first casting the indices to u32 - let keys_array: ArrayRef = Arc::new(dict_array.keys_array()); - let indicies = cast_with_options(&keys_array, &DataType::UInt32, cast_options)?; - let u32_indicies = - indicies - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::ComputeError( - "Internal Error: Cannot cast dict indices to UInt32".to_string(), - ) - })?; - - take(cast_dict_values.as_ref(), u32_indicies, None) -} - -/// Attempts to encode an array into an `ArrayDictionary` with index -/// type K and value (dictionary) type value_type -/// -/// K is the key type -fn cast_to_dictionary( - array: &ArrayRef, - dict_value_type: &DataType, - cast_options: &CastOptions, -) -> Result { - use DataType::*; - - match *dict_value_type { - Int8 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Int16 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Int32 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Int64 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt8 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt16 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt32 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - UInt64 => pack_numeric_to_dictionary::( - array, - dict_value_type, - cast_options, - ), - Utf8 => pack_string_to_dictionary::(array, cast_options), - _ => Err(ArrowError::CastError(format!( - "Unsupported output type for dictionary packing: {:?}", - dict_value_type - ))), - } -} - -// Packs the data from the primitive array of type to a -// DictionaryArray with keys of type K and values of value_type V -fn pack_numeric_to_dictionary( - array: &ArrayRef, - dict_value_type: &DataType, - cast_options: &CastOptions, -) -> Result -where - K: ArrowDictionaryKeyType, - V: ArrowNumericType, -{ - // attempt to cast the source array values to the target value type (the dictionary values type) - let cast_values = cast_with_options(array, &dict_value_type, cast_options)?; - let values = cast_values - .as_any() - .downcast_ref::>() - .unwrap(); - - let keys_builder = PrimitiveBuilder::::new(values.len()); - let values_builder = PrimitiveBuilder::::new(values.len()); - let mut b = PrimitiveDictionaryBuilder::new(keys_builder, values_builder); - - // copy each element one at a time - for i in 0..values.len() { - if values.is_null(i) { - b.append_null()?; - } else { - b.append(values.value(i))?; - } - } - Ok(Arc::new(b.finish())) -} - -// Packs the data as a StringDictionaryArray, if possible, with the -// key types of K -fn pack_string_to_dictionary( - array: &ArrayRef, - cast_options: &CastOptions, -) -> Result -where - K: ArrowDictionaryKeyType, -{ - let cast_values = cast_with_options(array, &DataType::Utf8, cast_options)?; - let values = cast_values.as_any().downcast_ref::().unwrap(); - - let keys_builder = PrimitiveBuilder::::new(values.len()); - let values_builder = StringBuilder::new(values.len()); - let mut b = StringDictionaryBuilder::new(keys_builder, values_builder); - - // copy each element one at a time - for i in 0..values.len() { - if values.is_null(i) { - b.append_null()?; - } else { - b.append(values.value(i))?; - } - } - Ok(Arc::new(b.finish())) -} - -/// Helper function that takes a primitive array and casts to a (generic) list array. -fn cast_primitive_to_list( - array: &ArrayRef, - to: &Field, - to_type: &DataType, - cast_options: &CastOptions, -) -> Result { - // cast primitive to list's primitive - let cast_array = cast_with_options(array, to.data_type(), cast_options)?; - // create offsets, where if array.len() = 2, we have [0,1,2] - // Safety: - // Length of range can be trusted. - // Note: could not yet create a generic range in stable Rust. - let offsets = unsafe { - MutableBuffer::from_trusted_len_iter( - (0..=array.len()).map(|i| OffsetSize::from(i).expect("integer")), - ) - }; - - let list_data = ArrayData::new( - to_type.clone(), - array.len(), - Some(cast_array.null_count()), - cast_array - .data() - .null_bitmap() - .clone() - .map(|bitmap| bitmap.bits), - 0, - vec![offsets.into()], - vec![cast_array.data().clone()], - ); - let list_array = - Arc::new(GenericListArray::::from(list_data)) as ArrayRef; - - Ok(list_array) -} - -/// Helper function that takes an Generic list container and casts the inner datatype. -fn cast_list_inner( - array: &Arc, - to: &Field, - to_type: &DataType, - cast_options: &CastOptions, -) -> Result { - let data = array.data_ref(); - let underlying_array = make_array(data.child_data()[0].clone()); - let cast_array = cast_with_options(&underlying_array, to.data_type(), cast_options)?; - let array_data = ArrayData::new( - to_type.clone(), - array.len(), - Some(cast_array.null_count()), - cast_array - .data() - .null_bitmap() - .clone() - .map(|bitmap| bitmap.bits), - array.offset(), - // reuse offset buffer - data.buffers().to_vec(), - vec![cast_array.data().clone()], - ); - let list = GenericListArray::::from(array_data); - Ok(Arc::new(list) as ArrayRef) -} - -/// Helper function to cast from `Utf8` to `LargeUtf8` and vice versa. If the `LargeUtf8` is too large for -/// a `Utf8` array it will return an Error. -fn cast_str_container(array: &dyn Array) -> Result -where - OffsetSizeFrom: StringOffsetSizeTrait + ToPrimitive, - OffsetSizeTo: StringOffsetSizeTrait + NumCast + ArrowNativeType, -{ - let str_array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let list_data = array.data(); - let str_values_buf = str_array.value_data(); - - let offsets = unsafe { list_data.buffers()[0].typed_data::() }; - - let mut offset_builder = BufferBuilder::::new(offsets.len()); - offsets.iter().try_for_each::<_, Result<_>>(|offset| { - let offset = OffsetSizeTo::from(*offset).ok_or_else(|| { - ArrowError::ComputeError( - "large-utf8 array too large to cast to utf8-array".into(), - ) - })?; - offset_builder.append(offset); - Ok(()) - })?; - - let offset_buffer = offset_builder.finish(); - - let dtype = if matches!(std::mem::size_of::(), 8) { - DataType::LargeUtf8 - } else { - DataType::Utf8 - }; - - let mut builder = ArrayData::builder(dtype) - .len(array.len()) - .add_buffer(offset_buffer) - .add_buffer(str_values_buf); - - if let Some(buf) = list_data.null_buffer() { - builder = builder.null_bit_buffer(buf.clone()) - } - let data = builder.build(); - Ok(Arc::new(GenericStringArray::::from(data))) -} - -/// Cast the container type of List/Largelist array but not the inner types. -/// This function can leave the value data intact and only has to cast the offset dtypes. -fn cast_list_container( - array: &dyn Array, - _cast_options: &CastOptions, -) -> Result -where - OffsetSizeFrom: OffsetSizeTrait + ToPrimitive, - OffsetSizeTo: OffsetSizeTrait + NumCast, -{ - let data = array.data_ref(); - // the value data stored by the list - let value_data = data.child_data()[0].clone(); - - let out_dtype = match array.data_type() { - DataType::List(value_type) => { - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - DataType::LargeList(value_type.clone()) - } - DataType::LargeList(value_type) => { - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - assert_eq!( - std::mem::size_of::(), - std::mem::size_of::() - ); - if value_data.len() > i32::MAX as usize { - return Err(ArrowError::ComputeError( - "LargeList too large to cast to List".into(), - )); - } - DataType::List(value_type.clone()) - } - // implementation error - _ => unreachable!(), - }; - - let offsets = data.buffer::(0); - - let iter = offsets.iter().map(|idx| { - let idx: OffsetSizeTo = NumCast::from(*idx).unwrap(); - idx - }); - - // SAFETY - // A slice produces a trusted length iterator - let offset_buffer = unsafe { Buffer::from_trusted_len_iter(iter) }; - - // wrap up - let mut builder = ArrayData::builder(out_dtype) - .len(array.len()) - .add_buffer(offset_buffer) - .add_child_data(value_data); - - if let Some(buf) = data.null_buffer() { - builder = builder.null_bit_buffer(buf.clone()) - } - let data = builder.build(); - Ok(make_array(data)) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{buffer::Buffer, util::display::array_value_to_string}; - - #[test] - fn test_cast_i32_to_f64() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Float64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert!(5.0 - c.value(0) < f64::EPSILON); - assert!(6.0 - c.value(1) < f64::EPSILON); - assert!(7.0 - c.value(2) < f64::EPSILON); - assert!(8.0 - c.value(3) < f64::EPSILON); - assert!(9.0 - c.value(4) < f64::EPSILON); - } - - #[test] - fn test_cast_i32_to_u8() { - let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::UInt8).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(false, c.is_valid(0)); - assert_eq!(6, c.value(1)); - assert_eq!(false, c.is_valid(2)); - assert_eq!(8, c.value(3)); - // overflows return None - assert_eq!(false, c.is_valid(4)); - } - - #[test] - fn test_cast_i32_to_u8_sliced() { - let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]); - let array = Arc::new(a) as ArrayRef; - assert_eq!(0, array.offset()); - let array = array.slice(2, 3); - assert_eq!(2, array.offset()); - let b = cast(&array, &DataType::UInt8).unwrap(); - assert_eq!(3, b.len()); - assert_eq!(0, b.offset()); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(false, c.is_valid(0)); - assert_eq!(8, c.value(1)); - // overflows return None - assert_eq!(false, c.is_valid(2)); - } - - #[test] - fn test_cast_i32_to_i32() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Int32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(5, c.value(0)); - assert_eq!(6, c.value(1)); - assert_eq!(7, c.value(2)); - assert_eq!(8, c.value(3)); - assert_eq!(9, c.value(4)); - } - - #[test] - fn test_cast_i32_to_list_i32() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let array = Arc::new(a) as ArrayRef; - let b = cast( - &array, - &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), - ) - .unwrap(); - assert_eq!(5, b.len()); - let arr = b.as_any().downcast_ref::().unwrap(); - assert_eq!(&[0, 1, 2, 3, 4, 5], arr.value_offsets()); - assert_eq!(1, arr.value_length(0)); - assert_eq!(1, arr.value_length(1)); - assert_eq!(1, arr.value_length(2)); - assert_eq!(1, arr.value_length(3)); - assert_eq!(1, arr.value_length(4)); - let values = arr.values(); - let c = values.as_any().downcast_ref::().unwrap(); - assert_eq!(5, c.value(0)); - assert_eq!(6, c.value(1)); - assert_eq!(7, c.value(2)); - assert_eq!(8, c.value(3)); - assert_eq!(9, c.value(4)); - } - - #[test] - fn test_cast_i32_to_list_i32_nullable() { - let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]); - let array = Arc::new(a) as ArrayRef; - let b = cast( - &array, - &DataType::List(Box::new(Field::new("item", DataType::Int32, true))), - ) - .unwrap(); - assert_eq!(5, b.len()); - assert_eq!(1, b.null_count()); - let arr = b.as_any().downcast_ref::().unwrap(); - assert_eq!(&[0, 1, 2, 3, 4, 5], arr.value_offsets()); - assert_eq!(1, arr.value_length(0)); - assert_eq!(1, arr.value_length(1)); - assert_eq!(1, arr.value_length(2)); - assert_eq!(1, arr.value_length(3)); - assert_eq!(1, arr.value_length(4)); - let values = arr.values(); - let c = values.as_any().downcast_ref::().unwrap(); - assert_eq!(1, c.null_count()); - assert_eq!(5, c.value(0)); - assert_eq!(false, c.is_valid(1)); - assert_eq!(7, c.value(2)); - assert_eq!(8, c.value(3)); - assert_eq!(9, c.value(4)); - } - - #[test] - fn test_cast_i32_to_list_f64_nullable_sliced() { - let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]); - let array = Arc::new(a) as ArrayRef; - let array = array.slice(2, 4); - let b = cast( - &array, - &DataType::List(Box::new(Field::new("item", DataType::Float64, true))), - ) - .unwrap(); - assert_eq!(4, b.len()); - assert_eq!(1, b.null_count()); - let arr = b.as_any().downcast_ref::().unwrap(); - assert_eq!(&[0, 1, 2, 3, 4], arr.value_offsets()); - assert_eq!(1, arr.value_length(0)); - assert_eq!(1, arr.value_length(1)); - assert_eq!(1, arr.value_length(2)); - assert_eq!(1, arr.value_length(3)); - let values = arr.values(); - let c = values.as_any().downcast_ref::().unwrap(); - assert_eq!(1, c.null_count()); - assert!(7.0 - c.value(0) < f64::EPSILON); - assert!(8.0 - c.value(1) < f64::EPSILON); - assert_eq!(false, c.is_valid(2)); - assert!(10.0 - c.value(3) < f64::EPSILON); - } - - #[test] - fn test_cast_utf8_to_i32() { - let a = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Int32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(5, c.value(0)); - assert_eq!(6, c.value(1)); - assert_eq!(false, c.is_valid(2)); - assert_eq!(8, c.value(3)); - assert_eq!(false, c.is_valid(4)); - } - - #[test] - fn test_cast_with_options_utf8_to_i32() { - let a = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]); - let array = Arc::new(a) as ArrayRef; - let result = - cast_with_options(&array, &DataType::Int32, &CastOptions { safe: false }); - match result { - Ok(_) => panic!("expected error"), - Err(e) => { - assert!(e.to_string().contains( - "Cast error: Cannot cast string 'seven' to value of arrow::datatypes::types::Int32Type type" - )) - } - } - } - - #[test] - fn test_cast_bool_to_i32() { - let a = BooleanArray::from(vec![Some(true), Some(false), None]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Int32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(1, c.value(0)); - assert_eq!(0, c.value(1)); - assert_eq!(false, c.is_valid(2)); - } - - #[test] - fn test_cast_bool_to_f64() { - let a = BooleanArray::from(vec![Some(true), Some(false), None]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Float64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert!(1.0 - c.value(0) < f64::EPSILON); - assert!(0.0 - c.value(1) < f64::EPSILON); - assert_eq!(false, c.is_valid(2)); - } - - #[test] - #[should_panic( - expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported" - )] - fn test_cast_int32_to_timestamp() { - let a = Int32Array::from(vec![Some(2), Some(10), None]); - let array = Arc::new(a) as ArrayRef; - cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); - } - - #[test] - fn test_cast_list_i32_to_list_u16() { - // Construct a value array - let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 100000000]) - .data() - .clone(); - - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; - - let cast_array = cast( - &list_array, - &DataType::List(Box::new(Field::new("item", DataType::UInt16, true))), - ) - .unwrap(); - // 3 negative values should get lost when casting to unsigned, - // 1 value should overflow - assert_eq!(4, cast_array.null_count()); - // offsets should be the same - assert_eq!( - list_array.data().buffers().to_vec(), - cast_array.data().buffers().to_vec() - ); - let array = cast_array - .as_ref() - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(DataType::UInt16, array.value_type()); - assert_eq!(4, array.values().null_count()); - assert_eq!(3, array.value_length(0)); - assert_eq!(3, array.value_length(1)); - assert_eq!(2, array.value_length(2)); - let values = array.values(); - let u16arr = values.as_any().downcast_ref::().unwrap(); - assert_eq!(8, u16arr.len()); - assert_eq!(4, u16arr.null_count()); - - assert_eq!(0, u16arr.value(0)); - assert_eq!(0, u16arr.value(1)); - assert_eq!(0, u16arr.value(2)); - assert_eq!(false, u16arr.is_valid(3)); - assert_eq!(false, u16arr.is_valid(4)); - assert_eq!(false, u16arr.is_valid(5)); - assert_eq!(2, u16arr.value(6)); - assert_eq!(false, u16arr.is_valid(7)); - } - - #[test] - #[should_panic( - expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported" - )] - fn test_cast_list_i32_to_list_timestamp() { - // Construct a value array - let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8, 100000000]) - .data() - .clone(); - - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 9]); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef; - - cast( - &list_array, - &DataType::List(Box::new(Field::new( - "item", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - ))), - ) - .unwrap(); - } - - #[test] - fn test_cast_date32_to_date64() { - let a = Date32Array::from(vec![10000, 17890]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Date64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(864000000000, c.value(0)); - assert_eq!(1545696000000, c.value(1)); - } - - #[test] - fn test_cast_date64_to_date32() { - let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Date32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(10000, c.value(0)); - assert_eq!(17890, c.value(1)); - assert!(c.is_null(2)); - } - - #[test] - fn test_cast_string_to_timestamp() { - let a1 = Arc::new(StringArray::from(vec![ - Some("2020-09-08T12:00:00+00:00"), - Some("Not a valid date"), - None, - ])) as ArrayRef; - let a2 = Arc::new(LargeStringArray::from(vec![ - Some("2020-09-08T12:00:00+00:00"), - Some("Not a valid date"), - None, - ])) as ArrayRef; - for array in &[a1, a2] { - let b = - cast(array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); - let c = b - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1599566400000000000, c.value(0)); - assert!(c.is_null(1)); - assert!(c.is_null(2)); - } - } - - #[test] - fn test_cast_date32_to_int32() { - let a = Date32Array::from(vec![10000, 17890]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Int32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(10000, c.value(0)); - assert_eq!(17890, c.value(1)); - } - - #[test] - fn test_cast_int32_to_date32() { - let a = Int32Array::from(vec![10000, 17890]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Date32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(10000, c.value(0)); - assert_eq!(17890, c.value(1)); - } - - #[test] - fn test_cast_timestamp_to_date32() { - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(864000000005), Some(1545696000001), None], - Some(String::from("UTC")), - ); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Date32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(10000, c.value(0)); - assert_eq!(17890, c.value(1)); - assert!(c.is_null(2)); - } - - #[test] - fn test_cast_timestamp_to_date64() { - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(864000000005), Some(1545696000001), None], - None, - ); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Date64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(864000000005, c.value(0)); - assert_eq!(1545696000001, c.value(1)); - assert!(c.is_null(2)); - } - - #[test] - fn test_cast_timestamp_to_i64() { - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(864000000005), Some(1545696000001), None], - Some("UTC".to_string()), - ); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Int64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(&DataType::Int64, c.data_type()); - assert_eq!(864000000005, c.value(0)); - assert_eq!(1545696000001, c.value(1)); - assert!(c.is_null(2)); - } - - #[test] - fn test_cast_between_timestamps() { - let a = TimestampMillisecondArray::from_opt_vec( - vec![Some(864000003005), Some(1545696002001), None], - None, - ); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(864000003, c.value(0)); - assert_eq!(1545696002, c.value(1)); - assert!(c.is_null(2)); - } - - #[test] - fn test_cast_to_strings() { - let a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; - let out = cast(&a, &DataType::Utf8).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!(out, vec![Some("1"), Some("2"), Some("3")]); - let out = cast(&a, &DataType::LargeUtf8).unwrap(); - let out = out - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!(out, vec![Some("1"), Some("2"), Some("3")]); - } - - #[test] - fn test_str_to_str_casts() { - for data in vec![ - vec![Some("foo"), Some("bar"), Some("ham")], - vec![Some("foo"), None, Some("bar")], - ] { - let a = Arc::new(LargeStringArray::from(data.clone())) as ArrayRef; - let to = cast(&a, &DataType::Utf8).unwrap(); - let expect = a - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - let out = to - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!(expect, out); - - let a = Arc::new(StringArray::from(data)) as ArrayRef; - let to = cast(&a, &DataType::LargeUtf8).unwrap(); - let expect = a - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - let out = to - .as_any() - .downcast_ref::() - .unwrap() - .into_iter() - .collect::>(); - assert_eq!(expect, out); - } - } - - #[test] - fn test_cast_from_f64() { - let f64_values: Vec = vec![ - std::i64::MIN as f64, - std::i32::MIN as f64, - std::i16::MIN as f64, - std::i8::MIN as f64, - 0_f64, - std::u8::MAX as f64, - std::u16::MAX as f64, - std::u32::MAX as f64, - std::u64::MAX as f64, - ]; - let f64_array: ArrayRef = Arc::new(Float64Array::from(f64_values)); - - let f64_expected = vec![ - "-9223372036854776000.0", - "-2147483648.0", - "-32768.0", - "-128.0", - "0.0", - "255.0", - "65535.0", - "4294967295.0", - "18446744073709552000.0", - ]; - assert_eq!( - f64_expected, - get_cast_values::(&f64_array, &DataType::Float64) - ); - - let f32_expected = vec![ - "-9223372000000000000.0", - "-2147483600.0", - "-32768.0", - "-128.0", - "0.0", - "255.0", - "65535.0", - "4294967300.0", - "18446744000000000000.0", - ]; - assert_eq!( - f32_expected, - get_cast_values::(&f64_array, &DataType::Float32) - ); - - let i64_expected = vec![ - "-9223372036854775808", - "-2147483648", - "-32768", - "-128", - "0", - "255", - "65535", - "4294967295", - "null", - ]; - assert_eq!( - i64_expected, - get_cast_values::(&f64_array, &DataType::Int64) - ); - - let i32_expected = vec![ - "null", - "-2147483648", - "-32768", - "-128", - "0", - "255", - "65535", - "null", - "null", - ]; - assert_eq!( - i32_expected, - get_cast_values::(&f64_array, &DataType::Int32) - ); - - let i16_expected = vec![ - "null", "null", "-32768", "-128", "0", "255", "null", "null", "null", - ]; - assert_eq!( - i16_expected, - get_cast_values::(&f64_array, &DataType::Int16) - ); - - let i8_expected = vec![ - "null", "null", "null", "-128", "0", "null", "null", "null", "null", - ]; - assert_eq!( - i8_expected, - get_cast_values::(&f64_array, &DataType::Int8) - ); - - let u64_expected = vec![ - "null", - "null", - "null", - "null", - "0", - "255", - "65535", - "4294967295", - "null", - ]; - assert_eq!( - u64_expected, - get_cast_values::(&f64_array, &DataType::UInt64) - ); - - let u32_expected = vec![ - "null", - "null", - "null", - "null", - "0", - "255", - "65535", - "4294967295", - "null", - ]; - assert_eq!( - u32_expected, - get_cast_values::(&f64_array, &DataType::UInt32) - ); - - let u16_expected = vec![ - "null", "null", "null", "null", "0", "255", "65535", "null", "null", - ]; - assert_eq!( - u16_expected, - get_cast_values::(&f64_array, &DataType::UInt16) - ); - - let u8_expected = vec![ - "null", "null", "null", "null", "0", "255", "null", "null", "null", - ]; - assert_eq!( - u8_expected, - get_cast_values::(&f64_array, &DataType::UInt8) - ); - } - - #[test] - fn test_cast_from_f32() { - let f32_values: Vec = vec![ - std::i32::MIN as f32, - std::i32::MIN as f32, - std::i16::MIN as f32, - std::i8::MIN as f32, - 0_f32, - std::u8::MAX as f32, - std::u16::MAX as f32, - std::u32::MAX as f32, - std::u32::MAX as f32, - ]; - let f32_array: ArrayRef = Arc::new(Float32Array::from(f32_values)); - - let f64_expected = vec![ - "-2147483648.0", - "-2147483648.0", - "-32768.0", - "-128.0", - "0.0", - "255.0", - "65535.0", - "4294967296.0", - "4294967296.0", - ]; - assert_eq!( - f64_expected, - get_cast_values::(&f32_array, &DataType::Float64) - ); - - let f32_expected = vec![ - "-2147483600.0", - "-2147483600.0", - "-32768.0", - "-128.0", - "0.0", - "255.0", - "65535.0", - "4294967300.0", - "4294967300.0", - ]; - assert_eq!( - f32_expected, - get_cast_values::(&f32_array, &DataType::Float32) - ); - - let i64_expected = vec![ - "-2147483648", - "-2147483648", - "-32768", - "-128", - "0", - "255", - "65535", - "4294967296", - "4294967296", - ]; - assert_eq!( - i64_expected, - get_cast_values::(&f32_array, &DataType::Int64) - ); - - let i32_expected = vec![ - "-2147483648", - "-2147483648", - "-32768", - "-128", - "0", - "255", - "65535", - "null", - "null", - ]; - assert_eq!( - i32_expected, - get_cast_values::(&f32_array, &DataType::Int32) - ); - - let i16_expected = vec![ - "null", "null", "-32768", "-128", "0", "255", "null", "null", "null", - ]; - assert_eq!( - i16_expected, - get_cast_values::(&f32_array, &DataType::Int16) - ); - - let i8_expected = vec![ - "null", "null", "null", "-128", "0", "null", "null", "null", "null", - ]; - assert_eq!( - i8_expected, - get_cast_values::(&f32_array, &DataType::Int8) - ); - - let u64_expected = vec![ - "null", - "null", - "null", - "null", - "0", - "255", - "65535", - "4294967296", - "4294967296", - ]; - assert_eq!( - u64_expected, - get_cast_values::(&f32_array, &DataType::UInt64) - ); - - let u32_expected = vec![ - "null", "null", "null", "null", "0", "255", "65535", "null", "null", - ]; - assert_eq!( - u32_expected, - get_cast_values::(&f32_array, &DataType::UInt32) - ); - - let u16_expected = vec![ - "null", "null", "null", "null", "0", "255", "65535", "null", "null", - ]; - assert_eq!( - u16_expected, - get_cast_values::(&f32_array, &DataType::UInt16) - ); - - let u8_expected = vec![ - "null", "null", "null", "null", "0", "255", "null", "null", "null", - ]; - assert_eq!( - u8_expected, - get_cast_values::(&f32_array, &DataType::UInt8) - ); - } - - #[test] - fn test_cast_from_uint64() { - let u64_values: Vec = vec![ - 0, - std::u8::MAX as u64, - std::u16::MAX as u64, - std::u32::MAX as u64, - std::u64::MAX, - ]; - let u64_array: ArrayRef = Arc::new(UInt64Array::from(u64_values)); - - let f64_expected = vec![ - "0.0", - "255.0", - "65535.0", - "4294967295.0", - "18446744073709552000.0", - ]; - assert_eq!( - f64_expected, - get_cast_values::(&u64_array, &DataType::Float64) - ); - - let f32_expected = vec![ - "0.0", - "255.0", - "65535.0", - "4294967300.0", - "18446744000000000000.0", - ]; - assert_eq!( - f32_expected, - get_cast_values::(&u64_array, &DataType::Float32) - ); - - let i64_expected = vec!["0", "255", "65535", "4294967295", "null"]; - assert_eq!( - i64_expected, - get_cast_values::(&u64_array, &DataType::Int64) - ); - - let i32_expected = vec!["0", "255", "65535", "null", "null"]; - assert_eq!( - i32_expected, - get_cast_values::(&u64_array, &DataType::Int32) - ); - - let i16_expected = vec!["0", "255", "null", "null", "null"]; - assert_eq!( - i16_expected, - get_cast_values::(&u64_array, &DataType::Int16) - ); - - let i8_expected = vec!["0", "null", "null", "null", "null"]; - assert_eq!( - i8_expected, - get_cast_values::(&u64_array, &DataType::Int8) - ); - - let u64_expected = - vec!["0", "255", "65535", "4294967295", "18446744073709551615"]; - assert_eq!( - u64_expected, - get_cast_values::(&u64_array, &DataType::UInt64) - ); - - let u32_expected = vec!["0", "255", "65535", "4294967295", "null"]; - assert_eq!( - u32_expected, - get_cast_values::(&u64_array, &DataType::UInt32) - ); - - let u16_expected = vec!["0", "255", "65535", "null", "null"]; - assert_eq!( - u16_expected, - get_cast_values::(&u64_array, &DataType::UInt16) - ); - - let u8_expected = vec!["0", "255", "null", "null", "null"]; - assert_eq!( - u8_expected, - get_cast_values::(&u64_array, &DataType::UInt8) - ); - } - - #[test] - fn test_cast_from_uint32() { - let u32_values: Vec = vec![ - 0, - std::u8::MAX as u32, - std::u16::MAX as u32, - std::u32::MAX as u32, - ]; - let u32_array: ArrayRef = Arc::new(UInt32Array::from(u32_values)); - - let f64_expected = vec!["0.0", "255.0", "65535.0", "4294967295.0"]; - assert_eq!( - f64_expected, - get_cast_values::(&u32_array, &DataType::Float64) - ); - - let f32_expected = vec!["0.0", "255.0", "65535.0", "4294967300.0"]; - assert_eq!( - f32_expected, - get_cast_values::(&u32_array, &DataType::Float32) - ); - - let i64_expected = vec!["0", "255", "65535", "4294967295"]; - assert_eq!( - i64_expected, - get_cast_values::(&u32_array, &DataType::Int64) - ); - - let i32_expected = vec!["0", "255", "65535", "null"]; - assert_eq!( - i32_expected, - get_cast_values::(&u32_array, &DataType::Int32) - ); - - let i16_expected = vec!["0", "255", "null", "null"]; - assert_eq!( - i16_expected, - get_cast_values::(&u32_array, &DataType::Int16) - ); - - let i8_expected = vec!["0", "null", "null", "null"]; - assert_eq!( - i8_expected, - get_cast_values::(&u32_array, &DataType::Int8) - ); - - let u64_expected = vec!["0", "255", "65535", "4294967295"]; - assert_eq!( - u64_expected, - get_cast_values::(&u32_array, &DataType::UInt64) - ); - - let u32_expected = vec!["0", "255", "65535", "4294967295"]; - assert_eq!( - u32_expected, - get_cast_values::(&u32_array, &DataType::UInt32) - ); - - let u16_expected = vec!["0", "255", "65535", "null"]; - assert_eq!( - u16_expected, - get_cast_values::(&u32_array, &DataType::UInt16) - ); - - let u8_expected = vec!["0", "255", "null", "null"]; - assert_eq!( - u8_expected, - get_cast_values::(&u32_array, &DataType::UInt8) - ); - } - - #[test] - fn test_cast_from_uint16() { - let u16_values: Vec = vec![0, std::u8::MAX as u16, std::u16::MAX as u16]; - let u16_array: ArrayRef = Arc::new(UInt16Array::from(u16_values)); - - let f64_expected = vec!["0.0", "255.0", "65535.0"]; - assert_eq!( - f64_expected, - get_cast_values::(&u16_array, &DataType::Float64) - ); - - let f32_expected = vec!["0.0", "255.0", "65535.0"]; - assert_eq!( - f32_expected, - get_cast_values::(&u16_array, &DataType::Float32) - ); - - let i64_expected = vec!["0", "255", "65535"]; - assert_eq!( - i64_expected, - get_cast_values::(&u16_array, &DataType::Int64) - ); - - let i32_expected = vec!["0", "255", "65535"]; - assert_eq!( - i32_expected, - get_cast_values::(&u16_array, &DataType::Int32) - ); - - let i16_expected = vec!["0", "255", "null"]; - assert_eq!( - i16_expected, - get_cast_values::(&u16_array, &DataType::Int16) - ); - - let i8_expected = vec!["0", "null", "null"]; - assert_eq!( - i8_expected, - get_cast_values::(&u16_array, &DataType::Int8) - ); - - let u64_expected = vec!["0", "255", "65535"]; - assert_eq!( - u64_expected, - get_cast_values::(&u16_array, &DataType::UInt64) - ); - - let u32_expected = vec!["0", "255", "65535"]; - assert_eq!( - u32_expected, - get_cast_values::(&u16_array, &DataType::UInt32) - ); - - let u16_expected = vec!["0", "255", "65535"]; - assert_eq!( - u16_expected, - get_cast_values::(&u16_array, &DataType::UInt16) - ); - - let u8_expected = vec!["0", "255", "null"]; - assert_eq!( - u8_expected, - get_cast_values::(&u16_array, &DataType::UInt8) - ); - } - - #[test] - fn test_cast_from_uint8() { - let u8_values: Vec = vec![0, std::u8::MAX]; - let u8_array: ArrayRef = Arc::new(UInt8Array::from(u8_values)); - - let f64_expected = vec!["0.0", "255.0"]; - assert_eq!( - f64_expected, - get_cast_values::(&u8_array, &DataType::Float64) - ); - - let f32_expected = vec!["0.0", "255.0"]; - assert_eq!( - f32_expected, - get_cast_values::(&u8_array, &DataType::Float32) - ); - - let i64_expected = vec!["0", "255"]; - assert_eq!( - i64_expected, - get_cast_values::(&u8_array, &DataType::Int64) - ); - - let i32_expected = vec!["0", "255"]; - assert_eq!( - i32_expected, - get_cast_values::(&u8_array, &DataType::Int32) - ); - - let i16_expected = vec!["0", "255"]; - assert_eq!( - i16_expected, - get_cast_values::(&u8_array, &DataType::Int16) - ); - - let i8_expected = vec!["0", "null"]; - assert_eq!( - i8_expected, - get_cast_values::(&u8_array, &DataType::Int8) - ); - - let u64_expected = vec!["0", "255"]; - assert_eq!( - u64_expected, - get_cast_values::(&u8_array, &DataType::UInt64) - ); - - let u32_expected = vec!["0", "255"]; - assert_eq!( - u32_expected, - get_cast_values::(&u8_array, &DataType::UInt32) - ); - - let u16_expected = vec!["0", "255"]; - assert_eq!( - u16_expected, - get_cast_values::(&u8_array, &DataType::UInt16) - ); - - let u8_expected = vec!["0", "255"]; - assert_eq!( - u8_expected, - get_cast_values::(&u8_array, &DataType::UInt8) - ); - } - - #[test] - fn test_cast_from_int64() { - let i64_values: Vec = vec![ - std::i64::MIN, - std::i32::MIN as i64, - std::i16::MIN as i64, - std::i8::MIN as i64, - 0, - std::i8::MAX as i64, - std::i16::MAX as i64, - std::i32::MAX as i64, - std::i64::MAX, - ]; - let i64_array: ArrayRef = Arc::new(Int64Array::from(i64_values)); - - let f64_expected = vec![ - "-9223372036854776000.0", - "-2147483648.0", - "-32768.0", - "-128.0", - "0.0", - "127.0", - "32767.0", - "2147483647.0", - "9223372036854776000.0", - ]; - assert_eq!( - f64_expected, - get_cast_values::(&i64_array, &DataType::Float64) - ); - - let f32_expected = vec![ - "-9223372000000000000.0", - "-2147483600.0", - "-32768.0", - "-128.0", - "0.0", - "127.0", - "32767.0", - "2147483600.0", - "9223372000000000000.0", - ]; - assert_eq!( - f32_expected, - get_cast_values::(&i64_array, &DataType::Float32) - ); - - let i64_expected = vec![ - "-9223372036854775808", - "-2147483648", - "-32768", - "-128", - "0", - "127", - "32767", - "2147483647", - "9223372036854775807", - ]; - assert_eq!( - i64_expected, - get_cast_values::(&i64_array, &DataType::Int64) - ); - - let i32_expected = vec![ - "null", - "-2147483648", - "-32768", - "-128", - "0", - "127", - "32767", - "2147483647", - "null", - ]; - assert_eq!( - i32_expected, - get_cast_values::(&i64_array, &DataType::Int32) - ); - - assert_eq!( - i32_expected, - get_cast_values::(&i64_array, &DataType::Date32) - ); - - let i16_expected = vec![ - "null", "null", "-32768", "-128", "0", "127", "32767", "null", "null", - ]; - assert_eq!( - i16_expected, - get_cast_values::(&i64_array, &DataType::Int16) - ); - - let i8_expected = vec![ - "null", "null", "null", "-128", "0", "127", "null", "null", "null", - ]; - assert_eq!( - i8_expected, - get_cast_values::(&i64_array, &DataType::Int8) - ); - - let u64_expected = vec![ - "null", - "null", - "null", - "null", - "0", - "127", - "32767", - "2147483647", - "9223372036854775807", - ]; - assert_eq!( - u64_expected, - get_cast_values::(&i64_array, &DataType::UInt64) - ); - - let u32_expected = vec![ - "null", - "null", - "null", - "null", - "0", - "127", - "32767", - "2147483647", - "null", - ]; - assert_eq!( - u32_expected, - get_cast_values::(&i64_array, &DataType::UInt32) - ); - - let u16_expected = vec![ - "null", "null", "null", "null", "0", "127", "32767", "null", "null", - ]; - assert_eq!( - u16_expected, - get_cast_values::(&i64_array, &DataType::UInt16) - ); - - let u8_expected = vec![ - "null", "null", "null", "null", "0", "127", "null", "null", "null", - ]; - assert_eq!( - u8_expected, - get_cast_values::(&i64_array, &DataType::UInt8) - ); - } - - #[test] - fn test_cast_from_int32() { - let i32_values: Vec = vec![ - std::i32::MIN as i32, - std::i16::MIN as i32, - std::i8::MIN as i32, - 0, - std::i8::MAX as i32, - std::i16::MAX as i32, - std::i32::MAX as i32, - ]; - let i32_array: ArrayRef = Arc::new(Int32Array::from(i32_values)); - - let f64_expected = vec![ - "-2147483648.0", - "-32768.0", - "-128.0", - "0.0", - "127.0", - "32767.0", - "2147483647.0", - ]; - assert_eq!( - f64_expected, - get_cast_values::(&i32_array, &DataType::Float64) - ); - - let f32_expected = vec![ - "-2147483600.0", - "-32768.0", - "-128.0", - "0.0", - "127.0", - "32767.0", - "2147483600.0", - ]; - assert_eq!( - f32_expected, - get_cast_values::(&i32_array, &DataType::Float32) - ); - - let i16_expected = vec!["null", "-32768", "-128", "0", "127", "32767", "null"]; - assert_eq!( - i16_expected, - get_cast_values::(&i32_array, &DataType::Int16) - ); - - let i8_expected = vec!["null", "null", "-128", "0", "127", "null", "null"]; - assert_eq!( - i8_expected, - get_cast_values::(&i32_array, &DataType::Int8) - ); - - let u64_expected = - vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; - assert_eq!( - u64_expected, - get_cast_values::(&i32_array, &DataType::UInt64) - ); - - let u32_expected = - vec!["null", "null", "null", "0", "127", "32767", "2147483647"]; - assert_eq!( - u32_expected, - get_cast_values::(&i32_array, &DataType::UInt32) - ); - - let u16_expected = vec!["null", "null", "null", "0", "127", "32767", "null"]; - assert_eq!( - u16_expected, - get_cast_values::(&i32_array, &DataType::UInt16) - ); - - let u8_expected = vec!["null", "null", "null", "0", "127", "null", "null"]; - assert_eq!( - u8_expected, - get_cast_values::(&i32_array, &DataType::UInt8) - ); - - // The date32 to date64 cast increases the numerical values in order to keep the same dates. - let i64_expected = vec![ - "-185542587187200000", - "-2831155200000", - "-11059200000", - "0", - "10972800000", - "2831068800000", - "185542587100800000", - ]; - assert_eq!( - i64_expected, - get_cast_values::(&i32_array, &DataType::Date64) - ); - } - - #[test] - fn test_cast_from_int16() { - let i16_values: Vec = vec![ - std::i16::MIN, - std::i8::MIN as i16, - 0, - std::i8::MAX as i16, - std::i16::MAX, - ]; - let i16_array: ArrayRef = Arc::new(Int16Array::from(i16_values)); - - let f64_expected = vec!["-32768.0", "-128.0", "0.0", "127.0", "32767.0"]; - assert_eq!( - f64_expected, - get_cast_values::(&i16_array, &DataType::Float64) - ); - - let f32_expected = vec!["-32768.0", "-128.0", "0.0", "127.0", "32767.0"]; - assert_eq!( - f32_expected, - get_cast_values::(&i16_array, &DataType::Float32) - ); - - let i64_expected = vec!["-32768", "-128", "0", "127", "32767"]; - assert_eq!( - i64_expected, - get_cast_values::(&i16_array, &DataType::Int64) - ); - - let i32_expected = vec!["-32768", "-128", "0", "127", "32767"]; - assert_eq!( - i32_expected, - get_cast_values::(&i16_array, &DataType::Int32) - ); - - let i16_expected = vec!["-32768", "-128", "0", "127", "32767"]; - assert_eq!( - i16_expected, - get_cast_values::(&i16_array, &DataType::Int16) - ); - - let i8_expected = vec!["null", "-128", "0", "127", "null"]; - assert_eq!( - i8_expected, - get_cast_values::(&i16_array, &DataType::Int8) - ); - - let u64_expected = vec!["null", "null", "0", "127", "32767"]; - assert_eq!( - u64_expected, - get_cast_values::(&i16_array, &DataType::UInt64) - ); - - let u32_expected = vec!["null", "null", "0", "127", "32767"]; - assert_eq!( - u32_expected, - get_cast_values::(&i16_array, &DataType::UInt32) - ); - - let u16_expected = vec!["null", "null", "0", "127", "32767"]; - assert_eq!( - u16_expected, - get_cast_values::(&i16_array, &DataType::UInt16) - ); - - let u8_expected = vec!["null", "null", "0", "127", "null"]; - assert_eq!( - u8_expected, - get_cast_values::(&i16_array, &DataType::UInt8) - ); - } - - #[test] - fn test_cast_from_date32() { - let i32_values: Vec = vec![ - std::i32::MIN as i32, - std::i16::MIN as i32, - std::i8::MIN as i32, - 0, - std::i8::MAX as i32, - std::i16::MAX as i32, - std::i32::MAX as i32, - ]; - let date32_array: ArrayRef = Arc::new(Date32Array::from(i32_values)); - - let i64_expected = vec![ - "-2147483648", - "-32768", - "-128", - "0", - "127", - "32767", - "2147483647", - ]; - assert_eq!( - i64_expected, - get_cast_values::(&date32_array, &DataType::Int64) - ); - } - - #[test] - fn test_cast_from_int8() { - let i8_values: Vec = vec![std::i8::MIN, 0, std::i8::MAX]; - let i8_array: ArrayRef = Arc::new(Int8Array::from(i8_values)); - - let f64_expected = vec!["-128.0", "0.0", "127.0"]; - assert_eq!( - f64_expected, - get_cast_values::(&i8_array, &DataType::Float64) - ); - - let f32_expected = vec!["-128.0", "0.0", "127.0"]; - assert_eq!( - f32_expected, - get_cast_values::(&i8_array, &DataType::Float32) - ); - - let i64_expected = vec!["-128", "0", "127"]; - assert_eq!( - i64_expected, - get_cast_values::(&i8_array, &DataType::Int64) - ); - - let i32_expected = vec!["-128", "0", "127"]; - assert_eq!( - i32_expected, - get_cast_values::(&i8_array, &DataType::Int32) - ); - - let i16_expected = vec!["-128", "0", "127"]; - assert_eq!( - i16_expected, - get_cast_values::(&i8_array, &DataType::Int16) - ); - - let i8_expected = vec!["-128", "0", "127"]; - assert_eq!( - i8_expected, - get_cast_values::(&i8_array, &DataType::Int8) - ); - - let u64_expected = vec!["null", "0", "127"]; - assert_eq!( - u64_expected, - get_cast_values::(&i8_array, &DataType::UInt64) - ); - - let u32_expected = vec!["null", "0", "127"]; - assert_eq!( - u32_expected, - get_cast_values::(&i8_array, &DataType::UInt32) - ); - - let u16_expected = vec!["null", "0", "127"]; - assert_eq!( - u16_expected, - get_cast_values::(&i8_array, &DataType::UInt16) - ); - - let u8_expected = vec!["null", "0", "127"]; - assert_eq!( - u8_expected, - get_cast_values::(&i8_array, &DataType::UInt8) - ); - } - - /// Convert `array` into a vector of strings by casting to data type dt - fn get_cast_values(array: &ArrayRef, dt: &DataType) -> Vec - where - T: ArrowNumericType, - { - let c = cast(&array, dt).unwrap(); - let a = c.as_any().downcast_ref::>().unwrap(); - let mut v: Vec = vec![]; - for i in 0..array.len() { - if a.is_null(i) { - v.push("null".to_string()) - } else { - v.push(format!("{:?}", a.value(i))); - } - } - v - } - - #[test] - fn test_cast_utf8_dict() { - // FROM a dictionary with of Utf8 values - use DataType::*; - - let keys_builder = PrimitiveBuilder::::new(10); - let values_builder = StringBuilder::new(10); - let mut builder = StringDictionaryBuilder::new(keys_builder, values_builder); - builder.append("one").unwrap(); - builder.append_null().unwrap(); - builder.append("three").unwrap(); - let array: ArrayRef = Arc::new(builder.finish()); - - let expected = vec!["one", "null", "three"]; - - // Test casting TO StringArray - let cast_type = Utf8; - let cast_array = cast(&array, &cast_type).expect("cast to UTF-8 failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - - // Test casting TO Dictionary (with different index sizes) - - let cast_type = Dictionary(Box::new(Int16), Box::new(Utf8)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - - let cast_type = Dictionary(Box::new(Int32), Box::new(Utf8)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - - let cast_type = Dictionary(Box::new(Int64), Box::new(Utf8)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - - let cast_type = Dictionary(Box::new(UInt8), Box::new(Utf8)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - - let cast_type = Dictionary(Box::new(UInt16), Box::new(Utf8)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - - let cast_type = Dictionary(Box::new(UInt32), Box::new(Utf8)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - - let cast_type = Dictionary(Box::new(UInt64), Box::new(Utf8)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - } - - #[test] - fn test_cast_dict_to_dict_bad_index_value_primitive() { - use DataType::*; - // test converting from an array that has indexes of a type - // that are out of bounds for a particular other kind of - // index. - - let keys_builder = PrimitiveBuilder::::new(10); - let values_builder = PrimitiveBuilder::::new(10); - let mut builder = PrimitiveDictionaryBuilder::new(keys_builder, values_builder); - - // add 200 distinct values (which can be stored by a - // dictionary indexed by int32, but not a dictionary indexed - // with int8) - for i in 0..200 { - builder.append(i).unwrap(); - } - let array: ArrayRef = Arc::new(builder.finish()); - - let cast_type = Dictionary(Box::new(Int8), Box::new(Utf8)); - let res = cast(&array, &cast_type); - assert!(res.is_err()); - let actual_error = format!("{:?}", res); - let expected_error = "Could not convert 72 dictionary indexes from Int32 to Int8"; - assert!( - actual_error.contains(expected_error), - "did not find expected error '{}' in actual error '{}'", - actual_error, - expected_error - ); - } - - #[test] - fn test_cast_dict_to_dict_bad_index_value_utf8() { - use DataType::*; - // Same test as test_cast_dict_to_dict_bad_index_value but use - // string values (and encode the expected behavior here); - - let keys_builder = PrimitiveBuilder::::new(10); - let values_builder = StringBuilder::new(10); - let mut builder = StringDictionaryBuilder::new(keys_builder, values_builder); - - // add 200 distinct values (which can be stored by a - // dictionary indexed by int32, but not a dictionary indexed - // with int8) - for i in 0..200 { - let val = format!("val{}", i); - builder.append(&val).unwrap(); - } - let array: ArrayRef = Arc::new(builder.finish()); - - let cast_type = Dictionary(Box::new(Int8), Box::new(Utf8)); - let res = cast(&array, &cast_type); - assert!(res.is_err()); - let actual_error = format!("{:?}", res); - let expected_error = "Could not convert 72 dictionary indexes from Int32 to Int8"; - assert!( - actual_error.contains(expected_error), - "did not find expected error '{}' in actual error '{}'", - actual_error, - expected_error - ); - } - - #[test] - fn test_cast_primitive_dict() { - // FROM a dictionary with of INT32 values - use DataType::*; - - let keys_builder = PrimitiveBuilder::::new(10); - let values_builder = PrimitiveBuilder::::new(10); - let mut builder = PrimitiveDictionaryBuilder::new(keys_builder, values_builder); - builder.append(1).unwrap(); - builder.append_null().unwrap(); - builder.append(3).unwrap(); - let array: ArrayRef = Arc::new(builder.finish()); - - let expected = vec!["1", "null", "3"]; - - // Test casting TO PrimitiveArray, different dictionary type - let cast_array = cast(&array, &Utf8).expect("cast to UTF-8 failed"); - assert_eq!(array_to_strings(&cast_array), expected); - assert_eq!(cast_array.data_type(), &Utf8); - - let cast_array = cast(&array, &Int64).expect("cast to int64 failed"); - assert_eq!(array_to_strings(&cast_array), expected); - assert_eq!(cast_array.data_type(), &Int64); - } - - #[test] - fn test_cast_primitive_array_to_dict() { - use DataType::*; - - let mut builder = PrimitiveBuilder::::new(10); - builder.append_value(1).unwrap(); - builder.append_null().unwrap(); - builder.append_value(3).unwrap(); - let array: ArrayRef = Arc::new(builder.finish()); - - let expected = vec!["1", "null", "3"]; - - // Cast to a dictionary (same value type, Int32) - let cast_type = Dictionary(Box::new(UInt8), Box::new(Int32)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - - // Cast to a dictionary (different value type, Int8) - let cast_type = Dictionary(Box::new(UInt8), Box::new(Int8)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - } - - #[test] - fn test_cast_string_array_to_dict() { - use DataType::*; - - let array = Arc::new(StringArray::from(vec![Some("one"), None, Some("three")])) - as ArrayRef; - - let expected = vec!["one", "null", "three"]; - - // Cast to a dictionary (same value type, Utf8) - let cast_type = Dictionary(Box::new(UInt8), Box::new(Utf8)); - let cast_array = cast(&array, &cast_type).expect("cast failed"); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(array_to_strings(&cast_array), expected); - } - - #[test] - fn test_cast_null_array_to_int32() { - let array = Arc::new(NullArray::new(6)) as ArrayRef; - - let expected = Int32Array::from(vec![None; 6]); - - // Cast to a dictionary (same value type, Utf8) - let cast_type = DataType::Int32; - let cast_array = cast(&array, &cast_type).expect("cast failed"); - let cast_array = as_primitive_array::(&cast_array); - assert_eq!(cast_array.data_type(), &cast_type); - assert_eq!(cast_array, &expected); - } - - /// Print the `DictionaryArray` `array` as a vector of strings - fn array_to_strings(array: &ArrayRef) -> Vec { - (0..array.len()) - .map(|i| { - if array.is_null(i) { - "null".to_string() - } else { - array_value_to_string(array, i).expect("Convert array to String") - } - }) - .collect() - } - - #[test] - fn test_cast_utf8_to_date32() { - use chrono::NaiveDate; - let from_ymd = chrono::NaiveDate::from_ymd; - let since = chrono::NaiveDate::signed_duration_since; - - let a = StringArray::from(vec![ - "2000-01-01", // valid date with leading 0s - "2000-2-2", // valid date without leading 0s - "2000-00-00", // invalid month and day - "2000-01-01T12:00:00", // date + time is invalid - "2000", // just a year is invalid - ]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Date32).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - - // test valid inputs - let date_value = since(NaiveDate::from_ymd(2000, 1, 1), from_ymd(1970, 1, 1)) - .num_days() as i32; - assert_eq!(true, c.is_valid(0)); // "2000-01-01" - assert_eq!(date_value, c.value(0)); - - let date_value = since(NaiveDate::from_ymd(2000, 2, 2), from_ymd(1970, 1, 1)) - .num_days() as i32; - assert_eq!(true, c.is_valid(1)); // "2000-2-2" - assert_eq!(date_value, c.value(1)); - - // test invalid inputs - assert_eq!(false, c.is_valid(2)); // "2000-00-00" - assert_eq!(false, c.is_valid(3)); // "2000-01-01T12:00:00" - assert_eq!(false, c.is_valid(4)); // "2000" - } - - #[test] - fn test_cast_utf8_to_date64() { - let a = StringArray::from(vec![ - "2000-01-01T12:00:00", // date + time valid - "2020-12-15T12:34:56", // date + time valid - "2020-2-2T12:34:56", // valid date time without leading 0s - "2000-00-00T12:00:00", // invalid month and day - "2000-01-01 12:00:00", // missing the 'T' - "2000-01-01", // just a date is invalid - ]); - let array = Arc::new(a) as ArrayRef; - let b = cast(&array, &DataType::Date64).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - - // test valid inputs - assert_eq!(true, c.is_valid(0)); // "2000-01-01T12:00:00" - assert_eq!(946728000000, c.value(0)); - assert_eq!(true, c.is_valid(1)); // "2020-12-15T12:34:56" - assert_eq!(1608035696000, c.value(1)); - assert_eq!(true, c.is_valid(2)); // "2020-2-2T12:34:56" - assert_eq!(1580646896000, c.value(2)); - - // test invalid inputs - assert_eq!(false, c.is_valid(3)); // "2000-00-00T12:00:00" - assert_eq!(false, c.is_valid(4)); // "2000-01-01 12:00:00" - assert_eq!(false, c.is_valid(5)); // "2000-01-01" - } - - #[test] - fn test_can_cast_types() { - // this function attempts to ensure that can_cast_types stays - // in sync with cast. It simply tries all combinations of - // types and makes sure that if `can_cast_types` returns - // true, so does `cast` - - let all_types = get_all_types(); - - for array in get_arrays_of_all_types() { - for to_type in &all_types { - println!("Test casting {:?} --> {:?}", array.data_type(), to_type); - let cast_result = cast(&array, &to_type); - let reported_cast_ability = can_cast_types(array.data_type(), to_type); - - // check for mismatch - match (cast_result, reported_cast_ability) { - (Ok(_), false) => { - panic!("Was able to cast array {:?} from {:?} to {:?} but can_cast_types reported false", - array, array.data_type(), to_type) - } - (Err(e), true) => { - panic!("Was not able to cast array {:?} from {:?} to {:?} but can_cast_types reported true. \ - Error was {:?}", - array, array.data_type(), to_type, e) - } - // otherwise it was a match - _ => {} - }; - } - } - } - - #[test] - fn test_cast_list_containers() { - // large-list to list - let array = Arc::new(make_large_list_array()) as ArrayRef; - let list_array = cast( - &array, - &DataType::List(Box::new(Field::new("", DataType::Int32, false))), - ) - .unwrap(); - let actual = list_array.as_any().downcast_ref::().unwrap(); - let expected = array.as_any().downcast_ref::().unwrap(); - - assert_eq!(&expected.value(0), &actual.value(0)); - assert_eq!(&expected.value(1), &actual.value(1)); - assert_eq!(&expected.value(2), &actual.value(2)); - - // list to large-list - let array = Arc::new(make_list_array()) as ArrayRef; - let large_list_array = cast( - &array, - &DataType::LargeList(Box::new(Field::new("", DataType::Int32, false))), - ) - .unwrap(); - let actual = large_list_array - .as_any() - .downcast_ref::() - .unwrap(); - let expected = array.as_any().downcast_ref::().unwrap(); - - assert_eq!(&expected.value(0), &actual.value(0)); - assert_eq!(&expected.value(1), &actual.value(1)); - assert_eq!(&expected.value(2), &actual.value(2)); - } - - /// Create instances of arrays with varying types for cast tests - fn get_arrays_of_all_types() -> Vec { - let tz_name = String::from("America/New_York"); - let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; - vec![ - Arc::new(BinaryArray::from(binary_data.clone())), - Arc::new(LargeBinaryArray::from(binary_data.clone())), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - Arc::new(make_list_array()), - Arc::new(make_large_list_array()), - Arc::new(make_fixed_size_list_array()), - Arc::new(make_fixed_size_binary_array()), - Arc::new(StructArray::from(vec![ - ( - Field::new("a", DataType::Boolean, false), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, - ), - ( - Field::new("b", DataType::Int32, false), - Arc::new(Int32Array::from(vec![42, 28, 19, 31])), - ), - ])), - //Arc::new(make_union_array()), - Arc::new(NullArray::new(10)), - Arc::new(StringArray::from(vec!["foo", "bar"])), - Arc::new(LargeStringArray::from(vec!["foo", "bar"])), - Arc::new(BooleanArray::from(vec![true, false])), - Arc::new(Int8Array::from(vec![1, 2])), - Arc::new(Int16Array::from(vec![1, 2])), - Arc::new(Int32Array::from(vec![1, 2])), - Arc::new(Int64Array::from(vec![1, 2])), - Arc::new(UInt8Array::from(vec![1, 2])), - Arc::new(UInt16Array::from(vec![1, 2])), - Arc::new(UInt32Array::from(vec![1, 2])), - Arc::new(UInt64Array::from(vec![1, 2])), - Arc::new(Float32Array::from(vec![1.0, 2.0])), - Arc::new(Float64Array::from(vec![1.0, 2.0])), - Arc::new(TimestampSecondArray::from_vec(vec![1000, 2000], None)), - Arc::new(TimestampMillisecondArray::from_vec(vec![1000, 2000], None)), - Arc::new(TimestampMicrosecondArray::from_vec(vec![1000, 2000], None)), - Arc::new(TimestampNanosecondArray::from_vec(vec![1000, 2000], None)), - Arc::new(TimestampSecondArray::from_vec( - vec![1000, 2000], - Some(tz_name.clone()), - )), - Arc::new(TimestampMillisecondArray::from_vec( - vec![1000, 2000], - Some(tz_name.clone()), - )), - Arc::new(TimestampMicrosecondArray::from_vec( - vec![1000, 2000], - Some(tz_name.clone()), - )), - Arc::new(TimestampNanosecondArray::from_vec( - vec![1000, 2000], - Some(tz_name), - )), - Arc::new(Date32Array::from(vec![1000, 2000])), - Arc::new(Date64Array::from(vec![1000, 2000])), - Arc::new(Time32SecondArray::from(vec![1000, 2000])), - Arc::new(Time32MillisecondArray::from(vec![1000, 2000])), - Arc::new(Time64MicrosecondArray::from(vec![1000, 2000])), - Arc::new(Time64NanosecondArray::from(vec![1000, 2000])), - Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])), - Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])), - Arc::new(DurationSecondArray::from(vec![1000, 2000])), - Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), - Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), - Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), - ] - } - - fn make_list_array() -> ListArray { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, true))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - ListArray::from(list_data) - } - - fn make_large_list_array() -> LargeListArray { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 8]); - - // Construct a list array from the above two - let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - LargeListArray::from(list_data) - } - - fn make_fixed_size_list_array() -> FixedSizeListArray { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); - - // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, true)), - 2, - ); - let list_data = ArrayData::builder(list_data_type) - .len(5) - .add_child_data(value_data) - .build(); - FixedSizeListArray::from(list_data) - } - - fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { - let values: [u8; 15] = *b"hellotherearrow"; - - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(3) - .add_buffer(Buffer::from(&values[..])) - .build(); - FixedSizeBinaryArray::from(array_data) - } - - fn make_union_array() -> UnionArray { - let mut builder = UnionBuilder::new_dense(7); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.build().unwrap() - } - - /// Creates a dictionary with primitive dictionary values, and keys of type K - fn make_dictionary_primitive() -> ArrayRef { - let keys_builder = PrimitiveBuilder::::new(2); - // Pick Int32 arbitrarily for dictionary values - let values_builder = PrimitiveBuilder::::new(2); - let mut b = PrimitiveDictionaryBuilder::new(keys_builder, values_builder); - b.append(1).unwrap(); - b.append(2).unwrap(); - Arc::new(b.finish()) - } - - /// Creates a dictionary with utf8 values, and keys of type K - fn make_dictionary_utf8() -> ArrayRef { - let keys_builder = PrimitiveBuilder::::new(2); - // Pick Int32 arbitrarily for dictionary values - let values_builder = StringBuilder::new(2); - let mut b = StringDictionaryBuilder::new(keys_builder, values_builder); - b.append("foo").unwrap(); - b.append("bar").unwrap(); - Arc::new(b.finish()) - } - - // Get a selection of datatypes to try and cast to - fn get_all_types() -> Vec { - use DataType::*; - let tz_name = String::from("America/New_York"); - - vec![ - Null, - Boolean, - Int8, - Int16, - Int32, - UInt64, - UInt8, - UInt16, - UInt32, - UInt64, - Float16, - Float32, - Float64, - Timestamp(TimeUnit::Second, None), - Timestamp(TimeUnit::Millisecond, None), - Timestamp(TimeUnit::Microsecond, None), - Timestamp(TimeUnit::Nanosecond, None), - Timestamp(TimeUnit::Second, Some(tz_name.clone())), - Timestamp(TimeUnit::Millisecond, Some(tz_name.clone())), - Timestamp(TimeUnit::Microsecond, Some(tz_name.clone())), - Timestamp(TimeUnit::Nanosecond, Some(tz_name)), - Date32, - Date64, - Time32(TimeUnit::Second), - Time32(TimeUnit::Millisecond), - Time64(TimeUnit::Microsecond), - Time64(TimeUnit::Nanosecond), - Duration(TimeUnit::Second), - Duration(TimeUnit::Millisecond), - Duration(TimeUnit::Microsecond), - Duration(TimeUnit::Nanosecond), - Interval(IntervalUnit::YearMonth), - Interval(IntervalUnit::DayTime), - Binary, - FixedSizeBinary(10), - LargeBinary, - Utf8, - LargeUtf8, - List(Box::new(Field::new("item", DataType::Int8, true))), - List(Box::new(Field::new("item", DataType::Utf8, true))), - FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), - FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), - LargeList(Box::new(Field::new("item", DataType::Int8, true))), - LargeList(Box::new(Field::new("item", DataType::Utf8, false))), - Struct(vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Utf8, true), - ]), - Union(vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Utf8, true), - ]), - Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), - Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), - ] - } -} diff --git a/rust/arrow/src/compute/kernels/cast_utils.rs b/rust/arrow/src/compute/kernels/cast_utils.rs deleted file mode 100644 index a06bf421ea4..00000000000 --- a/rust/arrow/src/compute/kernels/cast_utils.rs +++ /dev/null @@ -1,299 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::error::{ArrowError, Result}; -use chrono::{prelude::*, LocalResult}; - -/// Accepts a string in RFC3339 / ISO8601 standard format and some -/// variants and converts it to a nanosecond precision timestamp. -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// In addition to RFC3339 / ISO8601 standard timestamps, it also -/// accepts strings that use a space ` ` to separate the date and time -/// as well as strings that have no explicit timezone offset. -/// -/// Examples of accepted inputs: -/// * `1997-01-31T09:26:56.123Z` # RCF3339 -/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 -/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T -/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified -/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset -/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// We hope to extend this function in the future with a second -/// parameter to specifying the format string. -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// This function intertprets strings without an explicit time zone as -/// timestamps with offsets of the local time on the machine -/// -/// For example, `1997-01-31 09:26:56.123Z` is interpreted as UTC, as -/// it has an explicit timezone specifier (“Z” for Zulu/UTC) -/// -/// `1997-01-31T09:26:56.123` is interpreted as a local timestamp in -/// the timezone of the machine. For example, if -/// the system timezone is set to Americas/New_York (UTC-5) the -/// timestamp will be interpreted as though it were -/// `1997-01-31T09:26:56.123-05:00` -#[inline] -pub fn string_to_timestamp_nanos(s: &str) -> Result { - // Fast path: RFC3339 timestamp (with a T) - // Example: 2020-09-08T13:42:29.190855Z - if let Ok(ts) = DateTime::parse_from_rfc3339(s) { - return Ok(ts.timestamp_nanos()); - } - - // Implement quasi-RFC3339 support by trying to parse the - // timestamp with various other format specifiers to to support - // separating the date and time with a space ' ' rather than 'T' to be - // (more) compatible with Apache Spark SQL - - // timezone offset, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855-05:00 - if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") { - return Ok(ts.timestamp_nanos()); - } - - // with an explicit Z, using ' ' as a separator - // Example: 2020-09-08 13:42:29Z - if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") { - return Ok(ts.timestamp_nanos()); - } - - // Support timestamps without an explicit timezone offset, again - // to be compatible with what Apache Spark SQL does. - - // without a timezone specifier as a local time, using T as a separator - // Example: 2020-09-08T13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using T as a - // separator, no fractional seconds - // Example: 2020-09-08T13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S.%f") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using ' ' as a - // separator, no fractional seconds - // Example: 2020-09-08 13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); - } - - // Note we don't pass along the error message from the underlying - // chrono parsing because we tried several different format - // strings and we don't know which the user was trying to - // match. Ths any of the specific error messages is likely to be - // be more confusing than helpful - Err(ArrowError::CastError(format!( - "Error parsing '{}' as timestamp", - s - ))) -} - -/// Converts the naive datetime (which has no specific timezone) to a -/// nanosecond epoch timestamp relative to UTC. -fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result { - let l = Local {}; - - match l.from_local_datetime(&datetime) { - LocalResult::None => Err(ArrowError::CastError(format!( - "Error parsing '{}' as timestamp: local time representation is invalid", - s - ))), - LocalResult::Single(local_datetime) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - // Ambiguous times can happen if the timestamp is exactly when - // a daylight savings time transition occurs, for example, and - // so the datetime could validly be said to be in two - // potential offsets. However, since we are about to convert - // to UTC anyways, we can pick one arbitrarily - LocalResult::Ambiguous(local_datetime, _) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn string_to_timestamp_timezone() -> Result<()> { - // Explicit timezone - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855+00:00")? - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855Z")? - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08T13:42:29Z")? - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08T13:42:29.190855-05:00")? - ); - Ok(()) - } - - #[test] - fn string_to_timestamp_timezone_space() -> Result<()> { - // Ensure space rather than T between time and date is accepted - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855+00:00")? - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855Z")? - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08 13:42:29Z")? - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08 13:42:29.190855-05:00")? - ); - Ok(()) - } - - /// Interprets a naive_datetime (with no explicit timzone offset) - /// using the local timezone and returns the timestamp in UTC (0 - /// offset) - fn naive_datetime_to_timestamp(naive_datetime: &NaiveDateTime) -> i64 { - // Note: Use chrono APIs that are different than - // naive_datetime_to_timestamp to compute the utc offset to - // try and double check the logic - let utc_offset_secs = match Local.offset_from_local_datetime(&naive_datetime) { - LocalResult::Single(local_offset) => { - local_offset.fix().local_minus_utc() as i64 - } - _ => panic!("Unexpected failure converting to local datetime"), - }; - let utc_offset_nanos = utc_offset_secs * 1_000_000_000; - naive_datetime.timestamp_nanos() - utc_offset_nanos - } - - #[test] - fn string_to_timestamp_no_timezone() -> Result<()> { - // This test is designed to succeed in regardless of the local - // timezone the test machine is running. Thus it is still - // somewhat suceptable to bugs in the use of chrono - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms_nano(13, 42, 29, 190855), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime), - parse_timestamp("2020-09-08T13:42:29.190855")? - ); - - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime), - parse_timestamp("2020-09-08 13:42:29.190855")? - ); - - // Also ensure that parsing timestamps with no fractional - // second part works as well - let naive_datetime_whole_secs = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms(13, 42, 29), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime_whole_secs), - parse_timestamp("2020-09-08T13:42:29")? - ); - - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime_whole_secs), - parse_timestamp("2020-09-08 13:42:29")? - ); - - Ok(()) - } - - #[test] - fn string_to_timestamp_invalid() { - // Test parsing invalid formats - - // It would be nice to make these messages better - expect_timestamp_parse_error("", "Error parsing '' as timestamp"); - expect_timestamp_parse_error("SS", "Error parsing 'SS' as timestamp"); - expect_timestamp_parse_error( - "Wed, 18 Feb 2015 23:16:09 GMT", - "Error parsing 'Wed, 18 Feb 2015 23:16:09 GMT' as timestamp", - ); - } - - // Parse a timestamp to timestamp int with a useful human readable error message - fn parse_timestamp(s: &str) -> Result { - let result = string_to_timestamp_nanos(s); - if let Err(e) = &result { - eprintln!("Error parsing timestamp '{}': {:?}", s, e); - } - result - } - - fn expect_timestamp_parse_error(s: &str, expected_err: &str) { - match string_to_timestamp_nanos(s) { - Ok(v) => panic!( - "Expected error '{}' while parsing '{}', but parsed {} instead", - expected_err, s, v - ), - Err(e) => { - assert!(e.to_string().contains(expected_err), - "Can not find expected error '{}' while parsing '{}'. Actual error '{}'", - expected_err, s, e); - } - } - } -} diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs deleted file mode 100644 index a770ede21dc..00000000000 --- a/rust/arrow/src/compute/kernels/comparison.rs +++ /dev/null @@ -1,1619 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines basic comparison kernels for [`PrimitiveArray`]s. -//! -//! These kernels can leverage SIMD if available on your system. Currently no runtime -//! detection is provided, you should enable the specific SIMD intrinsics using -//! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation -//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. - -use regex::Regex; -use std::collections::HashMap; - -use crate::array::*; -use crate::buffer::{Buffer, MutableBuffer}; -use crate::compute::util::combine_option_bitmap; -use crate::datatypes::{ArrowNumericType, DataType}; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; - -/// Helper function to perform boolean lambda function on values from two arrays, this -/// version does not attempt to use SIMD. -macro_rules! compare_op { - ($left: expr, $right:expr, $op:expr) => {{ - if $left.len() != $right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap($left.data_ref(), $right.data_ref(), $left.len())?; - - let comparison = (0..$left.len()).map(|i| $op($left.value(i), $right.value(i))); - // same size as $left.len() and $right.len() - let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) }; - - let data = ArrayData::new( - DataType::Boolean, - $left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], - ); - Ok(BooleanArray::from(data)) - }}; -} - -macro_rules! compare_op_primitive { - ($left: expr, $right:expr, $op:expr) => {{ - if $left.len() != $right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap($left.data_ref(), $right.data_ref(), $left.len())?; - - let mut values = MutableBuffer::from_len_zeroed(($left.len() + 7) / 8); - let lhs_chunks_iter = $left.values().chunks_exact(8); - let lhs_remainder = lhs_chunks_iter.remainder(); - let rhs_chunks_iter = $right.values().chunks_exact(8); - let rhs_remainder = rhs_chunks_iter.remainder(); - let chunks = $left.len() / 8; - - values[..chunks] - .iter_mut() - .zip(lhs_chunks_iter) - .zip(rhs_chunks_iter) - .for_each(|((byte, lhs), rhs)| { - lhs.iter() - .zip(rhs.iter()) - .enumerate() - .for_each(|(i, (&lhs, &rhs))| { - *byte |= if $op(lhs, rhs) { 1 << i } else { 0 }; - }); - }); - - if !lhs_remainder.is_empty() { - let last = &mut values[chunks]; - lhs_remainder - .iter() - .zip(rhs_remainder.iter()) - .enumerate() - .for_each(|(i, (&lhs, &rhs))| { - *last |= if $op(lhs, rhs) { 1 << i } else { 0 }; - }); - }; - let data = ArrayData::new( - DataType::Boolean, - $left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(values)], - vec![], - ); - Ok(BooleanArray::from(data)) - }}; -} - -macro_rules! compare_op_scalar { - ($left: expr, $right:expr, $op:expr) => {{ - let null_bit_buffer = $left.data().null_buffer().cloned(); - - let comparison = (0..$left.len()).map(|i| $op($left.value(i), $right)); - // same as $left.len() - let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) }; - - let data = ArrayData::new( - DataType::Boolean, - $left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], - ); - Ok(BooleanArray::from(data)) - }}; -} - -macro_rules! compare_op_scalar_primitive { - ($left: expr, $right:expr, $op:expr) => {{ - let null_bit_buffer = $left.data().null_buffer().cloned(); - - let mut values = MutableBuffer::from_len_zeroed(($left.len() + 7) / 8); - let lhs_chunks_iter = $left.values().chunks_exact(8); - let lhs_remainder = lhs_chunks_iter.remainder(); - let chunks = $left.len() / 8; - - values[..chunks] - .iter_mut() - .zip(lhs_chunks_iter) - .for_each(|(byte, chunk)| { - chunk.iter().enumerate().for_each(|(i, &c_i)| { - *byte |= if $op(c_i, $right) { 1 << i } else { 0 }; - }); - }); - if !lhs_remainder.is_empty() { - let last = &mut values[chunks]; - lhs_remainder.iter().enumerate().for_each(|(i, &lhs)| { - *last |= if $op(lhs, $right) { 1 << i } else { 0 }; - }); - }; - - let data = ArrayData::new( - DataType::Boolean, - $left.len(), - None, - null_bit_buffer, - 0, - vec![Buffer::from(values)], - vec![], - ); - Ok(BooleanArray::from(data)) - }}; -} - -/// Evaluate `op(left, right)` for [`PrimitiveArray`]s using a specified -/// comparison function. -pub fn no_simd_compare_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result -where - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op_primitive!(left, right, op) -} - -/// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using -/// a specified comparison function. -pub fn no_simd_compare_op_scalar( - left: &PrimitiveArray, - right: T::Native, - op: F, -) -> Result -where - T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> bool, -{ - compare_op_scalar_primitive!(left, right, op) -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / [`LargeStringArray`]. -/// -/// There are two wildcards supported with the LIKE operator: -/// -/// 1. `%` - The percent sign represents zero, one, or multiple characters -/// 2. `_` - The underscore represents a single character -/// -/// For example: -/// ``` -/// use arrow::array::{StringArray, BooleanArray}; -/// use arrow::compute::like_utf8; -/// -/// let strings = StringArray::from(vec!["Arrow", "Arrow", "Arrow", "Ar"]); -/// let patterns = StringArray::from(vec!["A%", "B%", "A.", "A."]); -/// -/// let result = like_utf8(&strings, &patterns).unwrap(); -/// assert_eq!(result, BooleanArray::from(vec![true, false, false, true])); -/// ``` -pub fn like_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - let mut map = HashMap::new(); - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; - - let mut result = BooleanBufferBuilder::new(left.len()); - for i in 0..left.len() { - let haystack = left.value(i); - let pat = right.value(i); - let re = if let Some(ref regex) = map.get(pat) { - regex - } else { - let re_pattern = pat.replace("%", ".*").replace("_", "."); - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; - map.insert(pat, re); - map.get(pat).unwrap() - }; - - result.append(re.is_match(haystack)); - } - - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -fn is_like_pattern(c: char) -> bool { - c == '%' || c == '_' -} - -/// Perform SQL `left LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn like_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - - if !right.contains(is_like_pattern) { - // fast path, can use equals - for i in 0..left.len() { - if left.value(i) == right { - bit_util::set_bit(bool_slice, i); - } - } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let starts_with = &right[..right.len() - 1]; - for i in 0..left.len() { - if left.value(i).starts_with(starts_with) { - bit_util::set_bit(bool_slice, i); - } - } - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_with = &right[1..]; - for i in 0..left.len() { - if left.value(i).ends_with(ends_with) { - bit_util::set_bit(bool_slice, i); - } - } - } else { - let re_pattern = right.replace("%", ".*").replace("_", "."); - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; - - for i in 0..left.len() { - let haystack = left.value(i); - if re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; - - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`]. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - let mut map = HashMap::new(); - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let null_bit_buffer = - combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?; - - let mut result = BooleanBufferBuilder::new(left.len()); - for i in 0..left.len() { - let haystack = left.value(i); - let pat = right.value(i); - let re = if let Some(ref regex) = map.get(pat) { - regex - } else { - let re_pattern = pat.replace("%", ".*").replace("_", "."); - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; - map.insert(pat, re); - map.get(pat).unwrap() - }; - - result.append(!re.is_match(haystack)); - } - - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / -/// [`LargeStringArray`] and a scalar. -/// -/// See the documentation on [`like_utf8`] for more details. -pub fn nlike_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let mut result = BooleanBufferBuilder::new(left.len()); - - if !right.contains(is_like_pattern) { - // fast path, can use equals - for i in 0..left.len() { - result.append(left.value(i) != right); - } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use ends_with - for i in 0..left.len() { - result.append(!left.value(i).starts_with(&right[..right.len() - 1])); - } - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use starts_with - for i in 0..left.len() { - result.append(!left.value(i).ends_with(&right[1..])); - } - } else { - let re_pattern = right.replace("%", ".*").replace("_", "."); - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; - for i in 0..left.len() { - let haystack = left.value(i); - result.append(!re.is_match(haystack)); - } - } - - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![result.finish()], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -/// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`]. -pub fn eq_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - compare_op!(left, right, |a, b| a == b) -} - -/// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. -pub fn eq_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - compare_op_scalar!(left, right, |a, b| a == b) -} - -/// Perform `left != right` operation on [`StringArray`] / [`LargeStringArray`]. -pub fn neq_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - compare_op!(left, right, |a, b| a != b) -} - -/// Perform `left != right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. -pub fn neq_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - compare_op_scalar!(left, right, |a, b| a != b) -} - -/// Perform `left < right` operation on [`StringArray`] / [`LargeStringArray`]. -pub fn lt_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - compare_op!(left, right, |a, b| a < b) -} - -/// Perform `left < right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. -pub fn lt_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - compare_op_scalar!(left, right, |a, b| a < b) -} - -/// Perform `left <= right` operation on [`StringArray`] / [`LargeStringArray`]. -pub fn lt_eq_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - compare_op!(left, right, |a, b| a <= b) -} - -/// Perform `left <= right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. -pub fn lt_eq_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - compare_op_scalar!(left, right, |a, b| a <= b) -} - -/// Perform `left > right` operation on [`StringArray`] / [`LargeStringArray`]. -pub fn gt_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - compare_op!(left, right, |a, b| a > b) -} - -/// Perform `left > right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. -pub fn gt_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - compare_op_scalar!(left, right, |a, b| a > b) -} - -/// Perform `left >= right` operation on [`StringArray`] / [`LargeStringArray`]. -pub fn gt_eq_utf8( - left: &GenericStringArray, - right: &GenericStringArray, -) -> Result { - compare_op!(left, right, |a, b| a >= b) -} - -/// Perform `left >= right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. -pub fn gt_eq_utf8_scalar( - left: &GenericStringArray, - right: &str, -) -> Result { - compare_op_scalar!(left, right, |a, b| a >= b) -} - -/// Helper function to perform boolean lambda function on values from two arrays using -/// SIMD. -#[cfg(simd)] -fn simd_compare_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - simd_op: SIMD_OP, - scalar_op: SCALAR_OP, -) -> Result -where - T: ArrowNumericType, - SIMD_OP: Fn(T::Simd, T::Simd) -> T::SimdMask, - SCALAR_OP: Fn(T::Native, T::Native) -> bool, -{ - use std::borrow::BorrowMut; - - let len = left.len(); - if len != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let null_bit_buffer = combine_option_bitmap(left.data_ref(), right.data_ref(), len)?; - - let lanes = T::lanes(); - let buffer_size = bit_util::ceil(len, 8); - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - // this is currently the case for all our datatypes and allows us to always append full bytes - assert!( - lanes % 8 == 0, - "Number of vector lanes must be multiple of 8" - ); - let mut left_chunks = left.values().chunks_exact(lanes); - let mut right_chunks = right.values().chunks_exact(lanes); - - let result_remainder = left_chunks - .borrow_mut() - .zip(right_chunks.borrow_mut()) - .fold( - result.typed_data_mut(), - |result_slice, (left_slice, right_slice)| { - let simd_left = T::load(left_slice); - let simd_right = T::load(right_slice); - let simd_result = simd_op(simd_left, simd_right); - - let bitmask = T::mask_to_u64(&simd_result); - let bytes = bitmask.to_le_bytes(); - &result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes / 8]); - - &mut result_slice[lanes / 8..] - }, - ); - - let left_remainder = left_chunks.remainder(); - let right_remainder = right_chunks.remainder(); - - assert_eq!(left_remainder.len(), right_remainder.len()); - - let remainder_bitmask = left_remainder - .iter() - .zip(right_remainder.iter()) - .enumerate() - .fold(0_u64, |mut mask, (i, (scalar_left, scalar_right))| { - let bit = if scalar_op(*scalar_left, *scalar_right) { - 1_u64 - } else { - 0_u64 - }; - mask |= bit << i; - mask - }); - let remainder_mask_as_bytes = - &remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)]; - result_remainder.copy_from_slice(remainder_mask_as_bytes); - - let data = ArrayData::new( - DataType::Boolean, - len, - None, - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -/// Helper function to perform boolean lambda function on values from an array and a scalar value using -/// SIMD. -#[cfg(simd)] -fn simd_compare_op_scalar( - left: &PrimitiveArray, - right: T::Native, - simd_op: SIMD_OP, - scalar_op: SCALAR_OP, -) -> Result -where - T: ArrowNumericType, - SIMD_OP: Fn(T::Simd, T::Simd) -> T::SimdMask, - SCALAR_OP: Fn(T::Native, T::Native) -> bool, -{ - use std::borrow::BorrowMut; - - let len = left.len(); - - let lanes = T::lanes(); - let buffer_size = bit_util::ceil(len, 8); - let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false); - - // this is currently the case for all our datatypes and allows us to always append full bytes - assert!( - lanes % 8 == 0, - "Number of vector lanes must be multiple of 8" - ); - let mut left_chunks = left.values().chunks_exact(lanes); - let simd_right = T::init(right); - - let result_remainder = left_chunks.borrow_mut().fold( - result.typed_data_mut(), - |result_slice, left_slice| { - let simd_left = T::load(left_slice); - let simd_result = simd_op(simd_left, simd_right); - - let bitmask = T::mask_to_u64(&simd_result); - let bytes = bitmask.to_le_bytes(); - &result_slice[0..lanes / 8].copy_from_slice(&bytes[0..lanes / 8]); - - &mut result_slice[lanes / 8..] - }, - ); - - let left_remainder = left_chunks.remainder(); - - let remainder_bitmask = - left_remainder - .iter() - .enumerate() - .fold(0_u64, |mut mask, (i, scalar_left)| { - let bit = if scalar_op(*scalar_left, right) { - 1_u64 - } else { - 0_u64 - }; - mask |= bit << i; - mask - }); - let remainder_mask_as_bytes = - &remainder_bitmask.to_le_bytes()[0..bit_util::ceil(left_remainder.len(), 8)]; - result_remainder.copy_from_slice(remainder_mask_as_bytes); - - let null_bit_buffer = left - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(left.offset(), left.len())); - - // null count is the same as in the input since the right side of the scalar comparison cannot be null - let null_count = left.null_count(); - - let data = ArrayData::new( - DataType::Boolean, - len, - Some(null_count), - null_bit_buffer, - 0, - vec![result.into()], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -/// Perform `left == right` operation on two arrays. -pub fn eq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op(left, right, T::eq, |a, b| a == b); - #[cfg(not(simd))] - return compare_op!(left, right, |a, b| a == b); -} - -/// Perform `left == right` operation on an array and a scalar value. -pub fn eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op_scalar(left, right, T::eq, |a, b| a == b); - #[cfg(not(simd))] - return compare_op_scalar!(left, right, |a, b| a == b); -} - -/// Perform `left != right` operation on two arrays. -pub fn neq(left: &PrimitiveArray, right: &PrimitiveArray) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op(left, right, T::ne, |a, b| a != b); - #[cfg(not(simd))] - return compare_op!(left, right, |a, b| a != b); -} - -/// Perform `left != right` operation on an array and a scalar value. -pub fn neq_scalar(left: &PrimitiveArray, right: T::Native) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op_scalar(left, right, T::ne, |a, b| a != b); - #[cfg(not(simd))] - return compare_op_scalar!(left, right, |a, b| a != b); -} - -/// Perform `left < right` operation on two arrays. Null values are less than non-null -/// values. -pub fn lt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op(left, right, T::lt, |a, b| a < b); - #[cfg(not(simd))] - return compare_op!(left, right, |a, b| a < b); -} - -/// Perform `left < right` operation on an array and a scalar value. -/// Null values are less than non-null values. -pub fn lt_scalar(left: &PrimitiveArray, right: T::Native) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op_scalar(left, right, T::lt, |a, b| a < b); - #[cfg(not(simd))] - return compare_op_scalar!(left, right, |a, b| a < b); -} - -/// Perform `left <= right` operation on two arrays. Null values are less than non-null -/// values. -pub fn lt_eq( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op(left, right, T::le, |a, b| a <= b); - #[cfg(not(simd))] - return compare_op!(left, right, |a, b| a <= b); -} - -/// Perform `left <= right` operation on an array and a scalar value. -/// Null values are less than non-null values. -pub fn lt_eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op_scalar(left, right, T::le, |a, b| a <= b); - #[cfg(not(simd))] - return compare_op_scalar!(left, right, |a, b| a <= b); -} - -/// Perform `left > right` operation on two arrays. Non-null values are greater than null -/// values. -pub fn gt(left: &PrimitiveArray, right: &PrimitiveArray) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op(left, right, T::gt, |a, b| a > b); - #[cfg(not(simd))] - return compare_op!(left, right, |a, b| a > b); -} - -/// Perform `left > right` operation on an array and a scalar value. -/// Non-null values are greater than null values. -pub fn gt_scalar(left: &PrimitiveArray, right: T::Native) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op_scalar(left, right, T::gt, |a, b| a > b); - #[cfg(not(simd))] - return compare_op_scalar!(left, right, |a, b| a > b); -} - -/// Perform `left >= right` operation on two arrays. Non-null values are greater than null -/// values. -pub fn gt_eq( - left: &PrimitiveArray, - right: &PrimitiveArray, -) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op(left, right, T::ge, |a, b| a >= b); - #[cfg(not(simd))] - return compare_op!(left, right, |a, b| a >= b); -} - -/// Perform `left >= right` operation on an array and a scalar value. -/// Non-null values are greater than null values. -pub fn gt_eq_scalar(left: &PrimitiveArray, right: T::Native) -> Result -where - T: ArrowNumericType, -{ - #[cfg(simd)] - return simd_compare_op_scalar(left, right, T::ge, |a, b| a >= b); - #[cfg(not(simd))] - return compare_op_scalar!(left, right, |a, b| a >= b); -} - -/// Checks if a [`GenericListArray`] contains a value in the [`PrimitiveArray`] -pub fn contains( - left: &PrimitiveArray, - right: &GenericListArray, -) -> Result -where - T: ArrowNumericType, - OffsetSize: OffsetSizeTrait, -{ - let left_len = left.len(); - if left_len != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let num_bytes = bit_util::ceil(left_len, 8); - - let not_both_null_bit_buffer = - match combine_option_bitmap(left.data_ref(), right.data_ref(), left_len)? { - Some(buff) => buff, - None => new_all_set_buffer(num_bytes), - }; - let not_both_null_bitmap = not_both_null_bit_buffer.as_slice(); - - let mut bool_buf = MutableBuffer::from_len_zeroed(num_bytes); - let bool_slice = bool_buf.as_slice_mut(); - - // if both array slots are valid, check if list contains primitive - for i in 0..left_len { - if bit_util::get_bit(not_both_null_bitmap, i) { - let list = right.value(i); - let list = list.as_any().downcast_ref::>().unwrap(); - - for j in 0..list.len() { - if list.is_valid(j) && (left.value(i) == list.value(j)) { - bit_util::set_bit(bool_slice, i); - continue; - } - } - } - } - - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - None, - 0, - vec![bool_buf.into()], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -/// Checks if a [`GenericListArray`] contains a value in the [`GenericStringArray`] -pub fn contains_utf8( - left: &GenericStringArray, - right: &ListArray, -) -> Result -where - OffsetSize: StringOffsetSizeTrait, -{ - let left_len = left.len(); - if left_len != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let num_bytes = bit_util::ceil(left_len, 8); - - let not_both_null_bit_buffer = - match combine_option_bitmap(left.data_ref(), right.data_ref(), left_len)? { - Some(buff) => buff, - None => new_all_set_buffer(num_bytes), - }; - let not_both_null_bitmap = not_both_null_bit_buffer.as_slice(); - - let mut bool_buf = MutableBuffer::from_len_zeroed(num_bytes); - let bool_slice = &mut bool_buf; - - for i in 0..left_len { - // contains(null, null) = false - if bit_util::get_bit(not_both_null_bitmap, i) { - let list = right.value(i); - let list = list - .as_any() - .downcast_ref::>() - .unwrap(); - - for j in 0..list.len() { - if list.is_valid(j) && (left.value(i) == list.value(j)) { - bit_util::set_bit(bool_slice, i); - continue; - } - } - } - } - - let data = ArrayData::new( - DataType::Boolean, - left.len(), - None, - None, - 0, - vec![bool_buf.into()], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -// create a buffer and fill it with valid bits -#[inline] -fn new_all_set_buffer(len: usize) -> Buffer { - let buffer = MutableBuffer::new(len); - let buffer = buffer.with_bitset(len, true); - - buffer.into() -} - -// disable wrapping inside literal vectors used for test data and assertions -#[rustfmt::skip::macros(vec)] -#[cfg(test)] -mod tests { - use super::*; - use crate::datatypes::Int8Type; - use crate::{array::Int32Array, array::Int64Array, datatypes::Field}; - - /// Evaluate `KERNEL` with two vectors as inputs and assert against the expected output. - /// `A_VEC` and `B_VEC` can be of type `Vec` or `Vec>`. - /// `EXPECTED` can be either `Vec` or `Vec>`. - /// The main reason for this macro is that inputs and outputs align nicely after `cargo fmt`. - macro_rules! cmp_i64 { - ($KERNEL:ident, $A_VEC:expr, $B_VEC:expr, $EXPECTED:expr) => { - let a = Int64Array::from($A_VEC); - let b = Int64Array::from($B_VEC); - let c = $KERNEL(&a, &b).unwrap(); - assert_eq!(BooleanArray::from($EXPECTED), c); - }; - } - - /// Evaluate `KERNEL` with one vectors and one scalar as inputs and assert against the expected output. - /// `A_VEC` can be of type `Vec` or `Vec>`. - /// `EXPECTED` can be either `Vec` or `Vec>`. - /// The main reason for this macro is that inputs and outputs align nicely after `cargo fmt`. - macro_rules! cmp_i64_scalar { - ($KERNEL:ident, $A_VEC:expr, $B:literal, $EXPECTED:expr) => { - let a = Int64Array::from($A_VEC); - let c = $KERNEL(&a, $B).unwrap(); - assert_eq!(BooleanArray::from($EXPECTED), c); - }; - } - - #[test] - fn test_primitive_array_eq() { - cmp_i64!( - eq, - vec![8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - vec![false, false, true, false, false, false, false, true, false, false] - ); - } - - #[test] - fn test_primitive_array_eq_scalar() { - cmp_i64_scalar!( - eq_scalar, - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - 8, - vec![false, false, true, false, false, false, false, true, false, false] - ); - } - - #[test] - fn test_primitive_array_eq_with_slice() { - let a = Int32Array::from(vec![6, 7, 8, 8, 10]); - let b = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); - let b_slice = b.slice(5, 5); - let c = b_slice.as_any().downcast_ref().unwrap(); - let d = eq(&c, &a).unwrap(); - assert_eq!(true, d.value(0)); - assert_eq!(true, d.value(1)); - assert_eq!(true, d.value(2)); - assert_eq!(false, d.value(3)); - assert_eq!(true, d.value(4)); - } - - #[test] - fn test_primitive_array_neq() { - cmp_i64!( - neq, - vec![8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - vec![true, true, false, true, true, true, true, false, true, true] - ); - } - - #[test] - fn test_primitive_array_neq_scalar() { - cmp_i64_scalar!( - neq_scalar, - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - 8, - vec![true, true, false, true, true, true, true, false, true, true] - ); - } - - #[test] - fn test_primitive_array_lt() { - cmp_i64!( - lt, - vec![8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - vec![false, false, false, true, true, false, false, false, true, true] - ); - } - - #[test] - fn test_primitive_array_lt_scalar() { - cmp_i64_scalar!( - lt_scalar, - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - 8, - vec![true, true, false, false, false, true, true, false, false, false] - ); - } - - #[test] - fn test_primitive_array_lt_nulls() { - cmp_i64!( - lt, - vec![None, None, Some(1), Some(1), None, None, Some(2), Some(2),], - vec![None, Some(1), None, Some(1), None, Some(3), None, Some(3),], - vec![None, None, None, Some(false), None, None, None, Some(true)] - ); - } - - #[test] - fn test_primitive_array_lt_scalar_nulls() { - cmp_i64_scalar!( - lt_scalar, - vec![None, Some(1), Some(2), Some(3), None, Some(1), Some(2), Some(3), Some(2), None], - 2, - vec![None, Some(true), Some(false), Some(false), None, Some(true), Some(false), Some(false), Some(false), None] - ); - } - - #[test] - fn test_primitive_array_lt_eq() { - cmp_i64!( - lt_eq, - vec![8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - vec![false, false, true, true, true, false, false, true, true, true] - ); - } - - #[test] - fn test_primitive_array_lt_eq_scalar() { - cmp_i64_scalar!( - lt_eq_scalar, - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - 8, - vec![true, true, true, false, false, true, true, true, false, false] - ); - } - - #[test] - fn test_primitive_array_lt_eq_nulls() { - cmp_i64!( - lt_eq, - vec![None, None, Some(1), None, None, Some(1), None, None, Some(1)], - vec![None, Some(1), Some(0), None, Some(1), Some(2), None, None, Some(3)], - vec![None, None, Some(false), None, None, Some(true), None, None, Some(true)] - ); - } - - #[test] - fn test_primitive_array_lt_eq_scalar_nulls() { - cmp_i64_scalar!( - lt_eq_scalar, - vec![None, Some(1), Some(2), None, Some(1), Some(2), None, Some(1), Some(2)], - 1, - vec![None, Some(true), Some(false), None, Some(true), Some(false), None, Some(true), Some(false)] - ); - } - - #[test] - fn test_primitive_array_gt() { - cmp_i64!( - gt, - vec![8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - vec![true, true, false, false, false, true, true, false, false, false] - ); - } - - #[test] - fn test_primitive_array_gt_scalar() { - cmp_i64_scalar!( - gt_scalar, - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - 8, - vec![false, false, false, true, true, false, false, false, true, true] - ); - } - - #[test] - fn test_primitive_array_gt_nulls() { - cmp_i64!( - gt, - vec![None, None, Some(1), None, None, Some(2), None, None, Some(3)], - vec![None, Some(1), Some(1), None, Some(1), Some(1), None, Some(1), Some(1)], - vec![None, None, Some(false), None, None, Some(true), None, None, Some(true)] - ); - } - - #[test] - fn test_primitive_array_gt_scalar_nulls() { - cmp_i64_scalar!( - gt_scalar, - vec![None, Some(1), Some(2), None, Some(1), Some(2), None, Some(1), Some(2)], - 1, - vec![None, Some(false), Some(true), None, Some(false), Some(true), None, Some(false), Some(true)] - ); - } - - #[test] - fn test_primitive_array_gt_eq() { - cmp_i64!( - gt_eq, - vec![8, 8, 8, 8, 8, 8, 8, 8, 8, 8], - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - vec![true, true, true, false, false, true, true, true, false, false] - ); - } - - #[test] - fn test_primitive_array_gt_eq_scalar() { - cmp_i64_scalar!( - gt_eq_scalar, - vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10], - 8, - vec![false, false, true, true, true, false, false, true, true, true] - ); - } - - #[test] - fn test_primitive_array_gt_eq_nulls() { - cmp_i64!( - gt_eq, - vec![None, None, Some(1), None, Some(1), Some(2), None, None, Some(1)], - vec![None, Some(1), None, None, Some(1), Some(1), None, Some(2), Some(2)], - vec![None, None, None, None, Some(true), Some(true), None, None, Some(false)] - ); - } - - #[test] - fn test_primitive_array_gt_eq_scalar_nulls() { - cmp_i64_scalar!( - gt_eq_scalar, - vec![None, Some(1), Some(2), None, Some(2), Some(3), None, Some(3), Some(4)], - 2, - vec![None, Some(false), Some(true), None, Some(true), Some(true), None, Some(true), Some(true)] - ); - } - - #[test] - fn test_primitive_array_compare_slice() { - let a: Int32Array = (0..100).map(Some).collect(); - let a = a.slice(50, 50); - let a = a.as_any().downcast_ref::().unwrap(); - let b: Int32Array = (100..200).map(Some).collect(); - let b = b.slice(50, 50); - let b = b.as_any().downcast_ref::().unwrap(); - let actual = lt(&a, &b).unwrap(); - let expected: BooleanArray = (0..50).map(|_| Some(true)).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn test_primitive_array_compare_scalar_slice() { - let a: Int32Array = (0..100).map(Some).collect(); - let a = a.slice(50, 50); - let a = a.as_any().downcast_ref::().unwrap(); - let actual = lt_scalar(&a, 200).unwrap(); - let expected: BooleanArray = (0..50).map(|_| Some(true)).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn test_length_of_result_buffer() { - // `item_count` is chosen to not be a multiple of the number of SIMD lanes for this - // type (`Int8Type`), 64. - let item_count = 130; - - let select_mask: BooleanArray = vec![true; item_count].into(); - - let array_a: PrimitiveArray = vec![1; item_count].into(); - let array_b: PrimitiveArray = vec![2; item_count].into(); - let result_mask = gt_eq(&array_a, &array_b).unwrap(); - - assert_eq!( - result_mask.data().buffers()[0].len(), - select_mask.data().buffers()[0].len() - ); - } - - // Expected behaviour: - // contains(1, [1, 2, null]) = true - // contains(3, [1, 2, null]) = false - // contains(null, [1, 2, null]) = false - // contains(null, null) = false - #[test] - fn test_contains() { - let value_data = Int32Array::from(vec![ - Some(0), - Some(1), - Some(2), - Some(3), - Some(4), - Some(5), - Some(6), - None, - Some(7), - ]) - .data() - .clone(); - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 6, 9]); - let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); - let list_data = ArrayData::builder(list_data_type) - .len(4) - .add_buffer(value_offsets) - .add_child_data(value_data) - .null_bit_buffer(Buffer::from([0b00001011])) - .build(); - - // [[0, 1, 2], [3, 4, 5], null, [6, null, 7]] - let list_array = LargeListArray::from(list_data); - - let nulls = Int32Array::from(vec![None, None, None, None]); - let nulls_result = contains(&nulls, &list_array).unwrap(); - assert_eq!( - nulls_result - .as_any() - .downcast_ref::() - .unwrap(), - &BooleanArray::from(vec![false, false, false, false]), - ); - - let values = Int32Array::from(vec![Some(0), Some(0), Some(0), Some(0)]); - let values_result = contains(&values, &list_array).unwrap(); - assert_eq!( - values_result - .as_any() - .downcast_ref::() - .unwrap(), - &BooleanArray::from(vec![true, false, false, false]), - ); - } - - // Expected behaviour: - // contains("ab", ["ab", "cd", null]) = true - // contains("ef", ["ab", "cd", null]) = false - // contains(null, ["ab", "cd", null]) = false - // contains(null, null) = false - #[test] - fn test_contains_utf8() { - let values_builder = StringBuilder::new(10); - let mut builder = ListBuilder::new(values_builder); - - builder.values().append_value("Lorem").unwrap(); - builder.values().append_value("ipsum").unwrap(); - builder.values().append_null().unwrap(); - builder.append(true).unwrap(); - builder.values().append_value("sit").unwrap(); - builder.values().append_value("amet").unwrap(); - builder.values().append_value("Lorem").unwrap(); - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.values().append_value("ipsum").unwrap(); - builder.append(true).unwrap(); - - // [["Lorem", "ipsum", null], ["sit", "amet", "Lorem"], null, ["ipsum"]] - // value_offsets = [0, 3, 6, 6] - let list_array = builder.finish(); - - let nulls = StringArray::from(vec![None, None, None, None]); - let nulls_result = contains_utf8(&nulls, &list_array).unwrap(); - assert_eq!( - nulls_result - .as_any() - .downcast_ref::() - .unwrap(), - &BooleanArray::from(vec![false, false, false, false]), - ); - - let values = StringArray::from(vec![ - Some("Lorem"), - Some("Lorem"), - Some("Lorem"), - Some("Lorem"), - ]); - let values_result = contains_utf8(&values, &list_array).unwrap(); - assert_eq!( - values_result - .as_any() - .downcast_ref::() - .unwrap(), - &BooleanArray::from(vec![true, true, false, false]), - ); - } - - macro_rules! test_utf8 { - ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { - #[test] - fn $test_name() { - let left = StringArray::from($left); - let right = StringArray::from($right); - let res = $op(&left, &right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!(v, expected[i]); - } - } - }; - } - - macro_rules! test_utf8_scalar { - ($test_name:ident, $left:expr, $right:expr, $op:expr, $expected:expr) => { - #[test] - fn $test_name() { - let left = StringArray::from($left); - let res = $op(&left, $right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!( - v, - expected[i], - "unexpected result when comparing {} at position {} to {} ", - left.value(i), - i, - $right - ); - } - - let left = LargeStringArray::from($left); - let res = $op(&left, $right).unwrap(); - let expected = $expected; - assert_eq!(expected.len(), res.len()); - for i in 0..res.len() { - let v = res.value(i); - assert_eq!( - v, - expected[i], - "unexpected result when comparing {} at position {} to {} ", - left.value(i), - i, - $right - ); - } - } - }; - } - - test_utf8!( - test_utf8_array_like, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], - like_utf8, - vec![true, true, true, false, false, true, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "%ar%", - like_utf8_scalar, - vec![true, true, false, false] - ); - test_utf8_scalar!( - test_utf8_array_like_scalar_start, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow%", - like_utf8_scalar, - vec![true, false, true, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_end, - vec!["arrow", "parrow", "arrows", "arr"], - "%arrow", - like_utf8_scalar, - vec![true, true, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow", - like_utf8_scalar, - vec![true, false, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_like_scalar_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - like_utf8_scalar, - vec![false, true, false, false] - ); - - test_utf8!( - test_utf8_array_nlike, - vec!["arrow", "arrow", "arrow", "arrow", "arrow", "arrows", "arrow"], - vec!["arrow", "ar%", "%ro%", "foo", "arr", "arrow_", "arrow_"], - nlike_utf8, - vec![false, false, false, true, true, false, true] - ); - test_utf8_scalar!( - test_utf8_array_nlike_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "%ar%", - nlike_utf8_scalar, - vec![false, false, true, true] - ); - - test_utf8!( - test_utf8_array_eq, - vec!["arrow", "arrow", "arrow", "arrow"], - vec!["arrow", "parquet", "datafusion", "flight"], - eq_utf8, - vec![true, false, false, false] - ); - test_utf8_scalar!( - test_utf8_array_eq_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "arrow", - eq_utf8_scalar, - vec![true, false, false, false] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_start, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow%", - nlike_utf8_scalar, - vec![false, true, false, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_end, - vec!["arrow", "parrow", "arrows", "arr"], - "%arrow", - nlike_utf8_scalar, - vec![false, false, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow", - nlike_utf8_scalar, - vec![false, true, true, true] - ); - - test_utf8_scalar!( - test_utf8_array_nlike_scalar_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - nlike_utf8_scalar, - vec![true, false, true, true] - ); - - test_utf8!( - test_utf8_array_neq, - vec!["arrow", "arrow", "arrow", "arrow"], - vec!["arrow", "parquet", "datafusion", "flight"], - neq_utf8, - vec![false, true, true, true] - ); - test_utf8_scalar!( - test_utf8_array_neq_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "arrow", - neq_utf8_scalar, - vec![false, true, true, true] - ); - - test_utf8!( - test_utf8_array_lt, - vec!["arrow", "datafusion", "flight", "parquet"], - vec!["flight", "flight", "flight", "flight"], - lt_utf8, - vec![true, true, false, false] - ); - test_utf8_scalar!( - test_utf8_array_lt_scalar, - vec!["arrow", "datafusion", "flight", "parquet"], - "flight", - lt_utf8_scalar, - vec![true, true, false, false] - ); - - test_utf8!( - test_utf8_array_lt_eq, - vec!["arrow", "datafusion", "flight", "parquet"], - vec!["flight", "flight", "flight", "flight"], - lt_eq_utf8, - vec![true, true, true, false] - ); - test_utf8_scalar!( - test_utf8_array_lt_eq_scalar, - vec!["arrow", "datafusion", "flight", "parquet"], - "flight", - lt_eq_utf8_scalar, - vec![true, true, true, false] - ); - - test_utf8!( - test_utf8_array_gt, - vec!["arrow", "datafusion", "flight", "parquet"], - vec!["flight", "flight", "flight", "flight"], - gt_utf8, - vec![false, false, false, true] - ); - test_utf8_scalar!( - test_utf8_array_gt_scalar, - vec!["arrow", "datafusion", "flight", "parquet"], - "flight", - gt_utf8_scalar, - vec![false, false, false, true] - ); - - test_utf8!( - test_utf8_array_gt_eq, - vec!["arrow", "datafusion", "flight", "parquet"], - vec!["flight", "flight", "flight", "flight"], - gt_eq_utf8, - vec![false, false, true, true] - ); - test_utf8_scalar!( - test_utf8_array_gt_eq_scalar, - vec!["arrow", "datafusion", "flight", "parquet"], - "flight", - gt_eq_utf8_scalar, - vec![false, false, true, true] - ); -} diff --git a/rust/arrow/src/compute/kernels/concat.rs b/rust/arrow/src/compute/kernels/concat.rs deleted file mode 100644 index 32880286a72..00000000000 --- a/rust/arrow/src/compute/kernels/concat.rs +++ /dev/null @@ -1,387 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines concat kernel for `ArrayRef` -//! -//! Example: -//! -//! ``` -//! use arrow::array::{ArrayRef, StringArray}; -//! use arrow::compute::concat; -//! -//! let arr = concat(&[ -//! &StringArray::from(vec!["hello", "world"]), -//! &StringArray::from(vec!["!"]), -//! ]).unwrap(); -//! assert_eq!(arr.len(), 3); -//! ``` - -use crate::array::*; -use crate::error::{ArrowError, Result}; - -/// Concatenate multiple [Array] of the same type into a single [ArrayRef]. -pub fn concat(arrays: &[&Array]) -> Result { - if arrays.is_empty() { - return Err(ArrowError::ComputeError( - "concat requires input of at least one array".to_string(), - )); - } - - if arrays - .iter() - .any(|array| array.data_type() != arrays[0].data_type()) - { - return Err(ArrowError::InvalidArgumentError( - "It is not possible to concatenate arrays of different data types." - .to_string(), - )); - } - - let lengths = arrays.iter().map(|array| array.len()).collect::>(); - let capacity = lengths.iter().sum(); - - let arrays = arrays.iter().map(|a| a.data()).collect::>(); - - let mut mutable = MutableArrayData::new(arrays, false, capacity); - - for (i, len) in lengths.iter().enumerate() { - mutable.extend(i, 0, *len) - } - - Ok(make_array(mutable.freeze())) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::datatypes::*; - use std::sync::Arc; - - #[test] - fn test_concat_empty_vec() { - let re = concat(&[]); - assert!(re.is_err()); - } - - #[test] - fn test_concat_incompatible_datatypes() { - let re = concat(&[ - &PrimitiveArray::::from(vec![Some(-1), Some(2), None]), - &StringArray::from(vec![Some("hello"), Some("bar"), Some("world")]), - ]); - assert!(re.is_err()); - } - - #[test] - fn test_concat_string_arrays() -> Result<()> { - let arr = concat(&[ - &StringArray::from(vec!["hello", "world"]), - &StringArray::from(vec!["2", "3", "4"]), - &StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]), - ])?; - - let expected_output = Arc::new(StringArray::from(vec![ - Some("hello"), - Some("world"), - Some("2"), - Some("3"), - Some("4"), - Some("foo"), - Some("bar"), - None, - Some("baz"), - ])) as ArrayRef; - - assert_eq!(&arr, &expected_output); - - Ok(()) - } - - #[test] - fn test_concat_primitive_arrays() -> Result<()> { - let arr = concat(&[ - &PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - ]), - &PrimitiveArray::::from(vec![ - Some(101), - Some(102), - Some(103), - None, - ]), - &PrimitiveArray::::from(vec![Some(256), Some(512), Some(1024)]), - ])?; - - let expected_output = Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - Some(101), - Some(102), - Some(103), - None, - Some(256), - Some(512), - Some(1024), - ])) as ArrayRef; - - assert_eq!(&arr, &expected_output); - - Ok(()) - } - - #[test] - fn test_concat_primitive_array_slices() -> Result<()> { - let input_1 = PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - ]) - .slice(1, 3); - - let input_2 = PrimitiveArray::::from(vec![ - Some(101), - Some(102), - Some(103), - None, - ]) - .slice(1, 3); - let arr = concat(&[input_1.as_ref(), input_2.as_ref()])?; - - let expected_output = Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(2), - None, - Some(102), - Some(103), - None, - ])) as ArrayRef; - - assert_eq!(&arr, &expected_output); - - Ok(()) - } - - #[test] - fn test_concat_boolean_primitive_arrays() -> Result<()> { - let arr = concat(&[ - &BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - None, - None, - Some(false), - ]), - &BooleanArray::from(vec![None, Some(false), Some(true), Some(false)]), - ])?; - - let expected_output = Arc::new(BooleanArray::from(vec![ - Some(true), - Some(true), - Some(false), - None, - None, - Some(false), - None, - Some(false), - Some(true), - Some(false), - ])) as ArrayRef; - - assert_eq!(&arr, &expected_output); - - Ok(()) - } - - #[test] - fn test_concat_primitive_list_arrays() -> Result<()> { - let list1 = vec![ - Some(vec![Some(-1), Some(-1), Some(2), None, None]), - Some(vec![]), - None, - Some(vec![Some(10)]), - ]; - let list1_array = - ListArray::from_iter_primitive::(list1.clone()); - - let list2 = vec![ - None, - Some(vec![Some(100), None, Some(101)]), - Some(vec![Some(102)]), - ]; - let list2_array = - ListArray::from_iter_primitive::(list2.clone()); - - let list3 = vec![Some(vec![Some(1000), Some(1001)])]; - let list3_array = - ListArray::from_iter_primitive::(list3.clone()); - - let array_result = concat(&[&list1_array, &list2_array, &list3_array])?; - - let expected = list1 - .into_iter() - .chain(list2.into_iter()) - .chain(list3.into_iter()); - let array_expected = ListArray::from_iter_primitive::(expected); - - assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); - - Ok(()) - } - - #[test] - fn test_concat_struct_arrays() -> Result<()> { - let field = Field::new("field", DataType::Int64, true); - let input_primitive_1: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - ])); - let input_struct_1 = StructArray::from(vec![(field.clone(), input_primitive_1)]); - - let input_primitive_2: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(101), - Some(102), - Some(103), - None, - ])); - let input_struct_2 = StructArray::from(vec![(field.clone(), input_primitive_2)]); - - let input_primitive_3: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(256), - Some(512), - Some(1024), - ])); - let input_struct_3 = StructArray::from(vec![(field, input_primitive_3)]); - - let arr = concat(&[&input_struct_1, &input_struct_2, &input_struct_3])?; - - let expected_primitive_output = Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - Some(101), - Some(102), - Some(103), - None, - Some(256), - Some(512), - Some(1024), - ])) as ArrayRef; - - let actual_primitive = arr - .as_any() - .downcast_ref::() - .unwrap() - .column(0); - assert_eq!(actual_primitive, &expected_primitive_output); - - Ok(()) - } - - #[test] - fn test_concat_struct_array_slices() -> Result<()> { - let field = Field::new("field", DataType::Int64, true); - let input_primitive_1: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - ])); - let input_struct_1 = StructArray::from(vec![(field.clone(), input_primitive_1)]); - - let input_primitive_2: ArrayRef = - Arc::new(PrimitiveArray::::from(vec![ - Some(101), - Some(102), - Some(103), - None, - ])); - let input_struct_2 = StructArray::from(vec![(field, input_primitive_2)]); - - let arr = concat(&[ - input_struct_1.slice(1, 3).as_ref(), - input_struct_2.slice(1, 2).as_ref(), - ])?; - - let expected_primitive_output = Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(2), - None, - Some(102), - Some(103), - ])) as ArrayRef; - - let actual_primitive = arr - .as_any() - .downcast_ref::() - .unwrap() - .column(0); - assert_eq!(actual_primitive, &expected_primitive_output); - - Ok(()) - } - - #[test] - fn test_string_array_slices() -> Result<()> { - let input_1 = StringArray::from(vec!["hello", "A", "B", "C"]); - let input_2 = StringArray::from(vec!["world", "D", "E", "Z"]); - - let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()])?; - - let expected_output = StringArray::from(vec!["A", "B", "C", "D", "E"]); - - let actual_output = arr.as_any().downcast_ref::().unwrap(); - assert_eq!(actual_output, &expected_output); - - Ok(()) - } - - #[test] - fn test_string_array_with_null_slices() -> Result<()> { - let input_1 = StringArray::from(vec![Some("hello"), None, Some("A"), Some("C")]); - let input_2 = StringArray::from(vec![None, Some("world"), Some("D"), None]); - - let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()])?; - - let expected_output = - StringArray::from(vec![None, Some("A"), Some("C"), Some("world"), Some("D")]); - - let actual_output = arr.as_any().downcast_ref::().unwrap(); - assert_eq!(actual_output, &expected_output); - - Ok(()) - } -} diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs deleted file mode 100644 index 68feb0a546e..00000000000 --- a/rust/arrow/src/compute/kernels/filter.rs +++ /dev/null @@ -1,584 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines miscellaneous array kernels. - -use crate::error::Result; -use crate::record_batch::RecordBatch; -use crate::{array::*, util::bit_chunk_iterator::BitChunkIterator}; -use std::iter::Enumerate; - -/// Function that can filter arbitrary arrays -pub type Filter<'a> = Box ArrayData + 'a>; - -/// Internal state of [SlicesIterator] -#[derive(Debug, PartialEq)] -enum State { - // it is iterating over bits of a mask (`u64`, steps of size of 1 slot) - Bits(u64), - // it is iterating over chunks (steps of size of 64 slots) - Chunks, - // it is iterating over the remainding bits (steps of size of 1 slot) - Remainder, - // nothing more to iterate. - Finish, -} - -/// An iterator of `(usize, usize)` each representing an interval `[start,end[` whose -/// slots of a [BooleanArray] are true. Each interval corresponds to a contiguous region of memory to be -/// "taken" from an array to be filtered. -#[derive(Debug)] -pub(crate) struct SlicesIterator<'a> { - iter: Enumerate>, - state: State, - filter_count: usize, - remainder_mask: u64, - remainder_len: usize, - chunk_len: usize, - len: usize, - start: usize, - on_region: bool, - current_chunk: usize, - current_bit: usize, -} - -impl<'a> SlicesIterator<'a> { - pub(crate) fn new(filter: &'a BooleanArray) -> Self { - let values = &filter.data_ref().buffers()[0]; - - // this operation is performed before iteration - // because it is fast and allows reserving all the needed memory - let filter_count = values.count_set_bits_offset(filter.offset(), filter.len()); - - let chunks = values.bit_chunks(filter.offset(), filter.len()); - - Self { - iter: chunks.iter().enumerate(), - state: State::Chunks, - filter_count, - remainder_len: chunks.remainder_len(), - chunk_len: chunks.chunk_len(), - remainder_mask: chunks.remainder_bits(), - len: 0, - start: 0, - on_region: false, - current_chunk: 0, - current_bit: 0, - } - } - - #[inline] - fn current_start(&self) -> usize { - self.current_chunk * 64 + self.current_bit - } - - #[inline] - fn iterate_bits(&mut self, mask: u64, max: usize) -> Option<(usize, usize)> { - while self.current_bit < max { - if (mask & (1 << self.current_bit)) != 0 { - if !self.on_region { - self.start = self.current_start(); - self.on_region = true; - } - self.len += 1; - } else if self.on_region { - let result = (self.start, self.start + self.len); - self.len = 0; - self.on_region = false; - self.current_bit += 1; - return Some(result); - } - self.current_bit += 1; - } - self.current_bit = 0; - None - } - - /// iterates over chunks. - #[inline] - fn iterate_chunks(&mut self) -> Option<(usize, usize)> { - while let Some((i, mask)) = self.iter.next() { - self.current_chunk = i; - if mask == 0 { - if self.on_region { - let result = (self.start, self.start + self.len); - self.len = 0; - self.on_region = false; - return Some(result); - } - } else if mask == 18446744073709551615u64 { - // = !0u64 - if !self.on_region { - self.start = self.current_start(); - self.on_region = true; - } - self.len += 64; - } else { - // there is a chunk that has a non-trivial mask => iterate over bits. - self.state = State::Bits(mask); - return None; - } - } - // no more chunks => start iterating over the remainder - self.current_chunk = self.chunk_len; - self.state = State::Remainder; - None - } -} - -impl<'a> Iterator for SlicesIterator<'a> { - type Item = (usize, usize); - - fn next(&mut self) -> Option { - match self.state { - State::Chunks => { - match self.iterate_chunks() { - None => { - // iterating over chunks does not yield any new slice => continue to the next - self.current_bit = 0; - self.next() - } - other => other, - } - } - State::Bits(mask) => { - match self.iterate_bits(mask, 64) { - None => { - // iterating over bits does not yield any new slice => change back - // to chunks and continue to the next - self.state = State::Chunks; - self.next() - } - other => other, - } - } - State::Remainder => { - match self.iterate_bits(self.remainder_mask, self.remainder_len) { - None => { - self.state = State::Finish; - if self.on_region { - Some((self.start, self.start + self.len)) - } else { - None - } - } - other => other, - } - } - State::Finish => None, - } - } -} - -/// Returns a prepared function optimized to filter multiple arrays. -/// Creating this function requires time, but using it is faster than [filter] when the -/// same filter needs to be applied to multiple arrays (e.g. a multi-column `RecordBatch`). -/// WARNING: the nulls of `filter` are ignored and the value on its slot is considered. -/// Therefore, it is considered undefined behavior to pass `filter` with null values. -pub fn build_filter(filter: &BooleanArray) -> Result { - let iter = SlicesIterator::new(filter); - let filter_count = iter.filter_count; - let chunks = iter.collect::>(); - - Ok(Box::new(move |array: &ArrayData| { - let mut mutable = MutableArrayData::new(vec![array], false, filter_count); - chunks - .iter() - .for_each(|(start, end)| mutable.extend(0, *start, *end)); - mutable.freeze() - })) -} - -/// Filters an [Array], returning elements matching the filter (i.e. where the values are true). -/// WARNING: the nulls of `filter` are ignored and the value on its slot is considered. -/// Therefore, it is considered undefined behavior to pass `filter` with null values. -/// # Example -/// ```rust -/// # use arrow::array::{Int32Array, BooleanArray}; -/// # use arrow::error::Result; -/// # use arrow::compute::kernels::filter::filter; -/// # fn main() -> Result<()> { -/// let array = Int32Array::from(vec![5, 6, 7, 8, 9]); -/// let filter_array = BooleanArray::from(vec![true, false, false, true, false]); -/// let c = filter(&array, &filter_array)?; -/// let c = c.as_any().downcast_ref::().unwrap(); -/// assert_eq!(c, &Int32Array::from(vec![5, 8])); -/// # Ok(()) -/// # } -/// ``` -pub fn filter(array: &Array, filter: &BooleanArray) -> Result { - let iter = SlicesIterator::new(filter); - - let mut mutable = - MutableArrayData::new(vec![array.data_ref()], false, iter.filter_count); - iter.for_each(|(start, end)| mutable.extend(0, start, end)); - let data = mutable.freeze(); - Ok(make_array(data)) -} - -/// Returns a new [RecordBatch] with arrays containing only values matching the filter. -/// WARNING: the nulls of `filter` are ignored and the value on its slot is considered. -/// Therefore, it is considered undefined behavior to pass `filter` with null values. -pub fn filter_record_batch( - record_batch: &RecordBatch, - filter: &BooleanArray, -) -> Result { - let filter = build_filter(filter)?; - let filtered_arrays = record_batch - .columns() - .iter() - .map(|a| make_array(filter(&a.data()))) - .collect(); - RecordBatch::try_new(record_batch.schema(), filtered_arrays) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{ - buffer::Buffer, - datatypes::{DataType, Field}, - }; - - macro_rules! def_temporal_test { - ($test:ident, $array_type: ident, $data: expr) => { - #[test] - fn $test() { - let a = $data; - let b = BooleanArray::from(vec![true, false, true, false]); - let c = filter(&a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::<$array_type>().unwrap(); - assert_eq!(2, d.len()); - assert_eq!(1, d.value(0)); - assert_eq!(3, d.value(1)); - } - }; - } - - def_temporal_test!( - test_filter_date32, - Date32Array, - Date32Array::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_date64, - Date64Array, - Date64Array::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_time32_second, - Time32SecondArray, - Time32SecondArray::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_time32_millisecond, - Time32MillisecondArray, - Time32MillisecondArray::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_time64_microsecond, - Time64MicrosecondArray, - Time64MicrosecondArray::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_time64_nanosecond, - Time64NanosecondArray, - Time64NanosecondArray::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_duration_second, - DurationSecondArray, - DurationSecondArray::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_duration_millisecond, - DurationMillisecondArray, - DurationMillisecondArray::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_duration_microsecond, - DurationMicrosecondArray, - DurationMicrosecondArray::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_duration_nanosecond, - DurationNanosecondArray, - DurationNanosecondArray::from(vec![1, 2, 3, 4]) - ); - def_temporal_test!( - test_filter_timestamp_second, - TimestampSecondArray, - TimestampSecondArray::from_vec(vec![1, 2, 3, 4], None) - ); - def_temporal_test!( - test_filter_timestamp_millisecond, - TimestampMillisecondArray, - TimestampMillisecondArray::from_vec(vec![1, 2, 3, 4], None) - ); - def_temporal_test!( - test_filter_timestamp_microsecond, - TimestampMicrosecondArray, - TimestampMicrosecondArray::from_vec(vec![1, 2, 3, 4], None) - ); - def_temporal_test!( - test_filter_timestamp_nanosecond, - TimestampNanosecondArray, - TimestampNanosecondArray::from_vec(vec![1, 2, 3, 4], None) - ); - - #[test] - fn test_filter_array_slice() { - let a_slice = Int32Array::from(vec![5, 6, 7, 8, 9]).slice(1, 4); - let a = a_slice.as_ref(); - let b = BooleanArray::from(vec![true, false, false, true]); - // filtering with sliced filter array is not currently supported - // let b_slice = BooleanArray::from(vec![true, false, false, true, false]).slice(1, 4); - // let b = b_slice.as_any().downcast_ref().unwrap(); - let c = filter(a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, d.len()); - assert_eq!(6, d.value(0)); - assert_eq!(9, d.value(1)); - } - - #[test] - fn test_filter_array_low_density() { - // this test exercises the all 0's branch of the filter algorithm - let mut data_values = (1..=65).collect::>(); - let mut filter_values = - (1..=65).map(|i| matches!(i % 65, 0)).collect::>(); - // set up two more values after the batch - data_values.extend_from_slice(&[66, 67]); - filter_values.extend_from_slice(&[false, true]); - let a = Int32Array::from(data_values); - let b = BooleanArray::from(filter_values); - let c = filter(&a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, d.len()); - assert_eq!(65, d.value(0)); - assert_eq!(67, d.value(1)); - } - - #[test] - fn test_filter_array_high_density() { - // this test exercises the all 1's branch of the filter algorithm - let mut data_values = (1..=65).map(Some).collect::>(); - let mut filter_values = (1..=65) - .map(|i| !matches!(i % 65, 0)) - .collect::>(); - // set second data value to null - data_values[1] = None; - // set up two more values after the batch - data_values.extend_from_slice(&[Some(66), None, Some(67), None]); - filter_values.extend_from_slice(&[false, true, true, true]); - let a = Int32Array::from(data_values); - let b = BooleanArray::from(filter_values); - let c = filter(&a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(67, d.len()); - assert_eq!(3, d.null_count()); - assert_eq!(1, d.value(0)); - assert_eq!(true, d.is_null(1)); - assert_eq!(64, d.value(63)); - assert_eq!(true, d.is_null(64)); - assert_eq!(67, d.value(65)); - } - - #[test] - fn test_filter_string_array_simple() { - let a = StringArray::from(vec!["hello", " ", "world", "!"]); - let b = BooleanArray::from(vec![true, false, true, false]); - let c = filter(&a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, d.len()); - assert_eq!("hello", d.value(0)); - assert_eq!("world", d.value(1)); - } - - #[test] - fn test_filter_primative_array_with_null() { - let a = Int32Array::from(vec![Some(5), None]); - let b = BooleanArray::from(vec![false, true]); - let c = filter(&a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(1, d.len()); - assert_eq!(true, d.is_null(0)); - } - - #[test] - fn test_filter_string_array_with_null() { - let a = StringArray::from(vec![Some("hello"), None, Some("world"), None]); - let b = BooleanArray::from(vec![true, false, false, true]); - let c = filter(&a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, d.len()); - assert_eq!("hello", d.value(0)); - assert_eq!(false, d.is_null(0)); - assert_eq!(true, d.is_null(1)); - } - - #[test] - fn test_filter_binary_array_with_null() { - let data: Vec> = vec![Some(b"hello"), None, Some(b"world"), None]; - let a = BinaryArray::from(data); - let b = BooleanArray::from(vec![true, false, false, true]); - let c = filter(&a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, d.len()); - assert_eq!(b"hello", d.value(0)); - assert_eq!(false, d.is_null(0)); - assert_eq!(true, d.is_null(1)); - } - - #[test] - fn test_filter_array_slice_with_null() { - let a_slice = - Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]).slice(1, 4); - let a = a_slice.as_ref(); - let b = BooleanArray::from(vec![true, false, false, true]); - // filtering with sliced filter array is not currently supported - // let b_slice = BooleanArray::from(vec![true, false, false, true, false]).slice(1, 4); - // let b = b_slice.as_any().downcast_ref().unwrap(); - let c = filter(a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, d.len()); - assert_eq!(true, d.is_null(0)); - assert_eq!(false, d.is_null(1)); - assert_eq!(9, d.value(1)); - } - - #[test] - fn test_filter_dictionary_array() { - let values = vec![Some("hello"), None, Some("world"), Some("!")]; - let a: Int8DictionaryArray = values.iter().copied().collect(); - let b = BooleanArray::from(vec![false, true, true, false]); - let c = filter(&a, &b).unwrap(); - let d = c - .as_ref() - .as_any() - .downcast_ref::() - .unwrap(); - let value_array = d.values(); - let values = value_array.as_any().downcast_ref::().unwrap(); - // values are cloned in the filtered dictionary array - assert_eq!(3, values.len()); - // but keys are filtered - assert_eq!(2, d.len()); - assert_eq!(true, d.is_null(0)); - assert_eq!("world", values.value(d.keys().value(1) as usize)); - } - - #[test] - fn test_filter_string_array_with_negated_boolean_array() { - let a = StringArray::from(vec!["hello", " ", "world", "!"]); - let mut bb = BooleanBuilder::new(2); - bb.append_value(false).unwrap(); - bb.append_value(true).unwrap(); - bb.append_value(false).unwrap(); - bb.append_value(true).unwrap(); - let b = bb.finish(); - let b = crate::compute::not(&b).unwrap(); - - let c = filter(&a, &b).unwrap(); - let d = c.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, d.len()); - assert_eq!("hello", d.value(0)); - assert_eq!("world", d.value(1)); - } - - #[test] - fn test_filter_list_array() { - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); - - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 6, 8, 8]); - - let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(4) - .add_buffer(value_offsets) - .add_child_data(value_data) - .null_bit_buffer(Buffer::from([0b00000111])) - .build(); - - // a = [[0, 1, 2], [3, 4, 5], [6, 7], null] - let a = LargeListArray::from(list_data); - let b = BooleanArray::from(vec![false, true, false, true]); - let result = filter(&a, &b).unwrap(); - - // expected: [[3, 4, 5], null] - let value_data = ArrayData::builder(DataType::Int32) - .len(3) - .add_buffer(Buffer::from_slice_ref(&[3, 4, 5])) - .build(); - - let value_offsets = Buffer::from_slice_ref(&[0i64, 3, 3]); - - let list_data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))); - let expected = ArrayData::builder(list_data_type) - .len(2) - .add_buffer(value_offsets) - .add_child_data(value_data) - .null_bit_buffer(Buffer::from([0b00000001])) - .build(); - - assert_eq!(&make_array(expected), &result); - } - - #[test] - fn test_slice_iterator_bits() { - let filter_values = (0..64).map(|i| i == 1).collect::>(); - let filter = BooleanArray::from(filter_values); - - let iter = SlicesIterator::new(&filter); - let filter_count = iter.filter_count; - let chunks = iter.collect::>(); - - assert_eq!(chunks, vec![(1, 2)]); - assert_eq!(filter_count, 1); - } - - #[test] - fn test_slice_iterator_bits1() { - let filter_values = (0..64).map(|i| i != 1).collect::>(); - let filter = BooleanArray::from(filter_values); - - let iter = SlicesIterator::new(&filter); - let filter_count = iter.filter_count; - let chunks = iter.collect::>(); - - assert_eq!(chunks, vec![(0, 1), (2, 64)]); - assert_eq!(filter_count, 64 - 1); - } - - #[test] - fn test_slice_iterator_chunk_and_bits() { - let filter_values = (0..130).map(|i| i % 62 != 0).collect::>(); - let filter = BooleanArray::from(filter_values); - - let iter = SlicesIterator::new(&filter); - let filter_count = iter.filter_count; - let chunks = iter.collect::>(); - - assert_eq!(chunks, vec![(1, 62), (63, 124), (125, 130)]); - assert_eq!(filter_count, 61 + 61 + 5); - } -} diff --git a/rust/arrow/src/compute/kernels/length.rs b/rust/arrow/src/compute/kernels/length.rs deleted file mode 100644 index 4d704d27078..00000000000 --- a/rust/arrow/src/compute/kernels/length.rs +++ /dev/null @@ -1,385 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines kernel for length of a string array - -use crate::{ - array::*, - buffer::Buffer, - datatypes::{ArrowNativeType, ArrowPrimitiveType}, -}; -use crate::{ - datatypes::{DataType, Int32Type, Int64Type}, - error::{ArrowError, Result}, -}; - -fn unary_offsets_string( - array: &GenericStringArray, - data_type: DataType, - op: F, -) -> ArrayRef -where - O: StringOffsetSizeTrait + ArrowNativeType, - F: Fn(O) -> O, -{ - // note: offsets are stored as u8, but they can be interpreted as OffsetSize - let offsets = &array.data_ref().buffers()[0]; - // this is a 30% improvement over iterating over u8s and building OffsetSize, which - // justifies the usage of `unsafe`. - let slice: &[O] = &unsafe { offsets.typed_data::() }[array.offset()..]; - - let lengths = slice.windows(2).map(|offset| op(offset[1] - offset[0])); - - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` is an iterator with a known size. - let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) }; - - let null_bit_buffer = array - .data_ref() - .null_bitmap() - .as_ref() - .map(|b| b.bits.clone()); - - let data = ArrayData::new( - data_type, - array.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ); - make_array(data) -} - -fn octet_length( - array: &dyn Array, -) -> ArrayRef -where - T::Native: StringOffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - unary_offsets_string::(array, T::DATA_TYPE, |x| x) -} - -fn bit_length_impl( - array: &dyn Array, -) -> ArrayRef -where - T::Native: StringOffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let bits_in_bytes = O::from_usize(8).unwrap(); - unary_offsets_string::(array, T::DATA_TYPE, |x| x * bits_in_bytes) -} - -/// Returns an array of Int32/Int64 denoting the number of bytes in each string in the array. -/// -/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8 -/// * length of null is null. -/// * length is in number of bytes -pub fn length(array: &Array) -> Result { - match array.data_type() { - DataType::Utf8 => Ok(octet_length::(array)), - DataType::LargeUtf8 => Ok(octet_length::(array)), - _ => Err(ArrowError::ComputeError(format!( - "length not supported for {:?}", - array.data_type() - ))), - } -} - -/// Returns an array of Int32/Int64 denoting the number of bits in each string in the array. -/// -/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8 -/// * bit_length of null is null. -/// * bit_length is in number of bits -pub fn bit_length(array: &Array) -> Result { - match array.data_type() { - DataType::Utf8 => Ok(bit_length_impl::(array)), - DataType::LargeUtf8 => Ok(bit_length_impl::(array)), - _ => Err(ArrowError::ComputeError(format!( - "bit_length not supported for {:?}", - array.data_type() - ))), - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn length_cases() -> Vec<(Vec<&'static str>, usize, Vec)> { - fn double_vec(v: Vec) -> Vec { - [&v[..], &v[..]].concat() - } - - // a large array - let mut values = vec!["one", "on", "o", ""]; - let mut expected = vec![3, 2, 1, 0]; - for _ in 0..10 { - values = double_vec(values); - expected = double_vec(expected); - } - - vec![ - (vec!["hello", " ", "world"], 3, vec![5, 1, 5]), - (vec!["hello", " ", "world", "!"], 4, vec![5, 1, 5, 1]), - (vec!["💖"], 1, vec![4]), - (values, 4096, expected), - ] - } - - #[test] - fn length_test_string() -> Result<()> { - length_cases() - .into_iter() - .try_for_each(|(input, len, expected)| { - let array = StringArray::from(input); - let result = length(&array)?; - assert_eq!(len, result.len()); - let result = result.as_any().downcast_ref::().unwrap(); - expected.iter().enumerate().for_each(|(i, value)| { - assert_eq!(*value, result.value(i)); - }); - Ok(()) - }) - } - - #[test] - fn length_test_large_string() -> Result<()> { - length_cases() - .into_iter() - .try_for_each(|(input, len, expected)| { - let array = LargeStringArray::from(input); - let result = length(&array)?; - assert_eq!(len, result.len()); - let result = result.as_any().downcast_ref::().unwrap(); - expected.iter().enumerate().for_each(|(i, value)| { - assert_eq!(*value as i64, result.value(i)); - }); - Ok(()) - }) - } - - fn length_null_cases() -> Vec<(Vec>, usize, Vec>)> { - vec![( - vec![Some("one"), None, Some("three"), Some("four")], - 4, - vec![Some(3), None, Some(5), Some(4)], - )] - } - - #[test] - fn length_null_string() -> Result<()> { - length_null_cases() - .into_iter() - .try_for_each(|(input, len, expected)| { - let array = StringArray::from(input); - let result = length(&array)?; - assert_eq!(len, result.len()); - let result = result.as_any().downcast_ref::().unwrap(); - - let expected: Int32Array = expected.into(); - assert_eq!(expected.data(), result.data()); - Ok(()) - }) - } - - #[test] - fn length_null_large_string() -> Result<()> { - length_null_cases() - .into_iter() - .try_for_each(|(input, len, expected)| { - let array = LargeStringArray::from(input); - let result = length(&array)?; - assert_eq!(len, result.len()); - let result = result.as_any().downcast_ref::().unwrap(); - - // convert to i64 - let expected: Int64Array = expected - .iter() - .map(|e| e.map(|e| e as i64)) - .collect::>() - .into(); - assert_eq!(expected.data(), result.data()); - Ok(()) - }) - } - - /// Tests that length is not valid for u64. - #[test] - fn length_wrong_type() { - let array: UInt64Array = vec![1u64].into(); - - assert!(length(&array).is_err()); - } - - /// Tests with an offset - #[test] - fn length_offsets() -> Result<()> { - let a = StringArray::from(vec!["hello", " ", "world"]); - let b = make_array( - ArrayData::builder(DataType::Utf8) - .len(2) - .offset(1) - .buffers(a.data_ref().buffers().to_vec()) - .build(), - ); - let result = length(b.as_ref())?; - - let expected = Int32Array::from(vec![1, 5]); - assert_eq!(expected.data(), result.data()); - - Ok(()) - } - - fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec)> { - fn double_vec(v: Vec) -> Vec { - [&v[..], &v[..]].concat() - } - - // a large array - let mut values = vec!["one", "on", "o", ""]; - let mut expected = vec![24, 16, 8, 0]; - for _ in 0..10 { - values = double_vec(values); - expected = double_vec(expected); - } - - vec![ - (vec!["hello", " ", "world", "!"], 4, vec![40, 8, 40, 8]), - (vec!["💖"], 1, vec![32]), - (vec!["josé"], 1, vec![40]), - (values, 4096, expected), - ] - } - - #[test] - fn bit_length_test_string() -> Result<()> { - bit_length_cases() - .into_iter() - .try_for_each(|(input, len, expected)| { - let array = StringArray::from(input); - let result = bit_length(&array)?; - assert_eq!(len, result.len()); - let result = result.as_any().downcast_ref::().unwrap(); - expected.iter().enumerate().for_each(|(i, value)| { - assert_eq!(*value, result.value(i)); - }); - Ok(()) - }) - } - - #[test] - fn bit_length_test_large_string() -> Result<()> { - bit_length_cases() - .into_iter() - .try_for_each(|(input, len, expected)| { - let array = LargeStringArray::from(input); - let result = bit_length(&array)?; - assert_eq!(len, result.len()); - let result = result.as_any().downcast_ref::().unwrap(); - expected.iter().enumerate().for_each(|(i, value)| { - assert_eq!(*value as i64, result.value(i)); - }); - Ok(()) - }) - } - - fn bit_length_null_cases() -> Vec<(Vec>, usize, Vec>)> - { - vec![( - vec![Some("one"), None, Some("three"), Some("four")], - 4, - vec![Some(24), None, Some(40), Some(32)], - )] - } - - #[test] - fn bit_length_null_string() -> Result<()> { - bit_length_null_cases() - .into_iter() - .try_for_each(|(input, len, expected)| { - let array = StringArray::from(input); - let result = bit_length(&array)?; - assert_eq!(len, result.len()); - let result = result.as_any().downcast_ref::().unwrap(); - - let expected: Int32Array = expected.into(); - assert_eq!(expected.data(), result.data()); - Ok(()) - }) - } - - #[test] - fn bit_length_null_large_string() -> Result<()> { - bit_length_null_cases() - .into_iter() - .try_for_each(|(input, len, expected)| { - let array = LargeStringArray::from(input); - let result = bit_length(&array)?; - assert_eq!(len, result.len()); - let result = result.as_any().downcast_ref::().unwrap(); - - // convert to i64 - let expected: Int64Array = expected - .iter() - .map(|e| e.map(|e| e as i64)) - .collect::>() - .into(); - assert_eq!(expected.data(), result.data()); - Ok(()) - }) - } - - /// Tests that bit_length is not valid for u64. - #[test] - fn bit_length_wrong_type() { - let array: UInt64Array = vec![1u64].into(); - - assert!(bit_length(&array).is_err()); - } - - /// Tests with an offset - #[test] - fn bit_length_offsets() -> Result<()> { - let a = StringArray::from(vec!["hello", " ", "world"]); - let b = make_array( - ArrayData::builder(DataType::Utf8) - .len(2) - .offset(1) - .buffers(a.data_ref().buffers().to_vec()) - .build(), - ); - let result = bit_length(b.as_ref())?; - - let expected = Int32Array::from(vec![8, 40]); - assert_eq!(expected.data(), result.data()); - - Ok(()) - } -} diff --git a/rust/arrow/src/compute/kernels/limit.rs b/rust/arrow/src/compute/kernels/limit.rs deleted file mode 100644 index 4b4b08572a2..00000000000 --- a/rust/arrow/src/compute/kernels/limit.rs +++ /dev/null @@ -1,200 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines miscellaneous array kernels. - -use crate::array::ArrayRef; - -/// Returns the array, taking only the number of elements specified -/// -/// Limit performs a zero-copy slice of the array, and is a convenience method on slice -/// where: -/// * it performs a bounds-check on the array -/// * it slices from offset 0 -pub fn limit(array: &ArrayRef, num_elements: usize) -> ArrayRef { - let lim = num_elements.min(array.len()); - array.slice(0, lim) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::array::*; - use crate::buffer::Buffer; - use crate::datatypes::{DataType, Field}; - use crate::util::bit_util; - - use std::sync::Arc; - - #[test] - fn test_limit_array() { - let a: ArrayRef = Arc::new(Int32Array::from(vec![5, 6, 7, 8, 9])); - let b = limit(&a, 3); - let c = b.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(3, c.len()); - assert_eq!(5, c.value(0)); - assert_eq!(6, c.value(1)); - assert_eq!(7, c.value(2)); - } - - #[test] - fn test_limit_string_array() { - let a: ArrayRef = Arc::new(StringArray::from(vec!["hello", " ", "world", "!"])); - let b = limit(&a, 2); - let c = b.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(2, c.len()); - assert_eq!("hello", c.value(0)); - assert_eq!(" ", c.value(1)); - } - - #[test] - fn test_limit_array_with_null() { - let a: ArrayRef = Arc::new(Int32Array::from(vec![None, Some(5)])); - let b = limit(&a, 1); - let c = b.as_ref().as_any().downcast_ref::().unwrap(); - assert_eq!(1, c.len()); - assert_eq!(true, c.is_null(0)); - } - - #[test] - fn test_limit_array_with_limit_too_large() { - let a = Int32Array::from(vec![5, 6, 7, 8, 9]); - let a_ref: ArrayRef = Arc::new(a); - let b = limit(&a_ref, 6); - let c = b.as_ref().as_any().downcast_ref::().unwrap(); - - assert_eq!(5, c.len()); - assert_eq!(5, c.value(0)); - assert_eq!(6, c.value(1)); - assert_eq!(7, c.value(2)); - assert_eq!(8, c.value(3)); - assert_eq!(9, c.value(4)); - } - - #[test] - fn test_list_array_limit() { - // adapted from crate::array::test::test_list_array_slice - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1], null, [2, 3], null, [4, 5], null, [6, 7, 8], null, [9]] - let value_offsets = Buffer::from_slice_ref(&[0, 2, 2, 4, 4, 6, 6, 9, 9, 10]); - // 01010101 00000001 - let mut null_bits: [u8; 2] = [0; 2]; - bit_util::set_bit(&mut null_bits, 0); - bit_util::set_bit(&mut null_bits, 2); - bit_util::set_bit(&mut null_bits, 4); - bit_util::set_bit(&mut null_bits, 6); - bit_util::set_bit(&mut null_bits, 8); - - // Construct a list array from the above two - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(9) - .add_buffer(value_offsets) - .add_child_data(value_data) - .null_bit_buffer(Buffer::from(null_bits)) - .build(); - let list_array: ArrayRef = Arc::new(ListArray::from(list_data)); - - let limit_array = limit(&list_array, 6); - assert_eq!(6, limit_array.len()); - assert_eq!(0, limit_array.offset()); - assert_eq!(3, limit_array.null_count()); - - // Check offset and length for each non-null value. - let limit_array: &ListArray = - limit_array.as_any().downcast_ref::().unwrap(); - - for i in 0..limit_array.len() { - let offset = limit_array.value_offsets()[i]; - let length = limit_array.value_length(i); - if i % 2 == 0 { - assert_eq!(2, length); - assert_eq!(i as i32, offset); - } else { - assert_eq!(0, length); - } - } - } - - #[test] - fn test_struct_array_limit() { - // adapted from crate::array::test::test_struct_array_slice - let boolean_data = ArrayData::builder(DataType::Boolean) - .len(5) - .add_buffer(Buffer::from([0b00010000])) - .null_bit_buffer(Buffer::from([0b00010001])) - .build(); - let int_data = ArrayData::builder(DataType::Int32) - .len(5) - .add_buffer(Buffer::from_slice_ref(&[0, 28, 42, 0, 0])) - .null_bit_buffer(Buffer::from([0b00000110])) - .build(); - - let mut field_types = vec![]; - field_types.push(Field::new("a", DataType::Boolean, false)); - field_types.push(Field::new("b", DataType::Int32, false)); - let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) - .len(5) - .add_child_data(boolean_data.clone()) - .add_child_data(int_data.clone()) - .null_bit_buffer(Buffer::from([0b00010111])) - .build(); - let struct_array = StructArray::from(struct_array_data); - - assert_eq!(5, struct_array.len()); - assert_eq!(1, struct_array.null_count()); - assert_eq!(&boolean_data, struct_array.column(0).data()); - assert_eq!(&int_data, struct_array.column(1).data()); - - let array: ArrayRef = Arc::new(struct_array); - - let sliced_array = limit(&array, 3); - let sliced_array = sliced_array.as_any().downcast_ref::().unwrap(); - assert_eq!(3, sliced_array.len()); - assert_eq!(0, sliced_array.offset()); - assert_eq!(0, sliced_array.null_count()); - assert!(sliced_array.is_valid(0)); - assert!(sliced_array.is_valid(1)); - assert!(sliced_array.is_valid(2)); - - let sliced_c0 = sliced_array.column(0); - let sliced_c0 = sliced_c0.as_any().downcast_ref::().unwrap(); - assert_eq!(3, sliced_c0.len()); - assert_eq!(0, sliced_c0.offset()); - assert_eq!(2, sliced_c0.null_count()); - assert!(sliced_c0.is_valid(0)); - assert!(sliced_c0.is_null(1)); - assert!(sliced_c0.is_null(2)); - assert_eq!(false, sliced_c0.value(0)); - - let sliced_c1 = sliced_array.column(1); - let sliced_c1 = sliced_c1.as_any().downcast_ref::().unwrap(); - assert_eq!(3, sliced_c1.len()); - assert_eq!(0, sliced_c1.offset()); - assert_eq!(1, sliced_c1.null_count()); - assert!(sliced_c1.is_null(0)); - assert_eq!(28, sliced_c1.value(1)); - assert_eq!(42, sliced_c1.value(2)); - } -} diff --git a/rust/arrow/src/compute/kernels/mod.rs b/rust/arrow/src/compute/kernels/mod.rs deleted file mode 100644 index 862f55fe2f2..00000000000 --- a/rust/arrow/src/compute/kernels/mod.rs +++ /dev/null @@ -1,37 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Computation kernels on Arrow Arrays - -pub mod aggregate; -pub mod arithmetic; -pub mod arity; -pub mod boolean; -pub mod cast; -pub mod cast_utils; -pub mod comparison; -pub mod concat; -pub mod filter; -pub mod length; -pub mod limit; -pub mod regexp; -pub mod sort; -pub mod substring; -pub mod take; -pub mod temporal; -pub mod window; -pub mod zip; diff --git a/rust/arrow/src/compute/kernels/regexp.rs b/rust/arrow/src/compute/kernels/regexp.rs deleted file mode 100644 index 446d71d9f4a..00000000000 --- a/rust/arrow/src/compute/kernels/regexp.rs +++ /dev/null @@ -1,160 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines kernel to extract substrings based on a regular -//! expression of a \[Large\]StringArray - -use crate::array::{ - ArrayRef, GenericStringArray, GenericStringBuilder, ListBuilder, - StringOffsetSizeTrait, -}; -use crate::error::{ArrowError, Result}; -use std::collections::HashMap; - -use std::sync::Arc; - -use regex::Regex; - -/// Extract all groups matched by a regular expression for a given String array. -pub fn regexp_match( - array: &GenericStringArray, - regex_array: &GenericStringArray, - flags_array: Option<&GenericStringArray>, -) -> Result { - let mut patterns: HashMap = HashMap::new(); - let builder: GenericStringBuilder = GenericStringBuilder::new(0); - let mut list_builder = ListBuilder::new(builder); - - let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(value) => format!("(?{}){}", value, pattern), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - // Required for Postgres compatibility: - // SELECT regexp_match('foobarbequebaz', ''); = {""} - (Some(_), Some(pattern)) if pattern == *"" => { - list_builder.values().append_value("")?; - list_builder.append(true)?; - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re.clone(), - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {:?}", - e - )) - })?; - patterns.insert(pattern, re.clone()); - re - } - }; - match re.captures(value) { - Some(caps) => { - for m in caps.iter().skip(1) { - if let Some(v) = m { - list_builder.values().append_value(v.as_str())?; - } - } - list_builder.append(true)? - } - None => list_builder.append(false)?, - } - } - _ => list_builder.append(false)?, - } - Ok(()) - }) - .collect::>>()?; - Ok(Arc::new(list_builder.finish())) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::array::{ListArray, StringArray}; - - #[test] - fn match_single_group() -> Result<()> { - let values = vec![ - Some("abc-005-def"), - Some("X-7-5"), - Some("X545"), - None, - Some("foobarbequebaz"), - Some("foobarbequebaz"), - ]; - let array = StringArray::from(values); - let mut pattern_values = vec![r".*-(\d*)-.*"; 4]; - pattern_values.push(r"(bar)(bequ1e)"); - pattern_values.push(""); - let pattern = StringArray::from(pattern_values); - let actual = regexp_match(&array, &pattern, None)?; - let elem_builder: GenericStringBuilder = GenericStringBuilder::new(0); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.values().append_value("005")?; - expected_builder.append(true)?; - expected_builder.values().append_value("7")?; - expected_builder.append(true)?; - expected_builder.append(false)?; - expected_builder.append(false)?; - expected_builder.append(false)?; - expected_builder.values().append_value("")?; - expected_builder.append(true)?; - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); - Ok(()) - } - - #[test] - fn match_single_group_with_flags() -> Result<()> { - let values = vec![Some("abc-005-def"), Some("X-7-5"), Some("X545"), None]; - let array = StringArray::from(values); - let pattern = StringArray::from(vec![r"x.*-(\d*)-.*"; 4]); - let flags = StringArray::from(vec!["i"; 4]); - let actual = regexp_match(&array, &pattern, Some(&flags))?; - let elem_builder: GenericStringBuilder = GenericStringBuilder::new(0); - let mut expected_builder = ListBuilder::new(elem_builder); - expected_builder.append(false)?; - expected_builder.values().append_value("7")?; - expected_builder.append(true)?; - expected_builder.append(false)?; - expected_builder.append(false)?; - let expected = expected_builder.finish(); - let result = actual.as_any().downcast_ref::().unwrap(); - assert_eq!(&expected, result); - Ok(()) - } -} diff --git a/rust/arrow/src/compute/kernels/sort.rs b/rust/arrow/src/compute/kernels/sort.rs deleted file mode 100644 index bf8eda353e6..00000000000 --- a/rust/arrow/src/compute/kernels/sort.rs +++ /dev/null @@ -1,2246 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines sort kernel for `ArrayRef` - -use std::cmp::Ordering; - -use crate::array::*; -use crate::buffer::MutableBuffer; -use crate::compute::take; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; - -use TimeUnit::*; - -/// Sort the `ArrayRef` using `SortOptions`. -/// -/// Performs a stable sort on values and indices. Nulls are ordered according to the `nulls_first` flag in `options`. -/// Floats are sorted using IEEE 754 totalOrder -/// -/// Returns an `ArrowError::ComputeError(String)` if the array type is either unsupported by `sort_to_indices` or `take`. -/// -/// # Example -/// ```rust -/// # use std::sync::Arc; -/// # use arrow::array::{Int32Array, ArrayRef}; -/// # use arrow::error::Result; -/// # use arrow::compute::kernels::sort::sort; -/// # fn main() -> Result<()> { -/// let array: ArrayRef = Arc::new(Int32Array::from(vec![5, 4, 3, 2, 1])); -/// let sorted_array = sort(&array, None).unwrap(); -/// let sorted_array = sorted_array.as_any().downcast_ref::().unwrap(); -/// assert_eq!(sorted_array, &Int32Array::from(vec![1, 2, 3, 4, 5])); -/// # Ok(()) -/// # } -/// ``` -pub fn sort(values: &ArrayRef, options: Option) -> Result { - let indices = sort_to_indices(values, options, None)?; - take(values.as_ref(), &indices, None) -} - -/// Sort the `ArrayRef` partially. -/// -/// If `limit` is specified, the resulting array will contain only -/// first `limit` in the sort order. Any data data after the limit -/// will be discarded. -/// -/// Note: this is an unstable_sort, meaning it may not preserve the -/// order of equal elements. -/// -/// # Example -/// ```rust -/// # use std::sync::Arc; -/// # use arrow::array::{Int32Array, ArrayRef}; -/// # use arrow::error::Result; -/// # use arrow::compute::kernels::sort::{sort_limit, SortOptions}; -/// # fn main() -> Result<()> { -/// let array: ArrayRef = Arc::new(Int32Array::from(vec![5, 4, 3, 2, 1])); -/// -/// // Find the the top 2 items -/// let sorted_array = sort_limit(&array, None, Some(2)).unwrap(); -/// let sorted_array = sorted_array.as_any().downcast_ref::().unwrap(); -/// assert_eq!(sorted_array, &Int32Array::from(vec![1, 2])); -/// -/// // Find the bottom top 2 items -/// let options = Some(SortOptions { -/// descending: true, -/// ..Default::default() -/// }); -/// let sorted_array = sort_limit(&array, options, Some(2)).unwrap(); -/// let sorted_array = sorted_array.as_any().downcast_ref::().unwrap(); -/// assert_eq!(sorted_array, &Int32Array::from(vec![5, 4])); -/// # Ok(()) -/// # } -/// ``` -pub fn sort_limit( - values: &ArrayRef, - options: Option, - limit: Option, -) -> Result { - let indices = sort_to_indices(values, options, limit)?; - take(values.as_ref(), &indices, None) -} - -#[inline] -fn sort_by(array: &mut [T], limit: usize, cmp: F) -where - F: FnMut(&T, &T) -> Ordering, -{ - if array.len() == limit { - array.sort_by(cmp); - } else { - partial_sort(array, limit, cmp); - } -} - -// implements comparison using IEEE 754 total ordering for f32 -// Original implementation from https://doc.rust-lang.org/std/primitive.f64.html#method.total_cmp -// TODO to change to use std when it becomes stable -fn total_cmp_32(l: f32, r: f32) -> std::cmp::Ordering { - let mut left = l.to_bits() as i32; - let mut right = r.to_bits() as i32; - - left ^= (((left >> 31) as u32) >> 1) as i32; - right ^= (((right >> 31) as u32) >> 1) as i32; - - left.cmp(&right) -} - -// implements comparison using IEEE 754 total ordering for f64 -// Original implementation from https://doc.rust-lang.org/std/primitive.f64.html#method.total_cmp -// TODO to change to use std when it becomes stable -fn total_cmp_64(l: f64, r: f64) -> std::cmp::Ordering { - let mut left = l.to_bits() as i64; - let mut right = r.to_bits() as i64; - - left ^= (((left >> 63) as u64) >> 1) as i64; - right ^= (((right >> 63) as u64) >> 1) as i64; - - left.cmp(&right) -} - -fn cmp(l: T, r: T) -> std::cmp::Ordering -where - T: Ord, -{ - l.cmp(&r) -} - -// partition indices into valid and null indices -fn partition_validity(array: &ArrayRef) -> (Vec, Vec) { - match array.null_count() { - // faster path - 0 => ((0..(array.len() as u32)).collect(), vec![]), - _ => { - let indices = 0..(array.len() as u32); - indices.partition(|index| array.is_valid(*index as usize)) - } - } -} - -/// Sort elements from `ArrayRef` into an unsigned integer (`UInt32Array`) of indices. -/// For floating point arrays any NaN values are considered to be greater than any other non-null value -/// limit is an option for partial_sort -pub fn sort_to_indices( - values: &ArrayRef, - options: Option, - limit: Option, -) -> Result { - let options = options.unwrap_or_default(); - - let (v, n) = partition_validity(values); - - match values.data_type() { - DataType::Boolean => sort_boolean(values, v, n, &options, limit), - DataType::Int8 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Int16 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Int32 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Int64 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::UInt8 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::UInt16 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::UInt32 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::UInt64 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Float32 => { - sort_primitive::(values, v, n, total_cmp_32, &options, limit) - } - DataType::Float64 => { - sort_primitive::(values, v, n, total_cmp_64, &options, limit) - } - DataType::Date32 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Date64 => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Time32(Second) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Time32(Millisecond) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Time64(Microsecond) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Time64(Nanosecond) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Timestamp(Second, _) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Timestamp(Millisecond, _) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Timestamp(Microsecond, _) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Timestamp(Nanosecond, _) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Interval(IntervalUnit::YearMonth) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Interval(IntervalUnit::DayTime) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Duration(TimeUnit::Second) => { - sort_primitive::(values, v, n, cmp, &options, limit) - } - DataType::Duration(TimeUnit::Millisecond) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Duration(TimeUnit::Microsecond) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Duration(TimeUnit::Nanosecond) => { - sort_primitive::( - values, v, n, cmp, &options, limit, - ) - } - DataType::Utf8 => sort_string(values, v, n, &options, limit), - DataType::List(field) => match field.data_type() { - DataType::Int8 => sort_list::(values, v, n, &options, limit), - DataType::Int16 => sort_list::(values, v, n, &options, limit), - DataType::Int32 => sort_list::(values, v, n, &options, limit), - DataType::Int64 => sort_list::(values, v, n, &options, limit), - DataType::UInt8 => sort_list::(values, v, n, &options, limit), - DataType::UInt16 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt32 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt64 => { - sort_list::(values, v, n, &options, limit) - } - t => Err(ArrowError::ComputeError(format!( - "Sort not supported for list type {:?}", - t - ))), - }, - DataType::LargeList(field) => match field.data_type() { - DataType::Int8 => sort_list::(values, v, n, &options, limit), - DataType::Int16 => sort_list::(values, v, n, &options, limit), - DataType::Int32 => sort_list::(values, v, n, &options, limit), - DataType::Int64 => sort_list::(values, v, n, &options, limit), - DataType::UInt8 => sort_list::(values, v, n, &options, limit), - DataType::UInt16 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt32 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt64 => { - sort_list::(values, v, n, &options, limit) - } - t => Err(ArrowError::ComputeError(format!( - "Sort not supported for list type {:?}", - t - ))), - }, - DataType::FixedSizeList(field, _) => match field.data_type() { - DataType::Int8 => sort_list::(values, v, n, &options, limit), - DataType::Int16 => sort_list::(values, v, n, &options, limit), - DataType::Int32 => sort_list::(values, v, n, &options, limit), - DataType::Int64 => sort_list::(values, v, n, &options, limit), - DataType::UInt8 => sort_list::(values, v, n, &options, limit), - DataType::UInt16 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt32 => { - sort_list::(values, v, n, &options, limit) - } - DataType::UInt64 => { - sort_list::(values, v, n, &options, limit) - } - t => Err(ArrowError::ComputeError(format!( - "Sort not supported for list type {:?}", - t - ))), - }, - DataType::Dictionary(key_type, value_type) - if *value_type.as_ref() == DataType::Utf8 => - { - match key_type.as_ref() { - DataType::Int8 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::Int16 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::Int32 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::Int64 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::UInt8 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::UInt16 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::UInt32 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - DataType::UInt64 => { - sort_string_dictionary::(values, v, n, &options, limit) - } - t => Err(ArrowError::ComputeError(format!( - "Sort not supported for dictionary key type {:?}", - t - ))), - } - } - t => Err(ArrowError::ComputeError(format!( - "Sort not supported for data type {:?}", - t - ))), - } -} - -/// Options that define how sort kernels should behave -#[derive(Clone, Copy, Debug)] -pub struct SortOptions { - /// Whether to sort in descending order - pub descending: bool, - /// Whether to sort nulls first - pub nulls_first: bool, -} - -impl Default for SortOptions { - fn default() -> Self { - Self { - descending: false, - // default to nulls first to match spark's behavior - nulls_first: true, - } - } -} - -/// Sort primitive values -#[allow(clippy::unnecessary_wraps)] -fn sort_boolean( - values: &ArrayRef, - value_indices: Vec, - null_indices: Vec, - options: &SortOptions, - limit: Option, -) -> Result { - let values = values - .as_any() - .downcast_ref::() - .expect("Unable to downcast to boolean array"); - let descending = options.descending; - - // create tuples that are used for sorting - let mut valids = value_indices - .into_iter() - .map(|index| (index, values.value(index as usize))) - .collect::>(); - - let mut nulls = null_indices; - - let valids_len = valids.len(); - let nulls_len = nulls.len(); - - let mut len = values.len(); - if let Some(limit) = limit { - len = limit.min(len); - } - if !descending { - sort_by(&mut valids, len - nulls_len, |a, b| cmp(a.1, b.1)); - } else { - sort_by(&mut valids, len - nulls_len, |a, b| cmp(a.1, b.1).reverse()); - // reverse to keep a stable ordering - nulls.reverse(); - } - - // collect results directly into a buffer instead of a vec to avoid another aligned allocation - let mut result = MutableBuffer::new(values.len() * std::mem::size_of::()); - // sets len to capacity so we can access the whole buffer as a typed slice - result.resize(values.len() * std::mem::size_of::(), 0); - let result_slice: &mut [u32] = result.typed_data_mut(); - - debug_assert_eq!(result_slice.len(), nulls_len + valids_len); - - if options.nulls_first { - let size = nulls_len.min(len); - result_slice[0..nulls_len.min(len)].copy_from_slice(&nulls); - if nulls_len < len { - insert_valid_values(result_slice, nulls_len, &valids[0..len - size]); - } - } else { - // nulls last - let size = valids.len().min(len); - insert_valid_values(result_slice, 0, &valids[0..size]); - if len > size { - result_slice[valids_len..].copy_from_slice(&nulls[0..(len - valids_len)]); - } - } - - let result_data = ArrayData::new( - DataType::UInt32, - len, - Some(0), - None, - 0, - vec![result.into()], - vec![], - ); - - Ok(UInt32Array::from(result_data)) -} - -/// Sort primitive values -#[allow(clippy::unnecessary_wraps)] -fn sort_primitive( - values: &ArrayRef, - value_indices: Vec, - null_indices: Vec, - cmp: F, - options: &SortOptions, - limit: Option, -) -> Result -where - T: ArrowPrimitiveType, - T::Native: std::cmp::PartialOrd, - F: Fn(T::Native, T::Native) -> std::cmp::Ordering, -{ - let values = as_primitive_array::(values); - let descending = options.descending; - - // create tuples that are used for sorting - let mut valids = value_indices - .into_iter() - .map(|index| (index, values.value(index as usize))) - .collect::>(); - - let mut nulls = null_indices; - - let valids_len = valids.len(); - let nulls_len = nulls.len(); - let mut len = values.len(); - - if let Some(limit) = limit { - len = limit.min(len); - } - if !descending { - sort_by(&mut valids, len - nulls_len, |a, b| cmp(a.1, b.1)); - } else { - sort_by(&mut valids, len - nulls_len, |a, b| cmp(a.1, b.1).reverse()); - // reverse to keep a stable ordering - nulls.reverse(); - } - - // collect results directly into a buffer instead of a vec to avoid another aligned allocation - let mut result = MutableBuffer::new(values.len() * std::mem::size_of::()); - // sets len to capacity so we can access the whole buffer as a typed slice - result.resize(values.len() * std::mem::size_of::(), 0); - let result_slice: &mut [u32] = result.typed_data_mut(); - - debug_assert_eq!(result_slice.len(), nulls_len + valids_len); - - if options.nulls_first { - let size = nulls_len.min(len); - result_slice[0..nulls_len.min(len)].copy_from_slice(&nulls); - if nulls_len < len { - insert_valid_values(result_slice, nulls_len, &valids[0..len - size]); - } - } else { - // nulls last - let size = valids.len().min(len); - insert_valid_values(result_slice, 0, &valids[0..size]); - if len > size { - result_slice[valids_len..].copy_from_slice(&nulls[0..(len - valids_len)]); - } - } - - let result_data = ArrayData::new( - DataType::UInt32, - len, - Some(0), - None, - 0, - vec![result.into()], - vec![], - ); - - Ok(UInt32Array::from(result_data)) -} - -// insert valid and nan values in the correct order depending on the descending flag -fn insert_valid_values(result_slice: &mut [u32], offset: usize, valids: &[(u32, T)]) { - let valids_len = valids.len(); - // helper to append the index part of the valid tuples - let append_valids = move |dst_slice: &mut [u32]| { - debug_assert_eq!(dst_slice.len(), valids_len); - dst_slice - .iter_mut() - .zip(valids.iter()) - .for_each(|(dst, src)| *dst = src.0) - }; - - append_valids(&mut result_slice[offset..offset + valids.len()]); -} - -/// Sort strings -fn sort_string( - values: &ArrayRef, - value_indices: Vec, - null_indices: Vec, - options: &SortOptions, - limit: Option, -) -> Result { - let values = as_string_array(values); - - sort_string_helper( - values, - value_indices, - null_indices, - options, - limit, - |array, idx| array.value(idx as usize), - ) -} - -/// Sort dictionary encoded strings -fn sort_string_dictionary( - values: &ArrayRef, - value_indices: Vec, - null_indices: Vec, - options: &SortOptions, - limit: Option, -) -> Result { - let values: &DictionaryArray = as_dictionary_array::(values); - - let keys: &PrimitiveArray = &values.keys_array(); - - let dict = values.values(); - let dict: &StringArray = as_string_array(&dict); - - sort_string_helper( - keys, - value_indices, - null_indices, - options, - limit, - |array: &PrimitiveArray, idx| -> &str { - let key: T::Native = array.value(idx as usize); - dict.value(key.to_usize().unwrap()) - }, - ) -} - -/// shared implementation between dictionary encoded and plain string arrays -#[inline] -#[allow(clippy::unnecessary_wraps)] -fn sort_string_helper<'a, A: Array, F>( - values: &'a A, - value_indices: Vec, - null_indices: Vec, - options: &SortOptions, - limit: Option, - value_fn: F, -) -> Result -where - F: Fn(&'a A, u32) -> &str, -{ - let mut valids = value_indices - .into_iter() - .map(|index| (index, value_fn(&values, index))) - .collect::>(); - let mut nulls = null_indices; - let descending = options.descending; - let mut len = values.len(); - let nulls_len = nulls.len(); - - if let Some(limit) = limit { - len = limit.min(len); - } - if !descending { - sort_by(&mut valids, len - nulls_len, |a, b| cmp(a.1, b.1)); - } else { - sort_by(&mut valids, len - nulls_len, |a, b| cmp(a.1, b.1).reverse()); - // reverse to keep a stable ordering - nulls.reverse(); - } - // collect the order of valid tuplies - let mut valid_indices: Vec = valids.iter().map(|tuple| tuple.0).collect(); - - if options.nulls_first { - nulls.append(&mut valid_indices); - nulls.truncate(len); - return Ok(UInt32Array::from(nulls)); - } - - // no need to sort nulls as they are in the correct order already - valid_indices.append(&mut nulls); - valid_indices.truncate(len); - Ok(UInt32Array::from(valid_indices)) -} - -#[allow(clippy::unnecessary_wraps)] -fn sort_list( - values: &ArrayRef, - value_indices: Vec, - mut null_indices: Vec, - options: &SortOptions, - limit: Option, -) -> Result -where - S: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: std::cmp::PartialOrd, -{ - let mut valids: Vec<(u32, ArrayRef)> = values - .as_any() - .downcast_ref::() - .map_or_else( - || { - let values = as_generic_list_array::(values); - value_indices - .iter() - .copied() - .map(|index| (index, values.value(index as usize))) - .collect() - }, - |values| { - value_indices - .iter() - .copied() - .map(|index| (index, values.value(index as usize))) - .collect() - }, - ); - - let mut len = values.len(); - let nulls_len = null_indices.len(); - let descending = options.descending; - - if let Some(limit) = limit { - len = limit.min(len); - } - if !descending { - sort_by(&mut valids, len - nulls_len, |a, b| { - cmp_array(a.1.as_ref(), b.1.as_ref()) - }); - } else { - sort_by(&mut valids, len - nulls_len, |a, b| { - cmp_array(a.1.as_ref(), b.1.as_ref()).reverse() - }); - // reverse to keep a stable ordering - null_indices.reverse(); - } - - let mut valid_indices: Vec = valids.iter().map(|tuple| tuple.0).collect(); - if options.nulls_first { - null_indices.append(&mut valid_indices); - null_indices.truncate(len); - return Ok(UInt32Array::from(null_indices)); - } - - valid_indices.append(&mut null_indices); - valid_indices.truncate(len); - Ok(UInt32Array::from(valid_indices)) -} - -/// Compare two `Array`s based on the ordering defined in [ord](crate::array::ord). -fn cmp_array(a: &Array, b: &Array) -> Ordering { - let cmp_op = build_compare(a, b).unwrap(); - let length = a.len().max(b.len()); - - for i in 0..length { - let result = cmp_op(i, i); - if result != Ordering::Equal { - return result; - } - } - Ordering::Equal -} - -/// One column to be used in lexicographical sort -#[derive(Clone, Debug)] -pub struct SortColumn { - pub values: ArrayRef, - pub options: Option, -} - -/// Sort a list of `ArrayRef` using `SortOptions` provided for each array. -/// -/// Performs a stable lexicographical sort on values and indices. -/// -/// Returns an `ArrowError::ComputeError(String)` if any of the array type is either unsupported by -/// `lexsort_to_indices` or `take`. -/// -/// Example: -/// -/// ``` -/// use std::convert::From; -/// use std::sync::Arc; -/// use arrow::array::{ArrayRef, StringArray, PrimitiveArray, as_primitive_array}; -/// use arrow::compute::kernels::sort::{SortColumn, SortOptions, lexsort}; -/// use arrow::datatypes::Int64Type; -/// -/// let sorted_columns = lexsort(&vec![ -/// SortColumn { -/// values: Arc::new(PrimitiveArray::::from(vec![ -/// None, -/// Some(-2), -/// Some(89), -/// Some(-64), -/// Some(101), -/// ])) as ArrayRef, -/// options: None, -/// }, -/// SortColumn { -/// values: Arc::new(StringArray::from(vec![ -/// Some("hello"), -/// Some("world"), -/// Some(","), -/// Some("foobar"), -/// Some("!"), -/// ])) as ArrayRef, -/// options: Some(SortOptions { -/// descending: true, -/// nulls_first: false, -/// }), -/// }, -/// ], None).unwrap(); -/// -/// assert_eq!(as_primitive_array::(&sorted_columns[0]).value(1), -64); -/// assert!(sorted_columns[0].is_null(0)); -/// ``` -pub fn lexsort(columns: &[SortColumn], limit: Option) -> Result> { - let indices = lexsort_to_indices(columns, limit)?; - columns - .iter() - .map(|c| take(c.values.as_ref(), &indices, None)) - .collect() -} - -/// Sort elements lexicographically from a list of `ArrayRef` into an unsigned integer -/// (`UInt32Array`) of indices. -pub fn lexsort_to_indices( - columns: &[SortColumn], - limit: Option, -) -> Result { - if columns.is_empty() { - return Err(ArrowError::InvalidArgumentError( - "Sort requires at least one column".to_string(), - )); - } - if columns.len() == 1 { - // fallback to non-lexical sort - let column = &columns[0]; - return sort_to_indices(&column.values, column.options, limit); - } - - let row_count = columns[0].values.len(); - if columns.iter().any(|item| item.values.len() != row_count) { - return Err(ArrowError::ComputeError( - "lexical sort columns have different row counts".to_string(), - )); - }; - - // map to data and DynComparator - let flat_columns = columns - .iter() - .map( - |column| -> Result<(&ArrayData, DynComparator, SortOptions)> { - // flatten and convert build comparators - // use ArrayData for is_valid checks later to avoid dynamic call - let values = column.values.as_ref(); - let data = values.data_ref(); - Ok(( - data, - build_compare(values, values)?, - column.options.unwrap_or_default(), - )) - }, - ) - .collect::>>()?; - - let lex_comparator = |a_idx: &usize, b_idx: &usize| -> Ordering { - for (data, comparator, sort_option) in flat_columns.iter() { - match (data.is_valid(*a_idx), data.is_valid(*b_idx)) { - (true, true) => { - match (comparator)(*a_idx, *b_idx) { - // equal, move on to next column - Ordering::Equal => continue, - order => { - if sort_option.descending { - return order.reverse(); - } else { - return order; - } - } - } - } - (false, true) => { - return if sort_option.nulls_first { - Ordering::Less - } else { - Ordering::Greater - }; - } - (true, false) => { - return if sort_option.nulls_first { - Ordering::Greater - } else { - Ordering::Less - }; - } - // equal, move on to next column - (false, false) => continue, - } - } - - Ordering::Equal - }; - - let mut value_indices = (0..row_count).collect::>(); - let mut len = value_indices.len(); - - if let Some(limit) = limit { - len = limit.min(len); - } - sort_by(&mut value_indices, len, lex_comparator); - - Ok(UInt32Array::from( - (&value_indices)[0..len] - .iter() - .map(|i| *i as u32) - .collect::>(), - )) -} - -/// It's unstable_sort, may not preserve the order of equal elements -pub fn partial_sort(v: &mut [T], limit: usize, mut is_less: F) -where - F: FnMut(&T, &T) -> Ordering, -{ - let (before, _mid, _after) = v.select_nth_unstable_by(limit, &mut is_less); - before.sort_unstable_by(is_less); -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::compute::util::tests::{ - build_fixed_size_list_nullable, build_generic_list_nullable, - }; - use rand::rngs::StdRng; - use rand::{Rng, RngCore, SeedableRng}; - use std::convert::TryFrom; - use std::iter::FromIterator; - use std::sync::Arc; - - fn test_sort_to_indices_boolean_arrays( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec, - ) { - let output = BooleanArray::from(data); - let expected = UInt32Array::from(expected_data); - let output = - sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); - assert_eq!(output, expected) - } - - fn test_sort_to_indices_primitive_arrays( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec, - ) where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let output = PrimitiveArray::::from(data); - let expected = UInt32Array::from(expected_data); - let output = - sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); - assert_eq!(output, expected) - } - - fn test_sort_primitive_arrays( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec>, - ) where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let output = PrimitiveArray::::from(data); - let expected = Arc::new(PrimitiveArray::::from(expected_data)) as ArrayRef; - let output = match limit { - Some(_) => { - sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap() - } - _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), - }; - assert_eq!(&output, &expected) - } - - fn test_sort_to_indices_string_arrays( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec, - ) { - let output = StringArray::from(data); - let expected = UInt32Array::from(expected_data); - let output = - sort_to_indices(&(Arc::new(output) as ArrayRef), options, limit).unwrap(); - assert_eq!(output, expected) - } - - fn test_sort_string_arrays( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec>, - ) { - let output = StringArray::from(data); - let expected = Arc::new(StringArray::from(expected_data)) as ArrayRef; - let output = match limit { - Some(_) => { - sort_limit(&(Arc::new(output) as ArrayRef), options, limit).unwrap() - } - _ => sort(&(Arc::new(output) as ArrayRef), options).unwrap(), - }; - assert_eq!(&output, &expected) - } - - fn test_sort_string_dict_arrays( - data: Vec>, - options: Option, - limit: Option, - expected_data: Vec>, - ) { - let array = DictionaryArray::::from_iter(data.into_iter()); - let array_values = array.values(); - let dict = array_values - .as_any() - .downcast_ref::() - .expect("Unable to get dictionary values"); - - let sorted = match limit { - Some(_) => { - sort_limit(&(Arc::new(array) as ArrayRef), options, limit).unwrap() - } - _ => sort(&(Arc::new(array) as ArrayRef), options).unwrap(), - }; - let sorted = sorted - .as_any() - .downcast_ref::>() - .unwrap(); - let sorted_values = sorted.values(); - let sorted_dict = sorted_values - .as_any() - .downcast_ref::() - .expect("Unable to get dictionary values"); - let sorted_keys = sorted.keys_array(); - - assert_eq!(sorted_dict, dict); - - let sorted_strings = StringArray::try_from( - (0..sorted.len()) - .map(|i| { - if sorted.is_valid(i) { - Some(sorted_dict.value(sorted_keys.value(i).to_usize().unwrap())) - } else { - None - } - }) - .collect::>>(), - ) - .expect("Unable to create string array from dictionary"); - let expected = - StringArray::try_from(expected_data).expect("Unable to create string array"); - - assert_eq!(sorted_strings, expected) - } - - fn test_sort_list_arrays( - data: Vec>>>, - options: Option, - limit: Option, - expected_data: Vec>>>, - fixed_length: Option, - ) where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - // for FixedSizedList - if let Some(length) = fixed_length { - let input = Arc::new(build_fixed_size_list_nullable(data.clone(), length)); - let sorted = match limit { - Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), - _ => sort(&(input as ArrayRef), options).unwrap(), - }; - let expected = Arc::new(build_fixed_size_list_nullable( - expected_data.clone(), - length, - )) as ArrayRef; - - assert_eq!(&sorted, &expected); - } - - // for List - let input = Arc::new(build_generic_list_nullable::(data.clone())); - let sorted = match limit { - Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), - _ => sort(&(input as ArrayRef), options).unwrap(), - }; - let expected = - Arc::new(build_generic_list_nullable::(expected_data.clone())) - as ArrayRef; - - assert_eq!(&sorted, &expected); - - // for LargeList - let input = Arc::new(build_generic_list_nullable::(data)); - let sorted = match limit { - Some(_) => sort_limit(&(input as ArrayRef), options, limit).unwrap(), - _ => sort(&(input as ArrayRef), options).unwrap(), - }; - let expected = - Arc::new(build_generic_list_nullable::(expected_data)) as ArrayRef; - - assert_eq!(&sorted, &expected); - } - - fn test_lex_sort_arrays( - input: Vec, - expected_output: Vec, - limit: Option, - ) { - let sorted = lexsort(&input, limit).unwrap(); - - for (result, expected) in sorted.iter().zip(expected_output.iter()) { - assert_eq!(result, expected); - } - } - - #[test] - fn test_sort_to_indices_primitives() { - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - None, - None, - vec![0, 5, 3, 1, 4, 2], - ); - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - None, - None, - vec![0, 5, 3, 1, 4, 2], - ); - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - None, - None, - vec![0, 5, 3, 1, 4, 2], - ); - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - None, - None, - vec![0, 5, 3, 1, 4, 2], - ); - test_sort_to_indices_primitive_arrays::( - vec![ - None, - Some(-0.05), - Some(2.225), - Some(-1.01), - Some(-0.05), - None, - ], - None, - None, - vec![0, 5, 3, 1, 4, 2], - ); - test_sort_to_indices_primitive_arrays::( - vec![ - None, - Some(-0.05), - Some(2.225), - Some(-1.01), - Some(-0.05), - None, - ], - None, - None, - vec![0, 5, 3, 1, 4, 2], - ); - - // descending - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![2, 1, 4, 3, 5, 0], // [2, 4, 1, 3, 5, 0] - ); - - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![2, 1, 4, 3, 5, 0], - ); - - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![2, 1, 4, 3, 5, 0], - ); - - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![2, 1, 4, 3, 5, 0], - ); - - test_sort_to_indices_primitive_arrays::( - vec![ - None, - Some(0.005), - Some(20.22), - Some(-10.3), - Some(0.005), - None, - ], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![2, 1, 4, 3, 5, 0], - ); - - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![2, 1, 4, 3, 5, 0], - ); - - // descending, nulls first - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![5, 0, 2, 1, 4, 3], // [5, 0, 2, 4, 1, 3] - ); - - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![5, 0, 2, 1, 4, 3], // [5, 0, 2, 4, 1, 3] - ); - - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![5, 0, 2, 1, 4, 3], - ); - - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![5, 0, 2, 1, 4, 3], - ); - - test_sort_to_indices_primitive_arrays::( - vec![None, Some(0.1), Some(0.2), Some(-1.3), Some(0.01), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![5, 0, 2, 1, 4, 3], - ); - - test_sort_to_indices_primitive_arrays::( - vec![None, Some(10.1), Some(100.2), Some(-1.3), Some(10.01), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![5, 0, 2, 1, 4, 3], - ); - } - - #[test] - fn test_sort_boolean() { - // boolean - test_sort_to_indices_boolean_arrays( - vec![None, Some(false), Some(true), Some(true), Some(false), None], - None, - None, - vec![0, 5, 1, 4, 2, 3], - ); - - // boolean, descending - test_sort_to_indices_boolean_arrays( - vec![None, Some(false), Some(true), Some(true), Some(false), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![2, 3, 1, 4, 5, 0], - ); - - // boolean, descending, nulls first - test_sort_to_indices_boolean_arrays( - vec![None, Some(false), Some(true), Some(true), Some(false), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![5, 0, 2, 3, 1, 4], - ); - - // boolean, descending, nulls first, limit - test_sort_to_indices_boolean_arrays( - vec![None, Some(false), Some(true), Some(true), Some(false), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - Some(3), - vec![5, 0, 2], - ); - } - - #[test] - fn test_sort_primitives() { - // default case - test_sort_primitive_arrays::( - vec![None, Some(3), Some(5), Some(2), Some(3), None], - None, - None, - vec![None, None, Some(2), Some(3), Some(3), Some(5)], - ); - test_sort_primitive_arrays::( - vec![None, Some(3), Some(5), Some(2), Some(3), None], - None, - None, - vec![None, None, Some(2), Some(3), Some(3), Some(5)], - ); - test_sort_primitive_arrays::( - vec![None, Some(3), Some(5), Some(2), Some(3), None], - None, - None, - vec![None, None, Some(2), Some(3), Some(3), Some(5)], - ); - test_sort_primitive_arrays::( - vec![None, Some(3), Some(5), Some(2), Some(3), None], - None, - None, - vec![None, None, Some(2), Some(3), Some(3), Some(5)], - ); - - // descending - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![Some(2), Some(0), Some(0), Some(-1), None, None], - ); - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![Some(2), Some(0), Some(0), Some(-1), None, None], - ); - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![Some(2), Some(0), Some(0), Some(-1), None, None], - ); - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![Some(2), Some(0), Some(0), Some(-1), None, None], - ); - - // descending, nulls first - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![None, None, Some(2), Some(0), Some(0), Some(-1)], - ); - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![None, None, Some(2), Some(0), Some(0), Some(-1)], - ); - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![None, None, Some(2), Some(0), Some(0), Some(-1)], - ); - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![None, None, Some(2), Some(0), Some(0), Some(-1)], - ); - - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - Some(3), - vec![None, None, Some(2)], - ); - - test_sort_primitive_arrays::( - vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![None, None, Some(2.0), Some(0.0), Some(0.0), Some(-1.0)], - ); - test_sort_primitive_arrays::( - vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(f64::NAN), None], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![None, None, Some(f64::NAN), Some(2.0), Some(0.0), Some(-1.0)], - ); - test_sort_primitive_arrays::( - vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], - ); - - // int8 nulls first - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![None, None, Some(-1), Some(0), Some(0), Some(2)], - ); - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![None, None, Some(-1), Some(0), Some(0), Some(2)], - ); - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![None, None, Some(-1), Some(0), Some(0), Some(2)], - ); - test_sort_primitive_arrays::( - vec![None, Some(0), Some(2), Some(-1), Some(0), None], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![None, None, Some(-1), Some(0), Some(0), Some(2)], - ); - test_sort_primitive_arrays::( - vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(0.0), None], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![None, None, Some(-1.0), Some(0.0), Some(0.0), Some(2.0)], - ); - test_sort_primitive_arrays::( - vec![None, Some(0.0), Some(2.0), Some(-1.0), Some(f64::NAN), None], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![None, None, Some(-1.0), Some(0.0), Some(2.0), Some(f64::NAN)], - ); - test_sort_primitive_arrays::( - vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![Some(1.0), Some(f64::NAN), Some(f64::NAN), Some(f64::NAN)], - ); - - // limit - test_sort_primitive_arrays::( - vec![Some(f64::NAN), Some(f64::NAN), Some(f64::NAN), Some(1.0)], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - Some(2), - vec![Some(1.0), Some(f64::NAN)], - ); - - // limit with actual value - test_sort_primitive_arrays::( - vec![Some(2.0), Some(4.0), Some(3.0), Some(1.0)], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - Some(3), - vec![Some(1.0), Some(2.0), Some(3.0)], - ); - } - - #[test] - fn test_sort_to_indices_strings() { - test_sort_to_indices_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - None, - None, - vec![0, 3, 5, 1, 4, 2], - ); - - test_sort_to_indices_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![2, 4, 1, 5, 3, 0], - ); - - test_sort_to_indices_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![0, 3, 5, 1, 4, 2], - ); - - test_sort_to_indices_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![3, 0, 2, 4, 1, 5], - ); - - test_sort_to_indices_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - Some(3), - vec![3, 0, 2], - ); - } - - #[test] - fn test_sort_strings() { - test_sort_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - None, - None, - vec![ - None, - None, - Some("-ad"), - Some("bad"), - Some("glad"), - Some("sad"), - ], - ); - - test_sort_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![ - Some("sad"), - Some("glad"), - Some("bad"), - Some("-ad"), - None, - None, - ], - ); - - test_sort_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![ - None, - None, - Some("-ad"), - Some("bad"), - Some("glad"), - Some("sad"), - ], - ); - - test_sort_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![ - None, - None, - Some("sad"), - Some("glad"), - Some("bad"), - Some("-ad"), - ], - ); - - test_sort_string_arrays( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - Some(3), - vec![None, None, Some("sad")], - ); - } - - #[test] - fn test_sort_string_dicts() { - test_sort_string_dict_arrays::( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - None, - None, - vec![ - None, - None, - Some("-ad"), - Some("bad"), - Some("glad"), - Some("sad"), - ], - ); - - test_sort_string_dict_arrays::( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: true, - nulls_first: false, - }), - None, - vec![ - Some("sad"), - Some("glad"), - Some("bad"), - Some("-ad"), - None, - None, - ], - ); - - test_sort_string_dict_arrays::( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: false, - nulls_first: true, - }), - None, - vec![ - None, - None, - Some("-ad"), - Some("bad"), - Some("glad"), - Some("sad"), - ], - ); - - test_sort_string_dict_arrays::( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - None, - vec![ - None, - None, - Some("sad"), - Some("glad"), - Some("bad"), - Some("-ad"), - ], - ); - - test_sort_string_dict_arrays::( - vec![ - None, - Some("bad"), - Some("sad"), - None, - Some("glad"), - Some("-ad"), - ], - Some(SortOptions { - descending: true, - nulls_first: true, - }), - Some(3), - vec![None, None, Some("sad")], - ); - } - - #[test] - fn test_sort_list() { - test_sort_list_arrays::( - vec![ - Some(vec![Some(1)]), - Some(vec![Some(4)]), - Some(vec![Some(2)]), - Some(vec![Some(3)]), - ], - Some(SortOptions { - descending: false, - nulls_first: false, - }), - None, - vec![ - Some(vec![Some(1)]), - Some(vec![Some(2)]), - Some(vec![Some(3)]), - Some(vec![Some(4)]), - ], - Some(1), - ); - - test_sort_list_arrays::( - vec![ - Some(vec![Some(1), Some(0)]), - Some(vec![Some(4), Some(3), Some(2), Some(1)]), - Some(vec![Some(2), Some(3), Some(4)]), - Some(vec![Some(3), Some(3), Some(3), Some(3)]), - Some(vec![Some(1), Some(1)]), - ], - Some(SortOptions { - descending: false, - nulls_first: false, - }), - None, - vec![ - Some(vec![Some(1), Some(0)]), - Some(vec![Some(1), Some(1)]), - Some(vec![Some(2), Some(3), Some(4)]), - Some(vec![Some(3), Some(3), Some(3), Some(3)]), - Some(vec![Some(4), Some(3), Some(2), Some(1)]), - ], - None, - ); - - test_sort_list_arrays::( - vec![ - None, - Some(vec![Some(4), None, Some(2)]), - Some(vec![Some(2), Some(3), Some(4)]), - None, - Some(vec![Some(3), Some(3), None]), - ], - Some(SortOptions { - descending: false, - nulls_first: false, - }), - None, - vec![ - Some(vec![Some(2), Some(3), Some(4)]), - Some(vec![Some(3), Some(3), None]), - Some(vec![Some(4), None, Some(2)]), - None, - None, - ], - Some(3), - ); - - test_sort_list_arrays::( - vec![ - Some(vec![Some(1), Some(0)]), - Some(vec![Some(4), Some(3), Some(2), Some(1)]), - Some(vec![Some(2), Some(3), Some(4)]), - Some(vec![Some(3), Some(3), Some(3), Some(3)]), - Some(vec![Some(1), Some(1)]), - ], - Some(SortOptions { - descending: false, - nulls_first: false, - }), - Some(2), - vec![Some(vec![Some(1), Some(0)]), Some(vec![Some(1), Some(1)])], - None, - ); - } - - #[test] - fn test_lex_sort_single_column() { - let input = vec![SortColumn { - values: Arc::new(PrimitiveArray::::from(vec![ - Some(17), - Some(2), - Some(-1), - Some(0), - ])) as ArrayRef, - options: None, - }]; - let expected = vec![Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(0), - Some(2), - Some(17), - ])) as ArrayRef]; - test_lex_sort_arrays(input.clone(), expected, None); - - let expected = vec![Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(0), - Some(2), - ])) as ArrayRef]; - test_lex_sort_arrays(input, expected, Some(3)); - } - - #[test] - fn test_lex_sort_unaligned_rows() { - let input = vec![ - SortColumn { - values: Arc::new(PrimitiveArray::::from(vec![None, Some(-1)])) - as ArrayRef, - options: None, - }, - SortColumn { - values: Arc::new(StringArray::from(vec![Some("foo")])) as ArrayRef, - options: None, - }, - ]; - assert!( - lexsort(&input, None).is_err(), - "lexsort should reject columns with different row counts" - ); - } - - #[test] - fn test_lex_sort_mixed_types() { - let input = vec![ - SortColumn { - values: Arc::new(PrimitiveArray::::from(vec![ - Some(0), - Some(2), - Some(-1), - Some(0), - ])) as ArrayRef, - options: None, - }, - SortColumn { - values: Arc::new(PrimitiveArray::::from(vec![ - Some(101), - Some(8), - Some(7), - Some(102), - ])) as ArrayRef, - options: None, - }, - SortColumn { - values: Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(-2), - Some(-3), - Some(-4), - ])) as ArrayRef, - options: None, - }, - ]; - let expected = vec![ - Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(0), - Some(0), - Some(2), - ])) as ArrayRef, - Arc::new(PrimitiveArray::::from(vec![ - Some(7), - Some(101), - Some(102), - Some(8), - ])) as ArrayRef, - Arc::new(PrimitiveArray::::from(vec![ - Some(-3), - Some(-1), - Some(-4), - Some(-2), - ])) as ArrayRef, - ]; - test_lex_sort_arrays(input, expected, None); - - // test mix of string and in64 with option - let input = vec![ - SortColumn { - values: Arc::new(PrimitiveArray::::from(vec![ - Some(0), - Some(2), - Some(-1), - Some(0), - ])) as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - SortColumn { - values: Arc::new(StringArray::from(vec![ - Some("foo"), - Some("9"), - Some("7"), - Some("bar"), - ])) as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - ]; - let expected = vec![ - Arc::new(PrimitiveArray::::from(vec![ - Some(2), - Some(0), - Some(0), - Some(-1), - ])) as ArrayRef, - Arc::new(StringArray::from(vec![ - Some("9"), - Some("foo"), - Some("bar"), - Some("7"), - ])) as ArrayRef, - ]; - test_lex_sort_arrays(input, expected, None); - - // test sort with nulls first - let input = vec![ - SortColumn { - values: Arc::new(PrimitiveArray::::from(vec![ - None, - Some(-1), - Some(2), - None, - ])) as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - SortColumn { - values: Arc::new(StringArray::from(vec![ - Some("foo"), - Some("world"), - Some("hello"), - None, - ])) as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - ]; - let expected = vec![ - Arc::new(PrimitiveArray::::from(vec![ - None, - None, - Some(2), - Some(-1), - ])) as ArrayRef, - Arc::new(StringArray::from(vec![ - None, - Some("foo"), - Some("hello"), - Some("world"), - ])) as ArrayRef, - ]; - test_lex_sort_arrays(input, expected, None); - - // test sort with nulls last - let input = vec![ - SortColumn { - values: Arc::new(PrimitiveArray::::from(vec![ - None, - Some(-1), - Some(2), - None, - ])) as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: false, - }), - }, - SortColumn { - values: Arc::new(StringArray::from(vec![ - Some("foo"), - Some("world"), - Some("hello"), - None, - ])) as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: false, - }), - }, - ]; - let expected = vec![ - Arc::new(PrimitiveArray::::from(vec![ - Some(2), - Some(-1), - None, - None, - ])) as ArrayRef, - Arc::new(StringArray::from(vec![ - Some("hello"), - Some("world"), - Some("foo"), - None, - ])) as ArrayRef, - ]; - test_lex_sort_arrays(input, expected, None); - - // test sort with opposite options - let input = vec![ - SortColumn { - values: Arc::new(PrimitiveArray::::from(vec![ - None, - Some(-1), - Some(2), - Some(-1), - None, - ])) as ArrayRef, - options: Some(SortOptions { - descending: false, - nulls_first: false, - }), - }, - SortColumn { - values: Arc::new(StringArray::from(vec![ - Some("foo"), - Some("bar"), - Some("world"), - Some("hello"), - None, - ])) as ArrayRef, - options: Some(SortOptions { - descending: true, - nulls_first: true, - }), - }, - ]; - let expected = vec![ - Arc::new(PrimitiveArray::::from(vec![ - Some(-1), - Some(-1), - Some(2), - None, - None, - ])) as ArrayRef, - Arc::new(StringArray::from(vec![ - Some("hello"), - Some("bar"), - Some("world"), - None, - Some("foo"), - ])) as ArrayRef, - ]; - test_lex_sort_arrays(input, expected, None); - } - - #[test] - fn test_partial_sort() { - let mut before: Vec<&str> = vec![ - "a", "cat", "mat", "on", "sat", "the", "xxx", "xxxx", "fdadfdsf", - ]; - let mut d = before.clone(); - d.sort_unstable(); - - for last in 0..before.len() { - partial_sort(&mut before, last, |a, b| a.cmp(b)); - assert_eq!(&d[0..last], &before.as_slice()[0..last]); - } - } - - #[test] - fn test_partial_rand_sort() { - let size = 1000u32; - let mut rng = StdRng::seed_from_u64(42); - let mut before: Vec = (0..size).map(|_| rng.gen::()).collect(); - let mut d = before.clone(); - let last = (rng.next_u32() % size) as usize; - d.sort_unstable(); - - partial_sort(&mut before, last, |a, b| a.cmp(b)); - assert_eq!(&d[0..last], &before[0..last]); - } -} diff --git a/rust/arrow/src/compute/kernels/substring.rs b/rust/arrow/src/compute/kernels/substring.rs deleted file mode 100644 index d9956b89687..00000000000 --- a/rust/arrow/src/compute/kernels/substring.rs +++ /dev/null @@ -1,269 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines kernel to extract a substring of a \[Large\]StringArray - -use crate::{array::*, buffer::Buffer}; -use crate::{ - datatypes::DataType, - error::{ArrowError, Result}, -}; - -#[allow(clippy::unnecessary_wraps)] -fn generic_substring( - array: &GenericStringArray, - start: OffsetSize, - length: &Option, -) -> Result { - // compute current offsets - let offsets = array.data_ref().clone().buffers()[0].clone(); - let offsets: &[OffsetSize] = unsafe { offsets.typed_data::() }; - - // compute null bitmap (copy) - let null_bit_buffer = array.data_ref().null_buffer().cloned(); - - // compute values - let values = &array.data_ref().buffers()[1]; - let data = values.as_slice(); - - let mut new_values = Vec::new(); // we have no way to estimate how much this will be. - let mut new_offsets: Vec = Vec::with_capacity(array.len() + 1); - - let mut length_so_far = OffsetSize::zero(); - new_offsets.push(length_so_far); - (0..array.len()).for_each(|i| { - // the length of this entry - let length_i: OffsetSize = offsets[i + 1] - offsets[i]; - // compute where we should start slicing this entry - let start = offsets[i] - + if start >= OffsetSize::zero() { - start - } else { - length_i + start - }; - - let start = start.max(offsets[i]).min(offsets[i + 1]); - // compute the length of the slice - let length: OffsetSize = length - .unwrap_or(length_i) - // .max(0) is not needed as it is guaranteed - .min(offsets[i + 1] - start); // so we do not go beyond this entry - - length_so_far += length; - - new_offsets.push(length_so_far); - - // we need usize for ranges - let start = start.to_usize().unwrap(); - let length = length.to_usize().unwrap(); - - new_values.extend_from_slice(&data[start..start + length]); - }); - - let data = ArrayData::new( - ::DATA_TYPE, - array.len(), - None, - null_bit_buffer, - 0, - vec![ - Buffer::from_slice_ref(&new_offsets), - Buffer::from_slice_ref(&new_values), - ], - vec![], - ); - Ok(make_array(data)) -} - -/// Returns an ArrayRef with a substring starting from `start` and with optional length `length` of each of the elements in `array`. -/// `start` can be negative, in which case the start counts from the end of the string. -/// this function errors when the passed array is not a \[Large\]String array. -pub fn substring(array: &Array, start: i64, length: &Option) -> Result { - match array.data_type() { - DataType::LargeUtf8 => generic_substring( - array - .as_any() - .downcast_ref::() - .expect("A large string is expected"), - start, - &length.map(|e| e as i64), - ), - DataType::Utf8 => generic_substring( - array - .as_any() - .downcast_ref::() - .expect("A string is expected"), - start as i32, - &length.map(|e| e as i32), - ), - _ => Err(ArrowError::ComputeError(format!( - "substring does not support type {:?}", - array.data_type() - ))), - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn with_nulls>>>( - ) -> Result<()> { - let cases = vec![ - // identity - ( - vec![Some("hello"), None, Some("word")], - 0, - None, - vec![Some("hello"), None, Some("word")], - ), - // 0 length -> Nothing - ( - vec![Some("hello"), None, Some("word")], - 0, - Some(0), - vec![Some(""), None, Some("")], - ), - // high start -> Nothing - ( - vec![Some("hello"), None, Some("word")], - 1000, - Some(0), - vec![Some(""), None, Some("")], - ), - // high negative start -> identity - ( - vec![Some("hello"), None, Some("word")], - -1000, - None, - vec![Some("hello"), None, Some("word")], - ), - // high length -> identity - ( - vec![Some("hello"), None, Some("word")], - 0, - Some(1000), - vec![Some("hello"), None, Some("word")], - ), - ]; - - cases.into_iter().try_for_each::<_, Result<()>>( - |(array, start, length, expected)| { - let array = T::from(array); - let result: ArrayRef = substring(&array, start, &length)?; - assert_eq!(array.len(), result.len()); - - let result = result.as_any().downcast_ref::().unwrap(); - let expected = T::from(expected); - assert_eq!(&expected, result); - Ok(()) - }, - )?; - - Ok(()) - } - - #[test] - fn with_nulls_string() -> Result<()> { - with_nulls::() - } - - #[test] - fn with_nulls_large_string() -> Result<()> { - with_nulls::() - } - - fn without_nulls>>>( - ) -> Result<()> { - let cases = vec![ - // increase start - ( - vec!["hello", "", "word"], - 0, - None, - vec!["hello", "", "word"], - ), - (vec!["hello", "", "word"], 1, None, vec!["ello", "", "ord"]), - (vec!["hello", "", "word"], 2, None, vec!["llo", "", "rd"]), - (vec!["hello", "", "word"], 3, None, vec!["lo", "", "d"]), - (vec!["hello", "", "word"], 10, None, vec!["", "", ""]), - // increase start negatively - (vec!["hello", "", "word"], -1, None, vec!["o", "", "d"]), - (vec!["hello", "", "word"], -2, None, vec!["lo", "", "rd"]), - (vec!["hello", "", "word"], -3, None, vec!["llo", "", "ord"]), - ( - vec!["hello", "", "word"], - -10, - None, - vec!["hello", "", "word"], - ), - // increase length - (vec!["hello", "", "word"], 1, Some(1), vec!["e", "", "o"]), - (vec!["hello", "", "word"], 1, Some(2), vec!["el", "", "or"]), - ( - vec!["hello", "", "word"], - 1, - Some(3), - vec!["ell", "", "ord"], - ), - ( - vec!["hello", "", "word"], - 1, - Some(4), - vec!["ello", "", "ord"], - ), - (vec!["hello", "", "word"], -3, Some(1), vec!["l", "", "o"]), - (vec!["hello", "", "word"], -3, Some(2), vec!["ll", "", "or"]), - ( - vec!["hello", "", "word"], - -3, - Some(3), - vec!["llo", "", "ord"], - ), - ( - vec!["hello", "", "word"], - -3, - Some(4), - vec!["llo", "", "ord"], - ), - ]; - - cases.into_iter().try_for_each::<_, Result<()>>( - |(array, start, length, expected)| { - let array = StringArray::from(array); - let result = substring(&array, start, &length)?; - assert_eq!(array.len(), result.len()); - let result = result.as_any().downcast_ref::().unwrap(); - let expected = StringArray::from(expected); - assert_eq!(&expected, result,); - Ok(()) - }, - )?; - - Ok(()) - } - - #[test] - fn without_nulls_string() -> Result<()> { - without_nulls::() - } - - #[test] - fn without_nulls_large_string() -> Result<()> { - without_nulls::() - } -} diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs deleted file mode 100644 index 0217573dc5d..00000000000 --- a/rust/arrow/src/compute/kernels/take.rs +++ /dev/null @@ -1,1621 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines take kernel for [Array] - -use std::{ops::AddAssign, sync::Arc}; - -use crate::buffer::{Buffer, MutableBuffer}; -use crate::compute::util::{ - take_value_indices_from_fixed_size_list, take_value_indices_from_list, -}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; -use crate::{array::*, buffer::buffer_bin_and}; - -use num::{ToPrimitive, Zero}; -use TimeUnit::*; - -macro_rules! downcast_take { - ($type: ty, $values: expr, $indices: expr) => {{ - let values = $values - .as_any() - .downcast_ref::>() - .expect("Unable to downcast to a primitive array"); - Ok(Arc::new(take_primitive::<$type, _>(&values, $indices)?)) - }}; -} - -macro_rules! downcast_dict_take { - ($type: ty, $values: expr, $indices: expr) => {{ - let values = $values - .as_any() - .downcast_ref::>() - .expect("Unable to downcast to a dictionary array"); - Ok(Arc::new(take_dict::<$type, _>(values, $indices)?)) - }}; -} - -/// Take elements by index from [Array], creating a new [Array] from those indexes. -/// -/// # Errors -/// This function errors whenever: -/// * An index cannot be casted to `usize` (typically 32 bit architectures) -/// * An index is out of bounds and `options` is set to check bounds. -/// # Safety -/// When `options` is not set to check bounds (default), taking indexes after `len` is undefined behavior. -/// # Examples -/// ``` -/// use arrow::array::{StringArray, UInt32Array}; -/// use arrow::error::Result; -/// use arrow::compute::take; -/// # fn main() -> Result<()> { -/// let values = StringArray::from(vec!["zero", "one", "two"]); -/// -/// // Take items at index 2, and 1: -/// let indices = UInt32Array::from(vec![2, 1]); -/// let taken = take(&values, &indices, None)?; -/// let taken = taken.as_any().downcast_ref::().unwrap(); -/// -/// assert_eq!(*taken, StringArray::from(vec!["two", "one"])); -/// # Ok(()) -/// # } -/// ``` -pub fn take( - values: &Array, - indices: &PrimitiveArray, - options: Option, -) -> Result -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, -{ - take_impl(values, indices, options) -} - -fn take_impl( - values: &Array, - indices: &PrimitiveArray, - options: Option, -) -> Result -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, -{ - let options = options.unwrap_or_default(); - if options.check_bounds { - let len = values.len(); - for i in 0..indices.len() { - if indices.is_valid(i) { - let ix = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - if ix >= len { - return Err(ArrowError::ComputeError( - format!("Array index out of bounds, cannot get item at index {} from {} entries", ix, len)) - ); - } - } - } - } - match values.data_type() { - DataType::Boolean => { - let values = values.as_any().downcast_ref::().unwrap(); - Ok(Arc::new(take_boolean(values, indices)?)) - } - DataType::Int8 => downcast_take!(Int8Type, values, indices), - DataType::Int16 => downcast_take!(Int16Type, values, indices), - DataType::Int32 => downcast_take!(Int32Type, values, indices), - DataType::Int64 => downcast_take!(Int64Type, values, indices), - DataType::UInt8 => downcast_take!(UInt8Type, values, indices), - DataType::UInt16 => downcast_take!(UInt16Type, values, indices), - DataType::UInt32 => downcast_take!(UInt32Type, values, indices), - DataType::UInt64 => downcast_take!(UInt64Type, values, indices), - DataType::Float32 => downcast_take!(Float32Type, values, indices), - DataType::Float64 => downcast_take!(Float64Type, values, indices), - DataType::Date32 => downcast_take!(Date32Type, values, indices), - DataType::Date64 => downcast_take!(Date64Type, values, indices), - DataType::Time32(Second) => downcast_take!(Time32SecondType, values, indices), - DataType::Time32(Millisecond) => { - downcast_take!(Time32MillisecondType, values, indices) - } - DataType::Time64(Microsecond) => { - downcast_take!(Time64MicrosecondType, values, indices) - } - DataType::Time64(Nanosecond) => { - downcast_take!(Time64NanosecondType, values, indices) - } - DataType::Timestamp(Second, _) => { - downcast_take!(TimestampSecondType, values, indices) - } - DataType::Timestamp(Millisecond, _) => { - downcast_take!(TimestampMillisecondType, values, indices) - } - DataType::Timestamp(Microsecond, _) => { - downcast_take!(TimestampMicrosecondType, values, indices) - } - DataType::Timestamp(Nanosecond, _) => { - downcast_take!(TimestampNanosecondType, values, indices) - } - DataType::Interval(IntervalUnit::YearMonth) => { - downcast_take!(IntervalYearMonthType, values, indices) - } - DataType::Interval(IntervalUnit::DayTime) => { - downcast_take!(IntervalDayTimeType, values, indices) - } - DataType::Duration(TimeUnit::Second) => { - downcast_take!(DurationSecondType, values, indices) - } - DataType::Duration(TimeUnit::Millisecond) => { - downcast_take!(DurationMillisecondType, values, indices) - } - DataType::Duration(TimeUnit::Microsecond) => { - downcast_take!(DurationMicrosecondType, values, indices) - } - DataType::Duration(TimeUnit::Nanosecond) => { - downcast_take!(DurationNanosecondType, values, indices) - } - DataType::Utf8 => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_string::(values, indices)?)) - } - DataType::LargeUtf8 => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_string::(values, indices)?)) - } - DataType::List(_) => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_list::<_, Int32Type>(values, indices)?)) - } - DataType::LargeList(_) => { - let values = values - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(Arc::new(take_list::<_, Int64Type>(values, indices)?)) - } - DataType::FixedSizeList(_, length) => { - let values = values - .as_any() - .downcast_ref::() - .unwrap(); - Ok(Arc::new(take_fixed_size_list( - values, - indices, - *length as u32, - )?)) - } - DataType::Struct(fields) => { - let struct_: &StructArray = - values.as_any().downcast_ref::().unwrap(); - let arrays: Result> = struct_ - .columns() - .iter() - .map(|a| take_impl(a.as_ref(), indices, Some(options.clone()))) - .collect(); - let arrays = arrays?; - let pairs: Vec<(Field, ArrayRef)> = - fields.clone().into_iter().zip(arrays).collect(); - Ok(Arc::new(StructArray::from(pairs)) as ArrayRef) - } - DataType::Dictionary(key_type, _) => match key_type.as_ref() { - DataType::Int8 => downcast_dict_take!(Int8Type, values, indices), - DataType::Int16 => downcast_dict_take!(Int16Type, values, indices), - DataType::Int32 => downcast_dict_take!(Int32Type, values, indices), - DataType::Int64 => downcast_dict_take!(Int64Type, values, indices), - DataType::UInt8 => downcast_dict_take!(UInt8Type, values, indices), - DataType::UInt16 => downcast_dict_take!(UInt16Type, values, indices), - DataType::UInt32 => downcast_dict_take!(UInt32Type, values, indices), - DataType::UInt64 => downcast_dict_take!(UInt64Type, values, indices), - t => unimplemented!("Take not supported for dictionary key type {:?}", t), - }, - t => unimplemented!("Take not supported for data type {:?}", t), - } -} - -/// Options that define how `take` should behave -#[derive(Clone, Debug)] -pub struct TakeOptions { - /// Perform bounds check before taking indices from values. - /// If enabled, an `ArrowError` is returned if the indices are out of bounds. - /// If not enabled, and indices exceed bounds, the kernel will panic. - pub check_bounds: bool, -} - -impl Default for TakeOptions { - fn default() -> Self { - Self { - check_bounds: false, - } - } -} - -#[inline(always)] -fn maybe_usize(index: I::Native) -> Result { - index - .to_usize() - .ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string())) -} - -// take implementation when neither values nor indices contain nulls -fn take_no_nulls( - values: &[T::Native], - indices: &[I::Native], -) -> Result<(Buffer, Option)> -where - T: ArrowPrimitiveType, - I: ArrowNumericType, -{ - let values = indices - .iter() - .map(|index| Result::Ok(values[maybe_usize::(*index)?])); - // Soundness: `slice.map` is `TrustedLen`. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - - Ok((buffer, None)) -} - -// take implementation when only values contain nulls -fn take_values_nulls( - values: &PrimitiveArray, - indices: &[I::Native], -) -> Result<(Buffer, Option)> -where - T: ArrowPrimitiveType, - I: ArrowNumericType, - I::Native: ToPrimitive, -{ - let num_bytes = bit_util::ceil(indices.len(), 8); - let mut nulls = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let null_slice = nulls.as_slice_mut(); - let mut null_count = 0; - - let values_values = values.values(); - - let values = indices.iter().enumerate().map(|(i, index)| { - let index = maybe_usize::(*index)?; - if values.is_null(index) { - null_count += 1; - bit_util::unset_bit(null_slice, i); - } - Result::Ok(values_values[index]) - }); - // Soundness: `slice.map` is `TrustedLen`. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - - let nulls = if null_count == 0 { - // if only non-null values were taken - None - } else { - Some(nulls.into()) - }; - - Ok((buffer, nulls)) -} - -// take implementation when only indices contain nulls -fn take_indices_nulls( - values: &[T::Native], - indices: &PrimitiveArray, -) -> Result<(Buffer, Option)> -where - T: ArrowPrimitiveType, - I: ArrowNumericType, - I::Native: ToPrimitive, -{ - let values = indices.values().iter().map(|index| { - let index = maybe_usize::(*index)?; - Result::Ok(match values.get(index) { - Some(value) => *value, - None => { - if indices.is_null(index) { - T::Native::default() - } else { - panic!("Out-of-bounds index {}", index) - } - } - }) - }); - - // Soundness: `slice.map` is `TrustedLen`. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - - Ok((buffer, indices.data_ref().null_buffer().cloned())) -} - -// take implementation when both values and indices contain nulls -fn take_values_indices_nulls( - values: &PrimitiveArray, - indices: &PrimitiveArray, -) -> Result<(Buffer, Option)> -where - T: ArrowPrimitiveType, - I: ArrowNumericType, - I::Native: ToPrimitive, -{ - let num_bytes = bit_util::ceil(indices.len(), 8); - let mut nulls = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let null_slice = nulls.as_slice_mut(); - let mut null_count = 0; - - let values_values = values.values(); - let values = indices.iter().enumerate().map(|(i, index)| match index { - Some(index) => { - let index = maybe_usize::(index)?; - if values.is_null(index) { - null_count += 1; - bit_util::unset_bit(null_slice, i); - } - Result::Ok(values_values[index]) - } - None => { - null_count += 1; - bit_util::unset_bit(null_slice, i); - Ok(T::Native::default()) - } - }); - // Soundness: `slice.map` is `TrustedLen`. - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; - - let nulls = if null_count == 0 { - // if only non-null values were taken - None - } else { - Some(nulls.into()) - }; - - Ok((buffer, nulls)) -} - -/// `take` implementation for all primitive arrays -/// -/// This checks if an `indices` slot is populated, and gets the value from `values` -/// as the populated index. -/// If the `indices` slot is null, a null value is returned. -/// For example, given: -/// values: [1, 2, 3, null, 5] -/// indices: [0, null, 4, 3] -/// The result is: [1 (slot 0), null (null slot), 5 (slot 4), null (slot 3)] -fn take_primitive( - values: &PrimitiveArray, - indices: &PrimitiveArray, -) -> Result> -where - T: ArrowPrimitiveType, - I: ArrowNumericType, - I::Native: ToPrimitive, -{ - let indices_has_nulls = indices.null_count() > 0; - let values_has_nulls = values.null_count() > 0; - // note: this function should only panic when "an index is not null and out of bounds". - // if the index is null, its value is undefined and therefore we should not read from it. - - let (buffer, nulls) = match (values_has_nulls, indices_has_nulls) { - (false, false) => { - // * no nulls - // * all `indices.values()` are valid - take_no_nulls::(values.values(), indices.values())? - } - (true, false) => { - // * nulls come from `values` alone - // * all `indices.values()` are valid - take_values_nulls::(values, indices.values())? - } - (false, true) => { - // in this branch it is unsound to read and use `index.values()`, - // as doing so is UB when they come from a null slot. - take_indices_nulls::(values.values(), indices)? - } - (true, true) => { - // in this branch it is unsound to read and use `index.values()`, - // as doing so is UB when they come from a null slot. - take_values_indices_nulls::(values, indices)? - } - }; - - let data = ArrayData::new( - T::DATA_TYPE, - indices.len(), - None, - nulls, - 0, - vec![buffer], - vec![], - ); - Ok(PrimitiveArray::::from(data)) -} - -/// `take` implementation for boolean arrays -fn take_boolean( - values: &BooleanArray, - indices: &PrimitiveArray, -) -> Result -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, -{ - let data_len = indices.len(); - - let num_byte = bit_util::ceil(data_len, 8); - let mut val_buf = MutableBuffer::from_len_zeroed(num_byte); - - let val_slice = val_buf.as_slice_mut(); - - let null_count = values.null_count(); - - let nulls; - if null_count == 0 { - (0..data_len).try_for_each::<_, Result<()>>(|i| { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - if values.value(index) { - bit_util::set_bit(val_slice, i); - } - - Ok(()) - })?; - - nulls = indices.data_ref().null_buffer().cloned(); - } else { - let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, true); - let null_slice = null_buf.as_slice_mut(); - - (0..data_len).try_for_each::<_, Result<()>>(|i| { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - if values.is_null(index) { - bit_util::unset_bit(null_slice, i); - } else if values.value(index) { - bit_util::set_bit(val_slice, i); - } - - Ok(()) - })?; - - nulls = match indices.data_ref().null_buffer() { - Some(buffer) => Some(buffer_bin_and( - buffer, - 0, - &null_buf.into(), - 0, - indices.len(), - )), - None => Some(null_buf.into()), - }; - } - - let data = ArrayData::new( - DataType::Boolean, - indices.len(), - None, - nulls, - 0, - vec![val_buf.into()], - vec![], - ); - Ok(BooleanArray::from(data)) -} - -/// `take` implementation for string arrays -fn take_string( - array: &GenericStringArray, - indices: &PrimitiveArray, -) -> Result> -where - OffsetSize: Zero + AddAssign + StringOffsetSizeTrait, - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, -{ - let data_len = indices.len(); - - let bytes_offset = (data_len + 1) * std::mem::size_of::(); - let mut offsets_buffer = MutableBuffer::from_len_zeroed(bytes_offset); - - let offsets = offsets_buffer.typed_data_mut(); - let mut values = MutableBuffer::new(0); - let mut length_so_far = OffsetSize::zero(); - offsets[0] = length_so_far; - - let nulls; - if array.null_count() == 0 && indices.null_count() == 0 { - for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - let s = array.value(index); - - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); - *offset = length_so_far; - } - nulls = None - } else if indices.null_count() == 0 { - let num_bytes = bit_util::ceil(data_len, 8); - - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let null_slice = null_buf.as_slice_mut(); - - for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - if array.is_valid(index) { - let s = array.value(index); - - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); - } else { - bit_util::unset_bit(null_slice, i); - } - *offset = length_so_far; - } - nulls = Some(null_buf.into()); - } else if array.null_count() == 0 { - for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - if indices.is_valid(i) { - let index = - ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - let s = array.value(index); - - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); - } - *offset = length_so_far; - } - nulls = indices.data_ref().null_buffer().cloned(); - } else { - let num_bytes = bit_util::ceil(data_len, 8); - - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let null_slice = null_buf.as_slice_mut(); - - for (i, offset) in offsets.iter_mut().skip(1).enumerate() { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - - if array.is_valid(index) && indices.is_valid(i) { - let s = array.value(index); - - length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); - } else { - // set null bit - bit_util::unset_bit(null_slice, i); - } - *offset = length_so_far; - } - - nulls = match indices.data_ref().null_buffer() { - Some(buffer) => { - Some(buffer_bin_and(buffer, 0, &null_buf.into(), 0, data_len)) - } - None => Some(null_buf.into()), - }; - } - - let mut data = ArrayData::builder(::DATA_TYPE) - .len(data_len) - .add_buffer(offsets_buffer.into()) - .add_buffer(values.into()); - if let Some(null_buffer) = nulls { - data = data.null_bit_buffer(null_buffer); - } - Ok(GenericStringArray::::from(data.build())) -} - -/// `take` implementation for list arrays -/// -/// Calculates the index and indexed offset for the inner array, -/// applying `take` on the inner array, then reconstructing a list array -/// with the indexed offsets -fn take_list( - values: &GenericListArray, - indices: &PrimitiveArray, -) -> Result> -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, - OffsetType: ArrowNumericType, - OffsetType::Native: ToPrimitive + OffsetSizeTrait, - PrimitiveArray: From>>, -{ - // TODO: Some optimizations can be done here such as if it is - // taking the whole list or a contiguous sublist - let (list_indices, offsets) = - take_value_indices_from_list::(values, indices)?; - - let taken = take_impl::(values.values().as_ref(), &list_indices, None)?; - // determine null count and null buffer, which are a function of `values` and `indices` - let mut null_count = 0; - let num_bytes = bit_util::ceil(indices.len(), 8); - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - { - let null_slice = null_buf.as_slice_mut(); - offsets[..].windows(2).enumerate().for_each( - |(i, window): (usize, &[OffsetType::Native])| { - if window[0] == window[1] { - // offsets are equal, slot is null - bit_util::unset_bit(null_slice, i); - null_count += 1; - } - }, - ); - } - let value_offsets = Buffer::from_slice_ref(&offsets); - // create a new list with taken data and computed null information - let list_data = ArrayDataBuilder::new(values.data_type().clone()) - .len(indices.len()) - .null_bit_buffer(null_buf.into()) - .offset(0) - .add_child_data(taken.data().clone()) - .add_buffer(value_offsets) - .build(); - Ok(GenericListArray::::from(list_data)) -} - -/// `take` implementation for `FixedSizeListArray` -/// -/// Calculates the index and indexed offset for the inner array, -/// applying `take` on the inner array, then reconstructing a list array -/// with the indexed offsets -fn take_fixed_size_list( - values: &FixedSizeListArray, - indices: &PrimitiveArray, - length: ::Native, -) -> Result -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, -{ - let list_indices = take_value_indices_from_fixed_size_list(values, indices, length)?; - let taken = take_impl::(values.values().as_ref(), &list_indices, None)?; - - // determine null count and null buffer, which are a function of `values` and `indices` - let num_bytes = bit_util::ceil(indices.len(), 8); - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - let null_slice = null_buf.as_slice_mut(); - - for i in 0..indices.len() { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - if !indices.is_valid(i) || values.is_null(index) { - bit_util::unset_bit(null_slice, i); - } - } - - let list_data = ArrayDataBuilder::new(values.data_type().clone()) - .len(indices.len()) - .null_bit_buffer(null_buf.into()) - .offset(0) - .add_child_data(taken.data().clone()) - .build(); - - Ok(FixedSizeListArray::from(list_data)) -} - -/// `take` implementation for dictionary arrays -/// -/// applies `take` to the keys of the dictionary array and returns a new dictionary array -/// with the same dictionary values and reordered keys -fn take_dict( - values: &DictionaryArray, - indices: &PrimitiveArray, -) -> Result> -where - T: ArrowPrimitiveType, - T::Native: num::Num, - I: ArrowNumericType, - I::Native: ToPrimitive, -{ - let new_keys = take_primitive::(&values.keys_array(), indices)?; - let new_keys_data = new_keys.data_ref(); - - let data = ArrayData::new( - values.data_type().clone(), - new_keys.len(), - Some(new_keys_data.null_count()), - new_keys_data.null_buffer().cloned(), - 0, - new_keys_data.buffers().to_vec(), - values.data().child_data().to_vec(), - ); - - Ok(DictionaryArray::::from(data)) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::compute::util::tests::build_fixed_size_list_nullable; - - fn test_take_boolean_arrays( - data: Vec>, - index: &UInt32Array, - options: Option, - expected_data: Vec>, - ) { - let output = BooleanArray::from(data); - let expected = Arc::new(BooleanArray::from(expected_data)) as ArrayRef; - let output = take(&output, index, options).unwrap(); - assert_eq!(&output, &expected) - } - - fn test_take_primitive_arrays( - data: Vec>, - index: &UInt32Array, - options: Option, - expected_data: Vec>, - ) -> Result<()> - where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let output = PrimitiveArray::::from(data); - let expected = Arc::new(PrimitiveArray::::from(expected_data)) as ArrayRef; - let output = take(&output, index, options)?; - assert_eq!(&output, &expected); - Ok(()) - } - - fn test_take_impl_primitive_arrays( - data: Vec>, - index: &PrimitiveArray, - options: Option, - expected_data: Vec>, - ) where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - I: ArrowNumericType, - I::Native: ToPrimitive, - { - let output = PrimitiveArray::::from(data); - let expected = PrimitiveArray::::from(expected_data); - let output = take_impl(&output, index, options).unwrap(); - let output = output.as_any().downcast_ref::>().unwrap(); - assert_eq!(output, &expected) - } - - // create a simple struct for testing purposes - fn create_test_struct() -> StructArray { - let boolean_data = BooleanArray::from(vec![true, false, false, true]) - .data() - .clone(); - let int_data = Int32Array::from(vec![42, 28, 19, 31]).data().clone(); - let mut field_types = vec![]; - field_types.push(Field::new("a", DataType::Boolean, true)); - field_types.push(Field::new("b", DataType::Int32, true)); - let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) - .len(4) - .add_child_data(boolean_data) - .add_child_data(int_data) - .build(); - StructArray::from(struct_array_data) - } - - #[test] - fn test_take_primitive_non_null_indices() { - let index = UInt32Array::from(vec![0, 5, 3, 1, 4, 2]); - test_take_primitive_arrays::( - vec![None, Some(3), Some(5), Some(2), Some(3), None], - &index, - None, - vec![None, None, Some(2), Some(3), Some(3), Some(5)], - ) - .unwrap(); - } - - #[test] - fn test_take_primitive_non_null_values() { - let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(2)]); - test_take_primitive_arrays::( - vec![Some(0), Some(1), Some(2), Some(3), Some(4)], - &index, - None, - vec![Some(3), None, Some(1), Some(3), Some(2)], - ) - .unwrap(); - } - - #[test] - fn test_take_primitive_non_null() { - let index = UInt32Array::from(vec![0, 5, 3, 1, 4, 2]); - test_take_primitive_arrays::( - vec![Some(0), Some(3), Some(5), Some(2), Some(3), Some(1)], - &index, - None, - vec![Some(0), Some(1), Some(2), Some(3), Some(3), Some(5)], - ) - .unwrap(); - } - - #[test] - fn test_take_primitive() { - let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(2)]); - - // int8 - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ) - .unwrap(); - - // int16 - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ) - .unwrap(); - - // int32 - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ) - .unwrap(); - - // int64 - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ) - .unwrap(); - - // uint8 - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ) - .unwrap(); - - // uint16 - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ) - .unwrap(); - - // uint32 - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ) - .unwrap(); - - // int64 - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ) - .unwrap(); - - // interval_year_month - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ) - .unwrap(); - - // interval_day_time - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ) - .unwrap(); - - // duration_second - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ) - .unwrap(); - - // duration_millisecond - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ) - .unwrap(); - - // duration_microsecond - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ) - .unwrap(); - - // duration_nanosecond - test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ) - .unwrap(); - - // float32 - test_take_primitive_arrays::( - vec![Some(0.0), None, Some(2.21), Some(-3.1), None], - &index, - None, - vec![Some(-3.1), None, None, Some(-3.1), Some(2.21)], - ) - .unwrap(); - - // float64 - test_take_primitive_arrays::( - vec![Some(0.0), None, Some(2.21), Some(-3.1), None], - &index, - None, - vec![Some(-3.1), None, None, Some(-3.1), Some(2.21)], - ) - .unwrap(); - } - - #[test] - fn test_take_impl_primitive_with_int64_indices() { - let index = Int64Array::from(vec![Some(3), None, Some(1), Some(3), Some(2)]); - - // int16 - test_take_impl_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ); - - // int64 - test_take_impl_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ); - - // uint64 - test_take_impl_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ); - - // duration_millisecond - test_take_impl_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ); - - // float32 - test_take_impl_primitive_arrays::( - vec![Some(0.0), None, Some(2.21), Some(-3.1), None], - &index, - None, - vec![Some(-3.1), None, None, Some(-3.1), Some(2.21)], - ); - } - - #[test] - fn test_take_impl_primitive_with_uint8_indices() { - let index = UInt8Array::from(vec![Some(3), None, Some(1), Some(3), Some(2)]); - - // int16 - test_take_impl_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - None, - vec![Some(3), None, None, Some(3), Some(2)], - ); - - // duration_millisecond - test_take_impl_primitive_arrays::( - vec![Some(0), None, Some(2), Some(-15), None], - &index, - None, - vec![Some(-15), None, None, Some(-15), Some(2)], - ); - - // float32 - test_take_impl_primitive_arrays::( - vec![Some(0.0), None, Some(2.21), Some(-3.1), None], - &index, - None, - vec![Some(-3.1), None, None, Some(-3.1), Some(2.21)], - ); - } - - #[test] - fn test_take_primitive_bool() { - let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(2)]); - // boolean - test_take_boolean_arrays( - vec![Some(false), None, Some(true), Some(false), None], - &index, - None, - vec![Some(false), None, None, Some(false), Some(true)], - ); - } - - fn _test_take_string<'a, K: 'static>() - where - K: Array + PartialEq + From>>, - { - let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(4)]); - - let array = K::from(vec![ - Some("one"), - None, - Some("three"), - Some("four"), - Some("five"), - ]); - let actual = take(&array, &index, None).unwrap(); - assert_eq!(actual.len(), index.len()); - - let actual = actual.as_any().downcast_ref::().unwrap(); - - let expected = - K::from(vec![Some("four"), None, None, Some("four"), Some("five")]); - - assert_eq!(actual, &expected); - } - - #[test] - fn test_take_string() { - _test_take_string::() - } - - #[test] - fn test_take_large_string() { - _test_take_string::() - } - - macro_rules! test_take_list { - ($offset_type:ty, $list_data_type:ident, $list_array_type:ident) => {{ - // Construct a value array, [[0,0,0], [-1,-2,-1], [2,3]] - let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]) - .data() - .clone(); - // Construct offsets - let value_offsets: [$offset_type; 4] = [0, 3, 6, 8]; - let value_offsets = Buffer::from_slice_ref(&value_offsets); - // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new( - "item", - DataType::Int32, - false, - ))); - let list_data = ArrayData::builder(list_data_type.clone()) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - let list_array = $list_array_type::from(list_data); - - // index returns: [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]] - let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(2), Some(0)]); - - let a = take(&list_array, &index, None).unwrap(); - let a: &$list_array_type = - a.as_any().downcast_ref::<$list_array_type>().unwrap(); - - // construct a value array with expected results: - // [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]] - let expected_data = Int32Array::from(vec![ - Some(2), - Some(3), - Some(-1), - Some(-2), - Some(-1), - Some(2), - Some(3), - Some(0), - Some(0), - Some(0), - ]) - .data() - .clone(); - // construct offsets - let expected_offsets: [$offset_type; 6] = [0, 2, 2, 5, 7, 10]; - let expected_offsets = Buffer::from_slice_ref(&expected_offsets); - // construct list array from the two - let expected_list_data = ArrayData::builder(list_data_type) - .len(5) - // null buffer remains the same as only the indices have nulls - .null_bit_buffer( - index.data().null_bitmap().as_ref().unwrap().bits.clone(), - ) - .add_buffer(expected_offsets) - .add_child_data(expected_data) - .build(); - let expected_list_array = $list_array_type::from(expected_list_data); - - assert_eq!(a, &expected_list_array); - }}; - } - - macro_rules! test_take_list_with_value_nulls { - ($offset_type:ty, $list_data_type:ident, $list_array_type:ident) => {{ - // Construct a value array, [[0,null,0], [-1,-2,3], [null], [5,null]] - let value_data = Int32Array::from(vec![ - Some(0), - None, - Some(0), - Some(-1), - Some(-2), - Some(3), - None, - Some(5), - None, - ]) - .data() - .clone(); - // Construct offsets - let value_offsets: [$offset_type; 5] = [0, 3, 6, 7, 9]; - let value_offsets = Buffer::from_slice_ref(&value_offsets); - // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new( - "item", - DataType::Int32, - false, - ))); - let list_data = ArrayData::builder(list_data_type.clone()) - .len(4) - .add_buffer(value_offsets) - .null_bit_buffer(Buffer::from([0b10111101, 0b00000000])) - .add_child_data(value_data) - .build(); - let list_array = $list_array_type::from(list_data); - - // index returns: [[null], null, [-1,-2,3], [2,null], [0,null,0]] - let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(3), Some(0)]); - - let a = take(&list_array, &index, None).unwrap(); - let a: &$list_array_type = - a.as_any().downcast_ref::<$list_array_type>().unwrap(); - - // construct a value array with expected results: - // [[null], null, [-1,-2,3], [5,null], [0,null,0]] - let expected_data = Int32Array::from(vec![ - None, - Some(-1), - Some(-2), - Some(3), - Some(5), - None, - Some(0), - None, - Some(0), - ]) - .data() - .clone(); - // construct offsets - let expected_offsets: [$offset_type; 6] = [0, 1, 1, 4, 6, 9]; - let expected_offsets = Buffer::from_slice_ref(&expected_offsets); - // construct list array from the two - let expected_list_data = ArrayData::builder(list_data_type) - .len(5) - // null buffer remains the same as only the indices have nulls - .null_bit_buffer( - index.data().null_bitmap().as_ref().unwrap().bits.clone(), - ) - .add_buffer(expected_offsets) - .add_child_data(expected_data) - .build(); - let expected_list_array = $list_array_type::from(expected_list_data); - - assert_eq!(a, &expected_list_array); - }}; - } - - macro_rules! test_take_list_with_nulls { - ($offset_type:ty, $list_data_type:ident, $list_array_type:ident) => {{ - // Construct a value array, [[0,null,0], [-1,-2,3], null, [5,null]] - let value_data = Int32Array::from(vec![ - Some(0), - None, - Some(0), - Some(-1), - Some(-2), - Some(3), - Some(5), - None, - ]) - .data() - .clone(); - // Construct offsets - let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8]; - let value_offsets = Buffer::from_slice_ref(&value_offsets); - // Construct a list array from the above two - let list_data_type = DataType::$list_data_type(Box::new(Field::new( - "item", - DataType::Int32, - false, - ))); - let list_data = ArrayData::builder(list_data_type.clone()) - .len(4) - .add_buffer(value_offsets) - .null_bit_buffer(Buffer::from([0b01111101])) - .add_child_data(value_data) - .build(); - let list_array = $list_array_type::from(list_data); - - // index returns: [null, null, [-1,-2,3], [5,null], [0,null,0]] - let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(3), Some(0)]); - - let a = take(&list_array, &index, None).unwrap(); - let a: &$list_array_type = - a.as_any().downcast_ref::<$list_array_type>().unwrap(); - - // construct a value array with expected results: - // [null, null, [-1,-2,3], [5,null], [0,null,0]] - let expected_data = Int32Array::from(vec![ - Some(-1), - Some(-2), - Some(3), - Some(5), - None, - Some(0), - None, - Some(0), - ]) - .data() - .clone(); - // construct offsets - let expected_offsets: [$offset_type; 6] = [0, 0, 0, 3, 5, 8]; - let expected_offsets = Buffer::from_slice_ref(&expected_offsets); - // construct list array from the two - let mut null_bits: [u8; 1] = [0; 1]; - bit_util::set_bit(&mut null_bits, 2); - bit_util::set_bit(&mut null_bits, 3); - bit_util::set_bit(&mut null_bits, 4); - let expected_list_data = ArrayData::builder(list_data_type) - .len(5) - // null buffer must be recalculated as both values and indices have nulls - .null_bit_buffer(Buffer::from(null_bits)) - .add_buffer(expected_offsets) - .add_child_data(expected_data) - .build(); - let expected_list_array = $list_array_type::from(expected_list_data); - - assert_eq!(a, &expected_list_array); - }}; - } - - fn do_take_fixed_size_list_test( - length: ::Native, - input_data: Vec>>>, - indices: Vec<::Native>, - expected_data: Vec>>>, - ) where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let indices = UInt32Array::from(indices); - - let input_array = build_fixed_size_list_nullable::(input_data, length); - - let output = take_fixed_size_list(&input_array, &indices, length as u32).unwrap(); - - let expected = build_fixed_size_list_nullable::(expected_data, length); - - assert_eq!(&output, &expected) - } - - #[test] - fn test_take_list() { - test_take_list!(i32, List, ListArray); - } - - #[test] - fn test_take_large_list() { - test_take_list!(i64, LargeList, LargeListArray); - } - - #[test] - fn test_take_list_with_value_nulls() { - test_take_list_with_value_nulls!(i32, List, ListArray); - } - - #[test] - fn test_take_large_list_with_value_nulls() { - test_take_list_with_value_nulls!(i64, LargeList, LargeListArray); - } - - #[test] - fn test_test_take_list_with_nulls() { - test_take_list_with_nulls!(i32, List, ListArray); - } - - #[test] - fn test_test_take_large_list_with_nulls() { - test_take_list_with_nulls!(i64, LargeList, LargeListArray); - } - - #[test] - fn test_take_fixed_size_list() { - do_take_fixed_size_list_test::( - 3, - vec![ - Some(vec![None, Some(1), Some(2)]), - Some(vec![Some(3), Some(4), None]), - Some(vec![Some(6), Some(7), Some(8)]), - ], - vec![2, 1, 0], - vec![ - Some(vec![Some(6), Some(7), Some(8)]), - Some(vec![Some(3), Some(4), None]), - Some(vec![None, Some(1), Some(2)]), - ], - ); - - do_take_fixed_size_list_test::( - 1, - vec![ - Some(vec![Some(1)]), - Some(vec![Some(2)]), - Some(vec![Some(3)]), - Some(vec![Some(4)]), - Some(vec![Some(5)]), - Some(vec![Some(6)]), - Some(vec![Some(7)]), - Some(vec![Some(8)]), - ], - vec![2, 7, 0], - vec![ - Some(vec![Some(3)]), - Some(vec![Some(8)]), - Some(vec![Some(1)]), - ], - ); - - do_take_fixed_size_list_test::( - 3, - vec![ - Some(vec![Some(10), Some(11), Some(12)]), - Some(vec![Some(13), Some(14), Some(15)]), - None, - Some(vec![Some(16), Some(17), Some(18)]), - ], - vec![3, 2, 1, 2, 0], - vec![ - Some(vec![Some(16), Some(17), Some(18)]), - None, - Some(vec![Some(13), Some(14), Some(15)]), - None, - Some(vec![Some(10), Some(11), Some(12)]), - ], - ); - } - - #[test] - #[should_panic(expected = "index out of bounds: the len is 4 but the index is 1000")] - fn test_take_list_out_of_bounds() { - // Construct a value array, [[0,0,0], [-1,-2,-1], [2,3]] - let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3]) - .data() - .clone(); - // Construct offsets - let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]); - // Construct a list array from the above two - let list_data_type = - DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - let list_array = ListArray::from(list_data); - - let index = UInt32Array::from(vec![1000]); - - // A panic is expected here since we have not supplied the check_bounds - // option. - take(&list_array, &index, None).unwrap(); - } - - #[test] - fn test_take_struct() { - let array = create_test_struct(); - - let index = UInt32Array::from(vec![0, 3, 1, 0, 2]); - let a = take(&array, &index, None).unwrap(); - let a: &StructArray = a.as_any().downcast_ref::().unwrap(); - assert_eq!(index.len(), a.len()); - assert_eq!(0, a.null_count()); - - let expected_bool_data = BooleanArray::from(vec![true, true, false, true, false]) - .data() - .clone(); - let expected_int_data = Int32Array::from(vec![42, 31, 28, 42, 19]).data().clone(); - let mut field_types = vec![]; - field_types.push(Field::new("a", DataType::Boolean, true)); - field_types.push(Field::new("b", DataType::Int32, true)); - let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) - .len(5) - .add_child_data(expected_bool_data) - .add_child_data(expected_int_data) - .build(); - let struct_array = StructArray::from(struct_array_data); - - assert_eq!(a, &struct_array); - } - - #[test] - fn test_take_struct_with_nulls() { - let array = create_test_struct(); - - let index = UInt32Array::from(vec![None, Some(3), Some(1), None, Some(0)]); - let a = take(&array, &index, None).unwrap(); - let a: &StructArray = a.as_any().downcast_ref::().unwrap(); - assert_eq!(index.len(), a.len()); - assert_eq!(0, a.null_count()); - - let expected_bool_data = - BooleanArray::from(vec![None, Some(true), Some(false), None, Some(true)]) - .data() - .clone(); - let expected_int_data = - Int32Array::from(vec![None, Some(31), Some(28), None, Some(42)]) - .data() - .clone(); - - let mut field_types = vec![]; - field_types.push(Field::new("a", DataType::Boolean, true)); - field_types.push(Field::new("b", DataType::Int32, true)); - let struct_array_data = ArrayData::builder(DataType::Struct(field_types)) - .len(5) - // TODO: see https://issues.apache.org/jira/browse/ARROW-5408 for why count != 2 - .add_child_data(expected_bool_data) - .add_child_data(expected_int_data) - .build(); - let struct_array = StructArray::from(struct_array_data); - assert_eq!(a, &struct_array); - } - - #[test] - fn test_take_out_of_bounds() { - let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(3), Some(6)]); - let take_opt = TakeOptions { check_bounds: true }; - - // int64 - let result = test_take_primitive_arrays::( - vec![Some(0), None, Some(2), Some(3), None], - &index, - Some(take_opt), - vec![None], - ); - assert!(result.is_err()); - } - - #[test] - #[should_panic(expected = "index out of bounds: the len is 4 but the index is 1000")] - fn test_take_out_of_bounds_panic() { - let index = UInt32Array::from(vec![Some(1000)]); - - test_take_primitive_arrays::( - vec![Some(0), Some(1), Some(2), Some(3)], - &index, - None, - vec![None], - ) - .unwrap(); - } - - #[test] - fn test_take_dict() { - let keys_builder = Int16Builder::new(8); - let values_builder = StringBuilder::new(4); - - let mut dict_builder = StringDictionaryBuilder::new(keys_builder, values_builder); - - dict_builder.append("foo").unwrap(); - dict_builder.append("bar").unwrap(); - dict_builder.append("").unwrap(); - dict_builder.append_null().unwrap(); - dict_builder.append("foo").unwrap(); - dict_builder.append("bar").unwrap(); - dict_builder.append("bar").unwrap(); - dict_builder.append("foo").unwrap(); - - let array = dict_builder.finish(); - let dict_values = array.values().clone(); - let dict_values = dict_values.as_any().downcast_ref::().unwrap(); - - let indices = UInt32Array::from(vec![ - Some(0), // first "foo" - Some(7), // last "foo" - None, // null index should return null - Some(5), // second "bar" - Some(6), // another "bar" - Some(2), // empty string - Some(3), // input is null at this index - ]); - - let result = take(&array, &indices, None).unwrap(); - let result = result - .as_any() - .downcast_ref::>() - .unwrap(); - - let result_values: StringArray = result.values().data().clone().into(); - - // dictionary values should stay the same - let expected_values = StringArray::from(vec!["foo", "bar", ""]); - assert_eq!(&expected_values, dict_values); - assert_eq!(&expected_values, &result_values); - - let expected_keys = Int16Array::from(vec![ - Some(0), - Some(0), - None, - Some(1), - Some(1), - Some(2), - None, - ]); - assert_eq!(result.keys(), &expected_keys); - } -} diff --git a/rust/arrow/src/compute/kernels/temporal.rs b/rust/arrow/src/compute/kernels/temporal.rs deleted file mode 100644 index 63e412990fd..00000000000 --- a/rust/arrow/src/compute/kernels/temporal.rs +++ /dev/null @@ -1,187 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines temporal kernels for time and date related functions. - -use chrono::{Datelike, Timelike}; - -use crate::array::*; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -/// Extracts the hours of a given temporal array as an array of integers -pub fn hour(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, -{ - let mut b = Int32Builder::new(array.len()); - match array.data_type() { - &DataType::Time32(_) | &DataType::Time64(_) => { - for i in 0..array.len() { - if array.is_null(i) { - b.append_null()?; - } else { - match array.value_as_time(i) { - Some(time) => b.append_value(time.hour() as i32)?, - None => b.append_null()?, - }; - } - } - } - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, _) => { - for i in 0..array.len() { - if array.is_null(i) { - b.append_null()?; - } else { - match array.value_as_datetime(i) { - Some(dt) => b.append_value(dt.hour() as i32)?, - None => b.append_null()?, - } - } - } - } - dt => { - return { - Err(ArrowError::ComputeError(format!( - "hour does not support type {:?}", - dt - ))) - } - } - } - - Ok(b.finish()) -} - -/// Extracts the years of a given temporal array as an array of integers -pub fn year(array: &PrimitiveArray) -> Result -where - T: ArrowTemporalType + ArrowNumericType, - i64: std::convert::From, -{ - let mut b = Int32Builder::new(array.len()); - match array.data_type() { - &DataType::Date32 | &DataType::Date64 | &DataType::Timestamp(_, _) => { - for i in 0..array.len() { - if array.is_null(i) { - b.append_null()?; - } else { - match array.value_as_datetime(i) { - Some(dt) => b.append_value(dt.year() as i32)?, - None => b.append_null()?, - } - } - } - } - dt => { - return { - Err(ArrowError::ComputeError(format!( - "year does not support type {:?}", - dt - ))) - } - } - } - - Ok(b.finish()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_temporal_array_date64_hour() { - let a: PrimitiveArray = - vec![Some(1514764800000), None, Some(1550636625000)].into(); - - let b = hour(&a).unwrap(); - assert_eq!(0, b.value(0)); - assert_eq!(false, b.is_valid(1)); - assert_eq!(4, b.value(2)); - } - - #[test] - fn test_temporal_array_date32_hour() { - let a: PrimitiveArray = vec![Some(15147), None, Some(15148)].into(); - - let b = hour(&a).unwrap(); - assert_eq!(0, b.value(0)); - assert_eq!(false, b.is_valid(1)); - assert_eq!(0, b.value(2)); - } - - #[test] - fn test_temporal_array_time32_second_hour() { - let a: PrimitiveArray = vec![37800, 86339].into(); - - let b = hour(&a).unwrap(); - assert_eq!(10, b.value(0)); - assert_eq!(23, b.value(1)); - } - - #[test] - fn test_temporal_array_time64_micro_hour() { - let a: PrimitiveArray = - vec![37800000000, 86339000000].into(); - - let b = hour(&a).unwrap(); - assert_eq!(10, b.value(0)); - assert_eq!(23, b.value(1)); - } - - #[test] - fn test_temporal_array_timestamp_micro_hour() { - let a: TimestampMicrosecondArray = vec![37800000000, 86339000000].into(); - - let b = hour(&a).unwrap(); - assert_eq!(10, b.value(0)); - assert_eq!(23, b.value(1)); - } - - #[test] - fn test_temporal_array_date64_year() { - let a: PrimitiveArray = - vec![Some(1514764800000), None, Some(1550636625000)].into(); - - let b = year(&a).unwrap(); - assert_eq!(2018, b.value(0)); - assert_eq!(false, b.is_valid(1)); - assert_eq!(2019, b.value(2)); - } - - #[test] - fn test_temporal_array_date32_year() { - let a: PrimitiveArray = vec![Some(15147), None, Some(15448)].into(); - - let b = year(&a).unwrap(); - assert_eq!(2011, b.value(0)); - assert_eq!(false, b.is_valid(1)); - assert_eq!(2012, b.value(2)); - } - - #[test] - fn test_temporal_array_timestamp_micro_year() { - let a: TimestampMicrosecondArray = - vec![Some(1612025847000000), None, Some(1722015847000000)].into(); - - let b = year(&a).unwrap(); - assert_eq!(2021, b.value(0)); - assert_eq!(false, b.is_valid(1)); - assert_eq!(2024, b.value(2)); - } -} diff --git a/rust/arrow/src/compute/kernels/window.rs b/rust/arrow/src/compute/kernels/window.rs deleted file mode 100644 index 82e712c3079..00000000000 --- a/rust/arrow/src/compute/kernels/window.rs +++ /dev/null @@ -1,109 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines windowing functions, like `shift`ing - -use crate::compute::concat; -use num::{abs, clamp}; - -use crate::{ - array::{make_array, ArrayData, PrimitiveArray}, - datatypes::ArrowPrimitiveType, - error::Result, -}; -use crate::{ - array::{Array, ArrayRef}, - buffer::MutableBuffer, -}; - -/// Shifts array by defined number of items (to left or right) -/// A positive value for `offset` shifts the array to the right -/// a negative value shifts the array to the left. -/// # Examples -/// ``` -/// use arrow::array::Int32Array; -/// use arrow::error::Result; -/// use arrow::compute::shift; -/// -/// let a: Int32Array = vec![Some(1), None, Some(4)].into(); -/// // shift array 1 element to the right -/// let res = shift(&a, 1).unwrap(); -/// let expected: Int32Array = vec![None, Some(1), None].into(); -/// assert_eq!(res.as_ref(), &expected) -/// ``` -pub fn shift(values: &PrimitiveArray, offset: i64) -> Result -where - T: ArrowPrimitiveType, -{ - // Compute slice - let slice_offset = clamp(-offset, 0, values.len() as i64) as usize; - let length = values.len() - abs(offset) as usize; - let slice = values.slice(slice_offset, length); - - // Generate array with remaining `null` items - let nulls = abs(offset as i64) as usize; - - let mut null_array = MutableBuffer::new(nulls); - let mut null_data = MutableBuffer::new(nulls * T::get_byte_width()); - null_array.extend_zeros(nulls); - null_data.extend_zeros(nulls * T::get_byte_width()); - - let null_data = ArrayData::new( - T::DATA_TYPE, - nulls as usize, - Some(nulls), - Some(null_array.into()), - 0, - vec![null_data.into()], - vec![], - ); - - // Concatenate both arrays, add nulls after if shift > 0 else before - let null_arr = make_array(null_data); - if offset > 0 { - concat(&[null_arr.as_ref(), slice.as_ref()]) - } else { - concat(&[slice.as_ref(), null_arr.as_ref()]) - } -} - -#[cfg(test)] -mod tests { - use crate::array::Int32Array; - - use super::*; - - #[test] - fn test_shift_neg() { - let a: Int32Array = vec![Some(1), None, Some(4)].into(); - let res = shift(&a, -1).unwrap(); - - let expected: Int32Array = vec![None, Some(4), None].into(); - - assert_eq!(res.as_ref(), &expected); - } - - #[test] - fn test_shift_pos() { - let a: Int32Array = vec![Some(1), None, Some(4)].into(); - let res = shift(&a, 1).unwrap(); - - let expected: Int32Array = vec![None, Some(1), None].into(); - - assert_eq!(res.as_ref(), &expected); - } -} diff --git a/rust/arrow/src/compute/kernels/zip.rs b/rust/arrow/src/compute/kernels/zip.rs deleted file mode 100644 index 0ee8e47bede..00000000000 --- a/rust/arrow/src/compute/kernels/zip.rs +++ /dev/null @@ -1,87 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::array::*; -use crate::compute::SlicesIterator; -use crate::error::{ArrowError, Result}; - -/// Zip two arrays by some boolean mask. Where the mask evaluates `true` values of `truthy` -/// are taken, where the mask evaluates `false` values of `falsy` are taken. -/// -/// # Arguments -/// * `mask` - Boolean values used to determine from which array to take the values. -/// * `truthy` - Values of this array are taken if mask evaluates `true` -/// * `falsy` - Values of this array are taken if mask evaluates `false` -pub fn zip( - mask: &BooleanArray, - truthy: &dyn Array, - falsy: &dyn Array, -) -> Result { - if truthy.data_type() != falsy.data_type() { - return Err(ArrowError::InvalidArgumentError( - "arguments need to have the same data type".into(), - )); - } - if truthy.len() != falsy.len() || falsy.len() != mask.len() { - return Err(ArrowError::InvalidArgumentError( - "all arrays should have the same length".into(), - )); - } - let falsy = falsy.data(); - let truthy = truthy.data(); - - let mut mutable = MutableArrayData::new(vec![&*truthy, &*falsy], false, truthy.len()); - - // the SlicesIterator slices only the true values. So the gaps left by this iterator we need to - // fill with falsy values - - // keep track of how much is filled - let mut filled = 0; - - SlicesIterator::new(mask).for_each(|(start, end)| { - // the gap needs to be filled with falsy values - if start > filled { - mutable.extend(1, filled, start); - } - // fill with truthy values - mutable.extend(0, start, end); - filled = end; - }); - // the remaining part is falsy - if filled < truthy.len() { - mutable.extend(1, filled, truthy.len()); - } - - let data = mutable.freeze(); - Ok(make_array(data)) -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_zip_kernel() { - let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]); - let b = Int32Array::from(vec![None, Some(3), Some(6), Some(7), Some(3)]); - let mask = BooleanArray::from(vec![true, true, false, false, true]); - let out = zip(&mask, &a, &b).unwrap(); - let actual = out.as_any().downcast_ref::().unwrap(); - let expected = Int32Array::from(vec![Some(5), None, Some(6), Some(7), Some(1)]); - assert_eq!(actual, &expected); - } -} diff --git a/rust/arrow/src/compute/mod.rs b/rust/arrow/src/compute/mod.rs deleted file mode 100644 index be1aa277ca4..00000000000 --- a/rust/arrow/src/compute/mod.rs +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Computation kernels on Arrow Arrays - -pub mod kernels; - -mod util; - -pub use self::kernels::aggregate::*; -pub use self::kernels::arithmetic::*; -pub use self::kernels::boolean::*; -pub use self::kernels::cast::*; -pub use self::kernels::comparison::*; -pub use self::kernels::concat::*; -pub use self::kernels::filter::*; -pub use self::kernels::limit::*; -pub use self::kernels::regexp::*; -pub use self::kernels::sort::*; -pub use self::kernels::take::*; -pub use self::kernels::temporal::*; -pub use self::kernels::window::*; diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs deleted file mode 100644 index 56de5948301..00000000000 --- a/rust/arrow/src/compute/util.rs +++ /dev/null @@ -1,463 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Common utilities for computation kernels. - -use crate::array::*; -use crate::buffer::{buffer_bin_and, buffer_bin_or, Buffer}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use num::{One, ToPrimitive, Zero}; -use std::ops::Add; - -/// Combines the null bitmaps of two arrays using a bitwise `and` operation. -/// -/// This function is useful when implementing operations on higher level arrays. -#[allow(clippy::unnecessary_wraps)] -pub(super) fn combine_option_bitmap( - left_data: &ArrayData, - right_data: &ArrayData, - len_in_bits: usize, -) -> Result> { - let left_offset_in_bits = left_data.offset(); - let right_offset_in_bits = right_data.offset(); - - let left = left_data.null_buffer(); - let right = right_data.null_buffer(); - - match left { - None => match right { - None => Ok(None), - Some(r) => Ok(Some(r.bit_slice(right_offset_in_bits, len_in_bits))), - }, - Some(l) => match right { - None => Ok(Some(l.bit_slice(left_offset_in_bits, len_in_bits))), - - Some(r) => Ok(Some(buffer_bin_and( - &l, - left_offset_in_bits, - &r, - right_offset_in_bits, - len_in_bits, - ))), - }, - } -} - -/// Compares the null bitmaps of two arrays using a bitwise `or` operation. -/// -/// This function is useful when implementing operations on higher level arrays. -#[allow(clippy::unnecessary_wraps)] -pub(super) fn compare_option_bitmap( - left_data: &ArrayData, - right_data: &ArrayData, - len_in_bits: usize, -) -> Result> { - let left_offset_in_bits = left_data.offset(); - let right_offset_in_bits = right_data.offset(); - - let left = left_data.null_buffer(); - let right = right_data.null_buffer(); - - match left { - None => match right { - None => Ok(None), - Some(r) => Ok(Some(r.bit_slice(right_offset_in_bits, len_in_bits))), - }, - Some(l) => match right { - None => Ok(Some(l.bit_slice(left_offset_in_bits, len_in_bits))), - - Some(r) => Ok(Some(buffer_bin_or( - &l, - left_offset_in_bits, - &r, - right_offset_in_bits, - len_in_bits, - ))), - }, - } -} - -/// Takes/filters a list array's inner data using the offsets of the list array. -/// -/// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns -/// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2 -/// elements) -pub(super) fn take_value_indices_from_list( - list: &GenericListArray, - indices: &PrimitiveArray, -) -> Result<(PrimitiveArray, Vec)> -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, - OffsetType: ArrowNumericType, - OffsetType::Native: OffsetSizeTrait + Add + Zero + One, - PrimitiveArray: From>>, -{ - // TODO: benchmark this function, there might be a faster unsafe alternative - let offsets: &[OffsetType::Native] = list.value_offsets(); - - let mut new_offsets = Vec::with_capacity(indices.len()); - let mut values = Vec::new(); - let mut current_offset = OffsetType::Native::zero(); - // add first offset - new_offsets.push(OffsetType::Native::zero()); - // compute the value indices, and set offsets accordingly - for i in 0..indices.len() { - if indices.is_valid(i) { - let ix = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - let start = offsets[ix]; - let end = offsets[ix + 1]; - current_offset += end - start; - new_offsets.push(current_offset); - - let mut curr = start; - - // if start == end, this slot is empty - while curr < end { - values.push(Some(curr)); - curr += OffsetType::Native::one(); - } - } else { - new_offsets.push(current_offset); - } - } - - Ok((PrimitiveArray::::from(values), new_offsets)) -} - -/// Takes/filters a fixed size list array's inner data using the offsets of the list array. -pub(super) fn take_value_indices_from_fixed_size_list( - list: &FixedSizeListArray, - indices: &PrimitiveArray, - length: ::Native, -) -> Result> -where - IndexType: ArrowNumericType, - IndexType::Native: ToPrimitive, -{ - let mut values = vec![]; - - for i in 0..indices.len() { - if indices.is_valid(i) { - let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| { - ArrowError::ComputeError("Cast to usize failed".to_string()) - })?; - let start = - list.value_offset(index) as ::Native; - - values.extend(start..start + length); - } - } - - Ok(PrimitiveArray::::from(values)) -} - -#[cfg(test)] -pub(super) mod tests { - use super::*; - - use std::sync::Arc; - - use crate::datatypes::DataType; - use crate::util::bit_util; - use crate::{array::ArrayData, buffer::MutableBuffer}; - - fn make_data_with_null_bit_buffer( - len: usize, - offset: usize, - null_bit_buffer: Option, - ) -> Arc { - // empty vec for buffers and children is not really correct, but for these tests we only care about the null bitmap - Arc::new(ArrayData::new( - DataType::UInt8, - len, - None, - null_bit_buffer, - offset, - vec![], - vec![], - )) - } - - #[test] - fn test_combine_option_bitmap() { - let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); - let some_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b01001010]))); - let inverse_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10110101]))); - assert_eq!( - None, - combine_option_bitmap(&none_bitmap, &none_bitmap, 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - combine_option_bitmap(&some_bitmap, &none_bitmap, 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - combine_option_bitmap(&none_bitmap, &some_bitmap, 8,).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - combine_option_bitmap(&some_bitmap, &some_bitmap, 8,).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b0])), - combine_option_bitmap(&some_bitmap, &inverse_bitmap, 8,).unwrap() - ); - } - - #[test] - fn test_compare_option_bitmap() { - let none_bitmap = make_data_with_null_bit_buffer(8, 0, None); - let some_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b01001010]))); - let inverse_bitmap = - make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10110101]))); - assert_eq!( - None, - compare_option_bitmap(&none_bitmap, &none_bitmap, 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - compare_option_bitmap(&some_bitmap, &none_bitmap, 8).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - compare_option_bitmap(&none_bitmap, &some_bitmap, 8,).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b01001010])), - compare_option_bitmap(&some_bitmap, &some_bitmap, 8,).unwrap() - ); - assert_eq!( - Some(Buffer::from([0b11111111])), - compare_option_bitmap(&some_bitmap, &inverse_bitmap, 8,).unwrap() - ); - } - - pub(crate) fn build_generic_list( - data: Vec>>, - ) -> GenericListArray - where - S: OffsetSizeTrait + 'static, - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let data = data - .into_iter() - .map(|subarray| { - subarray.map(|item| { - item.into_iter() - .map(Some) - .collect::>>() - }) - }) - .collect(); - build_generic_list_nullable(data) - } - - pub(crate) fn build_generic_list_nullable( - data: Vec>>>, - ) -> GenericListArray - where - S: OffsetSizeTrait + 'static, - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - use std::any::TypeId; - - let mut offset = vec![0]; - let mut values = vec![]; - - let list_len = data.len(); - let num_bytes = bit_util::ceil(list_len, 8); - let mut list_null_count = 0; - let mut list_bitmap = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - for (idx, array) in data.into_iter().enumerate() { - if let Some(mut array) = array { - values.append(&mut array); - } else { - list_null_count += 1; - bit_util::unset_bit(&mut list_bitmap.as_slice_mut(), idx); - } - offset.push(values.len() as i64); - } - - let value_data = PrimitiveArray::::from(values).data().clone(); - let (list_data_type, value_offsets) = if TypeId::of::() == TypeId::of::() - { - ( - DataType::List(Box::new(Field::new( - "item", - T::DATA_TYPE, - list_null_count == 0, - ))), - Buffer::from_slice_ref( - &offset.into_iter().map(|x| x as i32).collect::>(), - ), - ) - } else if TypeId::of::() == TypeId::of::() { - ( - DataType::LargeList(Box::new(Field::new( - "item", - T::DATA_TYPE, - list_null_count == 0, - ))), - Buffer::from_slice_ref(&offset), - ) - } else { - unreachable!() - }; - - let list_data = ArrayData::builder(list_data_type) - .len(list_len) - .null_bit_buffer(list_bitmap.into()) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - - GenericListArray::::from(list_data) - } - - pub(crate) fn build_fixed_size_list( - data: Vec>>, - length: ::Native, - ) -> FixedSizeListArray - where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let data = data - .into_iter() - .map(|subarray| { - subarray.map(|item| { - item.into_iter() - .map(Some) - .collect::>>() - }) - }) - .collect(); - build_fixed_size_list_nullable(data, length) - } - - pub(crate) fn build_fixed_size_list_nullable( - list_values: Vec>>>, - length: ::Native, - ) -> FixedSizeListArray - where - T: ArrowPrimitiveType, - PrimitiveArray: From>>, - { - let mut values = vec![]; - let mut list_null_count = 0; - let list_len = list_values.len(); - - let num_bytes = bit_util::ceil(list_len, 8); - let mut list_bitmap = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true); - for (idx, list_element) in list_values.into_iter().enumerate() { - if let Some(items) = list_element { - // every sub-array should have the same length - debug_assert_eq!(length as usize, items.len()); - - values.extend(items.into_iter()); - } else { - list_null_count += 1; - bit_util::unset_bit(&mut list_bitmap.as_slice_mut(), idx); - values.extend(vec![None; length as usize].into_iter()); - } - } - - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", T::DATA_TYPE, list_null_count == 0)), - length, - ); - - let child_data = PrimitiveArray::::from(values).data().clone(); - - let list_data = ArrayData::builder(list_data_type) - .len(list_len) - .null_bit_buffer(list_bitmap.into()) - .add_child_data(child_data) - .build(); - - FixedSizeListArray::from(list_data) - } - - #[test] - fn test_take_value_index_from_list() { - let list = build_generic_list::(vec![ - Some(vec![0, 1]), - Some(vec![2, 3, 4]), - Some(vec![5, 6, 7, 8, 9]), - ]); - let indices = UInt32Array::from(vec![2, 0]); - - let (indexed, offsets) = take_value_indices_from_list(&list, &indices).unwrap(); - - assert_eq!(indexed, Int32Array::from(vec![5, 6, 7, 8, 9, 0, 1])); - assert_eq!(offsets, vec![0, 5, 7]); - } - - #[test] - fn test_take_value_index_from_large_list() { - let list = build_generic_list::(vec![ - Some(vec![0, 1]), - Some(vec![2, 3, 4]), - Some(vec![5, 6, 7, 8, 9]), - ]); - let indices = UInt32Array::from(vec![2, 0]); - - let (indexed, offsets) = - take_value_indices_from_list::<_, Int64Type>(&list, &indices).unwrap(); - - assert_eq!(indexed, Int64Array::from(vec![5, 6, 7, 8, 9, 0, 1])); - assert_eq!(offsets, vec![0, 5, 7]); - } - - #[test] - fn test_take_value_index_from_fixed_list() { - let list = build_fixed_size_list_nullable::( - vec![ - Some(vec![Some(1), Some(2), None]), - Some(vec![Some(4), None, Some(6)]), - None, - Some(vec![None, Some(8), Some(9)]), - ], - 3, - ); - - let indices = UInt32Array::from(vec![2, 1, 0]); - let indexed = - take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); - - assert_eq!(indexed, UInt32Array::from(vec![6, 7, 8, 3, 4, 5, 0, 1, 2])); - - let indices = UInt32Array::from(vec![3, 2, 1, 2, 0]); - let indexed = - take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap(); - - assert_eq!( - indexed, - UInt32Array::from(vec![9, 10, 11, 6, 7, 8, 3, 4, 5, 6, 7, 8, 0, 1, 2]) - ); - } -} diff --git a/rust/arrow/src/csv/mod.rs b/rust/arrow/src/csv/mod.rs deleted file mode 100644 index ffe82f33580..00000000000 --- a/rust/arrow/src/csv/mod.rs +++ /dev/null @@ -1,27 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Transfer data between the Arrow memory format and CSV (comma-separated values). - -pub mod reader; -pub mod writer; - -pub use self::reader::infer_schema_from_files; -pub use self::reader::Reader; -pub use self::reader::ReaderBuilder; -pub use self::writer::Writer; -pub use self::writer::WriterBuilder; diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs deleted file mode 100644 index 985c88b4978..00000000000 --- a/rust/arrow/src/csv/reader.rs +++ /dev/null @@ -1,1291 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! CSV Reader -//! -//! This CSV reader allows CSV files to be read into the Arrow memory model. Records are -//! loaded in batches and are then converted from row-based data to columnar data. -//! -//! Example: -//! -//! ``` -//! use arrow::csv; -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use std::fs::File; -//! use std::sync::Arc; -//! -//! let schema = Schema::new(vec![ -//! Field::new("city", DataType::Utf8, false), -//! Field::new("lat", DataType::Float64, false), -//! Field::new("lng", DataType::Float64, false), -//! ]); -//! -//! let file = File::open("test/data/uk_cities.csv").unwrap(); -//! -//! let mut csv = csv::Reader::new(file, Arc::new(schema), false, None, 1024, None, None); -//! let batch = csv.next().unwrap().unwrap(); -//! ``` - -use core::cmp::min; -use lazy_static::lazy_static; -use regex::{Regex, RegexBuilder}; -use std::collections::HashSet; -use std::fmt; -use std::fs::File; -use std::io::{Read, Seek, SeekFrom}; -use std::sync::Arc; - -use csv as csv_crate; - -use crate::array::{ArrayRef, BooleanArray, PrimitiveArray, StringArray}; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::record_batch::RecordBatch; - -use self::csv_crate::{ByteRecord, StringRecord}; - -lazy_static! { - static ref DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.\d+)$").unwrap(); - static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d+)$").unwrap(); - static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$") - .case_insensitive(true) - .build() - .unwrap(); - static ref DATE_RE: Regex = Regex::new(r"^\d{4}-\d\d-\d\d$").unwrap(); - static ref DATETIME_RE: Regex = - Regex::new(r"^\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d$").unwrap(); -} - -/// Infer the data type of a record -fn infer_field_schema(string: &str) -> DataType { - // when quoting is enabled in the reader, these quotes aren't escaped, we default to - // Utf8 for them - if string.starts_with('"') { - return DataType::Utf8; - } - // match regex in a particular order - if BOOLEAN_RE.is_match(string) { - DataType::Boolean - } else if DECIMAL_RE.is_match(string) { - DataType::Float64 - } else if INTEGER_RE.is_match(string) { - DataType::Int64 - } else if DATETIME_RE.is_match(string) { - DataType::Date64 - } else if DATE_RE.is_match(string) { - DataType::Date32 - } else { - DataType::Utf8 - } -} - -/// Infer the schema of a CSV file by reading through the first n records of the file, -/// with `max_read_records` controlling the maximum number of records to read. -/// -/// If `max_read_records` is not set, the whole file is read to infer its schema. -/// -/// Return infered schema and number of records used for inference. This function does not change -/// reader cursor offset. -pub fn infer_file_schema( - reader: &mut R, - delimiter: u8, - max_read_records: Option, - has_header: bool, -) -> Result<(Schema, usize)> { - let saved_offset = reader.seek(SeekFrom::Current(0))?; - - let (schema, records_count) = - infer_reader_schema(reader, delimiter, max_read_records, has_header)?; - - // return the reader seek back to the start - reader.seek(SeekFrom::Start(saved_offset))?; - - Ok((schema, records_count)) -} - -/// Infer schema of CSV records provided by struct that implements `Read` trait. -/// -/// `max_read_records` controlling the maximum number of records to read. If `max_read_records` is -/// not set, all records are read to infer the schema. -/// -/// Return infered schema and number of records used for inference. -pub fn infer_reader_schema( - reader: &mut R, - delimiter: u8, - max_read_records: Option, - has_header: bool, -) -> Result<(Schema, usize)> { - let mut csv_reader = csv_crate::ReaderBuilder::new() - .delimiter(delimiter) - .from_reader(reader); - - // get or create header names - // when has_header is false, creates default column names with column_ prefix - let headers: Vec = if has_header { - let headers = &csv_reader.headers()?.clone(); - headers.iter().map(|s| s.to_string()).collect() - } else { - let first_record_count = &csv_reader.headers()?.len(); - (0..*first_record_count) - .map(|i| format!("column_{}", i + 1)) - .collect() - }; - - let header_length = headers.len(); - // keep track of inferred field types - let mut column_types: Vec> = vec![HashSet::new(); header_length]; - // keep track of columns with nulls - let mut nulls: Vec = vec![false; header_length]; - - let mut records_count = 0; - let mut fields = vec![]; - - let mut record = StringRecord::new(); - let max_records = max_read_records.unwrap_or(usize::MAX); - while records_count < max_records { - if !csv_reader.read_record(&mut record)? { - break; - } - records_count += 1; - - for i in 0..header_length { - if let Some(string) = record.get(i) { - if string.is_empty() { - nulls[i] = true; - } else { - column_types[i].insert(infer_field_schema(string)); - } - } - } - } - - // build schema from inference results - for i in 0..header_length { - let possibilities = &column_types[i]; - let has_nulls = nulls[i]; - let field_name = &headers[i]; - - // determine data type based on possible types - // if there are incompatible types, use DataType::Utf8 - match possibilities.len() { - 1 => { - for dtype in possibilities.iter() { - fields.push(Field::new(&field_name, dtype.clone(), has_nulls)); - } - } - 2 => { - if possibilities.contains(&DataType::Int64) - && possibilities.contains(&DataType::Float64) - { - // we have an integer and double, fall down to double - fields.push(Field::new(&field_name, DataType::Float64, has_nulls)); - } else { - // default to Utf8 for conflicting datatypes (e.g bool and int) - fields.push(Field::new(&field_name, DataType::Utf8, has_nulls)); - } - } - _ => fields.push(Field::new(&field_name, DataType::Utf8, has_nulls)), - } - } - - Ok((Schema::new(fields), records_count)) -} - -/// Infer schema from a list of CSV files by reading through first n records -/// with `max_read_records` controlling the maximum number of records to read. -/// -/// Files will be read in the given order untill n records have been reached. -/// -/// If `max_read_records` is not set, all files will be read fully to infer the schema. -pub fn infer_schema_from_files( - files: &[String], - delimiter: u8, - max_read_records: Option, - has_header: bool, -) -> Result { - let mut schemas = vec![]; - let mut records_to_read = max_read_records.unwrap_or(std::usize::MAX); - - for fname in files.iter() { - let (schema, records_read) = infer_file_schema( - &mut File::open(fname)?, - delimiter, - Some(records_to_read), - has_header, - )?; - if records_read == 0 { - continue; - } - schemas.push(schema.clone()); - records_to_read -= records_read; - if records_to_read == 0 { - break; - } - } - - Schema::try_merge(schemas) -} - -// optional bounds of the reader, of the form (min line, max line). -type Bounds = Option<(usize, usize)>; - -/// CSV file reader -pub struct Reader { - /// Explicit schema for the CSV file - schema: SchemaRef, - /// Optional projection for which columns to load (zero-based column indices) - projection: Option>, - /// File reader - reader: csv_crate::Reader, - /// Current line number - line_number: usize, - /// Maximum number of rows to read - end: usize, - /// Number of records per batch - batch_size: usize, - /// Vector that can hold the `StringRecord`s of the batches - batch_records: Vec, -} - -impl fmt::Debug for Reader -where - R: Read, -{ - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Reader") - .field("schema", &self.schema) - .field("projection", &self.projection) - .field("line_number", &self.line_number) - .finish() - } -} - -impl Reader { - /// Create a new CsvReader from any value that implements the `Read` trait. - /// - /// If reading a `File` or an input that supports `std::io::Read` and `std::io::Seek`; - /// you can customise the Reader, such as to enable schema inference, use - /// `ReaderBuilder`. - pub fn new( - reader: R, - schema: SchemaRef, - has_header: bool, - delimiter: Option, - batch_size: usize, - bounds: Bounds, - projection: Option>, - ) -> Self { - Self::from_reader( - reader, schema, has_header, delimiter, batch_size, bounds, projection, - ) - } - - /// Returns the schema of the reader, useful for getting the schema without reading - /// record batches - pub fn schema(&self) -> SchemaRef { - match &self.projection { - Some(projection) => { - let fields = self.schema.fields(); - let projected_fields: Vec = - projection.iter().map(|i| fields[*i].clone()).collect(); - - Arc::new(Schema::new(projected_fields)) - } - None => self.schema.clone(), - } - } - - /// Create a new CsvReader from a Reader - /// - /// This constructor allows you more flexibility in what records are processed by the - /// csv reader. - pub fn from_reader( - reader: R, - schema: SchemaRef, - has_header: bool, - delimiter: Option, - batch_size: usize, - bounds: Bounds, - projection: Option>, - ) -> Self { - let mut reader_builder = csv_crate::ReaderBuilder::new(); - reader_builder.has_headers(has_header); - - if let Some(c) = delimiter { - reader_builder.delimiter(c); - } - - let mut csv_reader = reader_builder.from_reader(reader); - - let (start, end) = match bounds { - None => (0, usize::MAX), - Some((start, end)) => (start, end), - }; - - // First we will skip `start` rows - // note that this skips by iteration. This is because in general it is not possible - // to seek in CSV. However, skiping still saves the burden of creating arrow arrays, - // which is a slow operation that scales with the number of columns - - let mut record = ByteRecord::new(); - // Skip first start items - for _ in 0..start { - let res = csv_reader.read_byte_record(&mut record); - if !res.unwrap_or(false) { - break; - } - } - - // Initialize batch_records with StringRecords so they - // can be reused accross batches - let mut batch_records = Vec::with_capacity(batch_size); - batch_records.resize_with(batch_size, Default::default); - - Self { - schema, - projection, - reader: csv_reader, - line_number: if has_header { start + 1 } else { start }, - batch_size, - end, - batch_records, - } - } -} - -impl Iterator for Reader { - type Item = Result; - - fn next(&mut self) -> Option { - let remaining = self.end - self.line_number; - - let mut read_records = 0; - for i in 0..min(self.batch_size, remaining) { - match self.reader.read_record(&mut self.batch_records[i]) { - Ok(true) => { - read_records += 1; - } - Ok(false) => break, - Err(e) => { - return Some(Err(ArrowError::ParseError(format!( - "Error parsing line {}: {:?}", - self.line_number + i, - e - )))) - } - } - } - - // return early if no data was loaded - if read_records == 0 { - return None; - } - - // parse the batches into a RecordBatch - let result = parse( - &self.batch_records[..read_records], - &self.schema.fields(), - Some(self.schema.metadata.clone()), - &self.projection, - self.line_number, - ); - - self.line_number += read_records; - - Some(result) - } -} - -/// parses a slice of [csv_crate::StringRecord] into a [array::record_batch::RecordBatch]. -fn parse( - rows: &[StringRecord], - fields: &[Field], - metadata: Option>, - projection: &Option>, - line_number: usize, -) -> Result { - let projection: Vec = match projection { - Some(ref v) => v.clone(), - None => fields.iter().enumerate().map(|(i, _)| i).collect(), - }; - - let arrays: Result> = projection - .iter() - .map(|i| { - let i = *i; - let field = &fields[i]; - match field.data_type() { - &DataType::Boolean => build_boolean_array(line_number, rows, i), - &DataType::Int8 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::Int16 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::Int32 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::Int64 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::UInt8 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::UInt16 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::UInt32 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::UInt64 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::Float32 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::Float64 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::Date32 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::Date64 => { - build_primitive_array::(line_number, rows, i) - } - &DataType::Timestamp(TimeUnit::Microsecond, _) => { - build_primitive_array::( - line_number, - rows, - i, - ) - } - &DataType::Timestamp(TimeUnit::Nanosecond, _) => { - build_primitive_array::(line_number, rows, i) - } - &DataType::Utf8 => Ok(Arc::new( - rows.iter().map(|row| row.get(i)).collect::(), - ) as ArrayRef), - other => Err(ArrowError::ParseError(format!( - "Unsupported data type {:?}", - other - ))), - } - }) - .collect(); - - let projected_fields: Vec = - projection.iter().map(|i| fields[*i].clone()).collect(); - - let projected_schema = Arc::new(match metadata { - None => Schema::new(projected_fields), - Some(metadata) => Schema::new_with_metadata(projected_fields, metadata), - }); - - arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr)) -} - -/// Specialized parsing implementations -trait Parser: ArrowPrimitiveType { - fn parse(string: &str) -> Option { - string.parse::().ok() - } -} - -impl Parser for Float32Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} -impl Parser for Float64Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -impl Parser for UInt64Type {} - -impl Parser for UInt32Type {} - -impl Parser for UInt16Type {} - -impl Parser for UInt8Type {} - -impl Parser for Int64Type {} - -impl Parser for Int32Type {} - -impl Parser for Int16Type {} - -impl Parser for Int8Type {} - -/// Number of days between 0001-01-01 and 1970-01-01 -const EPOCH_DAYS_FROM_CE: i32 = 719_163; - -impl Parser for Date32Type { - fn parse(string: &str) -> Option { - use chrono::Datelike; - - match Self::DATA_TYPE { - DataType::Date32 => { - let date = string.parse::().ok()?; - Self::Native::from_i32(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } - _ => None, - } - } -} - -impl Parser for Date64Type { - fn parse(string: &str) -> Option { - match Self::DATA_TYPE { - DataType::Date64 => { - let date_time = string.parse::().ok()?; - Self::Native::from_i64(date_time.timestamp_millis()) - } - _ => None, - } - } -} - -impl Parser for TimestampNanosecondType { - fn parse(string: &str) -> Option { - match Self::DATA_TYPE { - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - let date_time = string.parse::().ok()?; - Self::Native::from_i64(date_time.timestamp_nanos()) - } - _ => None, - } - } -} - -impl Parser for TimestampMicrosecondType { - fn parse(string: &str) -> Option { - match Self::DATA_TYPE { - DataType::Timestamp(TimeUnit::Microsecond, None) => { - let date_time = string.parse::().ok()?; - Self::Native::from_i64(date_time.timestamp_nanos() / 1000) - } - _ => None, - } - } -} - -fn parse_item(string: &str) -> Option { - T::parse(string) -} - -fn parse_bool(string: &str) -> Option { - if string.eq_ignore_ascii_case("false") { - Some(false) - } else if string.eq_ignore_ascii_case("true") { - Some(true) - } else { - None - } -} - -// parses a specific column (col_idx) into an Arrow Array. -fn build_primitive_array( - line_number: usize, - rows: &[StringRecord], - col_idx: usize, -) -> Result { - rows.iter() - .enumerate() - .map(|(row_index, row)| { - match row.get(col_idx) { - Some(s) => { - if s.is_empty() { - return Ok(None); - } - - let parsed = parse_item::(s); - match parsed { - Some(e) => Ok(Some(e)), - None => Err(ArrowError::ParseError(format!( - // TODO: we should surface the underlying error here. - "Error while parsing value {} for column {} at line {}", - s, - col_idx, - line_number + row_index - ))), - } - } - None => Ok(None), - } - }) - .collect::>>() - .map(|e| Arc::new(e) as ArrayRef) -} - -// parses a specific column (col_idx) into an Arrow Array. -fn build_boolean_array( - line_number: usize, - rows: &[StringRecord], - col_idx: usize, -) -> Result { - rows.iter() - .enumerate() - .map(|(row_index, row)| { - match row.get(col_idx) { - Some(s) => { - if s.is_empty() { - return Ok(None); - } - - let parsed = parse_bool(s); - match parsed { - Some(e) => Ok(Some(e)), - None => Err(ArrowError::ParseError(format!( - // TODO: we should surface the underlying error here. - "Error while parsing value {} for column {} at line {}", - s, - col_idx, - line_number + row_index - ))), - } - } - None => Ok(None), - } - }) - .collect::>() - .map(|e| Arc::new(e) as ArrayRef) -} - -/// CSV file reader builder -#[derive(Debug)] -pub struct ReaderBuilder { - /// Optional schema for the CSV file - /// - /// If the schema is not supplied, the reader will try to infer the schema - /// based on the CSV structure. - schema: Option, - /// Whether the file has headers or not - /// - /// If schema inference is run on a file with no headers, default column names - /// are created. - has_header: bool, - /// An optional column delimiter. Defaults to `b','` - delimiter: Option, - /// Optional maximum number of records to read during schema inference - /// - /// If a number is not provided, all the records are read. - max_records: Option, - /// Batch size (number of records to load each time) - /// - /// The default batch size when using the `ReaderBuilder` is 1024 records - batch_size: usize, - /// The bounds over which to scan the reader. `None` starts from 0 and runs until EOF. - bounds: Bounds, - /// Optional projection for which columns to load (zero-based column indices) - projection: Option>, -} - -impl Default for ReaderBuilder { - fn default() -> Self { - Self { - schema: None, - has_header: false, - delimiter: None, - max_records: None, - batch_size: 1024, - bounds: None, - projection: None, - } - } -} - -impl ReaderBuilder { - /// Create a new builder for configuring CSV parsing options. - /// - /// To convert a builder into a reader, call `ReaderBuilder::build` - /// - /// # Example - /// - /// ``` - /// extern crate arrow; - /// - /// use arrow::csv; - /// use std::fs::File; - /// - /// fn example() -> csv::Reader { - /// let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - /// - /// // create a builder, inferring the schema with the first 100 records - /// let builder = csv::ReaderBuilder::new().infer_schema(Some(100)); - /// - /// let reader = builder.build(file).unwrap(); - /// - /// reader - /// } - /// ``` - pub fn new() -> ReaderBuilder { - ReaderBuilder::default() - } - - /// Set the CSV file's schema - pub fn with_schema(mut self, schema: SchemaRef) -> Self { - self.schema = Some(schema); - self - } - - /// Set whether the CSV file has headers - pub fn has_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; - self - } - - /// Set the CSV file's column delimiter as a byte character - pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = Some(delimiter); - self - } - - /// Set the CSV reader to infer the schema of the file - pub fn infer_schema(mut self, max_records: Option) -> Self { - // remove any schema that is set - self.schema = None; - self.max_records = max_records; - self - } - - /// Set the batch size (number of records to load at one time) - pub fn with_batch_size(mut self, batch_size: usize) -> Self { - self.batch_size = batch_size; - self - } - - /// Set the reader's column projection - pub fn with_projection(mut self, projection: Vec) -> Self { - self.projection = Some(projection); - self - } - - /// Create a new `Reader` from the `ReaderBuilder` - pub fn build(self, mut reader: R) -> Result> { - // check if schema should be inferred - let delimiter = self.delimiter.unwrap_or(b','); - let schema = match self.schema { - Some(schema) => schema, - None => { - let (inferred_schema, _) = infer_file_schema( - &mut reader, - delimiter, - self.max_records, - self.has_header, - )?; - - Arc::new(inferred_schema) - } - }; - Ok(Reader::from_reader( - reader, - schema, - self.has_header, - self.delimiter, - self.batch_size, - None, - self.projection.clone(), - )) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use std::fs::File; - use std::io::{Cursor, Write}; - use tempfile::NamedTempFile; - - use crate::array::*; - use crate::datatypes::Field; - - #[test] - fn test_csv() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(57.653484 - lat.value(0) < f64::EPSILON); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - } - - #[test] - fn test_csv_schema_metadata() { - let mut metadata = std::collections::HashMap::new(); - metadata.insert("foo".to_owned(), "bar".to_owned()); - let schema = Schema::new_with_metadata( - vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ], - metadata.clone(), - ); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - assert_eq!(&metadata, batch.schema().metadata()); - } - - #[test] - fn test_csv_from_buf_reader() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file_with_headers = - File::open("test/data/uk_cities_with_headers.csv").unwrap(); - let file_without_headers = File::open("test/data/uk_cities.csv").unwrap(); - let both_files = file_with_headers - .chain(Cursor::new("\n".to_string())) - .chain(file_without_headers); - let mut csv = Reader::from_reader( - both_files, - Arc::new(schema), - true, - None, - 1024, - None, - None, - ); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(74, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - } - - #[test] - fn test_csv_with_schema_inference() { - let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - - let builder = ReaderBuilder::new().has_header(true).infer_schema(None); - - let mut csv = builder.build(file).unwrap(); - let expected_schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - assert_eq!(Arc::new(expected_schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(57.653484 - lat.value(0) < f64::EPSILON); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - } - - #[test] - fn test_csv_with_schema_inference_no_headers() { - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let builder = ReaderBuilder::new().infer_schema(None); - - let mut csv = builder.build(file).unwrap(); - - // csv field names should be 'column_{number}' - let schema = csv.schema(); - assert_eq!("column_1", schema.field(0).name()); - assert_eq!("column_2", schema.field(1).name()); - assert_eq!("column_3", schema.field(2).name()); - let batch = csv.next().unwrap().unwrap(); - let batch_schema = batch.schema(); - - assert_eq!(schema, batch_schema); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert!(57.653484 - lat.value(0) < f64::EPSILON); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - } - - #[test] - fn test_csv_with_projection() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema), - false, - None, - 1024, - None, - Some(vec![0, 1]), - ); - let projected_schema = Arc::new(Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - ])); - assert_eq!(projected_schema, csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(projected_schema, batch.schema()); - assert_eq!(37, batch.num_rows()); - assert_eq!(2, batch.num_columns()); - } - - #[test] - fn test_nulls() { - let schema = Schema::new(vec![ - Field::new("c_int", DataType::UInt64, false), - Field::new("c_float", DataType::Float32, false), - Field::new("c_string", DataType::Utf8, false), - ]); - - let file = File::open("test/data/null_test.csv").unwrap(); - - let mut csv = Reader::new(file, Arc::new(schema), true, None, 1024, None, None); - let batch = csv.next().unwrap().unwrap(); - - assert_eq!(false, batch.column(1).is_null(0)); - assert_eq!(false, batch.column(1).is_null(1)); - assert_eq!(true, batch.column(1).is_null(2)); - assert_eq!(false, batch.column(1).is_null(3)); - assert_eq!(false, batch.column(1).is_null(4)); - } - - #[test] - fn test_nulls_with_inference() { - let file = File::open("test/data/various_types.csv").unwrap(); - - let builder = ReaderBuilder::new() - .infer_schema(None) - .has_header(true) - .with_delimiter(b'|') - .with_batch_size(512) - .with_projection(vec![0, 1, 2, 3, 4, 5]); - - let mut csv = builder.build(file).unwrap(); - let batch = csv.next().unwrap().unwrap(); - - assert_eq!(5, batch.num_rows()); - assert_eq!(6, batch.num_columns()); - - let schema = batch.schema(); - - assert_eq!(&DataType::Int64, schema.field(0).data_type()); - assert_eq!(&DataType::Float64, schema.field(1).data_type()); - assert_eq!(&DataType::Float64, schema.field(2).data_type()); - assert_eq!(&DataType::Boolean, schema.field(3).data_type()); - assert_eq!(&DataType::Date32, schema.field(4).data_type()); - assert_eq!(&DataType::Date64, schema.field(5).data_type()); - - let names: Vec<&str> = - schema.fields().iter().map(|x| x.name().as_str()).collect(); - assert_eq!( - names, - vec![ - "c_int", - "c_float", - "c_string", - "c_bool", - "c_date", - "c_datetime" - ] - ); - - assert_eq!(false, schema.field(0).is_nullable()); - assert_eq!(true, schema.field(1).is_nullable()); - assert_eq!(true, schema.field(2).is_nullable()); - assert_eq!(false, schema.field(3).is_nullable()); - assert_eq!(true, schema.field(4).is_nullable()); - assert_eq!(true, schema.field(5).is_nullable()); - - assert_eq!(false, batch.column(1).is_null(0)); - assert_eq!(false, batch.column(1).is_null(1)); - assert_eq!(true, batch.column(1).is_null(2)); - assert_eq!(false, batch.column(1).is_null(3)); - assert_eq!(false, batch.column(1).is_null(4)); - } - - #[test] - fn test_parse_invalid_csv() { - let file = File::open("test/data/various_types_invalid.csv").unwrap(); - - let schema = Schema::new(vec![ - Field::new("c_int", DataType::UInt64, false), - Field::new("c_float", DataType::Float32, false), - Field::new("c_string", DataType::Utf8, false), - Field::new("c_bool", DataType::Boolean, false), - ]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .has_header(true) - .with_delimiter(b'|') - .with_batch_size(512) - .with_projection(vec![0, 1, 2, 3]); - - let mut csv = builder.build(file).unwrap(); - match csv.next() { - Some(e) => match e { - Err(e) => assert_eq!( - "ParseError(\"Error while parsing value 4.x4 for column 1 at line 4\")", - format!("{:?}", e) - ), - Ok(_) => panic!("should have failed"), - }, - None => panic!("should have failed"), - } - } - - #[test] - fn test_infer_field_schema() { - assert_eq!(infer_field_schema("A"), DataType::Utf8); - assert_eq!(infer_field_schema("\"123\""), DataType::Utf8); - assert_eq!(infer_field_schema("10"), DataType::Int64); - assert_eq!(infer_field_schema("10.2"), DataType::Float64); - assert_eq!(infer_field_schema("true"), DataType::Boolean); - assert_eq!(infer_field_schema("false"), DataType::Boolean); - assert_eq!(infer_field_schema("2020-11-08"), DataType::Date32); - assert_eq!(infer_field_schema("2020-11-08T14:20:01"), DataType::Date64); - } - - #[test] - fn parse_date32() { - assert_eq!(parse_item::("1970-01-01").unwrap(), 0); - assert_eq!(parse_item::("2020-03-15").unwrap(), 18336); - assert_eq!(parse_item::("1945-05-08").unwrap(), -9004); - } - - #[test] - fn parse_date64() { - assert_eq!(parse_item::("1970-01-01T00:00:00").unwrap(), 0); - assert_eq!( - parse_item::("2018-11-13T17:11:10").unwrap(), - 1542129070000 - ); - assert_eq!( - parse_item::("2018-11-13T17:11:10.011").unwrap(), - 1542129070011 - ); - assert_eq!( - parse_item::("1900-02-28T12:34:56").unwrap(), - -2203932304000 - ); - } - - #[test] - fn test_infer_schema_from_multiple_files() -> Result<()> { - let mut csv1 = NamedTempFile::new()?; - let mut csv2 = NamedTempFile::new()?; - let csv3 = NamedTempFile::new()?; // empty csv file should be skipped - let mut csv4 = NamedTempFile::new()?; - writeln!(csv1, "c1,c2,c3")?; - writeln!(csv1, "1,\"foo\",0.5")?; - writeln!(csv1, "3,\"bar\",1")?; - // reading csv2 will set c2 to optional - writeln!(csv2, "c1,c2,c3,c4")?; - writeln!(csv2, "10,,3.14,true")?; - // reading csv4 will set c3 to optional - writeln!(csv4, "c1,c2,c3")?; - writeln!(csv4, "10,\"foo\",")?; - - let schema = infer_schema_from_files( - &[ - csv3.path().to_str().unwrap().to_string(), - csv1.path().to_str().unwrap().to_string(), - csv2.path().to_str().unwrap().to_string(), - csv4.path().to_str().unwrap().to_string(), - ], - b',', - Some(3), // only csv1 and csv2 should be read - true, - )?; - - assert_eq!(schema.fields().len(), 4); - assert_eq!(false, schema.field(0).is_nullable()); - assert_eq!(true, schema.field(1).is_nullable()); - assert_eq!(false, schema.field(2).is_nullable()); - assert_eq!(false, schema.field(3).is_nullable()); - - assert_eq!(&DataType::Int64, schema.field(0).data_type()); - assert_eq!(&DataType::Utf8, schema.field(1).data_type()); - assert_eq!(&DataType::Float64, schema.field(2).data_type()); - assert_eq!(&DataType::Boolean, schema.field(3).data_type()); - - Ok(()) - } - - #[test] - fn test_bounded() { - let schema = Schema::new(vec![Field::new("int", DataType::UInt32, false)]); - let data = vec![ - vec!["0"], - vec!["1"], - vec!["2"], - vec!["3"], - vec!["4"], - vec!["5"], - vec!["6"], - ]; - - let data = data - .iter() - .map(|x| x.join(",")) - .collect::>() - .join("\n"); - let data = data.as_bytes(); - - let reader = std::io::Cursor::new(data); - - let mut csv = Reader::new( - reader, - Arc::new(schema), - false, - None, - 2, - // starting at row 2 and up to row 6. - Some((2, 6)), - Some(vec![0]), - ); - - let batch = csv.next().unwrap().unwrap(); - let a = batch.column(0); - let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(a, &UInt32Array::from(vec![2, 3])); - - let batch = csv.next().unwrap().unwrap(); - let a = batch.column(0); - let a = a.as_any().downcast_ref::().unwrap(); - assert_eq!(a, &UInt32Array::from(vec![4, 5])); - - assert!(csv.next().is_none()); - } - - #[test] - fn test_parsing_bool() { - // Encode the expected behavior of boolean parsing - assert_eq!(Some(true), parse_bool("true")); - assert_eq!(Some(true), parse_bool("tRUe")); - assert_eq!(Some(true), parse_bool("True")); - assert_eq!(Some(true), parse_bool("TRUE")); - assert_eq!(None, parse_bool("t")); - assert_eq!(None, parse_bool("T")); - assert_eq!(None, parse_bool("")); - - assert_eq!(Some(false), parse_bool("false")); - assert_eq!(Some(false), parse_bool("fALse")); - assert_eq!(Some(false), parse_bool("False")); - assert_eq!(Some(false), parse_bool("FALSE")); - assert_eq!(None, parse_bool("f")); - assert_eq!(None, parse_bool("F")); - assert_eq!(None, parse_bool("")); - } - - #[test] - fn test_parsing_float() { - assert_eq!(Some(12.34), parse_item::("12.34")); - assert_eq!(Some(-12.34), parse_item::("-12.34")); - assert_eq!(Some(12.0), parse_item::("12")); - assert_eq!(Some(0.0), parse_item::("0")); - assert!(parse_item::("nan").unwrap().is_nan()); - assert!(parse_item::("NaN").unwrap().is_nan()); - assert!(parse_item::("inf").unwrap().is_infinite()); - assert!(parse_item::("inf").unwrap().is_sign_positive()); - assert!(parse_item::("-inf").unwrap().is_infinite()); - assert!(parse_item::("-inf") - .unwrap() - .is_sign_negative()); - assert_eq!(None, parse_item::("")); - assert_eq!(None, parse_item::("dd")); - assert_eq!(None, parse_item::("12.34.56")); - } -} diff --git a/rust/arrow/src/csv/writer.rs b/rust/arrow/src/csv/writer.rs deleted file mode 100644 index e9d8565b2a5..00000000000 --- a/rust/arrow/src/csv/writer.rs +++ /dev/null @@ -1,651 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! CSV Writer -//! -//! This CSV writer allows Arrow data (in record batches) to be written as CSV files. -//! The writer does not support writing `ListArray` and `StructArray`. -//! -//! Example: -//! -//! ``` -//! use arrow::array::*; -//! use arrow::csv; -//! use arrow::datatypes::*; -//! use arrow::record_batch::RecordBatch; -//! use arrow::util::test_util::get_temp_file; -//! use std::fs::File; -//! use std::sync::Arc; -//! -//! let schema = Schema::new(vec![ -//! Field::new("c1", DataType::Utf8, false), -//! Field::new("c2", DataType::Float64, true), -//! Field::new("c3", DataType::UInt32, false), -//! Field::new("c3", DataType::Boolean, true), -//! ]); -//! let c1 = StringArray::from(vec![ -//! "Lorem ipsum dolor sit amet", -//! "consectetur adipiscing elit", -//! "sed do eiusmod tempor", -//! ]); -//! let c2 = PrimitiveArray::::from(vec![ -//! Some(123.564532), -//! None, -//! Some(-556132.25), -//! ]); -//! let c3 = PrimitiveArray::::from(vec![3, 2, 1]); -//! let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); -//! -//! let batch = RecordBatch::try_new( -//! Arc::new(schema), -//! vec![Arc::new(c1), Arc::new(c2), Arc::new(c3), Arc::new(c4)], -//! ) -//! .unwrap(); -//! -//! let file = get_temp_file("out.csv", &[]); -//! -//! let mut writer = csv::Writer::new(file); -//! let batches = vec![&batch, &batch]; -//! for batch in batches { -//! writer.write(batch).unwrap(); -//! } -//! ``` - -use csv as csv_crate; - -use std::io::Write; - -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::record_batch::RecordBatch; -use crate::{array::*, util::serialization::lexical_to_string}; -const DEFAULT_DATE_FORMAT: &str = "%F"; -const DEFAULT_TIME_FORMAT: &str = "%T"; -const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f"; - -fn write_primitive_value(array: &ArrayRef, i: usize) -> String -where - T: ArrowNumericType, - T::Native: lexical_core::ToLexical, -{ - let c = array.as_any().downcast_ref::>().unwrap(); - lexical_to_string(c.value(i)) -} - -/// A CSV writer -#[derive(Debug)] -pub struct Writer { - /// The object to write to - writer: csv_crate::Writer, - /// Column delimiter. Defaults to `b','` - delimiter: u8, - /// Whether file should be written with headers. Defaults to `true` - has_headers: bool, - /// The date format for date arrays - date_format: String, - /// The datetime format for datetime arrays - datetime_format: String, - /// The timestamp format for timestamp arrays - timestamp_format: String, - /// The time format for time arrays - time_format: String, - /// Is the beginning-of-writer - beginning: bool, -} - -impl Writer { - /// Create a new CsvWriter from a writable object, with default options - pub fn new(writer: W) -> Self { - let delimiter = b','; - let mut builder = csv_crate::WriterBuilder::new(); - let writer = builder.delimiter(delimiter).from_writer(writer); - Writer { - writer, - delimiter, - has_headers: true, - date_format: DEFAULT_DATE_FORMAT.to_string(), - datetime_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), - time_format: DEFAULT_TIME_FORMAT.to_string(), - timestamp_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), - beginning: true, - } - } - - /// Convert a record to a string vector - fn convert( - &self, - batch: &RecordBatch, - row_index: usize, - buffer: &mut [String], - ) -> Result<()> { - // TODO: it'd be more efficient if we could create `record: Vec<&[u8]> - for (col_index, item) in buffer.iter_mut().enumerate() { - let col = batch.column(col_index); - if col.is_null(row_index) { - // write an empty value - *item = "".to_string(); - continue; - } - let string = match col.data_type() { - DataType::Float64 => write_primitive_value::(col, row_index), - DataType::Float32 => write_primitive_value::(col, row_index), - DataType::Int8 => write_primitive_value::(col, row_index), - DataType::Int16 => write_primitive_value::(col, row_index), - DataType::Int32 => write_primitive_value::(col, row_index), - DataType::Int64 => write_primitive_value::(col, row_index), - DataType::UInt8 => write_primitive_value::(col, row_index), - DataType::UInt16 => write_primitive_value::(col, row_index), - DataType::UInt32 => write_primitive_value::(col, row_index), - DataType::UInt64 => write_primitive_value::(col, row_index), - DataType::Boolean => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value(row_index).to_string() - } - DataType::Utf8 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value(row_index).to_owned() - } - DataType::LargeUtf8 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value(row_index).to_owned() - } - DataType::Date32 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value_as_date(row_index) - .unwrap() - .format(&self.date_format) - .to_string() - } - DataType::Date64 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value_as_datetime(row_index) - .unwrap() - .format(&self.datetime_format) - .to_string() - } - DataType::Time32(TimeUnit::Second) => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value_as_time(row_index) - .unwrap() - .format(&self.time_format) - .to_string() - } - DataType::Time32(TimeUnit::Millisecond) => { - let c = col - .as_any() - .downcast_ref::() - .unwrap(); - c.value_as_time(row_index) - .unwrap() - .format(&self.time_format) - .to_string() - } - DataType::Time64(TimeUnit::Microsecond) => { - let c = col - .as_any() - .downcast_ref::() - .unwrap(); - c.value_as_time(row_index) - .unwrap() - .format(&self.time_format) - .to_string() - } - DataType::Time64(TimeUnit::Nanosecond) => { - let c = col - .as_any() - .downcast_ref::() - .unwrap(); - c.value_as_time(row_index) - .unwrap() - .format(&self.time_format) - .to_string() - } - DataType::Timestamp(time_unit, _) => { - use TimeUnit::*; - let datetime = match time_unit { - Second => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Millisecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Microsecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Nanosecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - }; - format!("{}", datetime.format(&self.timestamp_format)) - } - t => { - // List and Struct arrays not supported by the writer, any - // other type needs to be implemented - return Err(ArrowError::CsvError(format!( - "CSV Writer does not support {:?} data type", - t - ))); - } - }; - *item = string; - } - Ok(()) - } - - /// Write a vector of record batches to a writable object - pub fn write(&mut self, batch: &RecordBatch) -> Result<()> { - let num_columns = batch.num_columns(); - if self.beginning { - if self.has_headers { - let mut headers: Vec = Vec::with_capacity(num_columns); - batch - .schema() - .fields() - .iter() - .for_each(|field| headers.push(field.name().to_string())); - self.writer.write_record(&headers[..])?; - } - self.beginning = false; - } - - let mut buffer = vec!["".to_string(); batch.num_columns()]; - - for row_index in 0..batch.num_rows() { - self.convert(batch, row_index, &mut buffer)?; - self.writer.write_record(&buffer)?; - } - self.writer.flush()?; - - Ok(()) - } -} - -/// A CSV writer builder -#[derive(Debug)] -pub struct WriterBuilder { - /// Optional column delimiter. Defaults to `b','` - delimiter: Option, - /// Whether to write column names as file headers. Defaults to `true` - has_headers: bool, - /// Optional date format for date arrays - date_format: Option, - /// Optional datetime format for datetime arrays - datetime_format: Option, - /// Optional timestamp format for timestamp arrays - timestamp_format: Option, - /// Optional time format for time arrays - time_format: Option, -} - -impl Default for WriterBuilder { - fn default() -> Self { - Self { - has_headers: true, - delimiter: None, - date_format: Some(DEFAULT_DATE_FORMAT.to_string()), - datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: Some(DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - } - } -} - -impl WriterBuilder { - /// Create a new builder for configuring CSV writing options. - /// - /// To convert a builder into a writer, call `WriterBuilder::build` - /// - /// # Example - /// - /// ``` - /// extern crate arrow; - /// - /// use arrow::csv; - /// use std::fs::File; - /// - /// fn example() -> csv::Writer { - /// let file = File::create("target/out.csv").unwrap(); - /// - /// // create a builder that doesn't write headers - /// let builder = csv::WriterBuilder::new().has_headers(false); - /// let writer = builder.build(file); - /// - /// writer - /// } - /// ``` - pub fn new() -> Self { - Self::default() - } - - /// Set whether to write headers - pub fn has_headers(mut self, has_headers: bool) -> Self { - self.has_headers = has_headers; - self - } - - /// Set the CSV file's column delimiter as a byte character - pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = Some(delimiter); - self - } - - /// Set the CSV file's date format - pub fn with_date_format(mut self, format: String) -> Self { - self.date_format = Some(format); - self - } - - /// Set the CSV file's time format - pub fn with_time_format(mut self, format: String) -> Self { - self.time_format = Some(format); - self - } - - /// Set the CSV file's timestamp format - pub fn with_timestamp_format(mut self, format: String) -> Self { - self.timestamp_format = Some(format); - self - } - - /// Create a new `Writer` - pub fn build(self, writer: W) -> Writer { - let delimiter = self.delimiter.unwrap_or(b','); - let mut builder = csv_crate::WriterBuilder::new(); - let writer = builder.delimiter(delimiter).from_writer(writer); - Writer { - writer, - delimiter, - has_headers: self.has_headers, - date_format: self - .date_format - .unwrap_or_else(|| DEFAULT_DATE_FORMAT.to_string()), - datetime_format: self - .datetime_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: self - .time_format - .unwrap_or_else(|| DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: self - .timestamp_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), - beginning: true, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::csv::Reader; - use crate::datatypes::{Field, Schema}; - use crate::util::string_writer::StringWriter; - use crate::util::test_util::get_temp_file; - use std::fs::File; - use std::io::{Cursor, Read}; - use std::sync::Arc; - - #[test] - fn test_write_csv() { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::UInt32, false), - Field::new("c4", DataType::Boolean, true), - Field::new("c5", DataType::Timestamp(TimeUnit::Millisecond, None), true), - Field::new("c6", DataType::Time32(TimeUnit::Second), false), - ]); - - let c1 = StringArray::from(vec![ - "Lorem ipsum dolor sit amet", - "consectetur adipiscing elit", - "sed do eiusmod tempor", - ]); - let c2 = PrimitiveArray::::from(vec![ - Some(123.564532), - None, - Some(-556132.25), - ]); - let c3 = PrimitiveArray::::from(vec![3, 2, 1]); - let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); - let c5 = TimestampMillisecondArray::from_opt_vec( - vec![None, Some(1555584887378), Some(1555555555555)], - None, - ); - let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]); - - let batch = RecordBatch::try_new( - Arc::new(schema), - vec![ - Arc::new(c1), - Arc::new(c2), - Arc::new(c3), - Arc::new(c4), - Arc::new(c5), - Arc::new(c6), - ], - ) - .unwrap(); - - let file = get_temp_file("columns.csv", &[]); - - let mut writer = Writer::new(file); - let batches = vec![&batch, &batch]; - for batch in batches { - writer.write(batch).unwrap(); - } - // check that file was written successfully - let mut file = File::open("target/debug/testdata/columns.csv").unwrap(); - let mut buffer: Vec = vec![]; - file.read_to_end(&mut buffer).unwrap(); - - assert_eq!( - r#"c1,c2,c3,c4,c5,c6 -Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34 -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20 -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03 -Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34 -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20 -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03 -"# - .to_string(), - String::from_utf8(buffer).unwrap() - ); - } - - #[test] - fn test_write_csv_custom_options() { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::UInt32, false), - Field::new("c4", DataType::Boolean, true), - Field::new("c6", DataType::Time32(TimeUnit::Second), false), - ]); - - let c1 = StringArray::from(vec![ - "Lorem ipsum dolor sit amet", - "consectetur adipiscing elit", - "sed do eiusmod tempor", - ]); - let c2 = PrimitiveArray::::from(vec![ - Some(123.564532), - None, - Some(-556132.25), - ]); - let c3 = PrimitiveArray::::from(vec![3, 2, 1]); - let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); - let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]); - - let batch = RecordBatch::try_new( - Arc::new(schema), - vec![ - Arc::new(c1), - Arc::new(c2), - Arc::new(c3), - Arc::new(c4), - Arc::new(c6), - ], - ) - .unwrap(); - - let file = get_temp_file("custom_options.csv", &[]); - - let builder = WriterBuilder::new() - .has_headers(false) - .with_delimiter(b'|') - .with_time_format("%r".to_string()); - let mut writer = builder.build(file); - let batches = vec![&batch]; - for batch in batches { - writer.write(batch).unwrap(); - } - - // check that file was written successfully - let mut file = File::open("target/debug/testdata/custom_options.csv").unwrap(); - let mut buffer: Vec = vec![]; - file.read_to_end(&mut buffer).unwrap(); - - assert_eq!( - "Lorem ipsum dolor sit amet|123.564532|3|true|12:20:34 AM\nconsectetur adipiscing elit||2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1||11:46:03 PM\n" - .to_string(), - String::from_utf8(buffer).unwrap() - ); - } - - #[test] - fn test_export_csv_string() { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::UInt32, false), - Field::new("c4", DataType::Boolean, true), - Field::new("c5", DataType::Timestamp(TimeUnit::Millisecond, None), true), - Field::new("c6", DataType::Time32(TimeUnit::Second), false), - ]); - - let c1 = StringArray::from(vec![ - "Lorem ipsum dolor sit amet", - "consectetur adipiscing elit", - "sed do eiusmod tempor", - ]); - let c2 = PrimitiveArray::::from(vec![ - Some(123.564532), - None, - Some(-556132.25), - ]); - let c3 = PrimitiveArray::::from(vec![3, 2, 1]); - let c4 = BooleanArray::from(vec![Some(true), Some(false), None]); - let c5 = TimestampMillisecondArray::from_opt_vec( - vec![None, Some(1555584887378), Some(1555555555555)], - None, - ); - let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]); - - let batch = RecordBatch::try_new( - Arc::new(schema), - vec![ - Arc::new(c1), - Arc::new(c2), - Arc::new(c3), - Arc::new(c4), - Arc::new(c5), - Arc::new(c6), - ], - ) - .unwrap(); - - let sw = StringWriter::new(); - let mut writer = Writer::new(sw); - let batches = vec![&batch, &batch]; - for batch in batches { - writer.write(batch).unwrap(); - } - - let left = "c1,c2,c3,c4,c5,c6 -Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34 -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20 -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03 -Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34 -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20 -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03\n"; - let right = writer.writer.into_inner().map(|s| s.to_string()); - assert_eq!(Some(left.to_string()), right.ok()); - } - - #[test] - fn test_conversion_consistency() { - // test if we can serialize and deserialize whilst retaining the same type information/ precision - - let schema = Schema::new(vec![ - Field::new("c1", DataType::Date32, false), - Field::new("c2", DataType::Date64, false), - ]); - - let c1 = Date32Array::from(vec![3, 2, 1]); - let c2 = Date64Array::from(vec![3, 2, 1]); - - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(c1), Arc::new(c2)], - ) - .unwrap(); - - let builder = WriterBuilder::new().has_headers(false); - - let mut buf: Cursor> = Default::default(); - // drop the writer early to release the borrow. - { - let mut writer = builder.build(&mut buf); - writer.write(&batch).unwrap(); - } - buf.set_position(0); - - let mut reader = Reader::new( - buf, - Arc::new(schema), - false, - None, - 3, - // starting at row 2 and up to row 6. - None, - None, - ); - let rb = reader.next().unwrap().unwrap(); - let c1 = rb.column(0).as_any().downcast_ref::().unwrap(); - let c2 = rb.column(1).as_any().downcast_ref::().unwrap(); - - let actual = c1.into_iter().collect::>(); - let expected = vec![Some(3), Some(2), Some(1)]; - assert_eq!(actual, expected); - let actual = c2.into_iter().collect::>(); - let expected = vec![Some(3), Some(2), Some(1)]; - assert_eq!(actual, expected); - } -} diff --git a/rust/arrow/src/datatypes/datatype.rs b/rust/arrow/src/datatypes/datatype.rs deleted file mode 100644 index 122cbdd5e47..00000000000 --- a/rust/arrow/src/datatypes/datatype.rs +++ /dev/null @@ -1,477 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::fmt; - -use serde_derive::{Deserialize, Serialize}; -use serde_json::{json, Value, Value::String as VString}; - -use crate::error::{ArrowError, Result}; - -use super::Field; - -/// The set of datatypes that are supported by this implementation of Apache Arrow. -/// -/// The Arrow specification on data types includes some more types. -/// See also [`Schema.fbs`](https://github.com/apache/arrow/blob/master/format/Schema.fbs) -/// for Arrow's specification. -/// -/// The variants of this enum include primitive fixed size types as well as parametric or -/// nested types. -/// Currently the Rust implementation supports the following nested types: -/// - `List` -/// - `Struct` -/// -/// Nested types can themselves be nested within other arrays. -/// For more information on these types please see -/// [the physical memory layout of Apache Arrow](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout). -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub enum DataType { - /// Null type - Null, - /// A boolean datatype representing the values `true` and `false`. - Boolean, - /// A signed 8-bit integer. - Int8, - /// A signed 16-bit integer. - Int16, - /// A signed 32-bit integer. - Int32, - /// A signed 64-bit integer. - Int64, - /// An unsigned 8-bit integer. - UInt8, - /// An unsigned 16-bit integer. - UInt16, - /// An unsigned 32-bit integer. - UInt32, - /// An unsigned 64-bit integer. - UInt64, - /// A 16-bit floating point number. - Float16, - /// A 32-bit floating point number. - Float32, - /// A 64-bit floating point number. - Float64, - /// A timestamp with an optional timezone. - /// - /// Time is measured as a Unix epoch, counting the seconds from - /// 00:00:00.000 on 1 January 1970, excluding leap seconds, - /// as a 64-bit integer. - /// - /// The time zone is a string indicating the name of a time zone, one of: - /// - /// * As used in the Olson time zone database (the "tz database" or - /// "tzdata"), such as "America/New_York" - /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 - Timestamp(TimeUnit, Option), - /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) - /// in days (32 bits). - Date32, - /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) - /// in milliseconds (64 bits). Values are evenly divisible by 86400000. - Date64, - /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. - Time32(TimeUnit), - /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. - Time64(TimeUnit), - /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. - Duration(TimeUnit), - /// A "calendar" interval which models types that don't necessarily - /// have a precise duration without the context of a base timestamp (e.g. - /// days can differ in length during day light savings time transitions). - Interval(IntervalUnit), - /// Opaque binary data of variable length. - Binary, - /// Opaque binary data of fixed size. - /// Enum parameter specifies the number of bytes per value. - FixedSizeBinary(i32), - /// Opaque binary data of variable length and 64-bit offsets. - LargeBinary, - /// A variable-length string in Unicode with UTF-8 encoding. - Utf8, - /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. - LargeUtf8, - /// A list of some logical data type with variable length. - List(Box), - /// A list of some logical data type with fixed length. - FixedSizeList(Box, i32), - /// A list of some logical data type with variable length and 64-bit offsets. - LargeList(Box), - /// A nested datatype that contains a number of sub-fields. - Struct(Vec), - /// A nested datatype that can represent slots of differing types. - Union(Vec), - /// A dictionary encoded array (`key_type`, `value_type`), where - /// each array element is an index of `key_type` into an - /// associated dictionary of `value_type`. - /// - /// Dictionary arrays are used to store columns of `value_type` - /// that contain many repeated values using less memory, but with - /// a higher CPU overhead for some operations. - /// - /// This type mostly used to represent low cardinality string - /// arrays or a limited set of primitive types as integers. - Dictionary(Box, Box), - /// Decimal value with precision and scale - Decimal(usize, usize), -} - -/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub enum TimeUnit { - /// Time in seconds. - Second, - /// Time in milliseconds. - Millisecond, - /// Time in microseconds. - Microsecond, - /// Time in nanoseconds. - Nanosecond, -} - -/// YEAR_MONTH or DAY_TIME interval in SQL style. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub enum IntervalUnit { - /// Indicates the number of elapsed whole months, stored as 4-byte integers. - YearMonth, - /// Indicates the number of elapsed days and milliseconds, - /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). - DayTime, -} - -impl fmt::Display for DataType { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{:?}", self) - } -} - -impl DataType { - /// Parse a data type from a JSON representation. - pub(crate) fn from(json: &Value) -> Result { - let default_field = Field::new("", DataType::Boolean, true); - match *json { - Value::Object(ref map) => match map.get("name") { - Some(s) if s == "null" => Ok(DataType::Null), - Some(s) if s == "bool" => Ok(DataType::Boolean), - Some(s) if s == "binary" => Ok(DataType::Binary), - Some(s) if s == "largebinary" => Ok(DataType::LargeBinary), - Some(s) if s == "utf8" => Ok(DataType::Utf8), - Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8), - Some(s) if s == "fixedsizebinary" => { - // return a list with any type as its child isn't defined in the map - if let Some(Value::Number(size)) = map.get("byteWidth") { - Ok(DataType::FixedSizeBinary(size.as_i64().unwrap() as i32)) - } else { - Err(ArrowError::ParseError( - "Expecting a byteWidth for fixedsizebinary".to_string(), - )) - } - } - Some(s) if s == "decimal" => { - // return a list with any type as its child isn't defined in the map - let precision = match map.get("precision") { - Some(p) => Ok(p.as_u64().unwrap() as usize), - None => Err(ArrowError::ParseError( - "Expecting a precision for decimal".to_string(), - )), - }; - let scale = match map.get("scale") { - Some(s) => Ok(s.as_u64().unwrap() as usize), - _ => Err(ArrowError::ParseError( - "Expecting a scale for decimal".to_string(), - )), - }; - - Ok(DataType::Decimal(precision?, scale?)) - } - Some(s) if s == "floatingpoint" => match map.get("precision") { - Some(p) if p == "HALF" => Ok(DataType::Float16), - Some(p) if p == "SINGLE" => Ok(DataType::Float32), - Some(p) if p == "DOUBLE" => Ok(DataType::Float64), - _ => Err(ArrowError::ParseError( - "floatingpoint precision missing or invalid".to_string(), - )), - }, - Some(s) if s == "timestamp" => { - let unit = match map.get("unit") { - Some(p) if p == "SECOND" => Ok(TimeUnit::Second), - Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), - Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), - Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), - _ => Err(ArrowError::ParseError( - "timestamp unit missing or invalid".to_string(), - )), - }; - let tz = match map.get("timezone") { - None => Ok(None), - Some(VString(tz)) => Ok(Some(tz.clone())), - _ => Err(ArrowError::ParseError( - "timezone must be a string".to_string(), - )), - }; - Ok(DataType::Timestamp(unit?, tz?)) - } - Some(s) if s == "date" => match map.get("unit") { - Some(p) if p == "DAY" => Ok(DataType::Date32), - Some(p) if p == "MILLISECOND" => Ok(DataType::Date64), - _ => Err(ArrowError::ParseError( - "date unit missing or invalid".to_string(), - )), - }, - Some(s) if s == "time" => { - let unit = match map.get("unit") { - Some(p) if p == "SECOND" => Ok(TimeUnit::Second), - Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), - Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), - Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), - _ => Err(ArrowError::ParseError( - "time unit missing or invalid".to_string(), - )), - }; - match map.get("bitWidth") { - Some(p) if p == 32 => Ok(DataType::Time32(unit?)), - Some(p) if p == 64 => Ok(DataType::Time64(unit?)), - _ => Err(ArrowError::ParseError( - "time bitWidth missing or invalid".to_string(), - )), - } - } - Some(s) if s == "duration" => match map.get("unit") { - Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), - Some(p) if p == "MILLISECOND" => { - Ok(DataType::Duration(TimeUnit::Millisecond)) - } - Some(p) if p == "MICROSECOND" => { - Ok(DataType::Duration(TimeUnit::Microsecond)) - } - Some(p) if p == "NANOSECOND" => { - Ok(DataType::Duration(TimeUnit::Nanosecond)) - } - _ => Err(ArrowError::ParseError( - "time unit missing or invalid".to_string(), - )), - }, - Some(s) if s == "interval" => match map.get("unit") { - Some(p) if p == "DAY_TIME" => { - Ok(DataType::Interval(IntervalUnit::DayTime)) - } - Some(p) if p == "YEAR_MONTH" => { - Ok(DataType::Interval(IntervalUnit::YearMonth)) - } - _ => Err(ArrowError::ParseError( - "interval unit missing or invalid".to_string(), - )), - }, - Some(s) if s == "int" => match map.get("isSigned") { - Some(&Value::Bool(true)) => match map.get("bitWidth") { - Some(&Value::Number(ref n)) => match n.as_u64() { - Some(8) => Ok(DataType::Int8), - Some(16) => Ok(DataType::Int16), - Some(32) => Ok(DataType::Int32), - Some(64) => Ok(DataType::Int64), - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - Some(&Value::Bool(false)) => match map.get("bitWidth") { - Some(&Value::Number(ref n)) => match n.as_u64() { - Some(8) => Ok(DataType::UInt8), - Some(16) => Ok(DataType::UInt16), - Some(32) => Ok(DataType::UInt32), - Some(64) => Ok(DataType::UInt64), - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - _ => Err(ArrowError::ParseError( - "int bitWidth missing or invalid".to_string(), - )), - }, - _ => Err(ArrowError::ParseError( - "int signed missing or invalid".to_string(), - )), - }, - Some(s) if s == "list" => { - // return a list with any type as its child isn't defined in the map - Ok(DataType::List(Box::new(default_field))) - } - Some(s) if s == "largelist" => { - // return a largelist with any type as its child isn't defined in the map - Ok(DataType::LargeList(Box::new(default_field))) - } - Some(s) if s == "fixedsizelist" => { - // return a list with any type as its child isn't defined in the map - if let Some(Value::Number(size)) = map.get("listSize") { - Ok(DataType::FixedSizeList( - Box::new(default_field), - size.as_i64().unwrap() as i32, - )) - } else { - Err(ArrowError::ParseError( - "Expecting a listSize for fixedsizelist".to_string(), - )) - } - } - Some(s) if s == "struct" => { - // return an empty `struct` type as its children aren't defined in the map - Ok(DataType::Struct(vec![])) - } - Some(other) => Err(ArrowError::ParseError(format!( - "invalid or unsupported type name: {} in {:?}", - other, json - ))), - None => Err(ArrowError::ParseError("type name missing".to_string())), - }, - _ => Err(ArrowError::ParseError( - "invalid json value type".to_string(), - )), - } - } - - /// Generate a JSON representation of the data type. - pub fn to_json(&self) -> Value { - match self { - DataType::Null => json!({"name": "null"}), - DataType::Boolean => json!({"name": "bool"}), - DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}), - DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), - DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}), - DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}), - DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}), - DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}), - DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}), - DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}), - DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}), - DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}), - DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}), - DataType::Utf8 => json!({"name": "utf8"}), - DataType::LargeUtf8 => json!({"name": "largeutf8"}), - DataType::Binary => json!({"name": "binary"}), - DataType::LargeBinary => json!({"name": "largebinary"}), - DataType::FixedSizeBinary(byte_width) => { - json!({"name": "fixedsizebinary", "byteWidth": byte_width}) - } - DataType::Struct(_) => json!({"name": "struct"}), - DataType::Union(_) => json!({"name": "union"}), - DataType::List(_) => json!({ "name": "list"}), - DataType::LargeList(_) => json!({ "name": "largelist"}), - DataType::FixedSizeList(_, length) => { - json!({"name":"fixedsizelist", "listSize": length}) - } - DataType::Time32(unit) => { - json!({"name": "time", "bitWidth": 32, "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}) - } - DataType::Time64(unit) => { - json!({"name": "time", "bitWidth": 64, "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}) - } - DataType::Date32 => { - json!({"name": "date", "unit": "DAY"}) - } - DataType::Date64 => { - json!({"name": "date", "unit": "MILLISECOND"}) - } - DataType::Timestamp(unit, None) => { - json!({"name": "timestamp", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}) - } - DataType::Timestamp(unit, Some(tz)) => { - json!({"name": "timestamp", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }, "timezone": tz}) - } - DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { - IntervalUnit::YearMonth => "YEAR_MONTH", - IntervalUnit::DayTime => "DAY_TIME", - }}), - DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { - TimeUnit::Second => "SECOND", - TimeUnit::Millisecond => "MILLISECOND", - TimeUnit::Microsecond => "MICROSECOND", - TimeUnit::Nanosecond => "NANOSECOND", - }}), - DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), - DataType::Decimal(precision, scale) => { - json!({"name": "decimal", "precision": precision, "scale": scale}) - } - } - } - - /// Returns true if this type is numeric: (UInt*, Unit*, or Float*). - pub fn is_numeric(t: &DataType) -> bool { - use DataType::*; - matches!( - t, - UInt8 - | UInt16 - | UInt32 - | UInt64 - | Int8 - | Int16 - | Int32 - | Int64 - | Float32 - | Float64 - ) - } - - /// Compares the datatype with another, ignoring nested field names - /// and metadata. - pub(crate) fn equals_datatype(&self, other: &DataType) -> bool { - match (&self, other) { - (DataType::List(a), DataType::List(b)) - | (DataType::LargeList(a), DataType::LargeList(b)) => { - a.is_nullable() == b.is_nullable() - && a.data_type().equals_datatype(b.data_type()) - } - (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => { - a_size == b_size - && a.is_nullable() == b.is_nullable() - && a.data_type().equals_datatype(b.data_type()) - } - (DataType::Struct(a), DataType::Struct(b)) => { - a.len() == b.len() - && a.iter().zip(b).all(|(a, b)| { - a.is_nullable() == b.is_nullable() - && a.data_type().equals_datatype(b.data_type()) - }) - } - _ => self == other, - } - } -} diff --git a/rust/arrow/src/datatypes/field.rs b/rust/arrow/src/datatypes/field.rs deleted file mode 100644 index a471f12ef95..00000000000 --- a/rust/arrow/src/datatypes/field.rs +++ /dev/null @@ -1,541 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::collections::BTreeMap; - -use serde_derive::{Deserialize, Serialize}; -use serde_json::{json, Value}; - -use crate::error::{ArrowError, Result}; - -use super::DataType; - -/// Contains the meta-data for a single relative type. -/// -/// The `Schema` object is an ordered collection of `Field` objects. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -pub struct Field { - name: String, - data_type: DataType, - nullable: bool, - dict_id: i64, - dict_is_ordered: bool, - /// A map of key-value pairs containing additional custom meta data. - #[serde(skip_serializing_if = "Option::is_none")] - metadata: Option>, -} - -impl Field { - /// Creates a new field - pub fn new(name: &str, data_type: DataType, nullable: bool) -> Self { - Field { - name: name.to_string(), - data_type, - nullable, - dict_id: 0, - dict_is_ordered: false, - metadata: None, - } - } - - /// Creates a new field - pub fn new_dict( - name: &str, - data_type: DataType, - nullable: bool, - dict_id: i64, - dict_is_ordered: bool, - ) -> Self { - Field { - name: name.to_string(), - data_type, - nullable, - dict_id, - dict_is_ordered, - metadata: None, - } - } - - /// Sets the `Field`'s optional custom metadata. - /// The metadata is set as `None` for empty map. - #[inline] - pub fn set_metadata(&mut self, metadata: Option>) { - // To make serde happy, convert Some(empty_map) to None. - self.metadata = None; - if let Some(v) = metadata { - if !v.is_empty() { - self.metadata = Some(v); - } - } - } - - /// Returns the immutable reference to the `Field`'s optional custom metadata. - #[inline] - pub const fn metadata(&self) -> &Option> { - &self.metadata - } - - /// Returns an immutable reference to the `Field`'s name. - #[inline] - pub const fn name(&self) -> &String { - &self.name - } - - /// Returns an immutable reference to the `Field`'s data-type. - #[inline] - pub const fn data_type(&self) -> &DataType { - &self.data_type - } - - /// Indicates whether this `Field` supports null values. - #[inline] - pub const fn is_nullable(&self) -> bool { - self.nullable - } - - /// Returns the dictionary ID, if this is a dictionary type. - #[inline] - pub const fn dict_id(&self) -> Option { - match self.data_type { - DataType::Dictionary(_, _) => Some(self.dict_id), - _ => None, - } - } - - /// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type. - #[inline] - pub const fn dict_is_ordered(&self) -> Option { - match self.data_type { - DataType::Dictionary(_, _) => Some(self.dict_is_ordered), - _ => None, - } - } - - /// Parse a `Field` definition from a JSON representation. - pub fn from(json: &Value) -> Result { - match *json { - Value::Object(ref map) => { - let name = match map.get("name") { - Some(&Value::String(ref name)) => name.to_string(), - _ => { - return Err(ArrowError::ParseError( - "Field missing 'name' attribute".to_string(), - )); - } - }; - let nullable = match map.get("nullable") { - Some(&Value::Bool(b)) => b, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'nullable' attribute".to_string(), - )); - } - }; - let data_type = match map.get("type") { - Some(t) => DataType::from(t)?, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'type' attribute".to_string(), - )); - } - }; - - // Referenced example file: testing/data/arrow-ipc-stream/integration/1.0.0-littleendian/generated_custom_metadata.json.gz - let metadata = match map.get("metadata") { - Some(&Value::Array(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); - for value in values { - match value.as_object() { - Some(map) => { - if map.len() != 2 { - return Err(ArrowError::ParseError( - "Field 'metadata' must have exact two entries for each key-value map".to_string(), - )); - } - if let (Some(k), Some(v)) = - (map.get("key"), map.get("value")) - { - if let (Some(k_str), Some(v_str)) = - (k.as_str(), v.as_str()) - { - res.insert( - k_str.to_string().clone(), - v_str.to_string().clone(), - ); - } else { - return Err(ArrowError::ParseError("Field 'metadata' must have map value of string type".to_string())); - } - } else { - return Err(ArrowError::ParseError("Field 'metadata' lacks map keys named \"key\" or \"value\"".to_string())); - } - } - _ => { - return Err(ArrowError::ParseError( - "Field 'metadata' contains non-object key-value pair".to_string(), - )); - } - } - } - Some(res) - } - // We also support map format, because Schema's metadata supports this. - // See https://github.com/apache/arrow/pull/5907 - Some(&Value::Object(ref values)) => { - let mut res: BTreeMap = BTreeMap::new(); - for (k, v) in values { - if let Some(str_value) = v.as_str() { - res.insert(k.clone(), str_value.to_string().clone()); - } else { - return Err(ArrowError::ParseError( - format!("Field 'metadata' contains non-string value for key {}", k), - )); - } - } - Some(res) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field `metadata` is not json array".to_string(), - )); - } - _ => None, - }; - - // if data_type is a struct or list, get its children - let data_type = match data_type { - DataType::List(_) - | DataType::LargeList(_) - | DataType::FixedSizeList(_, _) => match map.get("children") { - Some(Value::Array(values)) => { - if values.len() != 1 { - return Err(ArrowError::ParseError( - "Field 'children' must have one element for a list data type".to_string(), - )); - } - match data_type { - DataType::List(_) => { - DataType::List(Box::new(Self::from(&values[0])?)) - } - DataType::LargeList(_) => { - DataType::LargeList(Box::new(Self::from(&values[0])?)) - } - DataType::FixedSizeList(_, int) => DataType::FixedSizeList( - Box::new(Self::from(&values[0])?), - int, - ), - _ => unreachable!( - "Data type should be a list, largelist or fixedsizelist" - ), - } - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - DataType::Struct(mut fields) => match map.get("children") { - Some(Value::Array(values)) => { - let struct_fields: Result> = - values.iter().map(|v| Field::from(v)).collect(); - fields.append(&mut struct_fields?); - DataType::Struct(fields) - } - Some(_) => { - return Err(ArrowError::ParseError( - "Field 'children' must be an array".to_string(), - )) - } - None => { - return Err(ArrowError::ParseError( - "Field missing 'children' attribute".to_string(), - )); - } - }, - _ => data_type, - }; - - let mut dict_id = 0; - let mut dict_is_ordered = false; - - let data_type = match map.get("dictionary") { - Some(dictionary) => { - let index_type = match dictionary.get("indexType") { - Some(t) => DataType::from(t)?, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'indexType' attribute".to_string(), - )); - } - }; - dict_id = match dictionary.get("id") { - Some(Value::Number(n)) => n.as_i64().unwrap(), - _ => { - return Err(ArrowError::ParseError( - "Field missing 'id' attribute".to_string(), - )); - } - }; - dict_is_ordered = match dictionary.get("isOrdered") { - Some(&Value::Bool(n)) => n, - _ => { - return Err(ArrowError::ParseError( - "Field missing 'isOrdered' attribute".to_string(), - )); - } - }; - DataType::Dictionary(Box::new(index_type), Box::new(data_type)) - } - _ => data_type, - }; - Ok(Field { - name, - data_type, - nullable, - dict_id, - dict_is_ordered, - metadata, - }) - } - _ => Err(ArrowError::ParseError( - "Invalid json value type for field".to_string(), - )), - } - } - - /// Generate a JSON representation of the `Field`. - pub fn to_json(&self) -> Value { - let children: Vec = match self.data_type() { - DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(field) => vec![field.to_json()], - DataType::LargeList(field) => vec![field.to_json()], - DataType::FixedSizeList(field, _) => vec![field.to_json()], - _ => vec![], - }; - match self.data_type() { - DataType::Dictionary(ref index_type, ref value_type) => json!({ - "name": self.name, - "nullable": self.nullable, - "type": value_type.to_json(), - "children": children, - "dictionary": { - "id": self.dict_id, - "indexType": index_type.to_json(), - "isOrdered": self.dict_is_ordered - } - }), - _ => json!({ - "name": self.name, - "nullable": self.nullable, - "type": self.data_type.to_json(), - "children": children - }), - } - } - - /// Merge field into self if it is compatible. Struct will be merged recursively. - /// NOTE: `self` may be updated to unexpected state in case of merge failure. - /// - /// Example: - /// - /// ``` - /// use arrow::datatypes::*; - /// - /// let mut field = Field::new("c1", DataType::Int64, false); - /// assert!(field.try_merge(&Field::new("c1", DataType::Int64, true)).is_ok()); - /// assert!(field.is_nullable()); - /// ``` - pub fn try_merge(&mut self, from: &Field) -> Result<()> { - // merge metadata - match (self.metadata(), from.metadata()) { - (Some(self_metadata), Some(from_metadata)) => { - let mut merged = self_metadata.clone(); - for (key, from_value) in from_metadata { - if let Some(self_value) = self_metadata.get(key) { - if self_value != from_value { - return Err(ArrowError::SchemaError(format!( - "Fail to merge field due to conflicting metadata data value for key {}", key), - )); - } - } else { - merged.insert(key.clone(), from_value.clone()); - } - } - self.set_metadata(Some(merged)); - } - (None, Some(from_metadata)) => { - self.set_metadata(Some(from_metadata.clone())); - } - _ => {} - } - if from.dict_id != self.dict_id { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting dict_id".to_string(), - )); - } - if from.dict_is_ordered != self.dict_is_ordered { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting dict_is_ordered" - .to_string(), - )); - } - match &mut self.data_type { - DataType::Struct(nested_fields) => match &from.data_type { - DataType::Struct(from_nested_fields) => { - for from_field in from_nested_fields { - let mut is_new_field = true; - for self_field in nested_fields.iter_mut() { - if self_field.name != from_field.name { - continue; - } - is_new_field = false; - self_field.try_merge(&from_field)?; - } - if is_new_field { - nested_fields.push(from_field.clone()); - } - } - } - _ => { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting datatype" - .to_string(), - )); - } - }, - DataType::Union(nested_fields) => match &from.data_type { - DataType::Union(from_nested_fields) => { - for from_field in from_nested_fields { - let mut is_new_field = true; - for self_field in nested_fields.iter_mut() { - if from_field == self_field { - is_new_field = false; - break; - } - } - if is_new_field { - nested_fields.push(from_field.clone()); - } - } - } - _ => { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting datatype" - .to_string(), - )); - } - }, - DataType::Null - | DataType::Boolean - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Timestamp(_, _) - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Binary - | DataType::LargeBinary - | DataType::Interval(_) - | DataType::LargeList(_) - | DataType::List(_) - | DataType::Dictionary(_, _) - | DataType::FixedSizeList(_, _) - | DataType::FixedSizeBinary(_) - | DataType::Utf8 - | DataType::LargeUtf8 - | DataType::Decimal(_, _) => { - if self.data_type != from.data_type { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting datatype" - .to_string(), - )); - } - } - } - if from.nullable { - self.nullable = from.nullable; - } - - Ok(()) - } - - /// Check to see if `self` is a superset of `other` field. Superset is defined as: - /// - /// * if nullability doesn't match, self needs to be nullable - /// * self.metadata is a superset of other.metadata - /// * all other fields are equal - pub fn contains(&self, other: &Field) -> bool { - if self.name != other.name - || self.data_type != other.data_type - || self.dict_id != other.dict_id - || self.dict_is_ordered != other.dict_is_ordered - { - return false; - } - - if self.nullable != other.nullable && !self.nullable { - return false; - } - - // make sure self.metadata is a superset of other.metadata - match (&self.metadata, &other.metadata) { - (None, Some(_)) => { - return false; - } - (Some(self_meta), Some(other_meta)) => { - for (k, v) in other_meta.iter() { - match self_meta.get(k) { - Some(s) => { - if s != v { - return false; - } - } - None => { - return false; - } - } - } - } - _ => {} - } - - true - } -} - -// TODO: improve display with crate https://crates.io/crates/derive_more ? -impl std::fmt::Display for Field { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{:?}", self) - } -} diff --git a/rust/arrow/src/datatypes/mod.rs b/rust/arrow/src/datatypes/mod.rs deleted file mode 100644 index 175b50b0177..00000000000 --- a/rust/arrow/src/datatypes/mod.rs +++ /dev/null @@ -1,1241 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines the logical data types of Arrow arrays. -//! -//! The most important things you might be looking for are: -//! * [`Schema`](crate::datatypes::Schema) to describe a schema. -//! * [`Field`](crate::datatypes::Field) to describe one field within a schema. -//! * [`DataType`](crate::datatypes::DataType) to describe the type of a field. - -use std::sync::Arc; - -mod native; -pub use native::*; -mod field; -pub use field::*; -mod schema; -pub use schema::*; -mod numeric; -pub use numeric::*; -mod types; -pub use types::*; -mod datatype; -pub use datatype::*; - -/// A reference-counted reference to a [`Schema`](crate::datatypes::Schema). -pub type SchemaRef = Arc; - -#[cfg(test)] -mod tests { - use super::*; - use crate::error::Result; - use serde_json::Value::{Bool, Number as VNumber}; - use serde_json::{Number, Value}; - use std::{ - collections::{BTreeMap, HashMap}, - f32::NAN, - }; - - #[test] - fn test_list_datatype_equality() { - // tests that list type equality is checked while ignoring list names - let list_a = DataType::List(Box::new(Field::new("item", DataType::Int32, true))); - let list_b = DataType::List(Box::new(Field::new("array", DataType::Int32, true))); - let list_c = DataType::List(Box::new(Field::new("item", DataType::Int32, false))); - let list_d = DataType::List(Box::new(Field::new("item", DataType::UInt32, true))); - assert!(list_a.equals_datatype(&list_b)); - assert!(!list_a.equals_datatype(&list_c)); - assert!(!list_b.equals_datatype(&list_c)); - assert!(!list_a.equals_datatype(&list_d)); - - let list_e = - DataType::FixedSizeList(Box::new(Field::new("item", list_a, false)), 3); - let list_f = - DataType::FixedSizeList(Box::new(Field::new("array", list_b, false)), 3); - let list_g = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::FixedSizeBinary(3), true)), - 3, - ); - assert!(list_e.equals_datatype(&list_f)); - assert!(!list_e.equals_datatype(&list_g)); - assert!(!list_f.equals_datatype(&list_g)); - - let list_h = DataType::Struct(vec![Field::new("f1", list_e, true)]); - let list_i = DataType::Struct(vec![Field::new("f1", list_f.clone(), true)]); - let list_j = DataType::Struct(vec![Field::new("f1", list_f.clone(), false)]); - let list_k = DataType::Struct(vec![ - Field::new("f1", list_f.clone(), false), - Field::new("f2", list_g.clone(), false), - Field::new("f3", DataType::Utf8, true), - ]); - let list_l = DataType::Struct(vec![ - Field::new("ff1", list_f.clone(), false), - Field::new("ff2", list_g.clone(), false), - Field::new("ff3", DataType::LargeUtf8, true), - ]); - let list_m = DataType::Struct(vec![ - Field::new("ff1", list_f, false), - Field::new("ff2", list_g, false), - Field::new("ff3", DataType::Utf8, true), - ]); - assert!(list_h.equals_datatype(&list_i)); - assert!(!list_h.equals_datatype(&list_j)); - assert!(!list_k.equals_datatype(&list_l)); - assert!(list_k.equals_datatype(&list_m)); - } - - #[test] - fn create_struct_type() { - let _person = DataType::Struct(vec![ - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, false), - Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ), - ]); - } - - #[test] - fn serde_struct_type() { - let kv_array = [("k".to_string(), "v".to_string())]; - let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); - - // Non-empty map: should be converted as JSON obj { ... } - let mut first_name = Field::new("first_name", DataType::Utf8, false); - first_name.set_metadata(Some(field_metadata)); - - // Empty map: should be omitted. - let mut last_name = Field::new("last_name", DataType::Utf8, false); - last_name.set_metadata(Some(BTreeMap::default())); - - let person = DataType::Struct(vec![ - first_name, - last_name, - Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ), - ]); - - let serialized = serde_json::to_string(&person).unwrap(); - - // NOTE that this is testing the default (derived) serialization format, not the - // JSON format specified in metadata.md - - assert_eq!( - "{\"Struct\":[\ - {\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false,\"metadata\":{\"k\":\"v\"}},\ - {\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"address\",\"data_type\":{\"Struct\":\ - [{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false},\ - {\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}\ - ]},\"nullable\":false,\"dict_id\":0,\"dict_is_ordered\":false}]}", - serialized - ); - - let deserialized = serde_json::from_str(&serialized).unwrap(); - - assert_eq!(person, deserialized); - } - - #[test] - fn struct_field_to_json() { - let f = Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ); - let value: Value = serde_json::from_str( - r#"{ - "name": "address", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "street", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "zip", - "nullable": false, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - - #[test] - fn primitive_field_to_json() { - let f = Field::new("first_name", DataType::Utf8, false); - let value: Value = serde_json::from_str( - r#"{ - "name": "first_name", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }"#, - ) - .unwrap(); - assert_eq!(value, f.to_json()); - } - #[test] - fn parse_struct_from_json() { - let json = r#" - { - "name": "address", - "type": { - "name": "struct" - }, - "nullable": false, - "children": [ - { - "name": "street", - "type": { - "name": "utf8" - }, - "nullable": false, - "children": [] - }, - { - "name": "zip", - "type": { - "name": "int", - "isSigned": false, - "bitWidth": 16 - }, - "nullable": false, - "children": [] - } - ] - } - "#; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = Field::from(&value).unwrap(); - - let expected = Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ); - - assert_eq!(expected, dt); - } - - #[test] - fn parse_utf8_from_json() { - let json = "{\"name\":\"utf8\"}"; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = DataType::from(&value).unwrap(); - assert_eq!(DataType::Utf8, dt); - } - - #[test] - fn parse_int32_from_json() { - let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}"; - let value: Value = serde_json::from_str(json).unwrap(); - let dt = DataType::from(&value).unwrap(); - assert_eq!(DataType::Int32, dt); - } - - #[test] - fn schema_json() { - // Add some custom metadata - let metadata: HashMap = - [("Key".to_string(), "Value".to_string())] - .iter() - .cloned() - .collect(); - - let schema = Schema::new_with_metadata( - vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Binary, false), - Field::new("c3", DataType::FixedSizeBinary(3), false), - Field::new("c4", DataType::Boolean, false), - Field::new("c5", DataType::Date32, false), - Field::new("c6", DataType::Date64, false), - Field::new("c7", DataType::Time32(TimeUnit::Second), false), - Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false), - Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false), - Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false), - Field::new("c11", DataType::Time64(TimeUnit::Second), false), - Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false), - Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false), - Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false), - Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), - Field::new( - "c16", - DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), - false, - ), - Field::new( - "c17", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".to_string()), - ), - false, - ), - Field::new( - "c18", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - ), - Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), - Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), - Field::new( - "c21", - DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), - false, - ), - Field::new( - "c22", - DataType::FixedSizeList( - Box::new(Field::new("bools", DataType::Boolean, false)), - 5, - ), - false, - ), - Field::new( - "c23", - DataType::List(Box::new(Field::new( - "inner_list", - DataType::List(Box::new(Field::new( - "struct", - DataType::Struct(vec![]), - true, - ))), - false, - ))), - true, - ), - Field::new( - "c24", - DataType::Struct(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::UInt16, false), - ]), - false, - ), - Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true), - Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true), - Field::new("c27", DataType::Duration(TimeUnit::Second), false), - Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), - Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), - Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), - Field::new_dict( - "c31", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), - true, - 123, - true, - ), - Field::new("c32", DataType::LargeBinary, true), - Field::new("c33", DataType::LargeUtf8, true), - Field::new( - "c34", - DataType::LargeList(Box::new(Field::new( - "inner_large_list", - DataType::LargeList(Box::new(Field::new( - "struct", - DataType::Struct(vec![]), - false, - ))), - true, - ))), - true, - ), - ], - metadata, - ); - - let expected = schema.to_json(); - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "c2", - "nullable": false, - "type": { - "name": "binary" - }, - "children": [] - }, - { - "name": "c3", - "nullable": false, - "type": { - "name": "fixedsizebinary", - "byteWidth": 3 - }, - "children": [] - }, - { - "name": "c4", - "nullable": false, - "type": { - "name": "bool" - }, - "children": [] - }, - { - "name": "c5", - "nullable": false, - "type": { - "name": "date", - "unit": "DAY" - }, - "children": [] - }, - { - "name": "c6", - "nullable": false, - "type": { - "name": "date", - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c7", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c8", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c9", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c10", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 32, - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c11", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c12", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c13", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c14", - "nullable": false, - "type": { - "name": "time", - "bitWidth": 64, - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c15", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c16", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "MILLISECOND", - "timezone": "UTC" - }, - "children": [] - }, - { - "name": "c17", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "MICROSECOND", - "timezone": "Africa/Johannesburg" - }, - "children": [] - }, - { - "name": "c18", - "nullable": false, - "type": { - "name": "timestamp", - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c19", - "nullable": false, - "type": { - "name": "interval", - "unit": "DAY_TIME" - }, - "children": [] - }, - { - "name": "c20", - "nullable": false, - "type": { - "name": "interval", - "unit": "YEAR_MONTH" - }, - "children": [] - }, - { - "name": "c21", - "nullable": false, - "type": { - "name": "list" - }, - "children": [ - { - "name": "item", - "nullable": true, - "type": { - "name": "bool" - }, - "children": [] - } - ] - }, - { - "name": "c22", - "nullable": false, - "type": { - "name": "fixedsizelist", - "listSize": 5 - }, - "children": [ - { - "name": "bools", - "nullable": false, - "type": { - "name": "bool" - }, - "children": [] - } - ] - }, - { - "name": "c23", - "nullable": true, - "type": { - "name": "list" - }, - "children": [ - { - "name": "inner_list", - "nullable": false, - "type": { - "name": "list" - }, - "children": [ - { - "name": "struct", - "nullable": true, - "type": { - "name": "struct" - }, - "children": [] - } - ] - } - ] - }, - { - "name": "c24", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [ - { - "name": "a", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - }, - { - "name": "b", - "nullable": false, - "type": { - "name": "int", - "bitWidth": 16, - "isSigned": false - }, - "children": [] - } - ] - }, - { - "name": "c25", - "nullable": true, - "type": { - "name": "interval", - "unit": "YEAR_MONTH" - }, - "children": [] - }, - { - "name": "c26", - "nullable": true, - "type": { - "name": "interval", - "unit": "DAY_TIME" - }, - "children": [] - }, - { - "name": "c27", - "nullable": false, - "type": { - "name": "duration", - "unit": "SECOND" - }, - "children": [] - }, - { - "name": "c28", - "nullable": false, - "type": { - "name": "duration", - "unit": "MILLISECOND" - }, - "children": [] - }, - { - "name": "c29", - "nullable": false, - "type": { - "name": "duration", - "unit": "MICROSECOND" - }, - "children": [] - }, - { - "name": "c30", - "nullable": false, - "type": { - "name": "duration", - "unit": "NANOSECOND" - }, - "children": [] - }, - { - "name": "c31", - "nullable": true, - "children": [], - "type": { - "name": "utf8" - }, - "dictionary": { - "id": 123, - "indexType": { - "name": "int", - "bitWidth": 32, - "isSigned": true - }, - "isOrdered": true - } - }, - { - "name": "c32", - "nullable": true, - "type": { - "name": "largebinary" - }, - "children": [] - }, - { - "name": "c33", - "nullable": true, - "type": { - "name": "largeutf8" - }, - "children": [] - }, - { - "name": "c34", - "nullable": true, - "type": { - "name": "largelist" - }, - "children": [ - { - "name": "inner_large_list", - "nullable": true, - "type": { - "name": "largelist" - }, - "children": [ - { - "name": "struct", - "nullable": false, - "type": { - "name": "struct" - }, - "children": [] - } - ] - } - ] - } - ], - "metadata" : { - "Key": "Value" - } - }"#; - let value: Value = serde_json::from_str(&json).unwrap(); - assert_eq!(expected, value); - - // convert back to a schema - let value: Value = serde_json::from_str(&json).unwrap(); - let schema2 = Schema::from(&value).unwrap(); - - assert_eq!(schema, schema2); - - // Check that empty metadata produces empty value in JSON and can be parsed - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - } - ], - "metadata": {} - }"#; - let value: Value = serde_json::from_str(&json).unwrap(); - let schema = Schema::from(&value).unwrap(); - assert!(schema.metadata.is_empty()); - - // Check that metadata field is not required in the JSON. - let json = r#"{ - "fields": [ - { - "name": "c1", - "nullable": false, - "type": { - "name": "utf8" - }, - "children": [] - } - ] - }"#; - let value: Value = serde_json::from_str(&json).unwrap(); - let schema = Schema::from(&value).unwrap(); - assert!(schema.metadata.is_empty()); - } - - #[test] - fn create_schema_string() { - let schema = person_schema(); - assert_eq!(schema.to_string(), - "Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: Some({\"k\": \"v\"}) }, \ - Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"address\", data_type: Struct([\ - Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }\ - ]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: None }") - } - - #[test] - fn schema_field_accessors() { - let schema = person_schema(); - - // test schema accessors - assert_eq!(schema.fields().len(), 4); - - // test field accessors - let first_name = &schema.fields()[0]; - assert_eq!(first_name.name(), "first_name"); - assert_eq!(first_name.data_type(), &DataType::Utf8); - assert_eq!(first_name.is_nullable(), false); - assert_eq!(first_name.dict_id(), None); - assert_eq!(first_name.dict_is_ordered(), None); - - let metadata = first_name.metadata(); - assert!(metadata.is_some()); - let md = metadata.as_ref().unwrap(); - assert_eq!(md.len(), 1); - let key = md.get("k"); - assert!(key.is_some()); - assert_eq!(key.unwrap(), "v"); - - let interests = &schema.fields()[3]; - assert_eq!(interests.name(), "interests"); - assert_eq!( - interests.data_type(), - &DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) - ); - assert_eq!(interests.dict_id(), Some(123)); - assert_eq!(interests.dict_is_ordered(), Some(true)); - } - - #[test] - #[should_panic( - expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]" - )] - fn schema_index_of() { - let schema = person_schema(); - assert_eq!(schema.index_of("first_name").unwrap(), 0); - assert_eq!(schema.index_of("last_name").unwrap(), 1); - schema.index_of("nickname").unwrap(); - } - - #[test] - #[should_panic( - expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\", \\\"interests\\\"]" - )] - fn schema_field_with_name() { - let schema = person_schema(); - assert_eq!( - schema.field_with_name("first_name").unwrap().name(), - "first_name" - ); - assert_eq!( - schema.field_with_name("last_name").unwrap().name(), - "last_name" - ); - schema.field_with_name("nickname").unwrap(); - } - - #[test] - fn schema_field_with_dict_id() { - let schema = person_schema(); - - let fields_dict_123: Vec<_> = schema - .fields_with_dict_id(123) - .iter() - .map(|f| f.name()) - .collect(); - assert_eq!(fields_dict_123, vec!["interests"]); - - assert!(schema.fields_with_dict_id(456).is_empty()); - } - - #[test] - fn schema_equality() { - let schema1 = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::LargeBinary, true), - ]); - let schema2 = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::LargeBinary, true), - ]); - - assert_eq!(schema1, schema2); - - let schema3 = Schema::new(vec![ - Field::new("c1", DataType::Utf8, false), - Field::new("c2", DataType::Float32, true), - ]); - let schema4 = Schema::new(vec![ - Field::new("C1", DataType::Utf8, false), - Field::new("C2", DataType::Float64, true), - ]); - - assert!(schema1 != schema3); - assert!(schema1 != schema4); - assert!(schema2 != schema3); - assert!(schema2 != schema4); - assert!(schema3 != schema4); - - let mut f = Field::new("c1", DataType::Utf8, false); - f.set_metadata(Some( - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(), - )); - let schema5 = Schema::new(vec![ - f, - Field::new("c2", DataType::Float64, true), - Field::new("c3", DataType::LargeBinary, true), - ]); - assert!(schema1 != schema5); - } - - #[test] - fn test_arrow_native_type_to_json() { - assert_eq!(Some(Bool(true)), true.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value()); - assert_eq!( - Some(VNumber(Number::from_f64(0.01f64).unwrap())), - 0.01.into_json_value() - ); - assert_eq!( - Some(VNumber(Number::from_f64(0.01f64).unwrap())), - 0.01f64.into_json_value() - ); - assert_eq!(None, NAN.into_json_value()); - } - - fn person_schema() -> Schema { - let kv_array = [("k".to_string(), "v".to_string())]; - let field_metadata: BTreeMap = kv_array.iter().cloned().collect(); - let mut first_name = Field::new("first_name", DataType::Utf8, false); - first_name.set_metadata(Some(field_metadata)); - - Schema::new(vec![ - first_name, - Field::new("last_name", DataType::Utf8, false), - Field::new( - "address", - DataType::Struct(vec![ - Field::new("street", DataType::Utf8, false), - Field::new("zip", DataType::UInt16, false), - ]), - false, - ), - Field::new_dict( - "interests", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - true, - 123, - true, - ), - ]) - } - - #[test] - fn test_try_merge_field_with_metadata() { - // 1. Different values for the same key should cause error. - let metadata1: BTreeMap = - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(); - let mut f1 = Field::new("first_name", DataType::Utf8, false); - f1.set_metadata(Some(metadata1)); - - let metadata2: BTreeMap = - [("foo".to_string(), "baz".to_string())] - .iter() - .cloned() - .collect(); - let mut f2 = Field::new("first_name", DataType::Utf8, false); - f2.set_metadata(Some(metadata2)); - - assert!( - Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])]) - .is_err() - ); - - // 2. None + Some - let mut f1 = Field::new("first_name", DataType::Utf8, false); - let metadata2: BTreeMap = - [("missing".to_string(), "value".to_string())] - .iter() - .cloned() - .collect(); - let mut f2 = Field::new("first_name", DataType::Utf8, false); - f2.set_metadata(Some(metadata2)); - - assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_some()); - assert_eq!( - f1.metadata().as_ref().unwrap(), - f2.metadata().as_ref().unwrap() - ); - - // 3. Some + Some - let mut f1 = Field::new("first_name", DataType::Utf8, false); - f1.set_metadata(Some( - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(), - )); - let mut f2 = Field::new("first_name", DataType::Utf8, false); - f2.set_metadata(Some( - [("foo2".to_string(), "bar2".to_string())] - .iter() - .cloned() - .collect(), - )); - - assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_some()); - assert_eq!( - f1.metadata().clone().unwrap(), - [ - ("foo".to_string(), "bar".to_string()), - ("foo2".to_string(), "bar2".to_string()) - ] - .iter() - .cloned() - .collect() - ); - - // 4. Some + None. - let mut f1 = Field::new("first_name", DataType::Utf8, false); - f1.set_metadata(Some( - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect(), - )); - let f2 = Field::new("first_name", DataType::Utf8, false); - assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_some()); - assert_eq!( - f1.metadata().clone().unwrap(), - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect() - ); - - // 5. None + None. - let mut f1 = Field::new("first_name", DataType::Utf8, false); - let f2 = Field::new("first_name", DataType::Utf8, false); - assert!(f1.try_merge(&f2).is_ok()); - assert!(f1.metadata().is_none()); - } - - #[test] - fn test_schema_merge() -> Result<()> { - let merged = Schema::try_merge(vec![ - Schema::new(vec![ - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, false), - Field::new( - "address", - DataType::Struct(vec![Field::new("zip", DataType::UInt16, false)]), - false, - ), - ]), - Schema::new_with_metadata( - vec![ - // nullable merge - Field::new("last_name", DataType::Utf8, true), - Field::new( - "address", - DataType::Struct(vec![ - // add new nested field - Field::new("street", DataType::Utf8, false), - // nullable merge on nested field - Field::new("zip", DataType::UInt16, true), - ]), - false, - ), - // new field - Field::new("number", DataType::Utf8, true), - ], - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect::>(), - ), - ])?; - - assert_eq!( - merged, - Schema::new_with_metadata( - vec![ - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, true), - Field::new( - "address", - DataType::Struct(vec![ - Field::new("zip", DataType::UInt16, true), - Field::new("street", DataType::Utf8, false), - ]), - false, - ), - Field::new("number", DataType::Utf8, true), - ], - [("foo".to_string(), "bar".to_string())] - .iter() - .cloned() - .collect::>() - ) - ); - - // support merge union fields - assert_eq!( - Schema::try_merge(vec![ - Schema::new(vec![Field::new( - "c1", - DataType::Union(vec![ - Field::new("c11", DataType::Utf8, true), - Field::new("c12", DataType::Utf8, true), - ]), - false - ),]), - Schema::new(vec![Field::new( - "c1", - DataType::Union(vec![ - Field::new("c12", DataType::Utf8, true), - Field::new("c13", DataType::Time64(TimeUnit::Second), true), - ]), - false - ),]) - ])?, - Schema::new(vec![Field::new( - "c1", - DataType::Union(vec![ - Field::new("c11", DataType::Utf8, true), - Field::new("c12", DataType::Utf8, true), - Field::new("c13", DataType::Time64(TimeUnit::Second), true), - ]), - false - ),]), - ); - - // incompatible field should throw error - assert!(Schema::try_merge(vec![ - Schema::new(vec![ - Field::new("first_name", DataType::Utf8, false), - Field::new("last_name", DataType::Utf8, false), - ]), - Schema::new(vec![Field::new("last_name", DataType::Int64, false),]) - ]) - .is_err()); - - // incompatible metadata should throw error - assert!(Schema::try_merge(vec![ - Schema::new_with_metadata( - vec![Field::new("first_name", DataType::Utf8, false)], - [("foo".to_string(), "bar".to_string()),] - .iter() - .cloned() - .collect::>() - ), - Schema::new_with_metadata( - vec![Field::new("last_name", DataType::Utf8, false)], - [("foo".to_string(), "baz".to_string()),] - .iter() - .cloned() - .collect::>() - ) - ]) - .is_err()); - - Ok(()) - } -} diff --git a/rust/arrow/src/datatypes/native.rs b/rust/arrow/src/datatypes/native.rs deleted file mode 100644 index 6e8cf892237..00000000000 --- a/rust/arrow/src/datatypes/native.rs +++ /dev/null @@ -1,333 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use serde_json::{Number, Value}; - -use super::DataType; - -/// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.). -pub trait JsonSerializable: 'static { - fn into_json_value(self) -> Option; -} - -/// Trait expressing a Rust type that has the same in-memory representation -/// as Arrow. This includes `i16`, `f32`, but excludes `bool` (which in arrow is represented in bits). -/// In little endian machines, types that implement [`ArrowNativeType`] can be memcopied to arrow buffers -/// as is. -pub trait ArrowNativeType: - std::fmt::Debug - + Send - + Sync - + Copy - + PartialOrd - + std::str::FromStr - + Default - + JsonSerializable -{ - /// Convert native type from usize. - #[inline] - fn from_usize(_: usize) -> Option { - None - } - - /// Convert native type to usize. - #[inline] - fn to_usize(&self) -> Option { - None - } - - /// Convert native type to isize. - #[inline] - fn to_isize(&self) -> Option { - None - } - - /// Convert native type from i32. - #[inline] - fn from_i32(_: i32) -> Option { - None - } - - /// Convert native type from i64. - #[inline] - fn from_i64(_: i64) -> Option { - None - } -} - -/// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the -/// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. -pub trait ArrowPrimitiveType: 'static { - /// Corresponding Rust native type for the primitive type. - type Native: ArrowNativeType; - - /// the corresponding Arrow data type of this primitive type. - const DATA_TYPE: DataType; - - /// Returns the byte width of this primitive type. - fn get_byte_width() -> usize { - std::mem::size_of::() - } - - /// Returns a default value of this primitive type. - /// - /// This is useful for aggregate array ops like `sum()`, `mean()`. - fn default_value() -> Self::Native { - Default::default() - } -} - -impl JsonSerializable for bool { - fn into_json_value(self) -> Option { - Some(self.into()) - } -} - -impl JsonSerializable for i8 { - fn into_json_value(self) -> Option { - Some(self.into()) - } -} - -impl ArrowNativeType for i8 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl JsonSerializable for i16 { - fn into_json_value(self) -> Option { - Some(self.into()) - } -} - -impl ArrowNativeType for i16 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl JsonSerializable for i32 { - fn into_json_value(self) -> Option { - Some(self.into()) - } -} - -impl ArrowNativeType for i32 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } - - /// Convert native type from i32. - #[inline] - fn from_i32(val: i32) -> Option { - Some(val) - } -} - -impl JsonSerializable for i64 { - fn into_json_value(self) -> Option { - Some(Value::Number(Number::from(self))) - } -} - -impl ArrowNativeType for i64 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } - - /// Convert native type from i64. - #[inline] - fn from_i64(val: i64) -> Option { - Some(val) - } -} - -impl JsonSerializable for u8 { - fn into_json_value(self) -> Option { - Some(self.into()) - } -} - -impl ArrowNativeType for u8 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl JsonSerializable for u16 { - fn into_json_value(self) -> Option { - Some(self.into()) - } -} - -impl ArrowNativeType for u16 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl JsonSerializable for u32 { - fn into_json_value(self) -> Option { - Some(self.into()) - } -} - -impl ArrowNativeType for u32 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl JsonSerializable for u64 { - fn into_json_value(self) -> Option { - Some(self.into()) - } -} - -impl ArrowNativeType for u64 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - -impl JsonSerializable for f32 { - fn into_json_value(self) -> Option { - Number::from_f64(f64::round(self as f64 * 1000.0) / 1000.0).map(Value::Number) - } -} - -impl JsonSerializable for f64 { - fn into_json_value(self) -> Option { - Number::from_f64(self).map(Value::Number) - } -} - -impl ArrowNativeType for f32 {} -impl ArrowNativeType for f64 {} - -/// Allows conversion from supported Arrow types to a byte slice. -pub trait ToByteSlice { - /// Converts this instance into a byte slice - fn to_byte_slice(&self) -> &[u8]; -} - -impl ToByteSlice for [T] { - #[inline] - fn to_byte_slice(&self) -> &[u8] { - let raw_ptr = self.as_ptr() as *const T as *const u8; - unsafe { - std::slice::from_raw_parts(raw_ptr, self.len() * std::mem::size_of::()) - } - } -} - -impl ToByteSlice for T { - #[inline] - fn to_byte_slice(&self) -> &[u8] { - let raw_ptr = self as *const T as *const u8; - unsafe { std::slice::from_raw_parts(raw_ptr, std::mem::size_of::()) } - } -} diff --git a/rust/arrow/src/datatypes/numeric.rs b/rust/arrow/src/datatypes/numeric.rs deleted file mode 100644 index 0046398122b..00000000000 --- a/rust/arrow/src/datatypes/numeric.rs +++ /dev/null @@ -1,534 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#[cfg(feature = "simd")] -use packed_simd::*; -#[cfg(feature = "simd")] -use std::ops::{Add, BitAnd, BitAndAssign, BitOr, BitOrAssign, Div, Mul, Neg, Not, Sub}; - -use super::*; - -/// A subtype of primitive type that represents numeric values. -/// -/// SIMD operations are defined in this trait if available on the target system. -#[cfg(simd)] -pub trait ArrowNumericType: ArrowPrimitiveType -where - Self::Simd: Add - + Sub - + Mul - + Div - + Copy, - Self::SimdMask: BitAnd - + BitOr - + BitAndAssign - + BitOrAssign - + Not - + Copy, -{ - /// Defines the SIMD type that should be used for this numeric type - type Simd; - - /// Defines the SIMD Mask type that should be used for this numeric type - type SimdMask; - - /// The number of SIMD lanes available - fn lanes() -> usize; - - /// Initializes a SIMD register to a constant value - fn init(value: Self::Native) -> Self::Simd; - - /// Loads a slice into a SIMD register - fn load(slice: &[Self::Native]) -> Self::Simd; - - /// Creates a new SIMD mask for this SIMD type filling it with `value` - fn mask_init(value: bool) -> Self::SimdMask; - - /// Creates a new SIMD mask for this SIMD type from the lower-most bits of the given `mask`. - /// The number of bits used corresponds to the number of lanes of this type - fn mask_from_u64(mask: u64) -> Self::SimdMask; - - /// Creates a bitmask from the given SIMD mask. - /// Each bit corresponds to one vector lane, starting with the least-significant bit. - fn mask_to_u64(mask: &Self::SimdMask) -> u64; - - /// Gets the value of a single lane in a SIMD mask - fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool; - - /// Sets the value of a single lane of a SIMD mask - fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask; - - /// Selects elements of `a` and `b` using `mask` - fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd; - - /// Returns `true` if any of the lanes in the mask are `true` - fn mask_any(mask: Self::SimdMask) -> bool; - - /// Performs a SIMD binary operation - fn bin_op Self::Simd>( - left: Self::Simd, - right: Self::Simd, - op: F, - ) -> Self::Simd; - - /// SIMD version of equal - fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of not equal - fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of less than - fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of less than or equal to - fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of greater than - fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// SIMD version of greater than or equal to - fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; - - /// Writes a SIMD result back to a slice - fn write(simd_result: Self::Simd, slice: &mut [Self::Native]); - - fn unary_op Self::Simd>(a: Self::Simd, op: F) -> Self::Simd; -} - -#[cfg(not(simd))] -pub trait ArrowNumericType: ArrowPrimitiveType {} - -macro_rules! make_numeric_type { - ($impl_ty:ty, $native_ty:ty, $simd_ty:ident, $simd_mask_ty:ident) => { - #[cfg(simd)] - impl ArrowNumericType for $impl_ty { - type Simd = $simd_ty; - - type SimdMask = $simd_mask_ty; - - #[inline] - fn lanes() -> usize { - Self::Simd::lanes() - } - - #[inline] - fn init(value: Self::Native) -> Self::Simd { - Self::Simd::splat(value) - } - - #[inline] - fn load(slice: &[Self::Native]) -> Self::Simd { - unsafe { Self::Simd::from_slice_unaligned_unchecked(slice) } - } - - #[inline] - fn mask_init(value: bool) -> Self::SimdMask { - Self::SimdMask::splat(value) - } - - #[inline] - fn mask_from_u64(mask: u64) -> Self::SimdMask { - // this match will get removed by the compiler since the number of lanes is known at - // compile-time for each concrete numeric type - match Self::lanes() { - 8 => { - // the bit position in each lane indicates the index of that lane - let vecidx = i64x8::new(1, 2, 4, 8, 16, 32, 64, 128); - - // broadcast the lowermost 8 bits of mask to each lane - let vecmask = i64x8::splat((mask & 0xFF) as i64); - // compute whether the bit corresponding to each lanes index is set - let vecmask = (vecidx & vecmask).eq(vecidx); - - // transmute is necessary because the different match arms return different - // mask types, at runtime only one of those expressions will exist per type, - // with the type being equal to `SimdMask`. - unsafe { std::mem::transmute(vecmask) } - } - 16 => { - // same general logic as for 8 lanes, extended to 16 bits - let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, - ); - - let vecmask = i32x16::splat((mask & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - unsafe { std::mem::transmute(vecmask) } - } - 32 => { - // compute two separate m32x16 vector masks from from the lower-most 32 bits of `mask` - // and then combine them into one m16x32 vector mask by writing and reading a temporary - let tmp = &mut [0_i16; 32]; - - let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, - ); - - let vecmask = i32x16::splat((mask & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i16x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[0..16]); - - let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i16x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[16..32]); - - unsafe { std::mem::transmute(i16x32::from_slice_unaligned(tmp)) } - } - 64 => { - // compute four m32x16 vector masks from from all 64 bits of `mask` - // and convert them into one m8x64 vector mask by writing and reading a temporary - let tmp = &mut [0_i8; 64]; - - let vecidx = i32x16::new( - 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, - 8192, 16384, 32768, - ); - - let vecmask = i32x16::splat((mask & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[0..16]); - - let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[16..32]); - - let vecmask = i32x16::splat(((mask >> 32) & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[32..48]); - - let vecmask = i32x16::splat(((mask >> 48) & 0xFFFF) as i32); - let vecmask = (vecidx & vecmask).eq(vecidx); - - i8x16::from_cast(vecmask) - .write_to_slice_unaligned(&mut tmp[48..64]); - - unsafe { std::mem::transmute(i8x64::from_slice_unaligned(tmp)) } - } - _ => panic!("Invalid number of vector lanes"), - } - } - - #[inline] - fn mask_to_u64(mask: &Self::SimdMask) -> u64 { - mask.bitmask() as u64 - } - - #[inline] - fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool { - unsafe { mask.extract_unchecked(idx) } - } - - #[inline] - fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask { - unsafe { mask.replace_unchecked(idx, value) } - } - - /// Selects elements of `a` and `b` using `mask` - #[inline] - fn mask_select( - mask: Self::SimdMask, - a: Self::Simd, - b: Self::Simd, - ) -> Self::Simd { - mask.select(a, b) - } - - #[inline] - fn mask_any(mask: Self::SimdMask) -> bool { - mask.any() - } - - #[inline] - fn bin_op Self::Simd>( - left: Self::Simd, - right: Self::Simd, - op: F, - ) -> Self::Simd { - op(left, right) - } - - #[inline] - fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.eq(right) - } - - #[inline] - fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.ne(right) - } - - #[inline] - fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.lt(right) - } - - #[inline] - fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.le(right) - } - - #[inline] - fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.gt(right) - } - - #[inline] - fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { - left.ge(right) - } - - #[inline] - fn write(simd_result: Self::Simd, slice: &mut [Self::Native]) { - unsafe { simd_result.write_to_slice_unaligned_unchecked(slice) }; - } - - #[inline] - fn unary_op Self::Simd>( - a: Self::Simd, - op: F, - ) -> Self::Simd { - op(a) - } - } - - #[cfg(not(simd))] - impl ArrowNumericType for $impl_ty {} - }; -} - -make_numeric_type!(Int8Type, i8, i8x64, m8x64); -make_numeric_type!(Int16Type, i16, i16x32, m16x32); -make_numeric_type!(Int32Type, i32, i32x16, m32x16); -make_numeric_type!(Int64Type, i64, i64x8, m64x8); -make_numeric_type!(UInt8Type, u8, u8x64, m8x64); -make_numeric_type!(UInt16Type, u16, u16x32, m16x32); -make_numeric_type!(UInt32Type, u32, u32x16, m32x16); -make_numeric_type!(UInt64Type, u64, u64x8, m64x8); -make_numeric_type!(Float32Type, f32, f32x16, m32x16); -make_numeric_type!(Float64Type, f64, f64x8, m64x8); - -make_numeric_type!(TimestampSecondType, i64, i64x8, m64x8); -make_numeric_type!(TimestampMillisecondType, i64, i64x8, m64x8); -make_numeric_type!(TimestampMicrosecondType, i64, i64x8, m64x8); -make_numeric_type!(TimestampNanosecondType, i64, i64x8, m64x8); -make_numeric_type!(Date32Type, i32, i32x16, m32x16); -make_numeric_type!(Date64Type, i64, i64x8, m64x8); -make_numeric_type!(Time32SecondType, i32, i32x16, m32x16); -make_numeric_type!(Time32MillisecondType, i32, i32x16, m32x16); -make_numeric_type!(Time64MicrosecondType, i64, i64x8, m64x8); -make_numeric_type!(Time64NanosecondType, i64, i64x8, m64x8); -make_numeric_type!(IntervalYearMonthType, i32, i32x16, m32x16); -make_numeric_type!(IntervalDayTimeType, i64, i64x8, m64x8); -make_numeric_type!(DurationSecondType, i64, i64x8, m64x8); -make_numeric_type!(DurationMillisecondType, i64, i64x8, m64x8); -make_numeric_type!(DurationMicrosecondType, i64, i64x8, m64x8); -make_numeric_type!(DurationNanosecondType, i64, i64x8, m64x8); - -/// A subtype of primitive type that represents signed numeric values. -/// -/// SIMD operations are defined in this trait if available on the target system. -#[cfg(simd)] -pub trait ArrowSignedNumericType: ArrowNumericType -where - Self::SignedSimd: Neg, -{ - /// Defines the SIMD type that should be used for this numeric type - type SignedSimd; - - /// Loads a slice of signed numeric type into a SIMD register - fn load_signed(slice: &[Self::Native]) -> Self::SignedSimd; - - /// Performs a SIMD unary operation on signed numeric type - fn signed_unary_op Self::SignedSimd>( - a: Self::SignedSimd, - op: F, - ) -> Self::SignedSimd; - - /// Writes a signed SIMD result back to a slice - fn write_signed(simd_result: Self::SignedSimd, slice: &mut [Self::Native]); -} - -#[cfg(not(simd))] -pub trait ArrowSignedNumericType: ArrowNumericType -where - Self::Native: std::ops::Neg, -{ -} - -macro_rules! make_signed_numeric_type { - ($impl_ty:ty, $simd_ty:ident) => { - #[cfg(simd)] - impl ArrowSignedNumericType for $impl_ty { - type SignedSimd = $simd_ty; - - #[inline] - fn load_signed(slice: &[Self::Native]) -> Self::SignedSimd { - unsafe { Self::SignedSimd::from_slice_unaligned_unchecked(slice) } - } - - #[inline] - fn signed_unary_op Self::SignedSimd>( - a: Self::SignedSimd, - op: F, - ) -> Self::SignedSimd { - op(a) - } - - #[inline] - fn write_signed(simd_result: Self::SignedSimd, slice: &mut [Self::Native]) { - unsafe { simd_result.write_to_slice_unaligned_unchecked(slice) }; - } - } - - #[cfg(not(simd))] - impl ArrowSignedNumericType for $impl_ty {} - }; -} - -make_signed_numeric_type!(Int8Type, i8x64); -make_signed_numeric_type!(Int16Type, i16x32); -make_signed_numeric_type!(Int32Type, i32x16); -make_signed_numeric_type!(Int64Type, i64x8); -make_signed_numeric_type!(Float32Type, f32x16); -make_signed_numeric_type!(Float64Type, f64x8); - -#[cfg(simd)] -pub trait ArrowFloatNumericType: ArrowNumericType { - fn pow(base: Self::Simd, raise: Self::Simd) -> Self::Simd; -} - -#[cfg(not(simd))] -pub trait ArrowFloatNumericType: ArrowNumericType {} - -macro_rules! make_float_numeric_type { - ($impl_ty:ty, $simd_ty:ident) => { - #[cfg(simd)] - impl ArrowFloatNumericType for $impl_ty { - #[inline] - fn pow(base: Self::Simd, raise: Self::Simd) -> Self::Simd { - base.powf(raise) - } - } - - #[cfg(not(simd))] - impl ArrowFloatNumericType for $impl_ty {} - }; -} - -make_float_numeric_type!(Float32Type, f32x16); -make_float_numeric_type!(Float64Type, f64x8); - -#[cfg(all(test, simd_x86))] -mod tests { - use crate::datatypes::{ - ArrowNumericType, Float32Type, Float64Type, Int32Type, Int64Type, Int8Type, - UInt16Type, - }; - use packed_simd::*; - use FromCast; - - /// calculate the expected mask by iterating over all bits - macro_rules! expected_mask { - ($T:ty, $MASK:expr) => {{ - let mask = $MASK; - // simd width of all types is currently 64 bytes -> 512 bits - let lanes = 64 / std::mem::size_of::<$T>(); - // translate each set bit into a value of all ones (-1) of the correct type - (0..lanes) - .map(|i| (if (mask & (1 << i)) != 0 { -1 } else { 0 })) - .collect::>() - }}; - } - - #[test] - fn test_mask_f64() { - let mask = 0b10101010; - let actual = Float64Type::mask_from_u64(mask); - let expected = expected_mask!(i64, mask); - let expected = m64x8::from_cast(i64x8::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_u64() { - let mask = 0b01010101; - let actual = Int64Type::mask_from_u64(mask); - let expected = expected_mask!(i64, mask); - let expected = m64x8::from_cast(i64x8::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_f32() { - let mask = 0b10101010_10101010; - let actual = Float32Type::mask_from_u64(mask); - let expected = expected_mask!(i32, mask); - let expected = - m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_i32() { - let mask = 0b01010101_01010101; - let actual = Int32Type::mask_from_u64(mask); - let expected = expected_mask!(i32, mask); - let expected = - m32x16::from_cast(i32x16::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_u16() { - let mask = 0b01010101_01010101_10101010_10101010; - let actual = UInt16Type::mask_from_u64(mask); - let expected = expected_mask!(i16, mask); - dbg!(&expected); - let expected = - m16x32::from_cast(i16x32::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } - - #[test] - fn test_mask_i8() { - let mask = - 0b01010101_01010101_10101010_10101010_01010101_01010101_10101010_10101010; - let actual = Int8Type::mask_from_u64(mask); - let expected = expected_mask!(i8, mask); - let expected = m8x64::from_cast(i8x64::from_slice_unaligned(expected.as_slice())); - - assert_eq!(expected, actual); - } -} diff --git a/rust/arrow/src/datatypes/schema.rs b/rust/arrow/src/datatypes/schema.rs deleted file mode 100644 index ad89b29cacd..00000000000 --- a/rust/arrow/src/datatypes/schema.rs +++ /dev/null @@ -1,337 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::collections::HashMap; -use std::default::Default; -use std::fmt; - -use serde_derive::{Deserialize, Serialize}; -use serde_json::{json, Value}; - -use crate::error::{ArrowError, Result}; - -use super::Field; - -/// Describes the meta-data of an ordered sequence of relative types. -/// -/// Note that this information is only part of the meta-data and not part of the physical -/// memory layout. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] -pub struct Schema { - pub(crate) fields: Vec, - /// A map of key-value pairs containing additional meta data. - #[serde(skip_serializing_if = "HashMap::is_empty")] - pub(crate) metadata: HashMap, -} - -impl Schema { - /// Creates an empty `Schema` - pub fn empty() -> Self { - Self { - fields: vec![], - metadata: HashMap::new(), - } - } - - /// Creates a new `Schema` from a sequence of `Field` values. - /// - /// # Example - /// - /// ``` - /// # extern crate arrow; - /// # use arrow::datatypes::{Field, DataType, Schema}; - /// let field_a = Field::new("a", DataType::Int64, false); - /// let field_b = Field::new("b", DataType::Boolean, false); - /// - /// let schema = Schema::new(vec![field_a, field_b]); - /// ``` - pub fn new(fields: Vec) -> Self { - Self::new_with_metadata(fields, HashMap::new()) - } - - /// Creates a new `Schema` from a sequence of `Field` values - /// and adds additional metadata in form of key value pairs. - /// - /// # Example - /// - /// ``` - /// # extern crate arrow; - /// # use arrow::datatypes::{Field, DataType, Schema}; - /// # use std::collections::HashMap; - /// let field_a = Field::new("a", DataType::Int64, false); - /// let field_b = Field::new("b", DataType::Boolean, false); - /// - /// let mut metadata: HashMap = HashMap::new(); - /// metadata.insert("row_count".to_string(), "100".to_string()); - /// - /// let schema = Schema::new_with_metadata(vec![field_a, field_b], metadata); - /// ``` - #[inline] - pub const fn new_with_metadata( - fields: Vec, - metadata: HashMap, - ) -> Self { - Self { fields, metadata } - } - - /// Merge schema into self if it is compatible. Struct fields will be merged recursively. - /// - /// Example: - /// - /// ``` - /// use arrow::datatypes::*; - /// - /// let merged = Schema::try_merge(vec![ - /// Schema::new(vec![ - /// Field::new("c1", DataType::Int64, false), - /// Field::new("c2", DataType::Utf8, false), - /// ]), - /// Schema::new(vec![ - /// Field::new("c1", DataType::Int64, true), - /// Field::new("c2", DataType::Utf8, false), - /// Field::new("c3", DataType::Utf8, false), - /// ]), - /// ]).unwrap(); - /// - /// assert_eq!( - /// merged, - /// Schema::new(vec![ - /// Field::new("c1", DataType::Int64, true), - /// Field::new("c2", DataType::Utf8, false), - /// Field::new("c3", DataType::Utf8, false), - /// ]), - /// ); - /// ``` - pub fn try_merge(schemas: impl IntoIterator) -> Result { - schemas - .into_iter() - .try_fold(Self::empty(), |mut merged, schema| { - let Schema { metadata, fields } = schema; - for (key, value) in metadata.into_iter() { - // merge metadata - if let Some(old_val) = merged.metadata.get(&key) { - if old_val != &value { - return Err(ArrowError::SchemaError( - "Fail to merge schema due to conflicting metadata." - .to_string(), - )); - } - } - merged.metadata.insert(key, value); - } - // merge fields - for field in fields.into_iter() { - let mut new_field = true; - for merged_field in &mut merged.fields { - if field.name() != merged_field.name() { - continue; - } - new_field = false; - merged_field.try_merge(&field)? - } - // found a new field, add to field list - if new_field { - merged.fields.push(field); - } - } - Ok(merged) - }) - } - - /// Returns an immutable reference of the vector of `Field` instances. - #[inline] - pub const fn fields(&self) -> &Vec { - &self.fields - } - - /// Returns an immutable reference of a specific `Field` instance selected using an - /// offset within the internal `fields` vector. - pub fn field(&self, i: usize) -> &Field { - &self.fields[i] - } - - /// Returns an immutable reference of a specific `Field` instance selected by name. - pub fn field_with_name(&self, name: &str) -> Result<&Field> { - Ok(&self.fields[self.index_of(name)?]) - } - - /// Returns a vector of immutable references to all `Field` instances selected by - /// the dictionary ID they use. - pub fn fields_with_dict_id(&self, dict_id: i64) -> Vec<&Field> { - self.fields - .iter() - .filter(|f| f.dict_id() == Some(dict_id)) - .collect() - } - - /// Find the index of the column with the given name. - pub fn index_of(&self, name: &str) -> Result { - for i in 0..self.fields.len() { - if self.fields[i].name() == name { - return Ok(i); - } - } - let valid_fields: Vec = - self.fields.iter().map(|f| f.name().clone()).collect(); - Err(ArrowError::InvalidArgumentError(format!( - "Unable to get field named \"{}\". Valid fields: {:?}", - name, valid_fields - ))) - } - - /// Returns an immutable reference to the Map of custom metadata key-value pairs. - #[inline] - pub const fn metadata(&self) -> &HashMap { - &self.metadata - } - - /// Look up a column by name and return a immutable reference to the column along with - /// its index. - pub fn column_with_name(&self, name: &str) -> Option<(usize, &Field)> { - self.fields - .iter() - .enumerate() - .find(|&(_, c)| c.name() == name) - } - - /// Generate a JSON representation of the `Schema`. - pub fn to_json(&self) -> Value { - json!({ - "fields": self.fields.iter().map(|field| field.to_json()).collect::>(), - "metadata": serde_json::to_value(&self.metadata).unwrap() - }) - } - - /// Parse a `Schema` definition from a JSON representation. - pub fn from(json: &Value) -> Result { - match *json { - Value::Object(ref schema) => { - let fields = if let Some(Value::Array(fields)) = schema.get("fields") { - fields - .iter() - .map(|f| Field::from(f)) - .collect::>()? - } else { - return Err(ArrowError::ParseError( - "Schema fields should be an array".to_string(), - )); - }; - - let metadata = if let Some(value) = schema.get("metadata") { - Self::from_metadata(value)? - } else { - HashMap::default() - }; - - Ok(Self { fields, metadata }) - } - _ => Err(ArrowError::ParseError( - "Invalid json value type for schema".to_string(), - )), - } - } - - /// Parse a `metadata` definition from a JSON representation. - /// The JSON can either be an Object or an Array of Objects. - fn from_metadata(json: &Value) -> Result> { - match json { - Value::Array(_) => { - let mut hashmap = HashMap::new(); - let values: Vec = serde_json::from_value(json.clone()) - .map_err(|_| { - ArrowError::JsonError( - "Unable to parse object into key-value pair".to_string(), - ) - })?; - for meta in values { - hashmap.insert(meta.key.clone(), meta.value); - } - Ok(hashmap) - } - Value::Object(md) => md - .iter() - .map(|(k, v)| { - if let Value::String(v) = v { - Ok((k.to_string(), v.to_string())) - } else { - Err(ArrowError::ParseError( - "metadata `value` field must be a string".to_string(), - )) - } - }) - .collect::>(), - _ => Err(ArrowError::ParseError( - "`metadata` field must be an object".to_string(), - )), - } - } - - /// Check to see if `self` is a superset of `other` schema. Here are the comparision rules: - /// - /// * `self` and `other` should contain the same number of fields - /// * for every field `f` in `other`, the field in `self` with corresponding index should be a - /// superset of `f`. - /// * self.metadata is a superset of other.metadata - /// - /// In other words, any record conforms to `other` should also conform to `self`. - pub fn contains(&self, other: &Schema) -> bool { - if self.fields.len() != other.fields.len() { - return false; - } - - for (i, field) in other.fields.iter().enumerate() { - if !self.fields[i].contains(field) { - return false; - } - } - - // make sure self.metadata is a superset of other.metadata - for (k, v) in &other.metadata { - match self.metadata.get(k) { - Some(s) => { - if s != v { - return false; - } - } - None => { - return false; - } - } - } - - true - } -} - -impl fmt::Display for Schema { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str( - &self - .fields - .iter() - .map(|c| c.to_string()) - .collect::>() - .join(", "), - ) - } -} - -#[derive(Deserialize)] -struct MetadataKeyValue { - key: String, - value: String, -} diff --git a/rust/arrow/src/datatypes/types.rs b/rust/arrow/src/datatypes/types.rs deleted file mode 100644 index 30c9aae8956..00000000000 --- a/rust/arrow/src/datatypes/types.rs +++ /dev/null @@ -1,185 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use super::{ArrowPrimitiveType, DataType, IntervalUnit, TimeUnit}; - -// BooleanType is special: its bit-width is not the size of the primitive type, and its `index` -// operation assumes bit-packing. -#[derive(Debug)] -pub struct BooleanType {} - -impl BooleanType { - pub const DATA_TYPE: DataType = DataType::Boolean; -} - -macro_rules! make_type { - ($name:ident, $native_ty:ty, $data_ty:expr) => { - #[derive(Debug)] - pub struct $name {} - - impl ArrowPrimitiveType for $name { - type Native = $native_ty; - const DATA_TYPE: DataType = $data_ty; - } - }; -} - -make_type!(Int8Type, i8, DataType::Int8); -make_type!(Int16Type, i16, DataType::Int16); -make_type!(Int32Type, i32, DataType::Int32); -make_type!(Int64Type, i64, DataType::Int64); -make_type!(UInt8Type, u8, DataType::UInt8); -make_type!(UInt16Type, u16, DataType::UInt16); -make_type!(UInt32Type, u32, DataType::UInt32); -make_type!(UInt64Type, u64, DataType::UInt64); -make_type!(Float32Type, f32, DataType::Float32); -make_type!(Float64Type, f64, DataType::Float64); -make_type!( - TimestampSecondType, - i64, - DataType::Timestamp(TimeUnit::Second, None) -); -make_type!( - TimestampMillisecondType, - i64, - DataType::Timestamp(TimeUnit::Millisecond, None) -); -make_type!( - TimestampMicrosecondType, - i64, - DataType::Timestamp(TimeUnit::Microsecond, None) -); -make_type!( - TimestampNanosecondType, - i64, - DataType::Timestamp(TimeUnit::Nanosecond, None) -); -make_type!(Date32Type, i32, DataType::Date32); -make_type!(Date64Type, i64, DataType::Date64); -make_type!(Time32SecondType, i32, DataType::Time32(TimeUnit::Second)); -make_type!( - Time32MillisecondType, - i32, - DataType::Time32(TimeUnit::Millisecond) -); -make_type!( - Time64MicrosecondType, - i64, - DataType::Time64(TimeUnit::Microsecond) -); -make_type!( - Time64NanosecondType, - i64, - DataType::Time64(TimeUnit::Nanosecond) -); -make_type!( - IntervalYearMonthType, - i32, - DataType::Interval(IntervalUnit::YearMonth) -); -make_type!( - IntervalDayTimeType, - i64, - DataType::Interval(IntervalUnit::DayTime) -); -make_type!( - DurationSecondType, - i64, - DataType::Duration(TimeUnit::Second) -); -make_type!( - DurationMillisecondType, - i64, - DataType::Duration(TimeUnit::Millisecond) -); -make_type!( - DurationMicrosecondType, - i64, - DataType::Duration(TimeUnit::Microsecond) -); -make_type!( - DurationNanosecondType, - i64, - DataType::Duration(TimeUnit::Nanosecond) -); - -/// A subtype of primitive type that represents legal dictionary keys. -/// See -pub trait ArrowDictionaryKeyType: ArrowPrimitiveType {} - -impl ArrowDictionaryKeyType for Int8Type {} - -impl ArrowDictionaryKeyType for Int16Type {} - -impl ArrowDictionaryKeyType for Int32Type {} - -impl ArrowDictionaryKeyType for Int64Type {} - -impl ArrowDictionaryKeyType for UInt8Type {} - -impl ArrowDictionaryKeyType for UInt16Type {} - -impl ArrowDictionaryKeyType for UInt32Type {} - -impl ArrowDictionaryKeyType for UInt64Type {} - -/// A subtype of primitive type that represents temporal values. -pub trait ArrowTemporalType: ArrowPrimitiveType {} - -impl ArrowTemporalType for TimestampSecondType {} -impl ArrowTemporalType for TimestampMillisecondType {} -impl ArrowTemporalType for TimestampMicrosecondType {} -impl ArrowTemporalType for TimestampNanosecondType {} -impl ArrowTemporalType for Date32Type {} -impl ArrowTemporalType for Date64Type {} -impl ArrowTemporalType for Time32SecondType {} -impl ArrowTemporalType for Time32MillisecondType {} -impl ArrowTemporalType for Time64MicrosecondType {} -impl ArrowTemporalType for Time64NanosecondType {} -// impl ArrowTemporalType for IntervalYearMonthType {} -// impl ArrowTemporalType for IntervalDayTimeType {} -impl ArrowTemporalType for DurationSecondType {} -impl ArrowTemporalType for DurationMillisecondType {} -impl ArrowTemporalType for DurationMicrosecondType {} -impl ArrowTemporalType for DurationNanosecondType {} - -/// A timestamp type allows us to create array builders that take a timestamp. -pub trait ArrowTimestampType: ArrowTemporalType { - /// Returns the `TimeUnit` of this timestamp. - fn get_time_unit() -> TimeUnit; -} - -impl ArrowTimestampType for TimestampSecondType { - fn get_time_unit() -> TimeUnit { - TimeUnit::Second - } -} -impl ArrowTimestampType for TimestampMillisecondType { - fn get_time_unit() -> TimeUnit { - TimeUnit::Millisecond - } -} -impl ArrowTimestampType for TimestampMicrosecondType { - fn get_time_unit() -> TimeUnit { - TimeUnit::Microsecond - } -} -impl ArrowTimestampType for TimestampNanosecondType { - fn get_time_unit() -> TimeUnit { - TimeUnit::Nanosecond - } -} diff --git a/rust/arrow/src/error.rs b/rust/arrow/src/error.rs deleted file mode 100644 index 6bfa077f4ab..00000000000 --- a/rust/arrow/src/error.rs +++ /dev/null @@ -1,134 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines `ArrowError` for representing failures in various Arrow operations. -use std::fmt::{Debug, Display, Formatter}; -use std::io::Write; - -use csv as csv_crate; -use std::error::Error; - -/// Many different operations in the `arrow` crate return this error type. -#[derive(Debug)] -pub enum ArrowError { - /// Returned when functionality is not yet available. - NotYetImplemented(String), - ExternalError(Box), - CastError(String), - MemoryError(String), - ParseError(String), - SchemaError(String), - ComputeError(String), - DivideByZero, - CsvError(String), - JsonError(String), - IoError(String), - InvalidArgumentError(String), - ParquetError(String), - /// Error during import or export to/from the C Data Interface - CDataInterface(String), - DictionaryKeyOverflowError, -} - -impl ArrowError { - /// Wraps an external error in an `ArrowError`. - pub fn from_external_error( - error: Box, - ) -> Self { - Self::ExternalError(error) - } -} - -impl From<::std::io::Error> for ArrowError { - fn from(error: std::io::Error) -> Self { - ArrowError::IoError(error.to_string()) - } -} - -impl From for ArrowError { - fn from(error: csv_crate::Error) -> Self { - match error.kind() { - csv_crate::ErrorKind::Io(error) => ArrowError::CsvError(error.to_string()), - csv_crate::ErrorKind::Utf8 { pos: _, err } => ArrowError::CsvError(format!( - "Encountered UTF-8 error while reading CSV file: {}", - err.to_string() - )), - csv_crate::ErrorKind::UnequalLengths { - expected_len, len, .. - } => ArrowError::CsvError(format!( - "Encountered unequal lengths between records on CSV file. Expected {} \ - records, found {} records", - len, expected_len - )), - _ => ArrowError::CsvError("Error reading CSV file".to_string()), - } - } -} - -impl From<::std::string::FromUtf8Error> for ArrowError { - fn from(error: std::string::FromUtf8Error) -> Self { - ArrowError::ParseError(error.to_string()) - } -} - -impl From for ArrowError { - fn from(error: serde_json::Error) -> Self { - ArrowError::JsonError(error.to_string()) - } -} - -impl From<::std::io::IntoInnerError> for ArrowError { - fn from(error: std::io::IntoInnerError) -> Self { - ArrowError::IoError(error.to_string()) - } -} - -impl Display for ArrowError { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - ArrowError::NotYetImplemented(source) => { - write!(f, "Not yet implemented: {}", &source) - } - ArrowError::ExternalError(source) => write!(f, "External error: {}", &source), - ArrowError::CastError(desc) => write!(f, "Cast error: {}", desc), - ArrowError::MemoryError(desc) => write!(f, "Memory error: {}", desc), - ArrowError::ParseError(desc) => write!(f, "Parser error: {}", desc), - ArrowError::SchemaError(desc) => write!(f, "Schema error: {}", desc), - ArrowError::ComputeError(desc) => write!(f, "Compute error: {}", desc), - ArrowError::DivideByZero => write!(f, "Divide by zero error"), - ArrowError::CsvError(desc) => write!(f, "Csv error: {}", desc), - ArrowError::JsonError(desc) => write!(f, "Json error: {}", desc), - ArrowError::IoError(desc) => write!(f, "Io error: {}", desc), - ArrowError::InvalidArgumentError(desc) => { - write!(f, "Invalid argument error: {}", desc) - } - ArrowError::ParquetError(desc) => { - write!(f, "Parquet argument error: {}", desc) - } - ArrowError::CDataInterface(desc) => { - write!(f, "C Data interface error: {}", desc) - } - ArrowError::DictionaryKeyOverflowError => { - write!(f, "Dictionary key bigger than the key type") - } - } - } -} - -impl Error for ArrowError {} - -pub type Result = std::result::Result; diff --git a/rust/arrow/src/ffi.rs b/rust/arrow/src/ffi.rs deleted file mode 100644 index 3a6d031ebd8..00000000000 --- a/rust/arrow/src/ffi.rs +++ /dev/null @@ -1,997 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html). -//! -//! Generally, this module is divided in two main interfaces: -//! One interface maps C ABI to native Rust types, i.e. convert c-pointers, c_char, to native rust. -//! This is handled by [FFI_ArrowSchema] and [FFI_ArrowArray]. -//! -//! The second interface maps native Rust types to the Rust-specific implementation of Arrow such as `format` to `Datatype`, -//! `Buffer`, etc. This is handled by `ArrowArray`. -//! -//! ```rust -//! # use std::sync::Arc; -//! # use arrow::array::{Int32Array, Array, ArrayData, make_array_from_raw}; -//! # use arrow::error::{Result, ArrowError}; -//! # use arrow::compute::kernels::arithmetic; -//! # use std::convert::TryFrom; -//! # fn main() -> Result<()> { -//! // create an array natively -//! let array = Int32Array::from(vec![Some(1), None, Some(3)]); -//! -//! // export it -//! let (array_ptr, schema_ptr) = array.to_raw()?; -//! -//! // consumed and used by something else... -//! -//! // import it -//! let array = unsafe { make_array_from_raw(array_ptr, schema_ptr)? }; -//! -//! // perform some operation -//! let array = array.as_any().downcast_ref::().ok_or( -//! ArrowError::ParseError("Expects an int32".to_string()), -//! )?; -//! let array = arithmetic::add(&array, &array)?; -//! -//! // verify -//! assert_eq!(array, Int32Array::from(vec![Some(2), None, Some(6)])); -//! -//! // (drop/release) -//! Ok(()) -//! } -//! ``` - -/* -# Design: - -Main assumptions: -* A memory region is deallocated according it its own release mechanism. -* Rust shares memory regions between arrays. -* A memory region should be deallocated when no-one is using it. - -The design of this module is as follows: - -`ArrowArray` contains two `Arc`s, one per ABI-compatible `struct`, each containing data -according to the C Data Interface. These Arcs are used for ref counting of the structs -within Rust and lifetime management. - -Each ABI-compatible `struct` knowns how to `drop` itself, calling `release`. - -To import an array, unsafely create an `ArrowArray` from two pointers using [ArrowArray::try_from_raw]. -To export an array, create an `ArrowArray` using [ArrowArray::try_new]. -*/ - -use std::{ - convert::TryFrom, - ffi::CStr, - ffi::CString, - iter, - mem::{size_of, ManuallyDrop}, - os::raw::c_char, - ptr::{self, NonNull}, - sync::Arc, -}; - -use crate::array::ArrayData; -use crate::buffer::Buffer; -use crate::datatypes::{DataType, Field, TimeUnit}; -use crate::error::{ArrowError, Result}; -use crate::util::bit_util; - -/// ABI-compatible struct for `ArrowSchema` from C Data Interface -/// See -/// This was created by bindgen -#[repr(C)] -#[derive(Debug)] -pub struct FFI_ArrowSchema { - format: *const ::std::os::raw::c_char, - name: *const ::std::os::raw::c_char, - metadata: *const ::std::os::raw::c_char, - flags: i64, - n_children: i64, - children: *mut *mut FFI_ArrowSchema, - dictionary: *mut FFI_ArrowSchema, - release: ::std::option::Option, - private_data: *mut ::std::os::raw::c_void, -} - -// callback used to drop [FFI_ArrowSchema] when it is exported. -unsafe extern "C" fn release_schema(schema: *mut FFI_ArrowSchema) { - let schema = &mut *schema; - - // take ownership back to release it. - CString::from_raw(schema.format as *mut std::os::raw::c_char); - - schema.release = None; -} - -struct SchemaPrivateData { - children: Box<[*mut FFI_ArrowSchema]>, -} - -impl FFI_ArrowSchema { - /// create a new [FFI_ArrowSchema] from a format. - fn new( - format: &str, - children: Vec<*mut FFI_ArrowSchema>, - nullable: bool, - ) -> FFI_ArrowSchema { - let children = children.into_boxed_slice(); - let n_children = children.len() as i64; - let children_ptr = children.as_ptr() as *mut *mut FFI_ArrowSchema; - - let flags = if nullable { 2 } else { 0 }; - - let private_data = Box::new(SchemaPrivateData { children }); - // - FFI_ArrowSchema { - format: CString::new(format).unwrap().into_raw(), - // For child data a non null string is expected and is called item - name: CString::new("item").unwrap().into_raw(), - metadata: std::ptr::null_mut(), - flags, - n_children, - children: children_ptr, - dictionary: std::ptr::null_mut(), - release: Some(release_schema), - private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void, - } - } - - /// create an empty [FFI_ArrowSchema] - fn empty() -> Self { - Self { - format: std::ptr::null_mut(), - name: std::ptr::null_mut(), - metadata: std::ptr::null_mut(), - flags: 0, - n_children: 0, - children: ptr::null_mut(), - dictionary: std::ptr::null_mut(), - release: None, - private_data: std::ptr::null_mut(), - } - } - - /// returns the format of this schema. - pub fn format(&self) -> &str { - unsafe { CStr::from_ptr(self.format) } - .to_str() - .expect("The external API has a non-utf8 as format") - } -} - -impl Drop for FFI_ArrowSchema { - fn drop(&mut self) { - match self.release { - None => (), - Some(release) => unsafe { release(self) }, - }; - } -} - -/// maps a DataType `format` to a [DataType](arrow::datatypes::DataType). -/// See https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings -fn to_datatype( - format: &str, - child_type: Option, - schema: &FFI_ArrowSchema, -) -> Result { - Ok(match format { - "n" => DataType::Null, - "b" => DataType::Boolean, - "c" => DataType::Int8, - "C" => DataType::UInt8, - "s" => DataType::Int16, - "S" => DataType::UInt16, - "i" => DataType::Int32, - "I" => DataType::UInt32, - "l" => DataType::Int64, - "L" => DataType::UInt64, - "e" => DataType::Float16, - "f" => DataType::Float32, - "g" => DataType::Float64, - "z" => DataType::Binary, - "Z" => DataType::LargeBinary, - "u" => DataType::Utf8, - "U" => DataType::LargeUtf8, - "tdD" => DataType::Date32, - "tdm" => DataType::Date64, - "tts" => DataType::Time32(TimeUnit::Second), - "ttm" => DataType::Time32(TimeUnit::Millisecond), - "ttu" => DataType::Time64(TimeUnit::Microsecond), - "ttn" => DataType::Time64(TimeUnit::Nanosecond), - - // Note: The datatype null will only be created when called from ArrowArray::buffer_len - // at that point the child data is not yet known, but it is also not required to determine - // the buffer length of the list arrays. - "+l" => { - let nullable = schema.flags == 2; - // Safety - // Should be set as this is expected from the C FFI definition - debug_assert!(!schema.name.is_null()); - let name = unsafe { CString::from_raw(schema.name as *mut c_char) } - .into_string() - .unwrap(); - // prevent a double free - let name = ManuallyDrop::new(name); - DataType::List(Box::new(Field::new( - &name, - child_type.unwrap_or(DataType::Null), - nullable, - ))) - } - "+L" => { - let nullable = schema.flags == 2; - // Safety - // Should be set as this is expected from the C FFI definition - debug_assert!(!schema.name.is_null()); - let name = unsafe { CString::from_raw(schema.name as *mut c_char) } - .into_string() - .unwrap(); - // prevent a double free - let name = ManuallyDrop::new(name); - DataType::LargeList(Box::new(Field::new( - &name, - child_type.unwrap_or(DataType::Null), - nullable, - ))) - } - dt => { - return Err(ArrowError::CDataInterface(format!( - "The datatype \"{}\" is not supported in the Rust implementation", - dt - ))) - } - }) -} - -/// the inverse of [to_datatype] -fn from_datatype(datatype: &DataType) -> Result { - Ok(match datatype { - DataType::Null => "n", - DataType::Boolean => "b", - DataType::Int8 => "c", - DataType::UInt8 => "C", - DataType::Int16 => "s", - DataType::UInt16 => "S", - DataType::Int32 => "i", - DataType::UInt32 => "I", - DataType::Int64 => "l", - DataType::UInt64 => "L", - DataType::Float16 => "e", - DataType::Float32 => "f", - DataType::Float64 => "g", - DataType::Binary => "z", - DataType::LargeBinary => "Z", - DataType::Utf8 => "u", - DataType::LargeUtf8 => "U", - DataType::Date32 => "tdD", - DataType::Date64 => "tdm", - DataType::Time32(TimeUnit::Second) => "tts", - DataType::Time32(TimeUnit::Millisecond) => "ttm", - DataType::Time64(TimeUnit::Microsecond) => "ttu", - DataType::Time64(TimeUnit::Nanosecond) => "ttn", - DataType::List(_) => "+l", - DataType::LargeList(_) => "+L", - z => { - return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" is still not supported in Rust implementation", - z - ))) - } - } - .to_string()) -} - -// returns the number of bits that buffer `i` (in the C data interface) is expected to have. -// This is set by the Arrow specification -fn bit_width(data_type: &DataType, i: usize) -> Result { - Ok(match (data_type, i) { - // the null buffer is bit sized - (_, 0) => 1, - // primitive types first buffer's size is given by the native types - (DataType::Boolean, 1) => 1, - (DataType::UInt8, 1) => size_of::() * 8, - (DataType::UInt16, 1) => size_of::() * 8, - (DataType::UInt32, 1) => size_of::() * 8, - (DataType::UInt64, 1) => size_of::() * 8, - (DataType::Int8, 1) => size_of::() * 8, - (DataType::Int16, 1) => size_of::() * 8, - (DataType::Int32, 1) | (DataType::Date32, 1) | (DataType::Time32(_), 1) => size_of::() * 8, - (DataType::Int64, 1) | (DataType::Date64, 1) | (DataType::Time64(_), 1) => size_of::() * 8, - (DataType::Float32, 1) => size_of::() * 8, - (DataType::Float64, 1) => size_of::() * 8, - // primitive types have a single buffer - (DataType::Boolean, _) | - (DataType::UInt8, _) | - (DataType::UInt16, _) | - (DataType::UInt32, _) | - (DataType::UInt64, _) | - (DataType::Int8, _) | - (DataType::Int16, _) | - (DataType::Int32, _) | (DataType::Date32, _) | (DataType::Time32(_), _) | - (DataType::Int64, _) | (DataType::Date64, _) | (DataType::Time64(_), _) | - (DataType::Float32, _) | - (DataType::Float64, _) => { - return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 2 buffers, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i - ))) - } - // Variable-sized binaries: have two buffers. - // "small": first buffer is i32, second is in bytes - (DataType::Utf8, 1) | (DataType::Binary, 1) | (DataType::List(_), 1) => size_of::() * 8, - (DataType::Utf8, 2) | (DataType::Binary, 2) | (DataType::List(_), 2) => size_of::() * 8, - (DataType::Utf8, _) | (DataType::Binary, _) | (DataType::List(_), _)=> { - return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i - ))) - } - // Variable-sized binaries: have two buffers. - // LargeUtf8: first buffer is i64, second is in bytes - (DataType::LargeUtf8, 1) | (DataType::LargeBinary, 1) | (DataType::LargeList(_), 1) => size_of::() * 8, - (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) | (DataType::LargeList(_), 2)=> size_of::() * 8, - (DataType::LargeUtf8, _) | (DataType::LargeBinary, _) | (DataType::LargeList(_), _)=> { - return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" expects 3 buffers, but requested {}. Please verify that the C data interface is correctly implemented.", - data_type, i - ))) - } - _ => { - return Err(ArrowError::CDataInterface(format!( - "The datatype \"{:?}\" is still not supported in Rust implementation", - data_type - ))) - } - }) -} - -/// ABI-compatible struct for ArrowArray from C Data Interface -/// See -/// This was created by bindgen -#[repr(C)] -#[derive(Debug)] -pub struct FFI_ArrowArray { - pub(crate) length: i64, - pub(crate) null_count: i64, - pub(crate) offset: i64, - pub(crate) n_buffers: i64, - pub(crate) n_children: i64, - pub(crate) buffers: *mut *const ::std::os::raw::c_void, - children: *mut *mut FFI_ArrowArray, - dictionary: *mut FFI_ArrowArray, - release: ::std::option::Option, - // When exported, this MUST contain everything that is owned by this array. - // for example, any buffer pointed to in `buffers` must be here, as well as the `buffers` pointer - // itself. - // In other words, everything in [FFI_ArrowArray] must be owned by `private_data` and can assume - // that they do not outlive `private_data`. - private_data: *mut ::std::os::raw::c_void, -} - -// callback used to drop [FFI_ArrowArray] when it is exported -unsafe extern "C" fn release_array(array: *mut FFI_ArrowArray) { - if array.is_null() { - return; - } - let array = &mut *array; - // take ownership of `private_data`, therefore dropping it - Box::from_raw(array.private_data as *mut PrivateData); - - array.release = None; -} - -struct PrivateData { - buffers: Vec>, - buffers_ptr: Box<[*const std::os::raw::c_void]>, - children: Box<[*mut FFI_ArrowArray]>, -} - -impl FFI_ArrowArray { - /// creates a new `FFI_ArrowArray` from existing data. - /// # Safety - /// This method releases `buffers`. Consumers of this struct *must* call `release` before - /// releasing this struct, or contents in `buffers` leak. - unsafe fn new( - length: i64, - null_count: i64, - offset: i64, - n_buffers: i64, - buffers: Vec>, - children: Vec<*mut FFI_ArrowArray>, - ) -> Self { - let buffers_ptr = buffers - .iter() - .map(|maybe_buffer| match maybe_buffer { - // note that `raw_data` takes into account the buffer's offset - Some(b) => b.as_ptr() as *const std::os::raw::c_void, - None => std::ptr::null(), - }) - .collect::>(); - let pointer = buffers_ptr.as_ptr() as *mut *const std::ffi::c_void; - - let children = children.into_boxed_slice(); - let children_ptr = children.as_ptr() as *mut *mut FFI_ArrowArray; - let n_children = children.len() as i64; - - // create the private data owning everything. - // any other data must be added here, e.g. via a struct, to track lifetime. - let private_data = Box::new(PrivateData { - buffers, - buffers_ptr, - children, - }); - - Self { - length, - null_count, - offset, - n_buffers, - n_children, - buffers: pointer, - children: children_ptr, - dictionary: std::ptr::null_mut(), - release: Some(release_array), - private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void, - } - } - - // create an empty `FFI_ArrowArray`, which can be used to import data into - fn empty() -> Self { - Self { - length: 0, - null_count: 0, - offset: 0, - n_buffers: 0, - n_children: 0, - buffers: std::ptr::null_mut(), - children: std::ptr::null_mut(), - dictionary: std::ptr::null_mut(), - release: None, - private_data: std::ptr::null_mut(), - } - } -} - -/// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer). -/// `bits` is the number of bits that the native type of this buffer has. -/// The size of the buffer will be `ceil(self.length * bits, 8)`. -/// # Panic -/// This function panics if `i` is larger or equal to `n_buffers`. -/// # Safety -/// This function assumes that `ceil(self.length * bits, 8)` is the size of the buffer -unsafe fn create_buffer( - array: Arc, - index: usize, - len: usize, -) -> Option { - if array.buffers.is_null() { - return None; - } - let buffers = array.buffers as *mut *const u8; - - assert!(index < array.n_buffers as usize); - let ptr = *buffers.add(index); - - NonNull::new(ptr as *mut u8).map(|ptr| Buffer::from_unowned(ptr, len, array)) -} - -unsafe fn create_child_arrays( - array: Arc, - schema: Arc, -) -> Result> { - (0..array.n_children as usize) - .map(|i| { - let arr_ptr = *array.children.add(i); - let schema_ptr = *schema.children.add(i); - let arrow_arr = ArrowArray::try_from_raw( - arr_ptr as *const FFI_ArrowArray, - schema_ptr as *const FFI_ArrowSchema, - )?; - ArrayData::try_from(arrow_arr) - }) - .collect() -} - -impl Drop for FFI_ArrowArray { - fn drop(&mut self) { - match self.release { - None => (), - Some(release) => unsafe { release(self) }, - }; - } -} - -/// Struct used to move an Array from and to the C Data Interface. -/// Its main responsibility is to expose functionality that requires -/// both [FFI_ArrowArray] and [FFI_ArrowSchema]. -/// -/// This struct has two main paths: -/// -/// ## Import from the C Data Interface -/// * [ArrowArray::empty] to allocate memory to be filled by an external call -/// * [ArrowArray::try_from_raw] to consume two non-null allocated pointers -/// ## Export to the C Data Interface -/// * [ArrowArray::try_new] to create a new [ArrowArray] from Rust-specific information -/// * [ArrowArray::into_raw] to expose two pointers for [FFI_ArrowArray] and [FFI_ArrowSchema]. -/// -/// # Safety -/// Whoever creates this struct is responsible for releasing their resources. Specifically, -/// consumers *must* call [ArrowArray::into_raw] and take ownership of the individual pointers, -/// calling [FFI_ArrowArray::release] and [FFI_ArrowSchema::release] accordingly. -/// -/// Furthermore, this struct assumes that the incoming data agrees with the C data interface. -#[derive(Debug)] -pub struct ArrowArray { - // these are ref-counted because they can be shared by multiple buffers. - array: Arc, - schema: Arc, -} - -impl ArrowArray { - /// creates a new `ArrowArray`. This is used to export to the C Data Interface. - /// # Safety - /// See safety of [ArrowArray] - #[allow(clippy::too_many_arguments)] - pub unsafe fn try_new( - data_type: &DataType, - len: usize, - null_count: usize, - null_buffer: Option, - offset: usize, - buffers: Vec, - child_data: Vec, - nullable: bool, - ) -> Result { - let format = from_datatype(data_type)?; - // * insert the null buffer at the start - // * make all others `Option`. - let new_buffers = iter::once(null_buffer) - .chain(buffers.iter().map(|b| Some(b.clone()))) - .collect::>(); - - let mut ffi_arrow_arrays = Vec::with_capacity(child_data.len()); - let mut ffi_arrow_schemas = Vec::with_capacity(child_data.len()); - - child_data.into_iter().for_each(|arrow_arr| { - let (arr, schema) = ArrowArray::into_raw(arrow_arr); - ffi_arrow_arrays.push(arr as *mut FFI_ArrowArray); - ffi_arrow_schemas.push(schema as *mut FFI_ArrowSchema); - }); - - let schema = Arc::new(FFI_ArrowSchema::new(&format, ffi_arrow_schemas, nullable)); - let array = Arc::new(FFI_ArrowArray::new( - len as i64, - null_count as i64, - offset as i64, - new_buffers.len() as i64, - new_buffers, - ffi_arrow_arrays, - )); - - Ok(ArrowArray { array, schema }) - } - - /// creates a new [ArrowArray] from two pointers. Used to import from the C Data Interface. - /// # Safety - /// See safety of [ArrowArray] - /// # Error - /// Errors if any of the pointers is null - pub unsafe fn try_from_raw( - array: *const FFI_ArrowArray, - schema: *const FFI_ArrowSchema, - ) -> Result { - if array.is_null() || schema.is_null() { - return Err(ArrowError::MemoryError( - "At least one of the pointers passed to `try_from_raw` is null" - .to_string(), - )); - }; - Ok(Self { - array: Arc::from_raw(array as *mut FFI_ArrowArray), - schema: Arc::from_raw(schema as *mut FFI_ArrowSchema), - }) - } - - /// creates a new empty [ArrowArray]. Used to import from the C Data Interface. - /// # Safety - /// See safety of [ArrowArray] - pub unsafe fn empty() -> Self { - let schema = Arc::new(FFI_ArrowSchema::empty()); - let array = Arc::new(FFI_ArrowArray::empty()); - ArrowArray { array, schema } - } - - /// exports [ArrowArray] to the C Data Interface - pub fn into_raw(this: ArrowArray) -> (*const FFI_ArrowArray, *const FFI_ArrowSchema) { - (Arc::into_raw(this.array), Arc::into_raw(this.schema)) - } - - /// returns the null bit buffer. - /// Rust implementation uses a buffer that is not part of the array of buffers. - /// The C Data interface's null buffer is part of the array of buffers. - pub fn null_bit_buffer(&self) -> Option { - // similar to `self.buffer_len(0)`, but without `Result`. - let buffer_len = bit_util::ceil(self.array.length as usize, 8); - - unsafe { create_buffer(self.array.clone(), 0, buffer_len) } - } - - /// Returns the length, in bytes, of the buffer `i` (indexed according to the C data interface) - // Rust implementation uses fixed-sized buffers, which require knowledge of their `len`. - // for variable-sized buffers, such as the second buffer of a stringArray, we need - // to fetch offset buffer's len to build the second buffer. - fn buffer_len(&self, i: usize) -> Result { - // Inner type is not important for buffer length. - let data_type = &self.data_type(None)?; - - Ok(match (data_type, i) { - (DataType::Utf8, 1) - | (DataType::LargeUtf8, 1) - | (DataType::Binary, 1) - | (DataType::LargeBinary, 1) - | (DataType::List(_), 1) - | (DataType::LargeList(_), 1) => { - // the len of the offset buffer (buffer 1) equals length + 1 - let bits = bit_width(data_type, i)?; - debug_assert_eq!(bits % 8, 0); - (self.array.length as usize + 1) * (bits / 8) - } - (DataType::Utf8, 2) | (DataType::Binary, 2) | (DataType::List(_), 2) => { - // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) - let len = self.buffer_len(1)?; - // first buffer is the null buffer => add(1) - // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets. - #[allow(clippy::cast_ptr_alignment)] - let offset_buffer = unsafe { - *(self.array.buffers as *mut *const u8).add(1) as *const i32 - }; - // get last offset - (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize - } - (DataType::LargeUtf8, 2) - | (DataType::LargeBinary, 2) - | (DataType::LargeList(_), 2) => { - // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) - let len = self.buffer_len(1)?; - // first buffer is the null buffer => add(1) - // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets. - #[allow(clippy::cast_ptr_alignment)] - let offset_buffer = unsafe { - *(self.array.buffers as *mut *const u8).add(1) as *const i64 - }; - // get last offset - (unsafe { *offset_buffer.add(len / size_of::() - 1) }) as usize - } - // buffer len of primitive types - _ => { - let bits = bit_width(data_type, i)?; - bit_util::ceil(self.array.length as usize * bits, 8) - } - }) - } - - /// returns all buffers, as organized by Rust (i.e. null buffer is skipped) - pub fn buffers(&self) -> Result> { - (0..self.array.n_buffers - 1) - .map(|index| { - // + 1: skip null buffer - let index = (index + 1) as usize; - - let len = self.buffer_len(index)?; - - unsafe { create_buffer(self.array.clone(), index, len) }.ok_or_else( - || { - ArrowError::CDataInterface(format!( - "The external buffer at position {} is null.", - index - 1 - )) - }, - ) - }) - .collect() - } - - /// returns the child data of this array - pub fn children(&self) -> Result> { - unsafe { create_child_arrays(self.array.clone(), self.schema.clone()) } - } - - /// the length of the array - pub fn len(&self) -> usize { - self.array.length as usize - } - - /// whether the array is empty - pub fn is_empty(&self) -> bool { - self.array.length == 0 - } - - /// the offset of the array - pub fn offset(&self) -> usize { - self.array.offset as usize - } - - /// the null count of the array - pub fn null_count(&self) -> usize { - self.array.null_count as usize - } - - /// the data_type as declared in the schema - pub fn data_type(&self, child_type: Option) -> Result { - to_datatype(self.schema.format(), child_type, self.schema.as_ref()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::array::{ - make_array, Array, ArrayData, BinaryOffsetSizeTrait, BooleanArray, - GenericBinaryArray, GenericListArray, GenericStringArray, Int32Array, - OffsetSizeTrait, StringOffsetSizeTrait, Time32MillisecondArray, - }; - use crate::compute::kernels; - use crate::datatypes::Field; - use std::convert::TryFrom; - use std::iter::FromIterator; - - #[test] - fn test_round_trip() -> Result<()> { - // create an array natively - let array = Int32Array::from(vec![1, 2, 3]); - - // export it - let array = ArrowArray::try_from(array.data().clone())?; - - // (simulate consumer) import it - let data = ArrayData::try_from(array)?; - let array = make_array(data); - - // perform some operation - let array = array.as_any().downcast_ref::().unwrap(); - let array = kernels::arithmetic::add(&array, &array).unwrap(); - - // verify - assert_eq!(array, Int32Array::from(vec![2, 4, 6])); - - // (drop/release) - Ok(()) - } - // case with nulls is tested in the docs, through the example on this module. - - fn test_generic_string() -> Result<()> { - // create an array natively - let array = - GenericStringArray::::from(vec![Some("a"), None, Some("aaa")]); - - // export it - let array = ArrowArray::try_from(array.data().clone())?; - - // (simulate consumer) import it - let data = ArrayData::try_from(array)?; - let array = make_array(data); - - // perform some operation - let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap(); - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - // verify - let expected = GenericStringArray::::from(vec![ - Some("a"), - None, - Some("aaa"), - Some("a"), - None, - Some("aaa"), - ]); - assert_eq!(array, &expected); - - // (drop/release) - Ok(()) - } - - #[test] - fn test_string() -> Result<()> { - test_generic_string::() - } - - #[test] - fn test_large_string() -> Result<()> { - test_generic_string::() - } - - fn test_generic_list() -> Result<()> { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build(); - - // Construct a buffer for value offsets, for the nested array: - // [[0, 1, 2], [3, 4, 5], [6, 7]] - let value_offsets = Buffer::from_iter( - [0usize, 3, 6, 8] - .iter() - .map(|i| Offset::from_usize(*i).unwrap()), - ); - - // Construct a list array from the above two - let list_data_type = match std::mem::size_of::() { - 4 => DataType::List(Box::new(Field::new("item", DataType::Int32, false))), - _ => { - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))) - } - }; - - let list_data = ArrayData::builder(list_data_type) - .len(3) - .add_buffer(value_offsets) - .add_child_data(value_data) - .build(); - - // create an array natively - let array = GenericListArray::::from(list_data.clone()); - - // export it - let array = ArrowArray::try_from(array.data().clone())?; - - // (simulate consumer) import it - let data = ArrayData::try_from(array)?; - let array = make_array(data); - - // downcast - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - dbg!(&array); - - // verify - let expected = GenericListArray::::from(list_data); - assert_eq!(&array.value(0), &expected.value(0)); - assert_eq!(&array.value(1), &expected.value(1)); - assert_eq!(&array.value(2), &expected.value(2)); - - // (drop/release) - Ok(()) - } - - #[test] - fn test_list() -> Result<()> { - test_generic_list::() - } - - #[test] - fn test_large_list() -> Result<()> { - test_generic_list::() - } - - fn test_generic_binary() -> Result<()> { - // create an array natively - let array: Vec> = vec![Some(b"a"), None, Some(b"aaa")]; - let array = GenericBinaryArray::::from(array); - - // export it - let array = ArrowArray::try_from(array.data().clone())?; - - // (simulate consumer) import it - let data = ArrayData::try_from(array)?; - let array = make_array(data); - - // perform some operation - let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap(); - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - - // verify - let expected: Vec> = vec![ - Some(b"a"), - None, - Some(b"aaa"), - Some(b"a"), - None, - Some(b"aaa"), - ]; - let expected = GenericBinaryArray::::from(expected); - assert_eq!(array, &expected); - - // (drop/release) - Ok(()) - } - - #[test] - fn test_binary() -> Result<()> { - test_generic_binary::() - } - - #[test] - fn test_large_binary() -> Result<()> { - test_generic_binary::() - } - - #[test] - fn test_bool() -> Result<()> { - // create an array natively - let array = BooleanArray::from(vec![None, Some(true), Some(false)]); - - // export it - let array = ArrowArray::try_from(array.data().clone())?; - - // (simulate consumer) import it - let data = ArrayData::try_from(array)?; - let array = make_array(data); - - // perform some operation - let array = array.as_any().downcast_ref::().unwrap(); - let array = kernels::boolean::not(&array)?; - - // verify - assert_eq!( - array, - BooleanArray::from(vec![None, Some(false), Some(true)]) - ); - - // (drop/release) - Ok(()) - } - - #[test] - fn test_time32() -> Result<()> { - // create an array natively - let array = Time32MillisecondArray::from(vec![None, Some(1), Some(2)]); - - // export it - let array = ArrowArray::try_from(array.data().clone())?; - - // (simulate consumer) import it - let data = ArrayData::try_from(array)?; - let array = make_array(data); - - // perform some operation - let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).unwrap(); - let array = array - .as_any() - .downcast_ref::() - .unwrap(); - - // verify - assert_eq!( - array, - &Time32MillisecondArray::from(vec![ - None, - Some(1), - Some(2), - None, - Some(1), - Some(2) - ]) - ); - - // (drop/release) - Ok(()) - } -} diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs deleted file mode 100644 index 59d4d0b9089..00000000000 --- a/rust/arrow/src/ipc/convert.rs +++ /dev/null @@ -1,871 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Utilities for converting between IPC types and native Arrow types - -use crate::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; -use crate::error::{ArrowError, Result}; -use crate::ipc; - -use flatbuffers::{ - FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset, -}; -use std::collections::{BTreeMap, HashMap}; - -use DataType::*; - -/// Serialize a schema in IPC format -pub fn schema_to_fb(schema: &Schema) -> FlatBufferBuilder { - let mut fbb = FlatBufferBuilder::new(); - - let root = schema_to_fb_offset(&mut fbb, schema); - - fbb.finish(root, None); - - fbb -} - -pub fn schema_to_fb_offset<'a>( - fbb: &mut FlatBufferBuilder<'a>, - schema: &Schema, -) -> WIPOffset> { - let mut fields = vec![]; - for field in schema.fields() { - let fb_field = build_field(fbb, field); - fields.push(fb_field); - } - - let mut custom_metadata = vec![]; - for (k, v) in schema.metadata() { - let fb_key_name = fbb.create_string(k.as_str()); - let fb_val_name = fbb.create_string(v.as_str()); - - let mut kv_builder = ipc::KeyValueBuilder::new(fbb); - kv_builder.add_key(fb_key_name); - kv_builder.add_value(fb_val_name); - custom_metadata.push(kv_builder.finish()); - } - - let fb_field_list = fbb.create_vector(&fields); - let fb_metadata_list = fbb.create_vector(&custom_metadata); - - let mut builder = ipc::SchemaBuilder::new(fbb); - builder.add_fields(fb_field_list); - builder.add_custom_metadata(fb_metadata_list); - builder.finish() -} - -/// Convert an IPC Field to Arrow Field -impl<'a> From> for Field { - fn from(field: ipc::Field) -> Field { - let mut arrow_field = if let Some(dictionary) = field.dictionary() { - Field::new_dict( - field.name().unwrap(), - get_data_type(field, true), - field.nullable(), - dictionary.id(), - dictionary.isOrdered(), - ) - } else { - Field::new( - field.name().unwrap(), - get_data_type(field, true), - field.nullable(), - ) - }; - - let mut metadata = None; - if let Some(list) = field.custom_metadata() { - let mut metadata_map = BTreeMap::default(); - for kv in list { - if let (Some(k), Some(v)) = (kv.key(), kv.value()) { - metadata_map.insert(k.to_string(), v.to_string()); - } - } - metadata = Some(metadata_map); - } - - arrow_field.set_metadata(metadata); - arrow_field - } -} - -/// Deserialize a Schema table from IPC format to Schema data type -pub fn fb_to_schema(fb: ipc::Schema) -> Schema { - let mut fields: Vec = vec![]; - let c_fields = fb.fields().unwrap(); - let len = c_fields.len(); - for i in 0..len { - let c_field: ipc::Field = c_fields.get(i); - match c_field.type_type() { - ipc::Type::Decimal if fb.endianness() == ipc::Endianness::Big => { - unimplemented!("Big Endian is not supported for Decimal!") - } - _ => (), - }; - fields.push(c_field.into()); - } - - let mut metadata: HashMap = HashMap::default(); - if let Some(md_fields) = fb.custom_metadata() { - let len = md_fields.len(); - for i in 0..len { - let kv = md_fields.get(i); - let k_str = kv.key(); - let v_str = kv.value(); - if let Some(k) = k_str { - if let Some(v) = v_str { - metadata.insert(k.to_string(), v.to_string()); - } - } - } - } - Schema::new_with_metadata(fields, metadata) -} - -/// Deserialize an IPC message into a schema -pub fn schema_from_bytes(bytes: &[u8]) -> Result { - if let Ok(ipc) = ipc::root_as_message(bytes) { - if let Some(schema) = ipc.header_as_schema().map(fb_to_schema) { - Ok(schema) - } else { - Err(ArrowError::IoError( - "Unable to get head as schema".to_string(), - )) - } - } else { - Err(ArrowError::IoError( - "Unable to get root as message".to_string(), - )) - } -} - -/// Get the Arrow data type from the flatbuffer Field table -pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataType { - if let Some(dictionary) = field.dictionary() { - if may_be_dictionary { - let int = dictionary.indexType().unwrap(); - let index_type = match (int.bitWidth(), int.is_signed()) { - (8, true) => DataType::Int8, - (8, false) => DataType::UInt8, - (16, true) => DataType::Int16, - (16, false) => DataType::UInt16, - (32, true) => DataType::Int32, - (32, false) => DataType::UInt32, - (64, true) => DataType::Int64, - (64, false) => DataType::UInt64, - _ => panic!("Unexpected bitwidth and signed"), - }; - return DataType::Dictionary( - Box::new(index_type), - Box::new(get_data_type(field, false)), - ); - } - } - - match field.type_type() { - ipc::Type::Null => DataType::Null, - ipc::Type::Bool => DataType::Boolean, - ipc::Type::Int => { - let int = field.type_as_int().unwrap(); - match (int.bitWidth(), int.is_signed()) { - (8, true) => DataType::Int8, - (8, false) => DataType::UInt8, - (16, true) => DataType::Int16, - (16, false) => DataType::UInt16, - (32, true) => DataType::Int32, - (32, false) => DataType::UInt32, - (64, true) => DataType::Int64, - (64, false) => DataType::UInt64, - z => panic!( - "Int type with bit width of {} and signed of {} not supported", - z.0, z.1 - ), - } - } - ipc::Type::Binary => DataType::Binary, - ipc::Type::LargeBinary => DataType::LargeBinary, - ipc::Type::Utf8 => DataType::Utf8, - ipc::Type::LargeUtf8 => DataType::LargeUtf8, - ipc::Type::FixedSizeBinary => { - let fsb = field.type_as_fixed_size_binary().unwrap(); - DataType::FixedSizeBinary(fsb.byteWidth()) - } - ipc::Type::FloatingPoint => { - let float = field.type_as_floating_point().unwrap(); - match float.precision() { - ipc::Precision::HALF => DataType::Float16, - ipc::Precision::SINGLE => DataType::Float32, - ipc::Precision::DOUBLE => DataType::Float64, - z => panic!("FloatingPoint type with precision of {:?} not supported", z), - } - } - ipc::Type::Date => { - let date = field.type_as_date().unwrap(); - match date.unit() { - ipc::DateUnit::DAY => DataType::Date32, - ipc::DateUnit::MILLISECOND => DataType::Date64, - z => panic!("Date type with unit of {:?} not supported", z), - } - } - ipc::Type::Time => { - let time = field.type_as_time().unwrap(); - match (time.bitWidth(), time.unit()) { - (32, ipc::TimeUnit::SECOND) => DataType::Time32(TimeUnit::Second), - (32, ipc::TimeUnit::MILLISECOND) => { - DataType::Time32(TimeUnit::Millisecond) - } - (64, ipc::TimeUnit::MICROSECOND) => { - DataType::Time64(TimeUnit::Microsecond) - } - (64, ipc::TimeUnit::NANOSECOND) => DataType::Time64(TimeUnit::Nanosecond), - z => panic!( - "Time type with bit width of {} and unit of {:?} not supported", - z.0, z.1 - ), - } - } - ipc::Type::Timestamp => { - let timestamp = field.type_as_timestamp().unwrap(); - let timezone: Option = timestamp.timezone().map(|tz| tz.to_string()); - match timestamp.unit() { - ipc::TimeUnit::SECOND => DataType::Timestamp(TimeUnit::Second, timezone), - ipc::TimeUnit::MILLISECOND => { - DataType::Timestamp(TimeUnit::Millisecond, timezone) - } - ipc::TimeUnit::MICROSECOND => { - DataType::Timestamp(TimeUnit::Microsecond, timezone) - } - ipc::TimeUnit::NANOSECOND => { - DataType::Timestamp(TimeUnit::Nanosecond, timezone) - } - z => panic!("Timestamp type with unit of {:?} not supported", z), - } - } - ipc::Type::Interval => { - let interval = field.type_as_interval().unwrap(); - match interval.unit() { - ipc::IntervalUnit::YEAR_MONTH => { - DataType::Interval(IntervalUnit::YearMonth) - } - ipc::IntervalUnit::DAY_TIME => DataType::Interval(IntervalUnit::DayTime), - z => panic!("Interval type with unit of {:?} unsupported", z), - } - } - ipc::Type::Duration => { - let duration = field.type_as_duration().unwrap(); - match duration.unit() { - ipc::TimeUnit::SECOND => DataType::Duration(TimeUnit::Second), - ipc::TimeUnit::MILLISECOND => DataType::Duration(TimeUnit::Millisecond), - ipc::TimeUnit::MICROSECOND => DataType::Duration(TimeUnit::Microsecond), - ipc::TimeUnit::NANOSECOND => DataType::Duration(TimeUnit::Nanosecond), - z => panic!("Duration type with unit of {:?} unsupported", z), - } - } - ipc::Type::List => { - let children = field.children().unwrap(); - if children.len() != 1 { - panic!("expect a list to have one child") - } - DataType::List(Box::new(children.get(0).into())) - } - ipc::Type::LargeList => { - let children = field.children().unwrap(); - if children.len() != 1 { - panic!("expect a large list to have one child") - } - DataType::LargeList(Box::new(children.get(0).into())) - } - ipc::Type::FixedSizeList => { - let children = field.children().unwrap(); - if children.len() != 1 { - panic!("expect a list to have one child") - } - let fsl = field.type_as_fixed_size_list().unwrap(); - DataType::FixedSizeList(Box::new(children.get(0).into()), fsl.listSize()) - } - ipc::Type::Struct_ => { - let mut fields = vec![]; - if let Some(children) = field.children() { - for i in 0..children.len() { - fields.push(children.get(i).into()); - } - }; - - DataType::Struct(fields) - } - ipc::Type::Decimal => { - let fsb = field.type_as_decimal().unwrap(); - DataType::Decimal(fsb.precision() as usize, fsb.scale() as usize) - } - t => unimplemented!("Type {:?} not supported", t), - } -} - -pub(crate) struct FBFieldType<'b> { - pub(crate) type_type: ipc::Type, - pub(crate) type_: WIPOffset, - pub(crate) children: Option>>>>, -} - -/// Create an IPC Field from an Arrow Field -pub(crate) fn build_field<'a>( - fbb: &mut FlatBufferBuilder<'a>, - field: &Field, -) -> WIPOffset> { - // Optional custom metadata. - let mut fb_metadata = None; - if let Some(metadata) = field.metadata() { - if !metadata.is_empty() { - let mut kv_vec = vec![]; - for (k, v) in metadata { - let kv_args = ipc::KeyValueArgs { - key: Some(fbb.create_string(k.as_str())), - value: Some(fbb.create_string(v.as_str())), - }; - let kv_offset = ipc::KeyValue::create(fbb, &kv_args); - kv_vec.push(kv_offset); - } - fb_metadata = Some(fbb.create_vector(&kv_vec)); - } - }; - - let fb_field_name = fbb.create_string(field.name().as_str()); - let field_type = get_fb_field_type(field.data_type(), field.is_nullable(), fbb); - - let fb_dictionary = if let Dictionary(index_type, _) = field.data_type() { - Some(get_fb_dictionary( - index_type, - field - .dict_id() - .expect("All Dictionary types have `dict_id`"), - field - .dict_is_ordered() - .expect("All Dictionary types have `dict_is_ordered`"), - fbb, - )) - } else { - None - }; - - let mut field_builder = ipc::FieldBuilder::new(fbb); - field_builder.add_name(fb_field_name); - if let Some(dictionary) = fb_dictionary { - field_builder.add_dictionary(dictionary) - } - field_builder.add_type_type(field_type.type_type); - field_builder.add_nullable(field.is_nullable()); - match field_type.children { - None => {} - Some(children) => field_builder.add_children(children), - }; - field_builder.add_type_(field_type.type_); - - if let Some(fb_metadata) = fb_metadata { - field_builder.add_custom_metadata(fb_metadata); - } - - field_builder.finish() -} - -/// Get the IPC type of a data type -pub(crate) fn get_fb_field_type<'a>( - data_type: &DataType, - is_nullable: bool, - fbb: &mut FlatBufferBuilder<'a>, -) -> FBFieldType<'a> { - // some IPC implementations expect an empty list for child data, instead of a null value. - // An empty field list is thus returned for primitive types - let empty_fields: Vec> = vec![]; - match data_type { - Null => FBFieldType { - type_type: ipc::Type::Null, - type_: ipc::NullBuilder::new(fbb).finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - }, - Boolean => FBFieldType { - type_type: ipc::Type::Bool, - type_: ipc::BoolBuilder::new(fbb).finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - }, - UInt8 | UInt16 | UInt32 | UInt64 => { - let children = fbb.create_vector(&empty_fields[..]); - let mut builder = ipc::IntBuilder::new(fbb); - builder.add_is_signed(false); - match data_type { - UInt8 => builder.add_bitWidth(8), - UInt16 => builder.add_bitWidth(16), - UInt32 => builder.add_bitWidth(32), - UInt64 => builder.add_bitWidth(64), - _ => {} - }; - FBFieldType { - type_type: ipc::Type::Int, - type_: builder.finish().as_union_value(), - children: Some(children), - } - } - Int8 | Int16 | Int32 | Int64 => { - let children = fbb.create_vector(&empty_fields[..]); - let mut builder = ipc::IntBuilder::new(fbb); - builder.add_is_signed(true); - match data_type { - Int8 => builder.add_bitWidth(8), - Int16 => builder.add_bitWidth(16), - Int32 => builder.add_bitWidth(32), - Int64 => builder.add_bitWidth(64), - _ => {} - }; - FBFieldType { - type_type: ipc::Type::Int, - type_: builder.finish().as_union_value(), - children: Some(children), - } - } - Float16 | Float32 | Float64 => { - let children = fbb.create_vector(&empty_fields[..]); - let mut builder = ipc::FloatingPointBuilder::new(fbb); - match data_type { - Float16 => builder.add_precision(ipc::Precision::HALF), - Float32 => builder.add_precision(ipc::Precision::SINGLE), - Float64 => builder.add_precision(ipc::Precision::DOUBLE), - _ => {} - }; - FBFieldType { - type_type: ipc::Type::FloatingPoint, - type_: builder.finish().as_union_value(), - children: Some(children), - } - } - Binary => FBFieldType { - type_type: ipc::Type::Binary, - type_: ipc::BinaryBuilder::new(fbb).finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - }, - LargeBinary => FBFieldType { - type_type: ipc::Type::LargeBinary, - type_: ipc::LargeBinaryBuilder::new(fbb).finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - }, - Utf8 => FBFieldType { - type_type: ipc::Type::Utf8, - type_: ipc::Utf8Builder::new(fbb).finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - }, - LargeUtf8 => FBFieldType { - type_type: ipc::Type::LargeUtf8, - type_: ipc::LargeUtf8Builder::new(fbb).finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - }, - FixedSizeBinary(len) => { - let mut builder = ipc::FixedSizeBinaryBuilder::new(fbb); - builder.add_byteWidth(*len as i32); - FBFieldType { - type_type: ipc::Type::FixedSizeBinary, - type_: builder.finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - } - } - Date32 => { - let mut builder = ipc::DateBuilder::new(fbb); - builder.add_unit(ipc::DateUnit::DAY); - FBFieldType { - type_type: ipc::Type::Date, - type_: builder.finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - } - } - Date64 => { - let mut builder = ipc::DateBuilder::new(fbb); - builder.add_unit(ipc::DateUnit::MILLISECOND); - FBFieldType { - type_type: ipc::Type::Date, - type_: builder.finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - } - } - Time32(unit) | Time64(unit) => { - let mut builder = ipc::TimeBuilder::new(fbb); - match unit { - TimeUnit::Second => { - builder.add_bitWidth(32); - builder.add_unit(ipc::TimeUnit::SECOND); - } - TimeUnit::Millisecond => { - builder.add_bitWidth(32); - builder.add_unit(ipc::TimeUnit::MILLISECOND); - } - TimeUnit::Microsecond => { - builder.add_bitWidth(64); - builder.add_unit(ipc::TimeUnit::MICROSECOND); - } - TimeUnit::Nanosecond => { - builder.add_bitWidth(64); - builder.add_unit(ipc::TimeUnit::NANOSECOND); - } - } - FBFieldType { - type_type: ipc::Type::Time, - type_: builder.finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - } - } - Timestamp(unit, tz) => { - let tz = tz.clone().unwrap_or_else(String::new); - let tz_str = fbb.create_string(tz.as_str()); - let mut builder = ipc::TimestampBuilder::new(fbb); - let time_unit = match unit { - TimeUnit::Second => ipc::TimeUnit::SECOND, - TimeUnit::Millisecond => ipc::TimeUnit::MILLISECOND, - TimeUnit::Microsecond => ipc::TimeUnit::MICROSECOND, - TimeUnit::Nanosecond => ipc::TimeUnit::NANOSECOND, - }; - builder.add_unit(time_unit); - if !tz.is_empty() { - builder.add_timezone(tz_str); - } - FBFieldType { - type_type: ipc::Type::Timestamp, - type_: builder.finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - } - } - Interval(unit) => { - let mut builder = ipc::IntervalBuilder::new(fbb); - let interval_unit = match unit { - IntervalUnit::YearMonth => ipc::IntervalUnit::YEAR_MONTH, - IntervalUnit::DayTime => ipc::IntervalUnit::DAY_TIME, - }; - builder.add_unit(interval_unit); - FBFieldType { - type_type: ipc::Type::Interval, - type_: builder.finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - } - } - Duration(unit) => { - let mut builder = ipc::DurationBuilder::new(fbb); - let time_unit = match unit { - TimeUnit::Second => ipc::TimeUnit::SECOND, - TimeUnit::Millisecond => ipc::TimeUnit::MILLISECOND, - TimeUnit::Microsecond => ipc::TimeUnit::MICROSECOND, - TimeUnit::Nanosecond => ipc::TimeUnit::NANOSECOND, - }; - builder.add_unit(time_unit); - FBFieldType { - type_type: ipc::Type::Duration, - type_: builder.finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - } - } - List(ref list_type) => { - let child = build_field(fbb, list_type); - FBFieldType { - type_type: ipc::Type::List, - type_: ipc::ListBuilder::new(fbb).finish().as_union_value(), - children: Some(fbb.create_vector(&[child])), - } - } - LargeList(ref list_type) => { - let child = build_field(fbb, list_type); - FBFieldType { - type_type: ipc::Type::LargeList, - type_: ipc::LargeListBuilder::new(fbb).finish().as_union_value(), - children: Some(fbb.create_vector(&[child])), - } - } - FixedSizeList(ref list_type, len) => { - let child = build_field(fbb, list_type); - let mut builder = ipc::FixedSizeListBuilder::new(fbb); - builder.add_listSize(*len as i32); - FBFieldType { - type_type: ipc::Type::FixedSizeList, - type_: builder.finish().as_union_value(), - children: Some(fbb.create_vector(&[child])), - } - } - Struct(fields) => { - // struct's fields are children - let mut children = vec![]; - for field in fields { - let inner_types = - get_fb_field_type(field.data_type(), field.is_nullable(), fbb); - let field_name = fbb.create_string(field.name()); - children.push(ipc::Field::create( - fbb, - &ipc::FieldArgs { - name: Some(field_name), - nullable: field.is_nullable(), - type_type: inner_types.type_type, - type_: Some(inner_types.type_), - dictionary: None, - children: inner_types.children, - custom_metadata: None, - }, - )); - } - FBFieldType { - type_type: ipc::Type::Struct_, - type_: ipc::Struct_Builder::new(fbb).finish().as_union_value(), - children: Some(fbb.create_vector(&children[..])), - } - } - Dictionary(_, value_type) => { - // In this library, the dictionary "type" is a logical construct. Here we - // pass through to the value type, as we've already captured the index - // type in the DictionaryEncoding metadata in the parent field - get_fb_field_type(value_type, is_nullable, fbb) - } - Decimal(precision, scale) => { - let mut builder = ipc::DecimalBuilder::new(fbb); - builder.add_precision(*precision as i32); - builder.add_scale(*scale as i32); - builder.add_bitWidth(128); - FBFieldType { - type_type: ipc::Type::Decimal, - type_: builder.finish().as_union_value(), - children: Some(fbb.create_vector(&empty_fields[..])), - } - } - t => unimplemented!("Type {:?} not supported", t), - } -} - -/// Create an IPC dictionary encoding -pub(crate) fn get_fb_dictionary<'a>( - index_type: &DataType, - dict_id: i64, - dict_is_ordered: bool, - fbb: &mut FlatBufferBuilder<'a>, -) -> WIPOffset> { - // We assume that the dictionary index type (as an integer) has already been - // validated elsewhere, and can safely assume we are dealing with integers - let mut index_builder = ipc::IntBuilder::new(fbb); - - match *index_type { - Int8 | Int16 | Int32 | Int64 => index_builder.add_is_signed(true), - UInt8 | UInt16 | UInt32 | UInt64 => index_builder.add_is_signed(false), - _ => {} - } - - match *index_type { - Int8 | UInt8 => index_builder.add_bitWidth(8), - Int16 | UInt16 => index_builder.add_bitWidth(16), - Int32 | UInt32 => index_builder.add_bitWidth(32), - Int64 | UInt64 => index_builder.add_bitWidth(64), - _ => {} - } - - let index_builder = index_builder.finish(); - - let mut builder = ipc::DictionaryEncodingBuilder::new(fbb); - builder.add_id(dict_id); - builder.add_indexType(index_builder); - builder.add_isOrdered(dict_is_ordered); - - builder.finish() -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::datatypes::{DataType, Field, Schema}; - - #[test] - fn convert_schema_round_trip() { - let md: HashMap = [("Key".to_string(), "value".to_string())] - .iter() - .cloned() - .collect(); - let field_md: BTreeMap = [("k".to_string(), "v".to_string())] - .iter() - .cloned() - .collect(); - let schema = Schema::new_with_metadata( - vec![ - { - let mut f = Field::new("uint8", DataType::UInt8, false); - f.set_metadata(Some(field_md)); - f - }, - Field::new("uint16", DataType::UInt16, true), - Field::new("uint32", DataType::UInt32, false), - Field::new("uint64", DataType::UInt64, true), - Field::new("int8", DataType::Int8, true), - Field::new("int16", DataType::Int16, false), - Field::new("int32", DataType::Int32, true), - Field::new("int64", DataType::Int64, false), - Field::new("float16", DataType::Float16, true), - Field::new("float32", DataType::Float32, false), - Field::new("float64", DataType::Float64, true), - Field::new("null", DataType::Null, false), - Field::new("bool", DataType::Boolean, false), - Field::new("date32", DataType::Date32, false), - Field::new("date64", DataType::Date64, true), - Field::new("time32[s]", DataType::Time32(TimeUnit::Second), true), - Field::new("time32[ms]", DataType::Time32(TimeUnit::Millisecond), false), - Field::new("time64[us]", DataType::Time64(TimeUnit::Microsecond), false), - Field::new("time64[ns]", DataType::Time64(TimeUnit::Nanosecond), true), - Field::new( - "timestamp[s]", - DataType::Timestamp(TimeUnit::Second, None), - false, - ), - Field::new( - "timestamp[ms]", - DataType::Timestamp(TimeUnit::Millisecond, None), - true, - ), - Field::new( - "timestamp[us]", - DataType::Timestamp( - TimeUnit::Microsecond, - Some("Africa/Johannesburg".to_string()), - ), - false, - ), - Field::new( - "timestamp[ns]", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - ), - Field::new( - "interval[ym]", - DataType::Interval(IntervalUnit::YearMonth), - true, - ), - Field::new( - "interval[dt]", - DataType::Interval(IntervalUnit::DayTime), - true, - ), - Field::new("utf8", DataType::Utf8, false), - Field::new("binary", DataType::Binary, false), - Field::new( - "list[u8]", - DataType::List(Box::new(Field::new("item", DataType::UInt8, false))), - true, - ), - Field::new( - "list[struct]", - DataType::List(Box::new(Field::new( - "struct", - DataType::Struct(vec![ - Field::new("float32", DataType::UInt8, false), - Field::new("int32", DataType::Int32, true), - Field::new("bool", DataType::Boolean, true), - ]), - true, - ))), - false, - ), - Field::new( - "struct]>]>", - DataType::Struct(vec![ - Field::new("int64", DataType::Int64, true), - Field::new( - "list[struct]>]", - DataType::List(Box::new(Field::new( - "struct", - DataType::Struct(vec![ - Field::new("date32", DataType::Date32, true), - Field::new( - "list[struct<>]", - DataType::List(Box::new(Field::new( - "struct", - DataType::Struct(vec![]), - false, - ))), - false, - ), - ]), - false, - ))), - false, - ), - ]), - false, - ), - Field::new("struct<>", DataType::Struct(vec![]), true), - Field::new_dict( - "dictionary", - DataType::Dictionary( - Box::new(DataType::Int32), - Box::new(DataType::Utf8), - ), - true, - 123, - true, - ), - Field::new_dict( - "dictionary", - DataType::Dictionary( - Box::new(DataType::UInt8), - Box::new(DataType::UInt32), - ), - true, - 123, - true, - ), - Field::new("decimal", DataType::Decimal(10, 6), false), - ], - md, - ); - - let fb = schema_to_fb(&schema); - - // read back fields - let ipc = ipc::root_as_schema(fb.finished_data()).unwrap(); - let schema2 = fb_to_schema(ipc); - assert_eq!(schema, schema2); - } - - #[test] - fn schema_from_bytes() { - // bytes of a schema generated from python (0.14.0), saved as an `ipc::Message`. - // the schema is: Field("field1", DataType::UInt32, false) - let bytes: Vec = vec![ - 16, 0, 0, 0, 0, 0, 10, 0, 12, 0, 6, 0, 5, 0, 8, 0, 10, 0, 0, 0, 0, 1, 3, 0, - 12, 0, 0, 0, 8, 0, 8, 0, 0, 0, 4, 0, 8, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 20, - 0, 0, 0, 16, 0, 20, 0, 8, 0, 0, 0, 7, 0, 12, 0, 0, 0, 16, 0, 16, 0, 0, 0, 0, - 0, 0, 2, 32, 0, 0, 0, 20, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 8, 0, - 4, 0, 6, 0, 0, 0, 32, 0, 0, 0, 6, 0, 0, 0, 102, 105, 101, 108, 100, 49, 0, 0, - 0, 0, 0, 0, - ]; - let ipc = ipc::root_as_message(&bytes[..]).unwrap(); - let schema = ipc.header_as_schema().unwrap(); - - // a message generated from Rust, same as the Python one - let bytes: Vec = vec![ - 16, 0, 0, 0, 0, 0, 10, 0, 14, 0, 12, 0, 11, 0, 4, 0, 10, 0, 0, 0, 20, 0, 0, - 0, 0, 0, 0, 1, 3, 0, 10, 0, 12, 0, 0, 0, 8, 0, 4, 0, 10, 0, 0, 0, 8, 0, 0, 0, - 8, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 16, 0, 0, 0, 12, 0, 18, 0, 12, 0, 0, 0, - 11, 0, 4, 0, 12, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 2, 20, 0, 0, 0, 0, 0, 6, 0, - 8, 0, 4, 0, 6, 0, 0, 0, 32, 0, 0, 0, 6, 0, 0, 0, 102, 105, 101, 108, 100, 49, - 0, 0, - ]; - let ipc2 = ipc::root_as_message(&bytes[..]).unwrap(); - let schema2 = ipc.header_as_schema().unwrap(); - - assert_eq!(schema, schema2); - assert_eq!(ipc.version(), ipc2.version()); - assert_eq!(ipc.header_type(), ipc2.header_type()); - assert_eq!(ipc.bodyLength(), ipc2.bodyLength()); - assert!(ipc.custom_metadata().is_none()); - assert!(ipc2.custom_metadata().is_none()); - } -} diff --git a/rust/arrow/src/ipc/gen/File.rs b/rust/arrow/src/ipc/gen/File.rs deleted file mode 100644 index 04cbc644137..00000000000 --- a/rust/arrow/src/ipc/gen/File.rs +++ /dev/null @@ -1,491 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#![allow(dead_code)] -#![allow(unused_imports)] - -use crate::ipc::gen::Schema::*; -use flatbuffers::EndianScalar; -use std::{cmp::Ordering, mem}; -// automatically generated by the FlatBuffers compiler, do not modify - -// struct Block, aligned to 8 -#[repr(transparent)] -#[derive(Clone, Copy, PartialEq)] -pub struct Block(pub [u8; 24]); -impl std::fmt::Debug for Block { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - f.debug_struct("Block") - .field("offset", &self.offset()) - .field("metaDataLength", &self.metaDataLength()) - .field("bodyLength", &self.bodyLength()) - .finish() - } -} - -impl flatbuffers::SimpleToVerifyInSlice for Block {} -impl flatbuffers::SafeSliceAccess for Block {} -impl<'a> flatbuffers::Follow<'a> for Block { - type Inner = &'a Block; - #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - <&'a Block>::follow(buf, loc) - } -} -impl<'a> flatbuffers::Follow<'a> for &'a Block { - type Inner = &'a Block; - #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - flatbuffers::follow_cast_ref::(buf, loc) - } -} -impl<'b> flatbuffers::Push for Block { - type Output = Block; - #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - let src = unsafe { - ::std::slice::from_raw_parts(self as *const Block as *const u8, Self::size()) - }; - dst.copy_from_slice(src); - } -} -impl<'b> flatbuffers::Push for &'b Block { - type Output = Block; - - #[inline] - fn push(&self, dst: &mut [u8], _rest: &[u8]) { - let src = unsafe { - ::std::slice::from_raw_parts(*self as *const Block as *const u8, Self::size()) - }; - dst.copy_from_slice(src); - } -} - -impl<'a> flatbuffers::Verifiable for Block { - #[inline] - fn run_verifier( - v: &mut flatbuffers::Verifier, - pos: usize, - ) -> Result<(), flatbuffers::InvalidFlatbuffer> { - use flatbuffers::Verifiable; - v.in_buffer::(pos) - } -} -impl Block { - #[allow(clippy::too_many_arguments)] - pub fn new(offset: i64, metaDataLength: i32, bodyLength: i64) -> Self { - let mut s = Self([0; 24]); - s.set_offset(offset); - s.set_metaDataLength(metaDataLength); - s.set_bodyLength(bodyLength); - s - } - - /// Index to the start of the RecordBlock (note this is past the Message header) - pub fn offset(&self) -> i64 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { - core::ptr::copy_nonoverlapping( - self.0[0..].as_ptr(), - mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), - ); - mem.assume_init() - } - .from_little_endian() - } - - pub fn set_offset(&mut self, x: i64) { - let x_le = x.to_little_endian(); - unsafe { - core::ptr::copy_nonoverlapping( - &x_le as *const i64 as *const u8, - self.0[0..].as_mut_ptr(), - core::mem::size_of::(), - ); - } - } - - /// Length of the metadata - pub fn metaDataLength(&self) -> i32 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { - core::ptr::copy_nonoverlapping( - self.0[8..].as_ptr(), - mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), - ); - mem.assume_init() - } - .from_little_endian() - } - - pub fn set_metaDataLength(&mut self, x: i32) { - let x_le = x.to_little_endian(); - unsafe { - core::ptr::copy_nonoverlapping( - &x_le as *const i32 as *const u8, - self.0[8..].as_mut_ptr(), - core::mem::size_of::(), - ); - } - } - - /// Length of the data (this is aligned so there can be a gap between this and - /// the metadata). - pub fn bodyLength(&self) -> i64 { - let mut mem = core::mem::MaybeUninit::::uninit(); - unsafe { - core::ptr::copy_nonoverlapping( - self.0[16..].as_ptr(), - mem.as_mut_ptr() as *mut u8, - core::mem::size_of::(), - ); - mem.assume_init() - } - .from_little_endian() - } - - pub fn set_bodyLength(&mut self, x: i64) { - let x_le = x.to_little_endian(); - unsafe { - core::ptr::copy_nonoverlapping( - &x_le as *const i64 as *const u8, - self.0[16..].as_mut_ptr(), - core::mem::size_of::(), - ); - } - } -} - -pub enum FooterOffset {} -#[derive(Copy, Clone, PartialEq)] - -/// ---------------------------------------------------------------------- -/// Arrow File metadata -/// -pub struct Footer<'a> { - pub _tab: flatbuffers::Table<'a>, -} - -impl<'a> flatbuffers::Follow<'a> for Footer<'a> { - type Inner = Footer<'a>; - #[inline] - fn follow(buf: &'a [u8], loc: usize) -> Self::Inner { - Self { - _tab: flatbuffers::Table { buf, loc }, - } - } -} - -impl<'a> Footer<'a> { - #[inline] - pub fn init_from_table(table: flatbuffers::Table<'a>) -> Self { - Footer { _tab: table } - } - #[allow(unused_mut)] - pub fn create<'bldr: 'args, 'args: 'mut_bldr, 'mut_bldr>( - _fbb: &'mut_bldr mut flatbuffers::FlatBufferBuilder<'bldr>, - args: &'args FooterArgs<'args>, - ) -> flatbuffers::WIPOffset> { - let mut builder = FooterBuilder::new(_fbb); - if let Some(x) = args.custom_metadata { - builder.add_custom_metadata(x); - } - if let Some(x) = args.recordBatches { - builder.add_recordBatches(x); - } - if let Some(x) = args.dictionaries { - builder.add_dictionaries(x); - } - if let Some(x) = args.schema { - builder.add_schema(x); - } - builder.add_version(args.version); - builder.finish() - } - - pub const VT_VERSION: flatbuffers::VOffsetT = 4; - pub const VT_SCHEMA: flatbuffers::VOffsetT = 6; - pub const VT_DICTIONARIES: flatbuffers::VOffsetT = 8; - pub const VT_RECORDBATCHES: flatbuffers::VOffsetT = 10; - pub const VT_CUSTOM_METADATA: flatbuffers::VOffsetT = 12; - - #[inline] - pub fn version(&self) -> MetadataVersion { - self._tab - .get::(Footer::VT_VERSION, Some(MetadataVersion::V1)) - .unwrap() - } - #[inline] - pub fn schema(&self) -> Option> { - self._tab - .get::>(Footer::VT_SCHEMA, None) - } - #[inline] - pub fn dictionaries(&self) -> Option<&'a [Block]> { - self._tab - .get::>>( - Footer::VT_DICTIONARIES, - None, - ) - .map(|v| v.safe_slice()) - } - #[inline] - pub fn recordBatches(&self) -> Option<&'a [Block]> { - self._tab - .get::>>( - Footer::VT_RECORDBATCHES, - None, - ) - .map(|v| v.safe_slice()) - } - /// User-defined metadata - #[inline] - pub fn custom_metadata( - &self, - ) -> Option>>> { - self._tab.get::>, - >>(Footer::VT_CUSTOM_METADATA, None) - } -} - -impl flatbuffers::Verifiable for Footer<'_> { - #[inline] - fn run_verifier( - v: &mut flatbuffers::Verifier, - pos: usize, - ) -> Result<(), flatbuffers::InvalidFlatbuffer> { - use flatbuffers::Verifiable; - v.visit_table(pos)? - .visit_field::(&"version", Self::VT_VERSION, false)? - .visit_field::>( - &"schema", - Self::VT_SCHEMA, - false, - )? - .visit_field::>>( - &"dictionaries", - Self::VT_DICTIONARIES, - false, - )? - .visit_field::>>( - &"recordBatches", - Self::VT_RECORDBATCHES, - false, - )? - .visit_field::>, - >>(&"custom_metadata", Self::VT_CUSTOM_METADATA, false)? - .finish(); - Ok(()) - } -} -pub struct FooterArgs<'a> { - pub version: MetadataVersion, - pub schema: Option>>, - pub dictionaries: Option>>, - pub recordBatches: Option>>, - pub custom_metadata: Option< - flatbuffers::WIPOffset< - flatbuffers::Vector<'a, flatbuffers::ForwardsUOffset>>, - >, - >, -} -impl<'a> Default for FooterArgs<'a> { - #[inline] - fn default() -> Self { - FooterArgs { - version: MetadataVersion::V1, - schema: None, - dictionaries: None, - recordBatches: None, - custom_metadata: None, - } - } -} -pub struct FooterBuilder<'a: 'b, 'b> { - fbb_: &'b mut flatbuffers::FlatBufferBuilder<'a>, - start_: flatbuffers::WIPOffset, -} -impl<'a: 'b, 'b> FooterBuilder<'a, 'b> { - #[inline] - pub fn add_version(&mut self, version: MetadataVersion) { - self.fbb_.push_slot::( - Footer::VT_VERSION, - version, - MetadataVersion::V1, - ); - } - #[inline] - pub fn add_schema(&mut self, schema: flatbuffers::WIPOffset>) { - self.fbb_ - .push_slot_always::>( - Footer::VT_SCHEMA, - schema, - ); - } - #[inline] - pub fn add_dictionaries( - &mut self, - dictionaries: flatbuffers::WIPOffset>, - ) { - self.fbb_.push_slot_always::>( - Footer::VT_DICTIONARIES, - dictionaries, - ); - } - #[inline] - pub fn add_recordBatches( - &mut self, - recordBatches: flatbuffers::WIPOffset>, - ) { - self.fbb_.push_slot_always::>( - Footer::VT_RECORDBATCHES, - recordBatches, - ); - } - #[inline] - pub fn add_custom_metadata( - &mut self, - custom_metadata: flatbuffers::WIPOffset< - flatbuffers::Vector<'b, flatbuffers::ForwardsUOffset>>, - >, - ) { - self.fbb_.push_slot_always::>( - Footer::VT_CUSTOM_METADATA, - custom_metadata, - ); - } - #[inline] - pub fn new( - _fbb: &'b mut flatbuffers::FlatBufferBuilder<'a>, - ) -> FooterBuilder<'a, 'b> { - let start = _fbb.start_table(); - FooterBuilder { - fbb_: _fbb, - start_: start, - } - } - #[inline] - pub fn finish(self) -> flatbuffers::WIPOffset> { - let o = self.fbb_.end_table(self.start_); - flatbuffers::WIPOffset::new(o.value()) - } -} - -impl std::fmt::Debug for Footer<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let mut ds = f.debug_struct("Footer"); - ds.field("version", &self.version()); - ds.field("schema", &self.schema()); - ds.field("dictionaries", &self.dictionaries()); - ds.field("recordBatches", &self.recordBatches()); - ds.field("custom_metadata", &self.custom_metadata()); - ds.finish() - } -} -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_root_as_footer<'a>(buf: &'a [u8]) -> Footer<'a> { - unsafe { flatbuffers::root_unchecked::>(buf) } -} - -#[inline] -#[deprecated(since = "2.0.0", note = "Deprecated in favor of `root_as...` methods.")] -pub fn get_size_prefixed_root_as_footer<'a>(buf: &'a [u8]) -> Footer<'a> { - unsafe { flatbuffers::size_prefixed_root_unchecked::>(buf) } -} - -#[inline] -/// Verifies that a buffer of bytes contains a `Footer` -/// and returns it. -/// Note that verification is still experimental and may not -/// catch every error, or be maximally performant. For the -/// previous, unchecked, behavior use -/// `root_as_footer_unchecked`. -pub fn root_as_footer(buf: &[u8]) -> Result { - flatbuffers::root::